Initial Adding ROCM HAL Backend to Experimental (#5943)
Initial pass to integrate ROCm in to IREE so that we can Codegen and run on AMDGPUs. Following steps similar to thomasraoux's CUDA backend. Since ROCm do not have graph or CommandBuffer by default, we implement ROCm's command buffer using stream API to default stream. Tested out and pass most CTS tests except:
semaphore_submission_test + semaphore_test-> some functionalities not implemented for rocm backend yet
command_buffer_test -> CommandBufferTest.CopySubBuffer
In the next patch:
-Complete semaphore functionality
-Squash CommandBuffer bugs
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e8d39d..6ef40dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,7 @@
option(IREE_BUILD_EXPERIMENTAL_REMOTING "Builds experimental remoting support." OFF)
option(IREE_BUILD_EXPERIMENTAL_JAVA_BINDINGS "Builds the experimental java bindings." OFF)
+option(IREE_BUILD_EXPERIMENTAL_ROCM "Builds the experimental ROCM Backend." OFF)
#-------------------------------------------------------------------------------
# Derived flags based on primary options
@@ -446,6 +447,15 @@
add_subdirectory(iree/testing)
add_subdirectory(iree/test)
add_subdirectory(iree/vm)
+if(${IREE_BUILD_EXPERIMENTAL_ROCM})
+ add_subdirectory(build_tools/third_party/rocm EXCLUDE_FROM_ALL)
+ add_subdirectory(experimental/rocm)
+ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_BINARY_DIR}/iree/lib/cmake/lld")
+ list(APPEND LLD_INCLUDE_DIRS
+ ${CMAKE_CURRENT_SOURCE_DIR}/iree/third_party/llvm-project/lld/include
+ ${CMAKE_CURRENT_BINARY_DIR}/iree/third_party/llvm-project/llvm/tools/lld/include
+ )
+endif()
if(${IREE_BUILD_COMPILER})
add_subdirectory(iree/compiler)
diff --git a/build_tools/third_party/rocm/CMakeLists.txt b/build_tools/third_party/rocm/CMakeLists.txt
new file mode 100644
index 0000000..95ddb61
--- /dev/null
+++ b/build_tools/third_party/rocm/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(ROCM_HEADERS_API_ROOT "${IREE_ROOT_DIR}/third_party/rocm/include")
+
+external_cc_library(
+ PACKAGE
+ rocm_headers
+ NAME
+ rocm_headers
+ ROOT
+ ${ROCM_HEADERS_API_ROOT}
+ HDRS
+ "hip/hip_runtime.h"
+ INCLUDES
+ ${ROCM_HEADERS_API_ROOT}
+)
+
diff --git a/experimental/rocm/CMakeLists.txt b/experimental/rocm/CMakeLists.txt
new file mode 100644
index 0000000..0e3f824
--- /dev/null
+++ b/experimental/rocm/CMakeLists.txt
@@ -0,0 +1,93 @@
+# Copyright 2021 Google LLC
+
+if(NOT ${IREE_BUILD_EXPERIMENTAL_ROCM})
+ return()
+endif()
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+ NAME
+ rocm
+ HDRS
+ "api.h"
+ SRCS
+ "api.h"
+ "context_wrapper.h"
+ "rocm_allocator.c"
+ "rocm_allocator.h"
+ "rocm_buffer.c"
+ "rocm_buffer.h"
+ "rocm_device.c"
+ "rocm_device.h"
+ "rocm_driver.c"
+ "rocm_event.c"
+ "rocm_event.h"
+ "descriptor_set_layout.c"
+ "descriptor_set_layout.h"
+ "event_semaphore.c"
+ "event_semaphore.h"
+ "executable_layout.c"
+ "executable_layout.h"
+ "direct_command_buffer.c"
+ "direct_command_buffer.h"
+ "native_executable.c"
+ "native_executable.h"
+ "nop_executable_cache.c"
+ "nop_executable_cache.h"
+ "status_util.c"
+ "status_util.h"
+ INCLUDES
+ "${CMAKE_CURRENT_LIST_DIR}/../.."
+ "${PROJECT_BINARY_DIR}"
+ DEPS
+ ::dynamic_symbols
+ iree::base
+ iree::base::core_headers
+ iree::base::internal
+ iree::base::internal::flatcc
+ iree::base::internal::synchronization
+ iree::base::logging
+ iree::base::status
+ iree::base::tracing
+ iree::hal
+ iree::schemas::rocm_executable_def_c_fbs
+ PUBLIC
+)
+
+add_definitions(-D__HIP_PLATFORM_HCC__)
+
+iree_cc_library(
+ NAME
+ dynamic_symbols
+ HDRS
+ "dynamic_symbols.h"
+ TEXTUAL_HDRS
+ "dynamic_symbol_tables.h"
+ SRCS
+ "rocm_headers.h"
+ "dynamic_symbols.c"
+ INCLUDES
+ "${CMAKE_CURRENT_LIST_DIR}/../.."
+ DEPS
+ rocm_headers
+ iree::base::core_headers
+ iree::base::internal::dynamic_library
+ iree::base::tracing
+ PUBLIC
+)
+
+iree_cc_test(
+ NAME
+ dynamic_symbols_test
+ SRCS
+ "dynamic_symbols_test.cc"
+ DEPS
+ ::dynamic_symbols
+ iree::testing::gtest
+ iree::testing::gtest_main
+ LABELS
+ "driver=rocm"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/experimental/rocm/api.h b/experimental/rocm/api.h
new file mode 100644
index 0000000..ec38b00
--- /dev/null
+++ b/experimental/rocm/api.h
@@ -0,0 +1,55 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// See iree/base/api.h for documentation on the API conventions used.
+
+#ifndef IREE_HAL_ROCM_API_H_
+#define IREE_HAL_ROCM_API_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// iree_hal_rocm_driver_t
+//===----------------------------------------------------------------------===//
+
+// ROCM driver creation options.
+typedef struct {
+ // Index of the default ROCM device to use within the list of available
+ // devices.
+ int default_device_index;
+} iree_hal_rocm_driver_options_t;
+
+IREE_API_EXPORT void iree_hal_rocm_driver_options_initialize(
+ iree_hal_rocm_driver_options_t *out_options);
+
+// Creates a ROCM HAL driver that manage its own hipcontext.
+//
+// |out_driver| must be released by the caller (see |iree_hal_driver_release|).
+IREE_API_EXPORT iree_status_t iree_hal_rocm_driver_create(
+ iree_string_view_t identifier,
+ const iree_hal_rocm_driver_options_t *options,
+ iree_allocator_t host_allocator, iree_hal_driver_t **out_driver);
+
+// TODO(thomasraoux): Support importing a CUcontext from app.
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_API_H_
diff --git a/experimental/rocm/context_wrapper.h b/experimental/rocm/context_wrapper.h
new file mode 100644
index 0000000..6637923
--- /dev/null
+++ b/experimental/rocm/context_wrapper.h
@@ -0,0 +1,30 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_CONTEXT_WRAPPER_H_
+#define IREE_HAL_ROCM_CONTEXT_WRAPPER_H_
+
+#include "experimental/rocm/dynamic_symbols.h"
+#include "experimental/rocm/rocm_headers.h"
+#include "iree/hal/api.h"
+
+// Structure to wrap all objects constant within a context. This makes it
+// simpler to pass it to the different objects and saves memory.
+typedef struct {
+ hipCtx_t rocm_context;
+ iree_allocator_t host_allocator;
+ iree_hal_rocm_dynamic_symbols_t *syms;
+} iree_hal_rocm_context_wrapper_t;
+
+#endif // IREE_HAL_ROCM_CONTEXT_WRAPPER_H_
diff --git a/experimental/rocm/descriptor_set_layout.c b/experimental/rocm/descriptor_set_layout.c
new file mode 100644
index 0000000..59cd8cc
--- /dev/null
+++ b/experimental/rocm/descriptor_set_layout.c
@@ -0,0 +1,78 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/descriptor_set_layout.h"
+
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+} iree_hal_rocm_descriptor_set_layout_t;
+
+extern const iree_hal_descriptor_set_layout_vtable_t
+ iree_hal_rocm_descriptor_set_layout_vtable;
+
+static iree_hal_rocm_descriptor_set_layout_t *
+iree_hal_rocm_descriptor_set_layout_cast(
+ iree_hal_descriptor_set_layout_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_descriptor_set_layout_vtable);
+ return (iree_hal_rocm_descriptor_set_layout_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_descriptor_set_layout_create(
+ iree_hal_rocm_context_wrapper_t *context,
+ iree_hal_descriptor_set_layout_usage_type_t usage_type,
+ iree_host_size_t binding_count,
+ const iree_hal_descriptor_set_layout_binding_t *bindings,
+ iree_hal_descriptor_set_layout_t **out_descriptor_set_layout) {
+ IREE_ASSERT_ARGUMENT(context);
+ IREE_ASSERT_ARGUMENT(!binding_count || bindings);
+ IREE_ASSERT_ARGUMENT(out_descriptor_set_layout);
+ *out_descriptor_set_layout = NULL;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_descriptor_set_layout_t *descriptor_set_layout = NULL;
+ iree_status_t status = iree_allocator_malloc(context->host_allocator,
+ sizeof(*descriptor_set_layout),
+ (void **)&descriptor_set_layout);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_descriptor_set_layout_vtable,
+ &descriptor_set_layout->resource);
+ descriptor_set_layout->context = context;
+ *out_descriptor_set_layout =
+ (iree_hal_descriptor_set_layout_t *)descriptor_set_layout;
+ }
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_descriptor_set_layout_destroy(
+ iree_hal_descriptor_set_layout_t *base_descriptor_set_layout) {
+ iree_hal_rocm_descriptor_set_layout_t *descriptor_set_layout =
+ iree_hal_rocm_descriptor_set_layout_cast(base_descriptor_set_layout);
+ iree_allocator_t host_allocator =
+ descriptor_set_layout->context->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(host_allocator, descriptor_set_layout);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+const iree_hal_descriptor_set_layout_vtable_t
+ iree_hal_rocm_descriptor_set_layout_vtable = {
+ .destroy = iree_hal_rocm_descriptor_set_layout_destroy,
+};
diff --git a/experimental/rocm/descriptor_set_layout.h b/experimental/rocm/descriptor_set_layout.h
new file mode 100644
index 0000000..c658d3e
--- /dev/null
+++ b/experimental/rocm/descriptor_set_layout.h
@@ -0,0 +1,36 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_DESCRIPTOR_SET_LAYOUT_H_
+#define IREE_HAL_ROCM_DESCRIPTOR_SET_LAYOUT_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+iree_status_t iree_hal_rocm_descriptor_set_layout_create(
+ iree_hal_rocm_context_wrapper_t *context,
+ iree_hal_descriptor_set_layout_usage_type_t usage_type,
+ iree_host_size_t binding_count,
+ const iree_hal_descriptor_set_layout_binding_t *bindings,
+ iree_hal_descriptor_set_layout_t **out_descriptor_set_layout);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_DESCRIPTOR_SET_LAYOUT_H_
diff --git a/experimental/rocm/direct_command_buffer.c b/experimental/rocm/direct_command_buffer.c
new file mode 100644
index 0000000..24f03c2
--- /dev/null
+++ b/experimental/rocm/direct_command_buffer.c
@@ -0,0 +1,345 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/direct_command_buffer.h"
+
+#include "experimental/rocm/native_executable.h"
+#include "experimental/rocm/rocm_buffer.h"
+#include "experimental/rocm/rocm_event.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+// Command buffer implementation that directly maps to rocm direct.
+// This records the commands on the calling thread without additional threading
+// indirection.
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+ iree_hal_command_buffer_mode_t mode;
+ iree_hal_command_category_t allowed_categories;
+ iree_hal_queue_affinity_t queue_affinity;
+ size_t total_size;
+ // Keep track of the current set of kernel arguments.
+ void *current_descriptor[];
+} iree_hal_rocm_direct_command_buffer_t;
+
+static const size_t max_binding_count = 64;
+
+extern const iree_hal_command_buffer_vtable_t
+ iree_hal_rocm_direct_command_buffer_vtable;
+
+static iree_hal_rocm_direct_command_buffer_t *
+iree_hal_rocm_direct_command_buffer_cast(
+ iree_hal_command_buffer_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_direct_command_buffer_vtable);
+ return (iree_hal_rocm_direct_command_buffer_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_direct_command_buffer_allocate(
+ iree_hal_rocm_context_wrapper_t *context,
+ iree_hal_command_buffer_mode_t mode,
+ iree_hal_command_category_t command_categories,
+ iree_hal_queue_affinity_t queue_affinity,
+ iree_hal_command_buffer_t **out_command_buffer) {
+ IREE_ASSERT_ARGUMENT(context);
+ IREE_ASSERT_ARGUMENT(out_command_buffer);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_direct_command_buffer_t *command_buffer = NULL;
+ size_t total_size = sizeof(*command_buffer) +
+ max_binding_count * sizeof(void *) +
+ max_binding_count * sizeof(hipDeviceptr_t);
+ iree_status_t status = iree_allocator_malloc(
+ context->host_allocator, total_size, (void **)&command_buffer);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_direct_command_buffer_vtable,
+ &command_buffer->resource);
+ command_buffer->context = context;
+ command_buffer->mode = mode;
+ command_buffer->allowed_categories = command_categories;
+ command_buffer->queue_affinity = queue_affinity;
+ hipDeviceptr_t *device_ptrs =
+ (hipDeviceptr_t *)(command_buffer->current_descriptor +
+ max_binding_count);
+ for (size_t i = 0; i < max_binding_count; i++) {
+ command_buffer->current_descriptor[i] = &device_ptrs[i];
+ }
+ command_buffer->total_size = total_size;
+
+ *out_command_buffer = (iree_hal_command_buffer_t *)command_buffer;
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_direct_command_buffer_destroy(
+ iree_hal_command_buffer_t *base_command_buffer) {
+ iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(command_buffer->context->host_allocator, command_buffer);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_hal_command_buffer_mode_t iree_hal_rocm_direct_command_buffer_mode(
+ const iree_hal_command_buffer_t *base_command_buffer) {
+ const iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ (const iree_hal_rocm_direct_command_buffer_t *)(base_command_buffer);
+ return command_buffer->mode;
+}
+
+static iree_hal_command_category_t
+iree_hal_rocm_direct_command_buffer_allowed_categories(
+ const iree_hal_command_buffer_t *base_command_buffer) {
+ const iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ (const iree_hal_rocm_direct_command_buffer_t *)(base_command_buffer);
+ return command_buffer->allowed_categories;
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_begin(
+ iree_hal_command_buffer_t *base_command_buffer) {
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_end(
+ iree_hal_command_buffer_t *base_command_buffer) {
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_execution_barrier(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_execution_stage_t source_stage_mask,
+ iree_hal_execution_stage_t target_stage_mask,
+ iree_hal_execution_barrier_flags_t flags,
+ iree_host_size_t memory_barrier_count,
+ const iree_hal_memory_barrier_t *memory_barriers,
+ iree_host_size_t buffer_barrier_count,
+ const iree_hal_buffer_barrier_t *buffer_barriers) {
+ // TODO: Implement barrier
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_signal_event(
+ iree_hal_command_buffer_t *base_command_buffer, iree_hal_event_t *event,
+ iree_hal_execution_stage_t source_stage_mask) {
+ // TODO: Implement barrier
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_reset_event(
+ iree_hal_command_buffer_t *base_command_buffer, iree_hal_event_t *event,
+ iree_hal_execution_stage_t source_stage_mask) {
+ // TODO: Implement barrier
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_wait_events(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_host_size_t event_count, const iree_hal_event_t **events,
+ iree_hal_execution_stage_t source_stage_mask,
+ iree_hal_execution_stage_t target_stage_mask,
+ iree_host_size_t memory_barrier_count,
+ const iree_hal_memory_barrier_t *memory_barriers,
+ iree_host_size_t buffer_barrier_count,
+ const iree_hal_buffer_barrier_t *buffer_barriers) {
+ // TODO: Implement barrier
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_discard_buffer(
+ iree_hal_command_buffer_t *base_command_buffer, iree_hal_buffer_t *buffer) {
+ // nothing to do.
+ return iree_ok_status();
+}
+
+// Splats a pattern value of 1, 2, or 4 bytes out to a 4 byte value.
+static uint32_t iree_hal_rocm_splat_pattern(const void *pattern,
+ size_t pattern_length) {
+ switch (pattern_length) {
+ case 1: {
+ uint32_t pattern_value = *(const uint8_t *)(pattern);
+ return (pattern_value << 24) | (pattern_value << 16) |
+ (pattern_value << 8) | pattern_value;
+ }
+ case 2: {
+ uint32_t pattern_value = *(const uint16_t *)(pattern);
+ return (pattern_value << 16) | pattern_value;
+ }
+ case 4: {
+ uint32_t pattern_value = *(const uint32_t *)(pattern);
+ return pattern_value;
+ }
+ default:
+ return 0; // Already verified that this should not be possible.
+ }
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_fill_buffer(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_buffer_t *target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, const void *pattern,
+ iree_host_size_t pattern_length) {
+ iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
+
+ hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
+ iree_hal_buffer_allocated_buffer(target_buffer));
+ target_offset += iree_hal_buffer_byte_offset(target_buffer);
+ uint32_t dword_pattern = iree_hal_rocm_splat_pattern(pattern, pattern_length);
+ hipDeviceptr_t dst = target_device_buffer + target_offset;
+ int value = dword_pattern;
+ size_t sizeBytes = length;
+ // TODO(raikonenfnu): Currently using NULL stream, need to figure out way to
+ // access proper stream from command buffer
+ ROCM_RETURN_IF_ERROR(command_buffer->context->syms,
+ hipMemsetAsync(dst, value, sizeBytes, 0),
+ "hipMemsetAsync");
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_update_buffer(
+ iree_hal_command_buffer_t *base_command_buffer, const void *source_buffer,
+ iree_host_size_t source_offset, iree_hal_buffer_t *target_buffer,
+ iree_device_size_t target_offset, iree_device_size_t length) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "need rocm implementation");
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_copy_buffer(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_buffer_t *source_buffer, iree_device_size_t source_offset,
+ iree_hal_buffer_t *target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length) {
+ iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
+
+ hipDeviceptr_t target_device_buffer = iree_hal_rocm_buffer_device_pointer(
+ iree_hal_buffer_allocated_buffer(target_buffer));
+ target_offset += iree_hal_buffer_byte_offset(target_buffer);
+ hipDeviceptr_t source_device_buffer = iree_hal_rocm_buffer_device_pointer(
+ iree_hal_buffer_allocated_buffer(source_buffer));
+ source_offset += iree_hal_buffer_byte_offset(source_buffer);
+ // TODO(raikonenfnu): Currently using NULL stream, need to figure out way to
+ // access proper stream from command buffer
+ ROCM_RETURN_IF_ERROR(
+ command_buffer->context->syms,
+ hipMemcpyAsync(target_device_buffer, source_device_buffer, length,
+ hipMemcpyDeviceToDevice, 0),
+ "hipMemcpyAsync");
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_push_constants(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_executable_layout_t *executable_layout, iree_host_size_t offset,
+ const void *values, iree_host_size_t values_length) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "need rocm implementation");
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_push_descriptor_set(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_executable_layout_t *executable_layout, uint32_t set,
+ iree_host_size_t binding_count,
+ const iree_hal_descriptor_set_binding_t *bindings) {
+ iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
+ for (iree_host_size_t i = 0; i < binding_count; i++) {
+ uint32_t arg_index = bindings[i].binding;
+ assert(arg_index < max_binding_count &&
+ "binding index larger than the max expected.");
+ hipDeviceptr_t device_ptr =
+ iree_hal_rocm_buffer_device_pointer(
+ iree_hal_buffer_allocated_buffer(bindings[i].buffer)) +
+ iree_hal_buffer_byte_offset(bindings[i].buffer) + bindings[i].offset;
+ *((hipDeviceptr_t *)command_buffer->current_descriptor[arg_index]) =
+ device_ptr;
+ }
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_bind_descriptor_set(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_executable_layout_t *executable_layout, uint32_t set,
+ iree_hal_descriptor_set_t *descriptor_set,
+ iree_host_size_t dynamic_offset_count,
+ const iree_device_size_t *dynamic_offsets) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "need rocm implementation");
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_dispatch(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_executable_t *executable, int32_t entry_point,
+ uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z) {
+ iree_hal_rocm_direct_command_buffer_t *command_buffer =
+ iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
+ iree_hal_rocm_direct_command_buffer_cast(base_command_buffer);
+
+ int32_t block_size_x, block_size_y, block_size_z;
+ IREE_RETURN_IF_ERROR(iree_hal_rocm_native_executable_block_size(
+ executable, entry_point, &block_size_x, &block_size_y, &block_size_z));
+ int size = command_buffer->total_size;
+ hipFunction_t func =
+ iree_hal_rocm_native_executable_for_entry_point(executable, entry_point);
+ // TODO(raikonenfnu): Currently using NULL stream, need to figure out way to
+ // access proper stream from command buffer
+ ROCM_RETURN_IF_ERROR(
+ command_buffer->context->syms,
+ hipModuleLaunchKernel(func, workgroup_x, workgroup_y, workgroup_z,
+ block_size_x, block_size_y, block_size_z, 0, 0,
+ command_buffer->current_descriptor, NULL),
+ "hipModuleLaunchKernel");
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_direct_command_buffer_dispatch_indirect(
+ iree_hal_command_buffer_t *base_command_buffer,
+ iree_hal_executable_t *executable, int32_t entry_point,
+ iree_hal_buffer_t *workgroups_buffer,
+ iree_device_size_t workgroups_offset) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "need rocm implementation");
+}
+
+const iree_hal_command_buffer_vtable_t
+ iree_hal_rocm_direct_command_buffer_vtable = {
+ .destroy = iree_hal_rocm_direct_command_buffer_destroy,
+ .mode = iree_hal_rocm_direct_command_buffer_mode,
+ .allowed_categories =
+ iree_hal_rocm_direct_command_buffer_allowed_categories,
+ .begin = iree_hal_rocm_direct_command_buffer_begin,
+ .end = iree_hal_rocm_direct_command_buffer_end,
+ .execution_barrier =
+ iree_hal_rocm_direct_command_buffer_execution_barrier,
+ .signal_event = iree_hal_rocm_direct_command_buffer_signal_event,
+ .reset_event = iree_hal_rocm_direct_command_buffer_reset_event,
+ .wait_events = iree_hal_rocm_direct_command_buffer_wait_events,
+ .discard_buffer = iree_hal_rocm_direct_command_buffer_discard_buffer,
+ .fill_buffer = iree_hal_rocm_direct_command_buffer_fill_buffer,
+ .update_buffer = iree_hal_rocm_direct_command_buffer_update_buffer,
+ .copy_buffer = iree_hal_rocm_direct_command_buffer_copy_buffer,
+ .push_constants = iree_hal_rocm_direct_command_buffer_push_constants,
+ .push_descriptor_set =
+ iree_hal_rocm_direct_command_buffer_push_descriptor_set,
+ .bind_descriptor_set =
+ iree_hal_rocm_direct_command_buffer_bind_descriptor_set,
+ .dispatch = iree_hal_rocm_direct_command_buffer_dispatch,
+ .dispatch_indirect =
+ iree_hal_rocm_direct_command_buffer_dispatch_indirect,
+};
diff --git a/experimental/rocm/direct_command_buffer.h b/experimental/rocm/direct_command_buffer.h
new file mode 100644
index 0000000..41ffb9c
--- /dev/null
+++ b/experimental/rocm/direct_command_buffer.h
@@ -0,0 +1,51 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_DIRECT_COMMAND_BUFFER_H_
+#define IREE_HAL_ROCM_DIRECT_COMMAND_BUFFER_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "experimental/rocm/dynamic_symbols.h"
+#include "experimental/rocm/rocm_headers.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// ROCM Kernel Information Structure
+typedef struct {
+ hipFunction_t func;
+ unsigned int gridDimX;
+ unsigned int gridDimY;
+ unsigned int gridDimZ;
+ unsigned int blockDimX;
+ unsigned int blockDimY;
+ unsigned int blockDimZ;
+ void **kernelParams;
+} hip_launch_params;
+
+// Creates a rocm direct command buffer.
+iree_status_t iree_hal_rocm_direct_command_buffer_allocate(
+ iree_hal_rocm_context_wrapper_t *context,
+ iree_hal_command_buffer_mode_t mode,
+ iree_hal_command_category_t command_categories,
+ iree_hal_queue_affinity_t queue_affinity,
+ iree_hal_command_buffer_t **out_command_buffer);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_DIRECT_COMMAND_BUFFER_H_
diff --git a/experimental/rocm/dynamic_symbol_tables.h b/experimental/rocm/dynamic_symbol_tables.h
new file mode 100644
index 0000000..f67aa30
--- /dev/null
+++ b/experimental/rocm/dynamic_symbol_tables.h
@@ -0,0 +1,53 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+RC_PFN_DECL(hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
+RC_PFN_DECL(hipCtxDestroy, hipCtx_t)
+RC_PFN_DECL(hipDeviceGet, hipDevice_t *, int) // No direct, need to modify
+RC_PFN_DECL(hipGetDeviceCount, int *)
+RC_PFN_DECL(hipDeviceGetName, char *, int,
+ hipDevice_t) // No direct, need to modify
+RC_PFN_STR_DECL(
+ hipGetErrorName,
+ hipError_t) // Unlike other functions hipGetErrorName(hipError_t) return
+ // const char* instead of hipError_t so it uses a different
+ // macro
+RC_PFN_STR_DECL(
+ hipGetErrorString,
+ hipError_t) // Unlike other functions hipGetErrorName(hipError_t) return
+ // const char* instead of hipError_t so it uses a different
+ // macro
+RC_PFN_DECL(hipInit, unsigned int)
+RC_PFN_DECL(hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int,
+ unsigned int, unsigned int, unsigned int, unsigned int,
+ unsigned int, hipStream_t, void **, void **)
+RC_PFN_DECL(hipMemset, void *, int, size_t)
+RC_PFN_DECL(hipMemsetAsync, void *, int, size_t, hipStream_t)
+RC_PFN_DECL(hipMemcpy, void *, const void *, size_t, hipMemcpyKind)
+RC_PFN_DECL(hipMemcpyAsync, void *, const void *, size_t, hipMemcpyKind,
+ hipStream_t)
+RC_PFN_DECL(hipMalloc, void **, size_t)
+RC_PFN_DECL(hipFree, void *)
+RC_PFN_DECL(hipHostFree, void *)
+RC_PFN_DECL(hipMemAllocHost, void **, size_t, unsigned int)
+RC_PFN_DECL(hipHostGetDevicePointer, void **, void *, unsigned int)
+RC_PFN_DECL(hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
+RC_PFN_DECL(hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int,
+ hipJitOption *, void **)
+RC_PFN_DECL(hipModuleLoadData, hipModule_t *, const void *)
+RC_PFN_DECL(hipModuleUnload, hipModule_t)
+RC_PFN_DECL(hipStreamCreateWithFlags, hipStream_t *, unsigned int)
+RC_PFN_DECL(hipStreamDestroy, hipStream_t)
+RC_PFN_DECL(hipStreamSynchronize, hipStream_t)
+RC_PFN_DECL(hipStreamWaitEvent, hipStream_t, hipEvent_t, unsigned int)
diff --git a/experimental/rocm/dynamic_symbols.c b/experimental/rocm/dynamic_symbols.c
new file mode 100644
index 0000000..b198c87
--- /dev/null
+++ b/experimental/rocm/dynamic_symbols.c
@@ -0,0 +1,75 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/dynamic_symbols.h"
+
+#include <stddef.h>
+
+#include "iree/base/internal/dynamic_library.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+static const char *kROCMLoaderSearchNames[] = {
+#if defined(IREE_PLATFORM_WINDOWS)
+ "amdhip64.dll",
+#else
+ "libamdhip64.so",
+#endif
+};
+
+static iree_status_t iree_hal_rocm_dynamic_symbols_resolve_all(
+ iree_hal_rocm_dynamic_symbols_t *syms) {
+#define RC_PFN_DECL(rocmSymbolName, ...) \
+ { \
+ static const char *kName = #rocmSymbolName; \
+ IREE_RETURN_IF_ERROR(iree_dynamic_library_lookup_symbol( \
+ syms->loader_library, kName, (void **)&syms->rocmSymbolName)); \
+ }
+#define RC_PFN_STR_DECL(rocmSymbolName, ...) RC_PFN_DECL(rocmSymbolName, ...)
+#include "experimental/rocm/dynamic_symbol_tables.h"
+#undef RC_PFN_DECL
+#undef RC_PFN_STR_DECL
+ return iree_ok_status();
+}
+
+iree_status_t iree_hal_rocm_dynamic_symbols_initialize(
+ iree_allocator_t allocator, iree_hal_rocm_dynamic_symbols_t *out_syms) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ memset(out_syms, 0, sizeof(*out_syms));
+ iree_status_t status = iree_dynamic_library_load_from_files(
+ IREE_ARRAYSIZE(kROCMLoaderSearchNames), kROCMLoaderSearchNames,
+ IREE_DYNAMIC_LIBRARY_FLAG_NONE, allocator, &out_syms->loader_library);
+ if (iree_status_is_not_found(status)) {
+ iree_status_ignore(status);
+ return iree_make_status(
+ IREE_STATUS_UNAVAILABLE,
+ "ROCM/HIP runtime library not available; ensure installed and on path");
+ }
+ if (iree_status_is_ok(status)) {
+ status = iree_hal_rocm_dynamic_symbols_resolve_all(out_syms);
+ }
+ if (!iree_status_is_ok(status)) {
+ iree_hal_rocm_dynamic_symbols_deinitialize(out_syms);
+ }
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+void iree_hal_rocm_dynamic_symbols_deinitialize(
+ iree_hal_rocm_dynamic_symbols_t *syms) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ iree_dynamic_library_release(syms->loader_library);
+ memset(syms, 0, sizeof(*syms));
+ IREE_TRACE_ZONE_END(z0);
+}
diff --git a/experimental/rocm/dynamic_symbols.h b/experimental/rocm/dynamic_symbols.h
new file mode 100644
index 0000000..e5f5891
--- /dev/null
+++ b/experimental/rocm/dynamic_symbols.h
@@ -0,0 +1,58 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_DYNAMIC_SYMBOLS_H_
+#define IREE_HAL_ROCM_DYNAMIC_SYMBOLS_H_
+
+#include "experimental/rocm/rocm_headers.h"
+#include "iree/base/api.h"
+#include "iree/base/internal/dynamic_library.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// DynamicSymbols allow loading dynamically a subset of ROCM driver API. It
+// loads all the function declared in `dynamic_symbol_tables.def` and fail if
+// any of the symbol is not available. The functions signatures are matching
+// the declarations in `hipruntime.h`.
+typedef struct {
+ iree_dynamic_library_t *loader_library;
+
+#define RC_PFN_DECL(rocmSymbolName, ...) \
+ hipError_t (*rocmSymbolName)(__VA_ARGS__);
+#define RC_PFN_STR_DECL(rocmSymbolName, ...) \
+ const char *(*rocmSymbolName)(__VA_ARGS__);
+#include "experimental/rocm/dynamic_symbol_tables.h"
+#undef RC_PFN_DECL
+#undef RC_PFN_STR_DECL
+} iree_hal_rocm_dynamic_symbols_t;
+
+// Initializes |out_syms| in-place with dynamically loaded ROCM symbols.
+// iree_hal_rocm_dynamic_symbols_deinitialize must be used to release the
+// library resources.
+iree_status_t iree_hal_rocm_dynamic_symbols_initialize(
+ iree_allocator_t allocator, iree_hal_rocm_dynamic_symbols_t *out_syms);
+
+// Deinitializes |syms| by unloading the backing library. All function pointers
+// will be invalidated. They _may_ still work if there are other reasons the
+// library remains loaded so be careful.
+void iree_hal_rocm_dynamic_symbols_deinitialize(
+ iree_hal_rocm_dynamic_symbols_t *syms);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_DYNAMIC_SYMBOLS_H_
diff --git a/experimental/rocm/dynamic_symbols_test.cc b/experimental/rocm/dynamic_symbols_test.cc
new file mode 100644
index 0000000..541d04c
--- /dev/null
+++ b/experimental/rocm/dynamic_symbols_test.cc
@@ -0,0 +1,54 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/dynamic_symbols.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace iree {
+namespace hal {
+namespace rocm {
+namespace {
+
+#define ROCM_CHECK_ERRORS(expr) \
+ { \
+ hipError_t status = expr; \
+ ASSERT_EQ(hipSuccess, status); \
+ }
+
+TEST(DynamicSymbolsTest, CreateFromSystemLoader) {
+ iree_hal_rocm_dynamic_symbols_t symbols;
+ iree_status_t status = iree_hal_rocm_dynamic_symbols_initialize(
+ iree_allocator_system(), &symbols);
+ if (!iree_status_is_ok(status)) {
+ IREE_LOG(WARNING) << "Symbols cannot be loaded, skipping test.";
+ GTEST_SKIP();
+ }
+
+ int device_count = 0;
+ ROCM_CHECK_ERRORS(symbols.hipInit(0));
+ ROCM_CHECK_ERRORS(symbols.hipGetDeviceCount(&device_count));
+ if (device_count > 0) {
+ hipDevice_t device;
+ ROCM_CHECK_ERRORS(symbols.hipDeviceGet(&device, /*ordinal=*/0));
+ }
+
+ iree_hal_rocm_dynamic_symbols_deinitialize(&symbols);
+}
+
+} // namespace
+} // namespace rocm
+} // namespace hal
+} // namespace iree
diff --git a/experimental/rocm/event_semaphore.c b/experimental/rocm/event_semaphore.c
new file mode 100644
index 0000000..0cdfa02
--- /dev/null
+++ b/experimental/rocm/event_semaphore.c
@@ -0,0 +1,99 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations ufnder the License.
+
+#include "experimental/rocm/event_semaphore.h"
+
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+ uint64_t initial_value;
+} iree_hal_rocm_semaphore_t;
+
+extern const iree_hal_semaphore_vtable_t iree_hal_rocm_semaphore_vtable;
+
+static iree_hal_rocm_semaphore_t *iree_hal_rocm_semaphore_cast(
+ iree_hal_semaphore_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_semaphore_vtable);
+ return (iree_hal_rocm_semaphore_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_semaphore_create(
+ iree_hal_rocm_context_wrapper_t *context, uint64_t initial_value,
+ iree_hal_semaphore_t **out_semaphore) {
+ IREE_ASSERT_ARGUMENT(context);
+ IREE_ASSERT_ARGUMENT(out_semaphore);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_semaphore_t *semaphore = NULL;
+ iree_status_t status = iree_allocator_malloc(
+ context->host_allocator, sizeof(*semaphore), (void **)&semaphore);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_semaphore_vtable,
+ &semaphore->resource);
+ semaphore->context = context;
+ semaphore->initial_value = initial_value;
+ *out_semaphore = (iree_hal_semaphore_t *)semaphore;
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_semaphore_destroy(
+ iree_hal_semaphore_t *base_semaphore) {
+ iree_hal_rocm_semaphore_t *semaphore =
+ iree_hal_rocm_semaphore_cast(base_semaphore);
+ iree_allocator_t host_allocator = semaphore->context->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(host_allocator, semaphore);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_rocm_semaphore_query(
+ iree_hal_semaphore_t *base_semaphore, uint64_t *out_value) {
+ // TODO: Support semaphores completely.
+ *out_value = 0;
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "Not impemented on rocm");
+}
+
+static iree_status_t iree_hal_rocm_semaphore_signal(
+ iree_hal_semaphore_t *base_semaphore, uint64_t new_value) {
+ // TODO: Support semaphores completely. Return OK currently as everything is
+ // synchronized for each submit to allow things to run.
+ return iree_ok_status();
+}
+
+static void iree_hal_rocm_semaphore_fail(iree_hal_semaphore_t *base_semaphore,
+ iree_status_t status) {}
+
+static iree_status_t iree_hal_rocm_semaphore_wait(
+ iree_hal_semaphore_t *base_semaphore, uint64_t value,
+ iree_timeout_t timeout) {
+ // TODO: Support semaphores completely. Return OK currently as everything is
+ // synchronized for each submit to allow things to run.
+ return iree_ok_status();
+}
+
+const iree_hal_semaphore_vtable_t iree_hal_rocm_semaphore_vtable = {
+ .destroy = iree_hal_rocm_semaphore_destroy,
+ .query = iree_hal_rocm_semaphore_query,
+ .signal = iree_hal_rocm_semaphore_signal,
+ .fail = iree_hal_rocm_semaphore_fail,
+ .wait = iree_hal_rocm_semaphore_wait,
+};
diff --git a/experimental/rocm/event_semaphore.h b/experimental/rocm/event_semaphore.h
new file mode 100644
index 0000000..952e3e5
--- /dev/null
+++ b/experimental/rocm/event_semaphore.h
@@ -0,0 +1,35 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_SEMAPHORE_H_
+#define IREE_HAL_ROCM_SEMAPHORE_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Create a rocm allocator.
+iree_status_t iree_hal_rocm_semaphore_create(
+ iree_hal_rocm_context_wrapper_t *context, uint64_t initial_value,
+ iree_hal_semaphore_t **out_semaphore);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_SEMAPHORE_H_
diff --git a/experimental/rocm/executable_layout.c b/experimental/rocm/executable_layout.c
new file mode 100644
index 0000000..e7c666f
--- /dev/null
+++ b/experimental/rocm/executable_layout.c
@@ -0,0 +1,88 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/executable_layout.h"
+
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+ iree_host_size_t set_layout_count;
+ iree_hal_descriptor_set_layout_t *set_layouts[];
+} iree_hal_rocm_executable_layout_t;
+
+extern const iree_hal_executable_layout_vtable_t
+ iree_hal_rocm_executable_layout_vtable;
+
+static iree_hal_rocm_executable_layout_t *iree_hal_rocm_executable_layout_cast(
+ iree_hal_executable_layout_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_executable_layout_vtable);
+ return (iree_hal_rocm_executable_layout_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_executable_layout_create(
+ iree_hal_rocm_context_wrapper_t *context, iree_host_size_t set_layout_count,
+ iree_hal_descriptor_set_layout_t **set_layouts,
+ iree_host_size_t push_constant_count,
+ iree_hal_executable_layout_t **out_executable_layout) {
+ IREE_ASSERT_ARGUMENT(context);
+ IREE_ASSERT_ARGUMENT(!set_layout_count || set_layouts);
+ IREE_ASSERT_ARGUMENT(out_executable_layout);
+ *out_executable_layout = NULL;
+ IREE_TRACE_ZONE_BEGIN(z0);
+ // Currently the executable layout doesn't do anything.
+ // TODO: Handle creating the argument layout at that time hadling both push
+ // constant and buffers.
+ iree_hal_rocm_executable_layout_t *executable_layout = NULL;
+ iree_host_size_t total_size =
+ sizeof(*executable_layout) +
+ set_layout_count * sizeof(*executable_layout->set_layouts);
+ iree_status_t status = iree_allocator_malloc(
+ context->host_allocator, total_size, (void **)&executable_layout);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_executable_layout_vtable,
+ &executable_layout->resource);
+ executable_layout->context = context;
+ executable_layout->set_layout_count = set_layout_count;
+ for (iree_host_size_t i = 0; i < set_layout_count; ++i) {
+ executable_layout->set_layouts[i] = set_layouts[i];
+ iree_hal_descriptor_set_layout_retain(set_layouts[i]);
+ }
+ *out_executable_layout = (iree_hal_executable_layout_t *)executable_layout;
+ }
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_executable_layout_destroy(
+ iree_hal_executable_layout_t *base_executable_layout) {
+ iree_hal_rocm_executable_layout_t *executable_layout =
+ iree_hal_rocm_executable_layout_cast(base_executable_layout);
+ iree_allocator_t host_allocator = executable_layout->context->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ for (iree_host_size_t i = 0; i < executable_layout->set_layout_count; ++i) {
+ iree_hal_descriptor_set_layout_release(executable_layout->set_layouts[i]);
+ }
+ iree_allocator_free(host_allocator, executable_layout);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+const iree_hal_executable_layout_vtable_t
+ iree_hal_rocm_executable_layout_vtable = {
+ .destroy = iree_hal_rocm_executable_layout_destroy,
+};
diff --git a/experimental/rocm/executable_layout.h b/experimental/rocm/executable_layout.h
new file mode 100644
index 0000000..8c36713
--- /dev/null
+++ b/experimental/rocm/executable_layout.h
@@ -0,0 +1,36 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_EXECUTABLE_LAYOUT_H_
+#define IREE_HAL_ROCM_EXECUTABLE_LAYOUT_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Creates the kernel arguments.
+iree_status_t iree_hal_rocm_executable_layout_create(
+ iree_hal_rocm_context_wrapper_t *context, iree_host_size_t set_layout_count,
+ iree_hal_descriptor_set_layout_t **set_layouts,
+ iree_host_size_t push_constant_count,
+ iree_hal_executable_layout_t **out_executable_layout);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_EXECUTABLE_LAYOUT_H_
diff --git a/experimental/rocm/native_executable.c b/experimental/rocm/native_executable.c
new file mode 100644
index 0000000..3448228
--- /dev/null
+++ b/experimental/rocm/native_executable.c
@@ -0,0 +1,136 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/native_executable.h"
+
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+// flatcc schemas:
+#include "iree/base/internal/flatcc.h"
+#include "iree/schemas/rocm_executable_def_reader.h"
+#include "iree/schemas/rocm_executable_def_verifier.h"
+
+typedef struct {
+ hipFunction_t rocm_function;
+ uint32_t block_size_x;
+ uint32_t block_size_y;
+ uint32_t block_size_z;
+} iree_hal_rocm_native_executable_function_t;
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+ iree_host_size_t entry_count;
+ hipModule_t module;
+ iree_hal_rocm_native_executable_function_t entry_functions[];
+} iree_hal_rocm_native_executable_t;
+
+extern const iree_hal_executable_vtable_t
+ iree_hal_rocm_native_executable_vtable;
+
+static iree_hal_rocm_native_executable_t *iree_hal_rocm_native_executable_cast(
+ iree_hal_executable_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_native_executable_vtable);
+ return (iree_hal_rocm_native_executable_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_native_executable_create(
+ iree_hal_rocm_context_wrapper_t *context,
+ const iree_hal_executable_spec_t *executable_spec,
+ iree_hal_executable_t **out_executable) {
+ IREE_ASSERT_ARGUMENT(context);
+ IREE_ASSERT_ARGUMENT(executable_spec);
+ IREE_ASSERT_ARGUMENT(out_executable);
+ *out_executable = NULL;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_native_executable_t *executable = NULL;
+
+ // TODO: Verify the flat buffer.
+ iree_ROCMExecutableDef_table_t executable_def =
+ iree_ROCMExecutableDef_as_root(executable_spec->executable_data.data);
+
+ // Create the kernel module.
+ flatbuffers_string_t hsaco_image =
+ iree_ROCMExecutableDef_hsaco_image_get(executable_def);
+ flatbuffers_string_vec_t entry_points_vec =
+ iree_ROCMExecutableDef_entry_points_get(executable_def);
+ iree_ROCMBlockSizeDef_vec_t block_sizes_vec =
+ iree_ROCMExecutableDef_block_sizes_get(executable_def);
+ iree_host_size_t entry_count = flatbuffers_string_vec_len(entry_points_vec);
+ iree_host_size_t total_size =
+ sizeof(*executable) +
+ entry_count * sizeof(iree_hal_rocm_native_executable_function_t);
+ iree_status_t status = iree_allocator_malloc(
+ context->host_allocator, total_size, (void **)&executable);
+ hipModule_t module = NULL;
+ ROCM_RETURN_IF_ERROR(context->syms,
+ hipModuleLoadDataEx(&module, hsaco_image, 0, NULL, NULL),
+ "hipModuleLoadDataEx");
+
+ for (iree_host_size_t i = 0; i < entry_count; i++) {
+ hipFunction_t function = NULL;
+ const char *entry_name = flatbuffers_string_vec_at(entry_points_vec, i);
+ ROCM_RETURN_IF_ERROR(context->syms,
+ hipModuleGetFunction(&function, module, entry_name),
+ "hipModuleGetFunction");
+ executable->entry_functions[i].rocm_function = function;
+ executable->entry_functions[i].block_size_x = block_sizes_vec[i].x;
+ executable->entry_functions[i].block_size_y = block_sizes_vec[i].y;
+ executable->entry_functions[i].block_size_z = block_sizes_vec[i].z;
+ }
+
+ iree_hal_resource_initialize(&iree_hal_rocm_native_executable_vtable,
+ &executable->resource);
+ executable->module = module;
+ executable->context = context;
+ *out_executable = (iree_hal_executable_t *)executable;
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+hipFunction_t iree_hal_rocm_native_executable_for_entry_point(
+ iree_hal_executable_t *base_executable, int32_t entry_point) {
+ iree_hal_rocm_native_executable_t *executable =
+ iree_hal_rocm_native_executable_cast(base_executable);
+ return executable->entry_functions[entry_point].rocm_function;
+}
+
+iree_status_t iree_hal_rocm_native_executable_block_size(
+ iree_hal_executable_t *base_executable, int32_t entry_point, uint32_t *x,
+ uint32_t *y, uint32_t *z) {
+ iree_hal_rocm_native_executable_t *executable =
+ iree_hal_rocm_native_executable_cast(base_executable);
+ *x = executable->entry_functions[entry_point].block_size_x;
+ *y = executable->entry_functions[entry_point].block_size_y;
+ *z = executable->entry_functions[entry_point].block_size_z;
+ return iree_ok_status();
+}
+
+static void iree_hal_rocm_native_executable_destroy(
+ iree_hal_executable_t *base_executable) {
+ iree_hal_rocm_native_executable_t *executable =
+ iree_hal_rocm_native_executable_cast(base_executable);
+ iree_allocator_t host_allocator = executable->context->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(host_allocator, executable);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+const iree_hal_executable_vtable_t iree_hal_rocm_native_executable_vtable = {
+ .destroy = iree_hal_rocm_native_executable_destroy,
+};
diff --git a/experimental/rocm/native_executable.h b/experimental/rocm/native_executable.h
new file mode 100644
index 0000000..d1ff352
--- /dev/null
+++ b/experimental/rocm/native_executable.h
@@ -0,0 +1,45 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_NATIVE_EXECUTABLE_H_
+#define IREE_HAL_ROCM_NATIVE_EXECUTABLE_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "experimental/rocm/rocm_headers.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Creates an executable from a HSACO module. The module may contain several
+// kernels that can be extracted along with the associated block size.
+iree_status_t iree_hal_rocm_native_executable_create(
+ iree_hal_rocm_context_wrapper_t *context,
+ const iree_hal_executable_spec_t *executable_spec,
+ iree_hal_executable_t **out_executable);
+
+hipFunction_t iree_hal_rocm_native_executable_for_entry_point(
+ iree_hal_executable_t *executable, int32_t entry_point);
+
+// Return the block size of the given |entry_point| within the executable.
+iree_status_t iree_hal_rocm_native_executable_block_size(
+ iree_hal_executable_t *executable, int32_t entry_point, uint32_t *x,
+ uint32_t *y, uint32_t *z);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_NATIVE_EXECUTABLE_H_
diff --git a/experimental/rocm/nop_executable_cache.c b/experimental/rocm/nop_executable_cache.c
new file mode 100644
index 0000000..e225bab
--- /dev/null
+++ b/experimental/rocm/nop_executable_cache.c
@@ -0,0 +1,94 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/nop_executable_cache.h"
+
+#include "experimental/rocm/native_executable.h"
+#include "iree/base/tracing.h"
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+} iree_hal_rocm_nop_executable_cache_t;
+
+extern const iree_hal_executable_cache_vtable_t
+ iree_hal_rocm_nop_executable_cache_vtable;
+
+static iree_hal_rocm_nop_executable_cache_t *
+iree_hal_rocm_nop_executable_cache_cast(
+ iree_hal_executable_cache_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_nop_executable_cache_vtable);
+ return (iree_hal_rocm_nop_executable_cache_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_nop_executable_cache_create(
+ iree_hal_rocm_context_wrapper_t *context, iree_string_view_t identifier,
+ iree_hal_executable_cache_t **out_executable_cache) {
+ IREE_ASSERT_ARGUMENT(out_executable_cache);
+ *out_executable_cache = NULL;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_nop_executable_cache_t *executable_cache = NULL;
+ iree_status_t status =
+ iree_allocator_malloc(context->host_allocator, sizeof(*executable_cache),
+ (void **)&executable_cache);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_nop_executable_cache_vtable,
+ &executable_cache->resource);
+ executable_cache->context = context;
+
+ *out_executable_cache = (iree_hal_executable_cache_t *)executable_cache;
+ }
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_nop_executable_cache_destroy(
+ iree_hal_executable_cache_t *base_executable_cache) {
+ iree_hal_rocm_nop_executable_cache_t *executable_cache =
+ iree_hal_rocm_nop_executable_cache_cast(base_executable_cache);
+ iree_allocator_t host_allocator = executable_cache->context->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(host_allocator, executable_cache);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static bool iree_hal_rocm_nop_executable_cache_can_prepare_format(
+ iree_hal_executable_cache_t *base_executable_cache,
+ iree_hal_executable_caching_mode_t caching_mode,
+ iree_string_view_t executable_format) {
+ return iree_string_view_equal(executable_format,
+ iree_make_cstring_view("PTXE"));
+}
+
+static iree_status_t iree_hal_rocm_nop_executable_cache_prepare_executable(
+ iree_hal_executable_cache_t *base_executable_cache,
+ const iree_hal_executable_spec_t *executable_spec,
+ iree_hal_executable_t **out_executable) {
+ iree_hal_rocm_nop_executable_cache_t *executable_cache =
+ iree_hal_rocm_nop_executable_cache_cast(base_executable_cache);
+ return iree_hal_rocm_native_executable_create(
+ executable_cache->context, executable_spec, out_executable);
+}
+
+const iree_hal_executable_cache_vtable_t
+ iree_hal_rocm_nop_executable_cache_vtable = {
+ .destroy = iree_hal_rocm_nop_executable_cache_destroy,
+ .can_prepare_format =
+ iree_hal_rocm_nop_executable_cache_can_prepare_format,
+ .prepare_executable =
+ iree_hal_rocm_nop_executable_cache_prepare_executable,
+};
diff --git a/experimental/rocm/nop_executable_cache.h b/experimental/rocm/nop_executable_cache.h
new file mode 100644
index 0000000..72af2ca
--- /dev/null
+++ b/experimental/rocm/nop_executable_cache.h
@@ -0,0 +1,36 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_NOP_EXECUTABLE_CACHE_H_
+#define IREE_HAL_ROCM_NOP_EXECUTABLE_CACHE_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Creates a no-op executable cache that does not cache at all.
+// This is useful to isolate pipeline caching behavior and verify compilation
+// behavior.
+iree_status_t iree_hal_rocm_nop_executable_cache_create(
+ iree_hal_rocm_context_wrapper_t *context, iree_string_view_t identifier,
+ iree_hal_executable_cache_t **out_executable_cache);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_NOP_EXECUTABLE_CACHE_H_
diff --git a/experimental/rocm/registration/CMakeLists.txt b/experimental/rocm/registration/CMakeLists.txt
new file mode 100644
index 0000000..fd6e66a
--- /dev/null
+++ b/experimental/rocm/registration/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright 2021 Google LLC
+
+iree_add_all_subdirs()
+
+if(${IREE_BUILD_EXPERIMENTAL_ROCM})
+
+iree_cc_library(
+ NAME
+ registration
+ HDRS
+ "driver_module.h"
+ SRCS
+ "driver_module.c"
+ DEPS
+ iree::base::core_headers
+ iree::base::status
+ iree::base::tracing
+ iree::hal
+ experimental::rocm
+ INCLUDES
+ "${CMAKE_CURRENT_LIST_DIR}/../../.."
+ DEFINES
+ "IREE_BUILD_EXPERIMENTAL_ROCM=1"
+ PUBLIC
+)
+
+endif()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/experimental/rocm/registration/driver_module.c b/experimental/rocm/registration/driver_module.c
new file mode 100644
index 0000000..0ff0bb9
--- /dev/null
+++ b/experimental/rocm/registration/driver_module.c
@@ -0,0 +1,71 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/registration/driver_module.h"
+
+#include <inttypes.h>
+
+#include "experimental/rocm/api.h"
+#include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
+
+#define IREE_HAL_ROCM_DRIVER_ID 0x524f434d0au // ROCM
+
+static iree_status_t iree_hal_rocm_driver_factory_enumerate(
+ void *self, const iree_hal_driver_info_t **out_driver_infos,
+ iree_host_size_t *out_driver_info_count) {
+ // NOTE: we could query supported ROCM versions or featuresets here.
+ static const iree_hal_driver_info_t driver_infos[1] = {{
+ .driver_id = IREE_HAL_ROCM_DRIVER_ID,
+ .driver_name = iree_string_view_literal("rocm"),
+ .full_name = iree_string_view_literal("ROCM (dynamic)"),
+ }};
+ *out_driver_info_count = IREE_ARRAYSIZE(driver_infos);
+ *out_driver_infos = driver_infos;
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_driver_factory_try_create(
+ void *self, iree_hal_driver_id_t driver_id, iree_allocator_t allocator,
+ iree_hal_driver_t **out_driver) {
+ IREE_ASSERT_ARGUMENT(out_driver);
+ *out_driver = NULL;
+ if (driver_id != IREE_HAL_ROCM_DRIVER_ID) {
+ return iree_make_status(IREE_STATUS_UNAVAILABLE,
+ "no driver with ID %016" PRIu64
+ " is provided by this factory",
+ driver_id);
+ }
+ IREE_TRACE_ZONE_BEGIN(z0);
+ // When we expose more than one driver (different rocm versions, etc) we
+ // can name them here:
+ iree_string_view_t identifier = iree_make_cstring_view("rocm");
+
+ iree_hal_rocm_driver_options_t driver_options;
+ iree_hal_rocm_driver_options_initialize(&driver_options);
+ iree_status_t status = iree_hal_rocm_driver_create(
+ identifier, &driver_options, allocator, out_driver);
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+IREE_API_EXPORT iree_status_t
+iree_hal_rocm_driver_module_register(iree_hal_driver_registry_t *registry) {
+ static const iree_hal_driver_factory_t factory = {
+ .self = NULL,
+ .enumerate = iree_hal_rocm_driver_factory_enumerate,
+ .try_create = iree_hal_rocm_driver_factory_try_create,
+ };
+ return iree_hal_driver_registry_register_factory(registry, &factory);
+}
diff --git a/experimental/rocm/registration/driver_module.h b/experimental/rocm/registration/driver_module.h
new file mode 100644
index 0000000..376a21d
--- /dev/null
+++ b/experimental/rocm/registration/driver_module.h
@@ -0,0 +1,31 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_REGISTRATION_DRIVER_MODULE_H_
+#define IREE_HAL_ROCM_REGISTRATION_DRIVER_MODULE_H_
+
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+IREE_API_EXPORT iree_status_t
+iree_hal_rocm_driver_module_register(iree_hal_driver_registry_t *registry);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_REGISTRATION_DRIVER_MODULE_H_
diff --git a/experimental/rocm/rocm_allocator.c b/experimental/rocm/rocm_allocator.c
new file mode 100644
index 0000000..9b24628
--- /dev/null
+++ b/experimental/rocm/rocm_allocator.c
@@ -0,0 +1,175 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations ufnder the License.
+
+#include "experimental/rocm/rocm_allocator.h"
+
+#include "experimental/rocm/rocm_buffer.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_rocm_allocator_s {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context;
+} iree_hal_rocm_allocator_t;
+
+extern const iree_hal_allocator_vtable_t iree_hal_rocm_allocator_vtable;
+
+static iree_hal_rocm_allocator_t *iree_hal_rocm_allocator_cast(
+ iree_hal_allocator_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_allocator_vtable);
+ return (iree_hal_rocm_allocator_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_allocator_create(
+ iree_hal_rocm_context_wrapper_t *context,
+ iree_hal_allocator_t **out_allocator) {
+ IREE_ASSERT_ARGUMENT(context);
+ IREE_TRACE_ZONE_BEGIN(z0);
+ iree_hal_rocm_allocator_t *allocator = NULL;
+ iree_status_t status = iree_allocator_malloc(
+ context->host_allocator, sizeof(*allocator), (void **)&allocator);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_allocator_vtable,
+ &allocator->resource);
+ allocator->context = context;
+ *out_allocator = (iree_hal_allocator_t *)allocator;
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_allocator_destroy(
+ iree_hal_allocator_t *base_allocator) {
+ iree_hal_rocm_allocator_t *allocator =
+ iree_hal_rocm_allocator_cast(base_allocator);
+ iree_allocator_t host_allocator = allocator->context->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(host_allocator, allocator);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_rocm_allocator_host_allocator(
+ const iree_hal_allocator_t *base_allocator) {
+ iree_hal_rocm_allocator_t *allocator =
+ (iree_hal_rocm_allocator_t *)base_allocator;
+ return allocator->context->host_allocator;
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_rocm_allocator_query_buffer_compatibility(
+ iree_hal_allocator_t *base_allocator, iree_hal_memory_type_t memory_type,
+ iree_hal_buffer_usage_t allowed_usage,
+ iree_hal_buffer_usage_t intended_usage,
+ iree_device_size_t allocation_size) {
+ // TODO(benvanik): check to ensure the allocator can serve the memory type.
+
+ // Disallow usage not permitted by the buffer itself. Since we then use this
+ // to determine compatibility below we'll naturally set the right compat flags
+ // based on what's both allowed and intended.
+ intended_usage &= allowed_usage;
+
+ // All buffers can be allocated on the heap.
+ iree_hal_buffer_compatibility_t compatibility =
+ IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+ // Buffers can only be used on the queue if they are device visible.
+ if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+ if (iree_all_bits_set(intended_usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+ compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+ }
+ if (iree_all_bits_set(intended_usage, IREE_HAL_BUFFER_USAGE_DISPATCH)) {
+ compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+ }
+ }
+
+ return compatibility;
+}
+
+static iree_status_t iree_hal_rocm_allocator_allocate_buffer(
+ iree_hal_allocator_t *base_allocator, iree_hal_memory_type_t memory_type,
+ iree_hal_buffer_usage_t allowed_usage, iree_host_size_t allocation_size,
+ iree_hal_buffer_t **out_buffer) {
+ iree_hal_rocm_allocator_t *allocator =
+ iree_hal_rocm_allocator_cast(base_allocator);
+ // Guard against the corner case where the requested buffer size is 0. The
+ // application is unlikely to do anything when requesting a 0-byte buffer; but
+ // it can happen in real world use cases. So we should at least not crash.
+ if (allocation_size == 0) allocation_size = 4;
+ iree_status_t status;
+ void *host_ptr = NULL;
+ hipDeviceptr_t device_ptr = 0;
+ if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+ unsigned int flags = hipHostMallocMapped;
+ if (!iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+ flags |= hipHostMallocWriteCombined;
+ }
+ status = ROCM_RESULT_TO_STATUS(
+ allocator->context->syms,
+ hipMemAllocHost(&host_ptr, allocation_size, flags));
+ if (iree_status_is_ok(status)) {
+ status = ROCM_RESULT_TO_STATUS(
+ allocator->context->syms,
+ hipHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
+ }
+ } else {
+ status = ROCM_RESULT_TO_STATUS(allocator->context->syms,
+ hipMalloc(&device_ptr, allocation_size));
+ }
+
+ if (iree_status_is_ok(status)) {
+ status = iree_hal_rocm_buffer_wrap(
+ (iree_hal_allocator_t *)allocator, memory_type,
+ IREE_HAL_MEMORY_ACCESS_ALL, allowed_usage, allocation_size,
+ /*byte_offset=*/0,
+ /*byte_length=*/allocation_size, device_ptr, host_ptr, out_buffer);
+ }
+ if (!iree_status_is_ok(status)) {
+ iree_hal_rocm_allocator_free(base_allocator, device_ptr, host_ptr,
+ memory_type);
+ }
+ return status;
+}
+
+void iree_hal_rocm_allocator_free(iree_hal_allocator_t *base_allocator,
+ hipDeviceptr_t device_ptr, void *host_ptr,
+ iree_hal_memory_type_t memory_type) {
+ iree_hal_rocm_allocator_t *allocator =
+ iree_hal_rocm_allocator_cast(base_allocator);
+ if (iree_all_bits_set(memory_type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+ ROCM_IGNORE_ERROR(allocator->context->syms, hipHostFree(host_ptr));
+ } else {
+ ROCM_IGNORE_ERROR(allocator->context->syms, hipFree(device_ptr));
+ }
+}
+
+static iree_status_t iree_hal_rocm_allocator_wrap_buffer(
+ iree_hal_allocator_t *base_allocator, iree_hal_memory_type_t memory_type,
+ iree_hal_memory_access_t allowed_access,
+ iree_hal_buffer_usage_t allowed_usage, iree_byte_span_t data,
+ iree_allocator_t data_allocator, iree_hal_buffer_t **out_buffer) {
+ return iree_make_status(IREE_STATUS_UNAVAILABLE,
+ "wrapping of external buffers not supported");
+}
+
+const iree_hal_allocator_vtable_t iree_hal_rocm_allocator_vtable = {
+ .destroy = iree_hal_rocm_allocator_destroy,
+ .host_allocator = iree_hal_rocm_allocator_host_allocator,
+ .query_buffer_compatibility =
+ iree_hal_rocm_allocator_query_buffer_compatibility,
+ .allocate_buffer = iree_hal_rocm_allocator_allocate_buffer,
+ .wrap_buffer = iree_hal_rocm_allocator_wrap_buffer,
+};
diff --git a/experimental/rocm/rocm_allocator.h b/experimental/rocm/rocm_allocator.h
new file mode 100644
index 0000000..e802684
--- /dev/null
+++ b/experimental/rocm/rocm_allocator.h
@@ -0,0 +1,40 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_ALLOCATOR_H_
+#define IREE_HAL_ROCM_ALLOCATOR_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Create a ROCM allocator.
+iree_status_t iree_hal_rocm_allocator_create(
+ iree_hal_rocm_context_wrapper_t *context,
+ iree_hal_allocator_t **out_allocator);
+
+// Free an allocation represent by the given device or host pointer.
+void iree_hal_rocm_allocator_free(iree_hal_allocator_t *allocator,
+ hipDeviceptr_t device_ptr, void *host_ptr,
+ iree_hal_memory_type_t memory_type);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_ALLOCATOR_H_
diff --git a/experimental/rocm/rocm_buffer.c b/experimental/rocm/rocm_buffer.c
new file mode 100644
index 0000000..afb7810
--- /dev/null
+++ b/experimental/rocm/rocm_buffer.c
@@ -0,0 +1,140 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/rocm_buffer.h"
+
+#include "experimental/rocm/rocm_allocator.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+typedef struct iree_hal_rocm_buffer_s {
+ iree_hal_buffer_t base;
+ void *host_ptr;
+ hipDeviceptr_t device_ptr;
+} iree_hal_rocm_buffer_t;
+
+extern const iree_hal_buffer_vtable_t iree_hal_rocm_buffer_vtable;
+
+static iree_hal_rocm_buffer_t *iree_hal_rocm_buffer_cast(
+ iree_hal_buffer_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_buffer_vtable);
+ return (iree_hal_rocm_buffer_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_buffer_wrap(
+ iree_hal_allocator_t *allocator, iree_hal_memory_type_t memory_type,
+ iree_hal_memory_access_t allowed_access,
+ iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+ iree_device_size_t byte_offset, iree_device_size_t byte_length,
+ hipDeviceptr_t device_ptr, void *host_ptr, iree_hal_buffer_t **out_buffer) {
+ IREE_ASSERT_ARGUMENT(allocator);
+ IREE_ASSERT_ARGUMENT(out_buffer);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_buffer_t *buffer = NULL;
+ iree_status_t status =
+ iree_allocator_malloc(iree_hal_allocator_host_allocator(allocator),
+ sizeof(*buffer), (void **)&buffer);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_buffer_vtable,
+ &buffer->base.resource);
+ buffer->base.allocator = allocator;
+ buffer->base.allocated_buffer = &buffer->base;
+ buffer->base.allocation_size = allocation_size;
+ buffer->base.byte_offset = byte_offset;
+ buffer->base.byte_length = byte_length;
+ buffer->base.memory_type = memory_type;
+ buffer->base.allowed_access = allowed_access;
+ buffer->base.allowed_usage = allowed_usage;
+ buffer->host_ptr = host_ptr;
+ buffer->device_ptr = device_ptr;
+ *out_buffer = &buffer->base;
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+static void iree_hal_rocm_buffer_destroy(iree_hal_buffer_t *base_buffer) {
+ iree_hal_rocm_buffer_t *buffer = iree_hal_rocm_buffer_cast(base_buffer);
+ iree_allocator_t host_allocator =
+ iree_hal_allocator_host_allocator(iree_hal_buffer_allocator(base_buffer));
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_allocator_free(buffer->base.allocator, buffer->device_ptr,
+ buffer->host_ptr, buffer->base.memory_type);
+ iree_allocator_free(host_allocator, buffer);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_rocm_buffer_map_range(
+ iree_hal_buffer_t *base_buffer, iree_hal_mapping_mode_t mapping_mode,
+ iree_hal_memory_access_t memory_access,
+ iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
+ void **out_data_ptr) {
+ iree_hal_rocm_buffer_t *buffer = iree_hal_rocm_buffer_cast(base_buffer);
+
+ if (!iree_all_bits_set(buffer->base.memory_type,
+ IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+ return iree_make_status(IREE_STATUS_INTERNAL,
+ "trying to map memory not host visible");
+ }
+
+ uint8_t *data_ptr = (uint8_t *)(buffer->host_ptr) + local_byte_offset;
+ // If we mapped for discard scribble over the bytes. This is not a mandated
+ // behavior but it will make debugging issues easier. Alternatively for
+ // heap buffers we could reallocate them such that ASAN yells, but that
+ // would only work if the entire buffer was discarded.
+#ifndef NDEBUG
+ if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) {
+ memset(data_ptr + local_byte_offset, 0xCD, local_byte_length);
+ }
+#endif // !NDEBUG
+ *out_data_ptr = data_ptr;
+ return iree_ok_status();
+}
+
+static void iree_hal_rocm_buffer_unmap_range(
+ iree_hal_buffer_t *base_buffer, iree_device_size_t local_byte_offset,
+ iree_device_size_t local_byte_length, void *data_ptr) {
+ // nothing to do.
+}
+
+static iree_status_t iree_hal_rocm_buffer_invalidate_range(
+ iree_hal_buffer_t *base_buffer, iree_device_size_t local_byte_offset,
+ iree_device_size_t local_byte_length) {
+ // Nothing to do.
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_buffer_flush_range(
+ iree_hal_buffer_t *base_buffer, iree_device_size_t local_byte_offset,
+ iree_device_size_t local_byte_length) {
+ // Nothing to do.
+ return iree_ok_status();
+}
+
+void **iree_hal_rocm_buffer_device_pointer(iree_hal_buffer_t *base_buffer) {
+ iree_hal_rocm_buffer_t *buffer = iree_hal_rocm_buffer_cast(base_buffer);
+ return buffer->device_ptr;
+}
+
+const iree_hal_buffer_vtable_t iree_hal_rocm_buffer_vtable = {
+ .destroy = iree_hal_rocm_buffer_destroy,
+ .map_range = iree_hal_rocm_buffer_map_range,
+ .unmap_range = iree_hal_rocm_buffer_unmap_range,
+ .invalidate_range = iree_hal_rocm_buffer_invalidate_range,
+ .flush_range = iree_hal_rocm_buffer_flush_range,
+};
diff --git a/experimental/rocm/rocm_buffer.h b/experimental/rocm/rocm_buffer.h
new file mode 100644
index 0000000..f8a2a1c
--- /dev/null
+++ b/experimental/rocm/rocm_buffer.h
@@ -0,0 +1,42 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_BUFFER_H_
+#define IREE_HAL_ROCM_BUFFER_H_
+
+#include "experimental/rocm/rocm_headers.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Wraps a rocm allocation in an iree_hal_buffer_t.
+iree_status_t iree_hal_rocm_buffer_wrap(
+ iree_hal_allocator_t *allocator, iree_hal_memory_type_t memory_type,
+ iree_hal_memory_access_t allowed_access,
+ iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
+ iree_device_size_t byte_offset, iree_device_size_t byte_length,
+ hipDeviceptr_t device_ptr, void *host_ptr, iree_hal_buffer_t **out_buffer);
+
+// Returns the rocm base pointer for the given |buffer|.
+// This is the entire allocated_buffer and must be offset by the buffer
+// byte_offset and byte_length when used.
+void **iree_hal_rocm_buffer_device_pointer(iree_hal_buffer_t *buffer);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_BUFFER_H_
diff --git a/experimental/rocm/rocm_device.c b/experimental/rocm/rocm_device.c
new file mode 100644
index 0000000..a110fd3
--- /dev/null
+++ b/experimental/rocm/rocm_device.c
@@ -0,0 +1,298 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/rocm_device.h"
+
+#include "experimental/rocm/api.h"
+#include "experimental/rocm/descriptor_set_layout.h"
+#include "experimental/rocm/direct_command_buffer.h"
+#include "experimental/rocm/dynamic_symbols.h"
+#include "experimental/rocm/event_semaphore.h"
+#include "experimental/rocm/executable_layout.h"
+#include "experimental/rocm/nop_executable_cache.h"
+#include "experimental/rocm/rocm_allocator.h"
+#include "experimental/rocm/rocm_event.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+//===----------------------------------------------------------------------===//
+// iree_hal_rocm_device_t
+//===----------------------------------------------------------------------===//
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_string_view_t identifier;
+
+ // Optional driver that owns the ROCM symbols. We retain it for our lifetime
+ // to ensure the symbols remains valid.
+ iree_hal_driver_t *driver;
+
+ hipDevice_t device;
+
+ // TODO: support multiple streams.
+ hipStream_t stream;
+ iree_hal_rocm_context_wrapper_t context_wrapper;
+ iree_hal_allocator_t *device_allocator;
+
+} iree_hal_rocm_device_t;
+
+extern const iree_hal_device_vtable_t iree_hal_rocm_device_vtable;
+
+static iree_hal_rocm_device_t *iree_hal_rocm_device_cast(
+ iree_hal_device_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_device_vtable);
+ return (iree_hal_rocm_device_t *)base_value;
+}
+
+static void iree_hal_rocm_device_destroy(iree_hal_device_t *base_device) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // There should be no more buffers live that use the allocator.
+ iree_hal_allocator_release(device->device_allocator);
+ ROCM_IGNORE_ERROR(device->context_wrapper.syms,
+ hipStreamDestroy(device->stream));
+
+ // Finally, destroy the device.
+ iree_hal_driver_release(device->driver);
+
+ iree_allocator_free(host_allocator, device);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_rocm_device_create_internal(
+ iree_hal_driver_t *driver, iree_string_view_t identifier,
+ hipDevice_t rocm_device, hipStream_t stream, hipCtx_t context,
+ iree_hal_rocm_dynamic_symbols_t *syms, iree_allocator_t host_allocator,
+ iree_hal_device_t **out_device) {
+ iree_hal_rocm_device_t *device = NULL;
+ iree_host_size_t total_size = sizeof(*device) + identifier.size;
+ IREE_RETURN_IF_ERROR(
+ iree_allocator_malloc(host_allocator, total_size, (void **)&device));
+ memset(device, 0, total_size);
+ iree_hal_resource_initialize(&iree_hal_rocm_device_vtable, &device->resource);
+ device->driver = driver;
+ iree_hal_driver_retain(device->driver);
+ uint8_t *buffer_ptr = (uint8_t *)device + sizeof(*device);
+ buffer_ptr += iree_string_view_append_to_buffer(
+ identifier, &device->identifier, (char *)buffer_ptr);
+ device->device = rocm_device;
+ device->stream = stream;
+ device->context_wrapper.rocm_context = context;
+ device->context_wrapper.host_allocator = host_allocator;
+ device->context_wrapper.syms = syms;
+ iree_status_t status = iree_hal_rocm_allocator_create(
+ &device->context_wrapper, &device->device_allocator);
+ if (iree_status_is_ok(status)) {
+ *out_device = (iree_hal_device_t *)device;
+ } else {
+ iree_hal_device_release((iree_hal_device_t *)device);
+ }
+ return status;
+}
+
+iree_status_t iree_hal_rocm_device_create(iree_hal_driver_t *driver,
+ iree_string_view_t identifier,
+ iree_hal_rocm_dynamic_symbols_t *syms,
+ hipDevice_t device,
+ iree_allocator_t host_allocator,
+ iree_hal_device_t **out_device) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ hipCtx_t context;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, ROCM_RESULT_TO_STATUS(syms, hipCtxCreate(&context, 0, device)));
+ hipStream_t stream;
+ iree_status_t status = ROCM_RESULT_TO_STATUS(
+ syms, hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+
+ if (iree_status_is_ok(status)) {
+ status = iree_hal_rocm_device_create_internal(driver, identifier, device,
+ stream, context, syms,
+ host_allocator, out_device);
+ }
+ if (!iree_status_is_ok(status)) {
+ if (stream) {
+ syms->hipStreamDestroy(stream);
+ }
+ syms->hipCtxDestroy(context);
+ }
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static iree_string_view_t iree_hal_rocm_device_id(
+ iree_hal_device_t *base_device) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return device->identifier;
+}
+
+static iree_allocator_t iree_hal_rocm_device_host_allocator(
+ iree_hal_device_t *base_device) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return device->context_wrapper.host_allocator;
+}
+
+static iree_hal_allocator_t *iree_hal_rocm_device_allocator(
+ iree_hal_device_t *base_device) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return device->device_allocator;
+}
+
+static iree_status_t iree_hal_rocm_device_query_i32(
+ iree_hal_device_t *base_device, iree_string_view_t key,
+ int32_t *out_value) {
+ // iree_hal_rocm_device_t* device = iree_hal_rocm_device_cast(base_device);
+ *out_value = 0;
+ return iree_make_status(IREE_STATUS_NOT_FOUND,
+ "unknown device configuration key value '%*.s'",
+ (int)key.size, key.data);
+}
+
+static iree_status_t iree_hal_rocm_device_create_command_buffer(
+ iree_hal_device_t *base_device, iree_hal_command_buffer_mode_t mode,
+ iree_hal_command_category_t command_categories,
+ iree_hal_queue_affinity_t queue_affinity,
+ iree_hal_command_buffer_t **out_command_buffer) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return iree_hal_rocm_direct_command_buffer_allocate(
+ &device->context_wrapper, mode, command_categories, queue_affinity,
+ out_command_buffer);
+}
+
+static iree_status_t iree_hal_rocm_device_create_descriptor_set(
+ iree_hal_device_t *base_device,
+ iree_hal_descriptor_set_layout_t *set_layout,
+ iree_host_size_t binding_count,
+ const iree_hal_descriptor_set_binding_t *bindings,
+ iree_hal_descriptor_set_t **out_descriptor_set) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "non-push descriptor sets still need work");
+}
+
+static iree_status_t iree_hal_rocm_device_create_descriptor_set_layout(
+ iree_hal_device_t *base_device,
+ iree_hal_descriptor_set_layout_usage_type_t usage_type,
+ iree_host_size_t binding_count,
+ const iree_hal_descriptor_set_layout_binding_t *bindings,
+ iree_hal_descriptor_set_layout_t **out_descriptor_set_layout) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return iree_hal_rocm_descriptor_set_layout_create(
+ &device->context_wrapper, usage_type, binding_count, bindings,
+ out_descriptor_set_layout);
+}
+
+static iree_status_t iree_hal_rocm_device_create_event(
+ iree_hal_device_t *base_device, iree_hal_event_t **out_event) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return iree_hal_rocm_event_create(&device->context_wrapper, out_event);
+}
+
+static iree_status_t iree_hal_rocm_device_create_executable_cache(
+ iree_hal_device_t *base_device, iree_string_view_t identifier,
+ iree_hal_executable_cache_t **out_executable_cache) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return iree_hal_rocm_nop_executable_cache_create(
+ &device->context_wrapper, identifier, out_executable_cache);
+}
+
+static iree_status_t iree_hal_rocm_device_create_executable_layout(
+ iree_hal_device_t *base_device, iree_host_size_t push_constants,
+ iree_host_size_t set_layout_count,
+ iree_hal_descriptor_set_layout_t **set_layouts,
+ iree_hal_executable_layout_t **out_executable_layout) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return iree_hal_rocm_executable_layout_create(
+ &device->context_wrapper, set_layout_count, set_layouts, push_constants,
+ out_executable_layout);
+}
+
+static iree_status_t iree_hal_rocm_device_create_semaphore(
+ iree_hal_device_t *base_device, uint64_t initial_value,
+ iree_hal_semaphore_t **out_semaphore) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ return iree_hal_rocm_semaphore_create(&device->context_wrapper, initial_value,
+ out_semaphore);
+}
+
+static iree_status_t iree_hal_rocm_device_queue_submit(
+ iree_hal_device_t *base_device,
+ iree_hal_command_category_t command_categories,
+ iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+ const iree_hal_submission_batch_t *batches) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ // TODO(raikonenfnu): Once semaphore is implemented wait for semaphores
+ // TODO(thomasraoux): Conservatively syncronize after every submit until we
+ // support semaphores.
+ // TODO(raikonenfnu): currently run on default/null stream, when cmd buffer
+ // stream work with device->stream, we'll change
+ ROCM_RETURN_IF_ERROR(device->context_wrapper.syms, hipStreamSynchronize(0),
+ "hipStreamSynchronize");
+ return iree_ok_status();
+}
+
+static iree_status_t iree_hal_rocm_device_submit_and_wait(
+ iree_hal_device_t *base_device,
+ iree_hal_command_category_t command_categories,
+ iree_hal_queue_affinity_t queue_affinity, iree_host_size_t batch_count,
+ const iree_hal_submission_batch_t *batches,
+ iree_hal_semaphore_t *wait_semaphore, uint64_t wait_value,
+ iree_timeout_t timeout) {
+ // Submit...
+ IREE_RETURN_IF_ERROR(iree_hal_rocm_device_queue_submit(
+ base_device, command_categories, queue_affinity, batch_count, batches));
+
+ // ...and wait.
+ return iree_hal_semaphore_wait(wait_semaphore, wait_value, timeout);
+}
+
+static iree_status_t iree_hal_rocm_device_wait_semaphores(
+ iree_hal_device_t *base_device, iree_hal_wait_mode_t wait_mode,
+ const iree_hal_semaphore_list_t *semaphore_list, iree_timeout_t timeout) {
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
+ "semaphore not implemented");
+}
+
+static iree_status_t iree_hal_rocm_device_wait_idle(
+ iree_hal_device_t *base_device, iree_timeout_t timeout) {
+ iree_hal_rocm_device_t *device = iree_hal_rocm_device_cast(base_device);
+ // Wait until the stream is done.
+ // TODO(thomasraoux): HIP doesn't support a deadline for wait, figure out how
+ // to handle it better.
+ ROCM_RETURN_IF_ERROR(device->context_wrapper.syms,
+ hipStreamSynchronize(device->stream),
+ "hipStreamSynchronize");
+ return iree_ok_status();
+}
+
+const iree_hal_device_vtable_t iree_hal_rocm_device_vtable = {
+ .destroy = iree_hal_rocm_device_destroy,
+ .id = iree_hal_rocm_device_id,
+ .host_allocator = iree_hal_rocm_device_host_allocator,
+ .device_allocator = iree_hal_rocm_device_allocator,
+ .query_i32 = iree_hal_rocm_device_query_i32,
+ .create_command_buffer = iree_hal_rocm_device_create_command_buffer,
+ .create_descriptor_set = iree_hal_rocm_device_create_descriptor_set,
+ .create_descriptor_set_layout =
+ iree_hal_rocm_device_create_descriptor_set_layout,
+ .create_event = iree_hal_rocm_device_create_event,
+ .create_executable_cache = iree_hal_rocm_device_create_executable_cache,
+ .create_executable_layout = iree_hal_rocm_device_create_executable_layout,
+ .create_semaphore = iree_hal_rocm_device_create_semaphore,
+ .queue_submit = iree_hal_rocm_device_queue_submit,
+ .submit_and_wait = iree_hal_rocm_device_submit_and_wait,
+ .wait_semaphores = iree_hal_rocm_device_wait_semaphores,
+ .wait_idle = iree_hal_rocm_device_wait_idle,
+};
diff --git a/experimental/rocm/rocm_device.h b/experimental/rocm/rocm_device.h
new file mode 100644
index 0000000..e3504c6
--- /dev/null
+++ b/experimental/rocm/rocm_device.h
@@ -0,0 +1,38 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_ROCM_DEVICE_H_
+#define IREE_HAL_ROCM_ROCM_DEVICE_H_
+
+#include "experimental/rocm/api.h"
+#include "experimental/rocm/dynamic_symbols.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Creates a device that owns and manages its own hipContext.
+iree_status_t iree_hal_rocm_device_create(iree_hal_driver_t *driver,
+ iree_string_view_t identifier,
+ iree_hal_rocm_dynamic_symbols_t *syms,
+ hipDevice_t device,
+ iree_allocator_t host_allocator,
+ iree_hal_device_t **out_device);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_ROCM_DEVICE_H_
diff --git a/experimental/rocm/rocm_driver.c b/experimental/rocm/rocm_driver.c
new file mode 100644
index 0000000..219c6a3
--- /dev/null
+++ b/experimental/rocm/rocm_driver.c
@@ -0,0 +1,211 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/api.h"
+#include "experimental/rocm/dynamic_symbols.h"
+#include "experimental/rocm/rocm_device.h"
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_allocator_t host_allocator;
+ // Identifier used for the driver in the IREE driver registry.
+ // We allow overriding so that multiple ROCM versions can be exposed in the
+ // same process.
+ iree_string_view_t identifier;
+ int default_device_index;
+ // ROCM symbols.
+ iree_hal_rocm_dynamic_symbols_t syms;
+} iree_hal_rocm_driver_t;
+
+// Pick a fixed lenght size for device names.
+#define IREE_MAX_ROCM_DEVICE_NAME_LENGTH 100
+
+extern const iree_hal_driver_vtable_t iree_hal_rocm_driver_vtable;
+
+static iree_hal_rocm_driver_t *iree_hal_rocm_driver_cast(
+ iree_hal_driver_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_driver_vtable);
+ return (iree_hal_rocm_driver_t *)base_value;
+}
+
+IREE_API_EXPORT void iree_hal_rocm_driver_options_initialize(
+ iree_hal_rocm_driver_options_t *out_options) {
+ memset(out_options, 0, sizeof(*out_options));
+ out_options->default_device_index = 0;
+}
+
+static iree_status_t iree_hal_rocm_driver_create_internal(
+ iree_string_view_t identifier,
+ const iree_hal_rocm_driver_options_t *options,
+ iree_allocator_t host_allocator, iree_hal_driver_t **out_driver) {
+ iree_hal_rocm_driver_t *driver = NULL;
+ iree_host_size_t total_size = sizeof(*driver) + identifier.size;
+ IREE_RETURN_IF_ERROR(
+ iree_allocator_malloc(host_allocator, total_size, (void **)&driver));
+ iree_hal_resource_initialize(&iree_hal_rocm_driver_vtable, &driver->resource);
+ driver->host_allocator = host_allocator;
+ iree_string_view_append_to_buffer(
+ identifier, &driver->identifier,
+ (char *)driver + total_size - identifier.size);
+ driver->default_device_index = options->default_device_index;
+ iree_status_t status =
+ iree_hal_rocm_dynamic_symbols_initialize(host_allocator, &driver->syms);
+ if (iree_status_is_ok(status)) {
+ *out_driver = (iree_hal_driver_t *)driver;
+ } else {
+ iree_hal_driver_release((iree_hal_driver_t *)driver);
+ }
+ return status;
+}
+
+static void iree_hal_rocm_driver_destroy(iree_hal_driver_t *base_driver) {
+ iree_hal_rocm_driver_t *driver = iree_hal_rocm_driver_cast(base_driver);
+ iree_allocator_t host_allocator = driver->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_dynamic_symbols_deinitialize(&driver->syms);
+ iree_allocator_free(host_allocator, driver);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT iree_status_t iree_hal_rocm_driver_create(
+ iree_string_view_t identifier,
+ const iree_hal_rocm_driver_options_t *options,
+ iree_allocator_t host_allocator, iree_hal_driver_t **out_driver) {
+ IREE_ASSERT_ARGUMENT(options);
+ IREE_ASSERT_ARGUMENT(out_driver);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_status_t status = iree_hal_rocm_driver_create_internal(
+ identifier, options, host_allocator, out_driver);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+// Populates device information from the given ROCM physical device handle.
+// |out_device_info| must point to valid memory and additional data will be
+// appended to |buffer_ptr| and the new pointer is returned.
+static uint8_t *iree_hal_rocm_populate_device_info(
+ hipDevice_t device, iree_hal_rocm_dynamic_symbols_t *syms,
+ uint8_t *buffer_ptr, iree_hal_device_info_t *out_device_info) {
+ char device_name[IREE_MAX_ROCM_DEVICE_NAME_LENGTH];
+ ROCM_IGNORE_ERROR(syms,
+ hipDeviceGetName(device_name, sizeof(device_name), device));
+ memset(out_device_info, 0, sizeof(*out_device_info));
+ out_device_info->device_id = (iree_hal_device_id_t)device;
+
+ iree_string_view_t device_name_string =
+ iree_make_string_view(device_name, strlen(device_name));
+ buffer_ptr += iree_string_view_append_to_buffer(
+ device_name_string, &out_device_info->name, (char *)buffer_ptr);
+ return buffer_ptr;
+}
+
+static iree_status_t iree_hal_rocm_driver_query_available_devices(
+ iree_hal_driver_t *base_driver, iree_allocator_t host_allocator,
+ iree_hal_device_info_t **out_device_infos,
+ iree_host_size_t *out_device_info_count) {
+ iree_hal_rocm_driver_t *driver = iree_hal_rocm_driver_cast(base_driver);
+ // Query the number of available ROCM devices.
+ int device_count = 0;
+ ROCM_RETURN_IF_ERROR(&driver->syms, hipGetDeviceCount(&device_count),
+ "hipGetDeviceCount");
+
+ // Allocate the return infos and populate with the devices.
+ iree_hal_device_info_t *device_infos = NULL;
+ iree_host_size_t total_size = device_count * sizeof(iree_hal_device_info_t);
+ for (iree_host_size_t i = 0; i < device_count; ++i) {
+ total_size += IREE_MAX_ROCM_DEVICE_NAME_LENGTH * sizeof(char);
+ }
+ iree_status_t status =
+ iree_allocator_malloc(host_allocator, total_size, (void **)&device_infos);
+ if (iree_status_is_ok(status)) {
+ uint8_t *buffer_ptr =
+ (uint8_t *)device_infos + device_count * sizeof(iree_hal_device_info_t);
+ for (iree_host_size_t i = 0; i < device_count; ++i) {
+ hipDevice_t device;
+ iree_status_t status = ROCM_RESULT_TO_STATUS(
+ &driver->syms, hipDeviceGet(&device, i), "hipDeviceGet");
+ if (!iree_status_is_ok(status)) break;
+ buffer_ptr = iree_hal_rocm_populate_device_info(
+ device, &driver->syms, buffer_ptr, &device_infos[i]);
+ }
+ }
+ if (iree_status_is_ok(status)) {
+ *out_device_info_count = device_count;
+ *out_device_infos = device_infos;
+ } else {
+ iree_allocator_free(host_allocator, device_infos);
+ }
+ return status;
+}
+
+static iree_status_t iree_hal_rocm_driver_select_default_device(
+ iree_hal_rocm_dynamic_symbols_t *syms, int default_device_index,
+ iree_allocator_t host_allocator, hipDevice_t *out_device) {
+ int device_count = 0;
+ ROCM_RETURN_IF_ERROR(syms, hipGetDeviceCount(&device_count),
+ "hipGetDeviceCount");
+ iree_status_t status = iree_ok_status();
+ if (device_count == 0 || default_device_index >= device_count) {
+ status = iree_make_status(IREE_STATUS_NOT_FOUND,
+ "default device %d not found (of %d enumerated)",
+ default_device_index, device_count);
+ } else {
+ hipDevice_t device;
+ ROCM_RETURN_IF_ERROR(syms, hipDeviceGet(&device, default_device_index),
+ "hipDeviceGet");
+ *out_device = device;
+ }
+ return status;
+}
+
+static iree_status_t iree_hal_rocm_driver_create_device(
+ iree_hal_driver_t *base_driver, iree_hal_device_id_t device_id,
+ iree_allocator_t host_allocator, iree_hal_device_t **out_device) {
+ iree_hal_rocm_driver_t *driver = iree_hal_rocm_driver_cast(base_driver);
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, ROCM_RESULT_TO_STATUS(&driver->syms, hipInit(0), "hipInit"));
+ // Use either the specified device (enumerated earlier) or whatever default
+ // one was specified when the driver was created.
+ hipDevice_t device = (hipDevice_t)device_id;
+ if (device == 0) {
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_hal_rocm_driver_select_default_device(
+ &driver->syms, driver->default_device_index, host_allocator,
+ &device));
+ }
+
+ iree_string_view_t device_name = iree_make_cstring_view("rocm");
+
+ // Attempt to create the device.
+ iree_status_t status =
+ iree_hal_rocm_device_create(base_driver, device_name, &driver->syms,
+ device, host_allocator, out_device);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+const iree_hal_driver_vtable_t iree_hal_rocm_driver_vtable = {
+ .destroy = iree_hal_rocm_driver_destroy,
+ .query_available_devices = iree_hal_rocm_driver_query_available_devices,
+ .create_device = iree_hal_rocm_driver_create_device,
+};
diff --git a/experimental/rocm/rocm_event.c b/experimental/rocm/rocm_event.c
new file mode 100644
index 0000000..a496ba4
--- /dev/null
+++ b/experimental/rocm/rocm_event.c
@@ -0,0 +1,67 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/rocm_event.h"
+
+#include "experimental/rocm/status_util.h"
+#include "iree/base/tracing.h"
+
+// Dummy events for now, don't do anything.
+typedef struct {
+ iree_hal_resource_t resource;
+ iree_hal_rocm_context_wrapper_t *context_wrapper;
+} iree_hal_rocm_event_t;
+
+extern const iree_hal_event_vtable_t iree_hal_rocm_event_vtable;
+
+static iree_hal_rocm_event_t *iree_hal_rocm_event_cast(
+ iree_hal_event_t *base_value) {
+ IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_rocm_event_vtable);
+ return (iree_hal_rocm_event_t *)base_value;
+}
+
+iree_status_t iree_hal_rocm_event_create(
+ iree_hal_rocm_context_wrapper_t *context_wrapper,
+ iree_hal_event_t **out_event) {
+ IREE_ASSERT_ARGUMENT(context_wrapper);
+ IREE_ASSERT_ARGUMENT(out_event);
+ *out_event = NULL;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_hal_rocm_event_t *event = NULL;
+ iree_status_t status = iree_allocator_malloc(context_wrapper->host_allocator,
+ sizeof(*event), (void **)&event);
+ if (iree_status_is_ok(status)) {
+ iree_hal_resource_initialize(&iree_hal_rocm_event_vtable, &event->resource);
+ event->context_wrapper = context_wrapper;
+ *out_event = (iree_hal_event_t *)event;
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+static void iree_hal_rocm_event_destroy(iree_hal_event_t *base_event) {
+ iree_hal_rocm_event_t *event = iree_hal_rocm_event_cast(base_event);
+ iree_allocator_t host_allocator = event->context_wrapper->host_allocator;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_allocator_free(host_allocator, event);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+const iree_hal_event_vtable_t iree_hal_rocm_event_vtable = {
+ .destroy = iree_hal_rocm_event_destroy,
+};
diff --git a/experimental/rocm/rocm_event.h b/experimental/rocm/rocm_event.h
new file mode 100644
index 0000000..d97d023
--- /dev/null
+++ b/experimental/rocm/rocm_event.h
@@ -0,0 +1,38 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_EVENT_H_
+#define IREE_HAL_ROCM_EVENT_H_
+
+#include "experimental/rocm/context_wrapper.h"
+#include "experimental/rocm/rocm_headers.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Creates a dummy event object. Object will be represented by rocm Graph edges
+// so nothing is created at creation time. When an event is signaled in the
+// command buffer we will add the appropriate edges to enforce the right
+// synchronization.
+iree_status_t iree_hal_rocm_event_create(
+ iree_hal_rocm_context_wrapper_t *context_wrapper,
+ iree_hal_event_t **out_event);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_EVENT_H_
diff --git a/experimental/rocm/rocm_headers.h b/experimental/rocm/rocm_headers.h
new file mode 100644
index 0000000..866d0ac
--- /dev/null
+++ b/experimental/rocm/rocm_headers.h
@@ -0,0 +1,20 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_ROCM_HEADERS_H_
+#define IREE_HAL_ROCM_ROCM_HEADERS_H_
+
+#include "hip/hip_runtime.h"
+
+#endif // IREE_HAL_ROCM_ROCM_HEADERS_H_
diff --git a/experimental/rocm/status_util.c b/experimental/rocm/status_util.c
new file mode 100644
index 0000000..31304e5
--- /dev/null
+++ b/experimental/rocm/status_util.c
@@ -0,0 +1,38 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "experimental/rocm/status_util.h"
+
+#include "experimental/rocm/dynamic_symbols.h"
+
+iree_status_t iree_hal_rocm_result_to_status(
+ iree_hal_rocm_dynamic_symbols_t *syms, hipError_t result, const char *file,
+ uint32_t line) {
+ if (IREE_LIKELY(result == hipSuccess)) {
+ return iree_ok_status();
+ }
+
+ const char *error_name = syms->hipGetErrorName(result);
+ if (result == hipErrorUnknown) {
+ error_name = "UNKNOWN";
+ }
+
+ const char *error_string = syms->hipGetErrorString(result);
+ if (result == hipErrorUnknown) {
+ error_string = "Unknown error.";
+ }
+ return iree_make_status(IREE_STATUS_INTERNAL,
+ "rocm driver error '%s' (%d): %s", error_name, result,
+ error_string);
+}
diff --git a/experimental/rocm/status_util.h b/experimental/rocm/status_util.h
new file mode 100644
index 0000000..6a46462
--- /dev/null
+++ b/experimental/rocm/status_util.h
@@ -0,0 +1,60 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_ROCM_STATUS_UTIL_H_
+#define IREE_HAL_ROCM_STATUS_UTIL_H_
+
+#include "experimental/rocm/dynamic_symbols.h"
+#include "iree/base/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Converts a hipError_t to an iree_status_t.
+//
+// Usage:
+// iree_status_t status = ROCM_RESULT_TO_STATUS(rocmDoThing(...));
+#define ROCM_RESULT_TO_STATUS(syms, expr, ...) \
+ iree_hal_rocm_result_to_status((syms), ((syms)->expr), __FILE__, __LINE__)
+
+// IREE_RETURN_IF_ERROR but implicitly converts the hipError_t return value to
+// a Status.
+//
+// Usage:
+// ROCM_RETURN_IF_ERROR(rocmDoThing(...), "message");
+#define ROCM_RETURN_IF_ERROR(syms, expr, ...) \
+ IREE_RETURN_IF_ERROR(iree_hal_rocm_result_to_status((syms), ((syms)->expr), \
+ __FILE__, __LINE__), \
+ __VA_ARGS__)
+
+// IREE_IGNORE_ERROR but implicitly converts the hipError_t return value to a
+// Status.
+//
+// Usage:
+// ROCM_IGNORE_ERROR(rocmDoThing(...));
+#define ROCM_IGNORE_ERROR(syms, expr) \
+ IREE_IGNORE_ERROR(iree_hal_rocm_result_to_status((syms), ((syms)->expr), \
+ __FILE__, __LINE__))
+
+// Converts a hipError_t to a Status object.
+iree_status_t iree_hal_rocm_result_to_status(
+ iree_hal_rocm_dynamic_symbols_t *syms, hipError_t result, const char *file,
+ uint32_t line);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_HAL_ROCM_STATUS_UTIL_H_
diff --git a/iree/hal/drivers/CMakeLists.txt b/iree/hal/drivers/CMakeLists.txt
index 64ea93b..e69d2cb 100644
--- a/iree/hal/drivers/CMakeLists.txt
+++ b/iree/hal/drivers/CMakeLists.txt
@@ -29,6 +29,9 @@
if(${IREE_HAL_DRIVER_VULKAN})
list(APPEND IREE_HAL_DRIVER_MODULES iree::hal::vulkan::registration)
endif()
+if(${IREE_BUILD_EXPERIMENTAL_ROCM})
+ list(APPEND IREE_HAL_DRIVER_MODULES experimental::rocm::registration)
+endif()
iree_cc_library(
NAME
diff --git a/iree/hal/drivers/init.c b/iree/hal/drivers/init.c
index fe50a0c..e30e871 100644
--- a/iree/hal/drivers/init.c
+++ b/iree/hal/drivers/init.c
@@ -36,6 +36,10 @@
#include "iree/hal/vulkan/registration/driver_module.h"
#endif // IREE_HAL_HAVE_VULKAN_DRIVER_MODULE
+#if defined(IREE_BUILD_EXPERIMENTAL_ROCM)
+#include "experimental/rocm/registration/driver_module.h"
+#endif // IREE_BUILD_EXPERIMENTAL_ROCM
+
IREE_API_EXPORT iree_status_t
iree_hal_register_all_available_drivers(iree_hal_driver_registry_t* registry) {
IREE_TRACE_ZONE_BEGIN(z0);
@@ -65,6 +69,11 @@
z0, iree_hal_vulkan_driver_module_register(registry));
#endif // IREE_HAL_HAVE_VULKAN_DRIVER_MODULE
+#if defined(IREE_BUILD_EXPERIMENTAL_ROCM)
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_hal_rocm_driver_module_register(registry));
+#endif // IREE_HAL_HAVE_ROCM_DRIVER_MODULE
+
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
diff --git a/third_party/rocm/LICENSE b/third_party/rocm/LICENSE
new file mode 100644
index 0000000..7c79cca
--- /dev/null
+++ b/third_party/rocm/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+with the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimers.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimers in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the names of Advanced Micro Devices, Inc. nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this Software without specific prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+THE SOFTWARE.
\ No newline at end of file
diff --git a/third_party/rocm/README.txt b/third_party/rocm/README.txt
new file mode 100644
index 0000000..202619d
--- /dev/null
+++ b/third_party/rocm/README.txt
@@ -0,0 +1,3 @@
+This folder contains a subset of ROCM SDK headers needed to build Experimental IREE ROCM Backend.
+It will also contains amdgcn bc files llvm module used to import __oc* function
+during ROCm HSACO(code object) kernel compilation.
diff --git a/third_party/rocm/UPDATING.md b/third_party/rocm/UPDATING.md
new file mode 100644
index 0000000..2cab28e
--- /dev/null
+++ b/third_party/rocm/UPDATING.md
@@ -0,0 +1,15 @@
+Those headers come from ROCM SDK.
+
+Currently updates are not supported by ROCm, so we need to uninstall and reinstall ROCm if we want to update
+To update, install ROCM SDK locally:
+```
+sudo apt autoremove rocm-opencl rocm-dkms rocm-dev rocm-utils && sudo reboot
+sudo apt-get install rocm-dkms
+```
+
+Copy HIP and HSA headers, version.txt and libdevice.10.bc:
+```
+cp -RL /opt/rocm/include/hip ./include/
+cp -RL /opt/rocm/include/hsa ./include/
+cp /opt/rocm/.info/version version.txt
+```
diff --git a/third_party/rocm/include/hip/channel_descriptor.h b/third_party/rocm/include/hip/channel_descriptor.h
new file mode 100644
index 0000000..842701b
--- /dev/null
+++ b/third_party/rocm/include/hip/channel_descriptor.h
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_CHANNEL_DESCRIPTOR_H
+
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/channel_descriptor.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/channel_descriptor.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/device_functions.h b/third_party/rocm/include/hip/device_functions.h
new file mode 100644
index 0000000..f6059f2
--- /dev/null
+++ b/third_party/rocm/include/hip/device_functions.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_DEVICE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_DEVICE_FUNCTIONS_H
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/device_functions.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <device_functions.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/driver_types.h b/third_party/rocm/include/hip/driver_types.h
new file mode 100644
index 0000000..d428ec7
--- /dev/null
+++ b/third_party/rocm/include/hip/driver_types.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_DRIVER_TYPES_H
+#define HIP_INCLUDE_HIP_DRIVER_TYPES_H
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/driver_types.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include "driver_types.h"
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/channel_descriptor.h b/third_party/rocm/include/hip/hcc_detail/channel_descriptor.h
new file mode 100644
index 0000000..417451f
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/channel_descriptor.h
@@ -0,0 +1,354 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include <hip/hip_common.h>
+#include <hip/hcc_detail/driver_types.h>
+#include <hip/hcc_detail/hip_vector_types.h>
+
+#ifdef __cplusplus
+
+#if __HIP_ROCclr__
+extern "C" {
+#endif
+HIP_PUBLIC_API
+hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
+#if __HIP_ROCclr__
+}
+#endif
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <typename T>
+static inline hipChannelFormatDesc hipCreateChannelDesc() {
+ return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
+ int e = (int)sizeof(char) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
+ int e = (int)sizeof(signed char) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
+ int e = (int)sizeof(unsigned char) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
+ int e = (int)sizeof(unsigned char) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
+ int e = (int)sizeof(signed char) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
+ int e = (int)sizeof(unsigned char) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
+ int e = (int)sizeof(signed char) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__ // vector3 is the same as vector4
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
+ int e = (int)sizeof(unsigned char) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
+ int e = (int)sizeof(signed char) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
+ int e = (int)sizeof(unsigned char) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
+ int e = (int)sizeof(signed char) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
+ int e = (int)sizeof(signed short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
+ int e = (int)sizeof(signed short) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
+ int e = (int)sizeof(signed short) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
+ int e = (int)sizeof(signed short) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
+ int e = (int)sizeof(unsigned short) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
+ int e = (int)sizeof(signed short) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
+ int e = (int)sizeof(unsigned int) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
+ int e = (int)sizeof(signed int) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
+ int e = (int)sizeof(unsigned int) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
+ int e = (int)sizeof(signed int) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
+ int e = (int)sizeof(unsigned int) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
+ int e = (int)sizeof(signed int) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
+ int e = (int)sizeof(unsigned int) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
+ int e = (int)sizeof(signed int) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
+ int e = (int)sizeof(unsigned int) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
+ int e = (int)sizeof(signed int) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
+ int e = (int)sizeof(float) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
+ int e = (int)sizeof(float) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
+ int e = (int)sizeof(float) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
+ int e = (int)sizeof(float) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
+ int e = (int)sizeof(float) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
+ int e = (int)sizeof(unsigned long) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
+ int e = (int)sizeof(signed long) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
+ int e = (int)sizeof(unsigned long) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
+ int e = (int)sizeof(signed long) * 8;
+ return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
+ int e = (int)sizeof(unsigned long) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
+ int e = (int)sizeof(signed long) * 8;
+ return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
+}
+
+#ifndef __GNUC__
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
+ int e = (int)sizeof(unsigned long) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
+ int e = (int)sizeof(signed long) * 8;
+ return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
+}
+#endif
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
+ int e = (int)sizeof(unsigned long) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
+}
+
+template <>
+inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
+ int e = (int)sizeof(signed long) * 8;
+ return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
+}
+
+#else
+
+struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+ enum hipChannelFormatKind f);
+
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/concepts.hpp b/third_party/rocm/include/hip/hcc_detail/concepts.hpp
new file mode 100644
index 0000000..373cefb
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/concepts.hpp
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2015-present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+namespace hip_impl // Documentation only.
+{
+#define requires(...)
+
+#define FunctionalProcedure typename
+} // namespace hip_impl
diff --git a/third_party/rocm/include/hip/hcc_detail/cuda/cuda.h b/third_party/rocm/include/hip/hcc_detail/cuda/cuda.h
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/cuda/cuda.h
@@ -0,0 +1 @@
+
diff --git a/third_party/rocm/include/hip/hcc_detail/cuda/math_functions.h b/third_party/rocm/include/hip/hcc_detail/cuda/math_functions.h
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/cuda/math_functions.h
@@ -0,0 +1 @@
+
diff --git a/third_party/rocm/include/hip/hcc_detail/device_functions.h b/third_party/rocm/include/hip/hcc_detail/device_functions.h
new file mode 100644
index 0000000..515b4cc
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/device_functions.h
@@ -0,0 +1,1431 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
+
+#include "host_defines.h"
+#include "math_fwd.h"
+
+#include <hip/hip_runtime_api.h>
+#include <stddef.h>
+
+
+#include <hip/hip_vector_types.h>
+#include <hip/hcc_detail/device_library_decls.h>
+#include <hip/hcc_detail/llvm_intrinsics.h>
+
+#if __HIP_CLANG_ONLY__ && __HIP_ROCclr__ && !_WIN32
+extern "C" __device__ int printf(const char *fmt, ...);
+#else
+#if HC_FEATURE_PRINTF
+template <typename... All>
+static inline __device__ void printf(const char* format, All... all) {
+ hc::printf(format, all...);
+}
+#else
+template <typename... All>
+static inline __device__ void printf(const char* format, All... all) {}
+#endif // HC_FEATURE_PRINTF
+#endif // __HIP_CLANG_ONLY__ && __HIP_ROCclr__
+
+/*
+Integer Intrinsics
+*/
+
+// integer intrinsic function __poc __clz __ffs __brev
+__device__ static inline unsigned int __popc(unsigned int input) {
+ return __builtin_popcount(input);
+}
+__device__ static inline unsigned int __popcll(unsigned long long int input) {
+ return __builtin_popcountll(input);
+}
+
+__device__ static inline int __clz(int input) {
+ return __ockl_clz_u32((uint)input);
+}
+
+__device__ static inline int __clzll(long long int input) {
+ return __ockl_clz_u64((ullong)input);
+}
+
+__device__ static inline unsigned int __ffs(unsigned int input) {
+ return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
+}
+
+__device__ static inline unsigned int __ffsll(unsigned long long int input) {
+ return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
+}
+
+__device__ static inline unsigned int __ffs(int input) {
+ return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
+}
+
+__device__ static inline unsigned int __ffsll(long long int input) {
+ return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
+}
+
+__device__ static inline unsigned int __brev(unsigned int input) {
+ return __builtin_bitreverse32(input);
+}
+
+__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
+ return __builtin_bitreverse64(input);
+}
+
+__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
+ return input == 0 ? -1 : __builtin_ctzl(input);
+}
+
+__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
+ uint32_t offset = src1 & 31;
+ uint32_t width = src2 & 31;
+ return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
+}
+
+__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
+ uint64_t offset = src1 & 63;
+ uint64_t width = src2 & 63;
+ return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
+}
+
+__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
+ uint32_t offset = src2 & 31;
+ uint32_t width = src3 & 31;
+ uint32_t mask = (1 << width) - 1;
+ return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
+
+__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
+ uint64_t offset = src2 & 63;
+ uint64_t width = src3 & 63;
+ uint64_t mask = (1ULL << width) - 1;
+ return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
+}
+
+__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
+__device__ static unsigned int __hadd(int x, int y);
+__device__ static int __mul24(int x, int y);
+__device__ static long long int __mul64hi(long long int x, long long int y);
+__device__ static int __mulhi(int x, int y);
+__device__ static int __rhadd(int x, int y);
+__device__ static unsigned int __sad(int x, int y,unsigned int z);
+__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
+__device__ static int __umul24(unsigned int x, unsigned int y);
+__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
+__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
+__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
+
+struct ucharHolder {
+ union {
+ unsigned char c[4];
+ unsigned int ui;
+ };
+} __attribute__((aligned(4)));
+
+struct uchar2Holder {
+ union {
+ unsigned int ui[2];
+ unsigned char c[8];
+ };
+} __attribute__((aligned(8)));
+
+__device__
+static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
+ struct uchar2Holder cHoldVal;
+ struct ucharHolder cHoldKey;
+ struct ucharHolder cHoldOut;
+ cHoldKey.ui = s;
+ cHoldVal.ui[0] = x;
+ cHoldVal.ui[1] = y;
+ cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
+ cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
+ cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
+ cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
+ return cHoldOut.ui;
+}
+
+__device__ static inline unsigned int __hadd(int x, int y) {
+ int z = x + y;
+ int sign = z & 0x8000000;
+ int value = z & 0x7FFFFFFF;
+ return ((value) >> 1 || sign);
+}
+
+__device__ static inline int __mul24(int x, int y) {
+ return __ockl_mul24_i32(x, y);
+}
+
+__device__ static inline long long __mul64hi(long long int x, long long int y) {
+ ulong x0 = (ulong)x & 0xffffffffUL;
+ long x1 = x >> 32;
+ ulong y0 = (ulong)y & 0xffffffffUL;
+ long y1 = y >> 32;
+ ulong z0 = x0*y0;
+ long t = x1*y0 + (z0 >> 32);
+ long z1 = t & 0xffffffffL;
+ long z2 = t >> 32;
+ z1 = x0*y1 + z1;
+ return x1*y1 + z2 + (z1 >> 32);
+}
+
+__device__ static inline int __mulhi(int x, int y) {
+ return __ockl_mul_hi_i32(x, y);
+}
+
+__device__ static inline int __rhadd(int x, int y) {
+ int z = x + y + 1;
+ int sign = z & 0x8000000;
+ int value = z & 0x7FFFFFFF;
+ return ((value) >> 1 || sign);
+}
+__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
+ return x > y ? x - y + z : y - x + z;
+}
+__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
+ return (x + y) >> 1;
+}
+__device__ static inline int __umul24(unsigned int x, unsigned int y) {
+ return __ockl_mul24_u32(x, y);
+}
+
+__device__
+static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
+ ulong x0 = x & 0xffffffffUL;
+ ulong x1 = x >> 32;
+ ulong y0 = y & 0xffffffffUL;
+ ulong y1 = y >> 32;
+ ulong z0 = x0*y0;
+ ulong t = x1*y0 + (z0 >> 32);
+ ulong z1 = t & 0xffffffffUL;
+ ulong z2 = t >> 32;
+ z1 = x0*y1 + z1;
+ return x1*y1 + z2 + (z1 >> 32);
+}
+
+__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
+ return __ockl_mul_hi_u32(x, y);
+}
+__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
+ return (x + y + 1) >> 1;
+}
+__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
+ return __ockl_sadd_u32(x, y, z);
+}
+
+__device__ static inline unsigned int __lane_id() {
+ return __builtin_amdgcn_mbcnt_hi(
+ -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+
+__device__
+static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
+
+__device__
+static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
+
+/*
+HIP specific device functions
+*/
+
+__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = src;
+ tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+ return tmp.u;
+}
+
+__device__ static inline float __hip_ds_bpermutef(int index, float src) {
+ union { int i; unsigned u; float f; } tmp; tmp.f = src;
+ tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
+ return tmp.f;
+}
+
+__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = src;
+ tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+ return tmp.u;
+}
+
+__device__ static inline float __hip_ds_permutef(int index, float src) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = src;
+ tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
+ return tmp.u;
+}
+
+#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
+#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
+
+template <int pattern>
+__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = src;
+#if defined(__HCC__)
+ tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
+#else
+ tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+#endif
+ return tmp.u;
+}
+
+template <int pattern>
+__device__ static inline float __hip_ds_swizzlef_N(float src) {
+ union { int i; unsigned u; float f; } tmp; tmp.f = src;
+#if defined(__HCC__)
+ tmp.i = __llvm_amdgcn_ds_swizzle(tmp.i, pattern);
+#else
+ tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
+#endif
+ return tmp.f;
+}
+
+#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
+ __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
+
+template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
+__device__ static inline int __hip_move_dpp_N(int src) {
+ return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
+ bound_ctrl);
+}
+
+// FIXME: Remove the following workaround once the clang change is released.
+// This is for backward compatibility with older clang which does not define
+// __AMDGCN_WAVEFRONT_SIZE. It does not consider -mwavefrontsize64.
+#ifndef __AMDGCN_WAVEFRONT_SIZE
+#if __gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__
+#define __AMDGCN_WAVEFRONT_SIZE 32
+#else
+#define __AMDGCN_WAVEFRONT_SIZE 64
+#endif
+#endif
+static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
+
+__device__
+inline
+int __shfl(int var, int src_lane, int width = warpSize) {
+ int self = __lane_id();
+ int index = src_lane + (self & ~(width-1));
+ return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = var;
+ tmp.i = __shfl(tmp.i, src_lane, width);
+ return tmp.u;
+}
+__device__
+inline
+float __shfl(float var, int src_lane, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.f = var;
+ tmp.i = __shfl(tmp.i, src_lane, width);
+ return tmp.f;
+}
+__device__
+inline
+double __shfl(double var, int src_lane, int width = warpSize) {
+ static_assert(sizeof(double) == 2 * sizeof(int), "");
+ static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl(tmp[0], src_lane, width);
+ tmp[1] = __shfl(tmp[1], src_lane, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+long __shfl(long var, int src_lane, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl(tmp[0], src_lane, width);
+ tmp[1] = __shfl(tmp[1], src_lane, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(long) == sizeof(int), "");
+ return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
+ #endif
+}
+__device__
+inline
+unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
+ #ifndef _MSC_VER
+ static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl(tmp[0], src_lane, width);
+ tmp[1] = __shfl(tmp[1], src_lane, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+ return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
+ #endif
+}
+__device__
+inline
+long long __shfl(long long var, int src_lane, int width = warpSize)
+{
+ static_assert(sizeof(long long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long long) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl(tmp[0], src_lane, width);
+ tmp[1] = __shfl(tmp[1], src_lane, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
+ static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl(tmp[0], src_lane, width);
+ tmp[1] = __shfl(tmp[1], src_lane, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+
+__device__
+inline
+int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
+ int self = __lane_id();
+ int index = self - lane_delta;
+ index = (index < (self & ~(width-1)))?self:index;
+ return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = var;
+ tmp.i = __shfl_up(tmp.i, lane_delta, width);
+ return tmp.u;
+}
+__device__
+inline
+float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.f = var;
+ tmp.i = __shfl_up(tmp.i, lane_delta, width);
+ return tmp.f;
+}
+__device__
+inline
+double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
+ static_assert(sizeof(double) == 2 * sizeof(int), "");
+ static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(long) == sizeof(int), "");
+ return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
+ #endif
+}
+
+__device__
+inline
+unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+ return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
+ #endif
+}
+
+__device__
+inline
+long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
+{
+ static_assert(sizeof(long long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long long) == sizeof(uint64_t), "");
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+
+__device__
+inline
+unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+ static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_up(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_up(tmp[1], lane_delta, width);
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+
+__device__
+inline
+int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
+ int self = __lane_id();
+ int index = self + lane_delta;
+ index = (int)((self&(width-1))+lane_delta) >= width?self:index;
+ return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = var;
+ tmp.i = __shfl_down(tmp.i, lane_delta, width);
+ return tmp.u;
+}
+__device__
+inline
+float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.f = var;
+ tmp.i = __shfl_down(tmp.i, lane_delta, width);
+ return tmp.f;
+}
+__device__
+inline
+double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
+ static_assert(sizeof(double) == 2 * sizeof(int), "");
+ static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(long) == sizeof(int), "");
+ return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
+ #endif
+}
+__device__
+inline
+unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+ return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
+ #endif
+}
+__device__
+inline
+long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
+{
+ static_assert(sizeof(long long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long long) == sizeof(uint64_t), "");
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
+{
+ static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_down(tmp[0], lane_delta, width);
+ tmp[1] = __shfl_down(tmp[1], lane_delta, width);
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+
+__device__
+inline
+int __shfl_xor(int var, int lane_mask, int width = warpSize) {
+ int self = __lane_id();
+ int index = self^lane_mask;
+ index = index >= ((self+width)&~(width-1))?self:index;
+ return __builtin_amdgcn_ds_bpermute(index<<2, var);
+}
+__device__
+inline
+unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.u = var;
+ tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+ return tmp.u;
+}
+__device__
+inline
+float __shfl_xor(float var, int lane_mask, int width = warpSize) {
+ union { int i; unsigned u; float f; } tmp; tmp.f = var;
+ tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+ return tmp.f;
+}
+__device__
+inline
+double __shfl_xor(double var, int lane_mask, int width = warpSize) {
+ static_assert(sizeof(double) == 2 * sizeof(int), "");
+ static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+ tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+long __shfl_xor(long var, int lane_mask, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long) == sizeof(uint64_t), "");
+
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+ tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(long) == sizeof(int), "");
+ return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
+ #endif
+}
+__device__
+inline
+unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
+{
+ #ifndef _MSC_VER
+ static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
+
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+ tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+ #else
+ static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
+ return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
+ #endif
+}
+__device__
+inline
+long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
+{
+ static_assert(sizeof(long long) == 2 * sizeof(int), "");
+ static_assert(sizeof(long long) == sizeof(uint64_t), "");
+ int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+ tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+__device__
+inline
+unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
+{
+ static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
+ static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+ unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
+ tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
+ tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
+ uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
+ unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+ return tmp1;
+}
+#define MASK1 0x00ff00ff
+#define MASK2 0xff00ff00
+
+__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
+ char4 out;
+ unsigned one1 = in1.w & MASK1;
+ unsigned one2 = in2.w & MASK1;
+ out.w = (one1 + one2) & MASK1;
+ one1 = in1.w & MASK2;
+ one2 = in2.w & MASK2;
+ out.w = out.w | ((one1 + one2) & MASK2);
+ return out;
+}
+
+__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
+ char4 out;
+ unsigned one1 = in1.w & MASK1;
+ unsigned one2 = in2.w & MASK1;
+ out.w = (one1 - one2) & MASK1;
+ one1 = in1.w & MASK2;
+ one2 = in2.w & MASK2;
+ out.w = out.w | ((one1 - one2) & MASK2);
+ return out;
+}
+
+__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
+ char4 out;
+ unsigned one1 = in1.w & MASK1;
+ unsigned one2 = in2.w & MASK1;
+ out.w = (one1 * one2) & MASK1;
+ one1 = in1.w & MASK2;
+ one2 = in2.w & MASK2;
+ out.w = out.w | ((one1 * one2) & MASK2);
+ return out;
+}
+
+/*
+ * Rounding modes are not yet supported in HIP
+ * TODO: Conversion functions are not correct, need to fix when BE is ready
+*/
+
+__device__ static inline float __double2float_rd(double x) { return (double)x; }
+__device__ static inline float __double2float_rn(double x) { return (double)x; }
+__device__ static inline float __double2float_ru(double x) { return (double)x; }
+__device__ static inline float __double2float_rz(double x) { return (double)x; }
+
+__device__ static inline int __double2hiint(double x) {
+ static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+ int tmp[2];
+ __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+ return tmp[1];
+}
+__device__ static inline int __double2loint(double x) {
+ static_assert(sizeof(double) == 2 * sizeof(int), "");
+
+ int tmp[2];
+ __builtin_memcpy(tmp, &x, sizeof(tmp));
+
+ return tmp[0];
+}
+
+__device__ static inline int __double2int_rd(double x) { return (int)x; }
+__device__ static inline int __double2int_rn(double x) { return (int)x; }
+__device__ static inline int __double2int_ru(double x) { return (int)x; }
+__device__ static inline int __double2int_rz(double x) { return (int)x; }
+
+__device__ static inline long long int __double2ll_rd(double x) { return (long long int)x; }
+__device__ static inline long long int __double2ll_rn(double x) { return (long long int)x; }
+__device__ static inline long long int __double2ll_ru(double x) { return (long long int)x; }
+__device__ static inline long long int __double2ll_rz(double x) { return (long long int)x; }
+
+__device__ static inline unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
+__device__ static inline unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
+__device__ static inline unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
+__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
+
+__device__ static inline unsigned long long int __double2ull_rd(double x) {
+ return (unsigned long long int)x;
+}
+__device__ static inline unsigned long long int __double2ull_rn(double x) {
+ return (unsigned long long int)x;
+}
+__device__ static inline unsigned long long int __double2ull_ru(double x) {
+ return (unsigned long long int)x;
+}
+__device__ static inline unsigned long long int __double2ull_rz(double x) {
+ return (unsigned long long int)x;
+}
+
+__device__ static inline long long int __double_as_longlong(double x) {
+ static_assert(sizeof(long long) == sizeof(double), "");
+
+ long long tmp;
+ __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+ return tmp;
+}
+
+/*
+__device__ unsigned short __float2half_rn(float x);
+__device__ float __half2float(unsigned short);
+
+The above device function are not a valid .
+Use
+__device__ __half __float2half_rn(float x);
+__device__ float __half2float(__half);
+from hip_fp16.h
+
+CUDA implements half as unsigned short whereas, HIP doesn't.
+
+*/
+
+__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
+__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
+__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
+__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
+
+__device__ static inline long long int __float2ll_rd(float x) { return (long long int)x; }
+__device__ static inline long long int __float2ll_rn(float x) { return (long long int)x; }
+__device__ static inline long long int __float2ll_ru(float x) { return (long long int)x; }
+__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
+
+__device__ static inline unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
+__device__ static inline unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
+__device__ static inline unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
+__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
+
+__device__ static inline unsigned long long int __float2ull_rd(float x) {
+ return (unsigned long long int)x;
+}
+__device__ static inline unsigned long long int __float2ull_rn(float x) {
+ return (unsigned long long int)x;
+}
+__device__ static inline unsigned long long int __float2ull_ru(float x) {
+ return (unsigned long long int)x;
+}
+__device__ static inline unsigned long long int __float2ull_rz(float x) {
+ return (unsigned long long int)x;
+}
+
+__device__ static inline int __float_as_int(float x) {
+ static_assert(sizeof(int) == sizeof(float), "");
+
+ int tmp;
+ __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+ return tmp;
+}
+
+__device__ static inline unsigned int __float_as_uint(float x) {
+ static_assert(sizeof(unsigned int) == sizeof(float), "");
+
+ unsigned int tmp;
+ __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+ return tmp;
+}
+
+__device__ static inline double __hiloint2double(int hi, int lo) {
+ static_assert(sizeof(double) == sizeof(uint64_t), "");
+
+ uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
+ double tmp1;
+ __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
+
+ return tmp1;
+}
+
+__device__ static inline double __int2double_rn(int x) { return (double)x; }
+
+__device__ static inline float __int2float_rd(int x) { return (float)x; }
+__device__ static inline float __int2float_rn(int x) { return (float)x; }
+__device__ static inline float __int2float_ru(int x) { return (float)x; }
+__device__ static inline float __int2float_rz(int x) { return (float)x; }
+
+__device__ static inline float __int_as_float(int x) {
+ static_assert(sizeof(float) == sizeof(int), "");
+
+ float tmp;
+ __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+ return tmp;
+}
+
+__device__ static inline double __ll2double_rd(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_ru(long long int x) { return (double)x; }
+__device__ static inline double __ll2double_rz(long long int x) { return (double)x; }
+
+__device__ static inline float __ll2float_rd(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_ru(long long int x) { return (float)x; }
+__device__ static inline float __ll2float_rz(long long int x) { return (float)x; }
+
+__device__ static inline double __longlong_as_double(long long int x) {
+ static_assert(sizeof(double) == sizeof(long long), "");
+
+ double tmp;
+ __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+ return tmp;
+}
+
+__device__ static inline double __uint2double_rn(int x) { return (double)x; }
+
+__device__ static inline float __uint2float_rd(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_ru(unsigned int x) { return (float)x; }
+__device__ static inline float __uint2float_rz(unsigned int x) { return (float)x; }
+
+__device__ static inline float __uint_as_float(unsigned int x) {
+ static_assert(sizeof(float) == sizeof(unsigned int), "");
+
+ float tmp;
+ __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+ return tmp;
+}
+
+__device__ static inline double __ull2double_rd(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_ru(unsigned long long int x) { return (double)x; }
+__device__ static inline double __ull2double_rz(unsigned long long int x) { return (double)x; }
+
+__device__ static inline float __ull2float_rd(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_ru(unsigned long long int x) { return (float)x; }
+__device__ static inline float __ull2float_rz(unsigned long long int x) { return (float)x; }
+
+#if defined(__HCC__)
+#define __HCC_OR_HIP_CLANG__ 1
+#elif defined(__clang__) && defined(__HIP__)
+#define __HCC_OR_HIP_CLANG__ 1
+#else
+#define __HCC_OR_HIP_CLANG__ 0
+#endif
+
+#if __HCC_OR_HIP_CLANG__
+
+// Clock functions
+__device__ long long int __clock64();
+__device__ long long int __clock();
+__device__ long long int clock64();
+__device__ long long int clock();
+// hip.amdgcn.bc - named sync
+__device__ void __named_sync(int a, int b);
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+// Clock functions
+#if __HCC__
+extern "C" uint64_t __clock_u64() __HC__;
+#endif
+
+__device__
+inline __attribute((always_inline))
+long long int __clock64() {
+return (long long int) __builtin_readcyclecounter();
+}
+
+__device__
+inline __attribute((always_inline))
+long long int __clock() { return __clock64(); }
+
+__device__
+inline __attribute__((always_inline))
+long long int clock64() { return __clock64(); }
+
+__device__
+inline __attribute__((always_inline))
+long long int clock() { return __clock(); }
+
+// hip.amdgcn.bc - named sync
+__device__
+inline
+void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
+
+#endif // __HIP_DEVICE_COMPILE__
+
+// warp vote function __all __any __ballot
+__device__
+inline
+int __all(int predicate) {
+ return __ockl_wfall_i32(predicate);
+}
+
+__device__
+inline
+int __any(int predicate) {
+ return __ockl_wfany_i32(predicate);
+}
+
+// XXX from llvm/include/llvm/IR/InstrTypes.h
+#define ICMP_NE 33
+
+__device__
+inline
+unsigned long long int __ballot(int predicate) {
+ return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
+}
+
+__device__
+inline
+unsigned long long int __ballot64(int predicate) {
+ return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
+}
+
+// hip.amdgcn.bc - lanemask
+__device__
+inline
+uint64_t __lanemask_gt()
+{
+ uint32_t lane = __ockl_lane_u32();
+ if (lane == 63)
+ return 0;
+ uint64_t ballot = __ballot64(1);
+ uint64_t mask = (~((uint64_t)0)) << (lane + 1);
+ return mask & ballot;
+}
+
+__device__
+inline
+uint64_t __lanemask_lt()
+{
+ uint32_t lane = __ockl_lane_u32();
+ int64_t ballot = __ballot64(1);
+ uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
+ return mask & ballot;
+}
+
+__device__
+inline
+uint64_t __lanemask_eq()
+{
+ uint32_t lane = __ockl_lane_u32();
+ int64_t mask = ((uint64_t)1 << lane);
+ return mask;
+}
+
+
+__device__ inline void* __local_to_generic(void* p) { return p; }
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__
+inline
+void* __get_dynamicgroupbaseptr()
+{
+ // Get group segment base pointer.
+ return (char*)__local_to_generic((void*)__to_local(__llvm_amdgcn_groupstaticsize()));
+}
+#else
+__device__
+void* __get_dynamicgroupbaseptr();
+#endif // __HIP_DEVICE_COMPILE__
+
+__device__
+inline
+void *__amdgcn_get_dynamicgroupbaseptr() {
+ return __get_dynamicgroupbaseptr();
+}
+
+#if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
+// hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE 0x01
+typedef unsigned __cl_mem_fence_flags;
+
+typedef enum __memory_scope {
+ __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+ __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+ __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+ __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+ __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+} __memory_scope;
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum __memory_order
+{
+ __memory_order_relaxed = __ATOMIC_RELAXED,
+ __memory_order_acquire = __ATOMIC_ACQUIRE,
+ __memory_order_release = __ATOMIC_RELEASE,
+ __memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ __memory_order_seq_cst = __ATOMIC_SEQ_CST
+} __memory_order;
+
+__device__
+inline
+static void
+__atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
+{
+ // We're tying global-happens-before and local-happens-before together as does HSA
+ if (order != __memory_order_relaxed) {
+ switch (scope) {
+ case __memory_scope_work_item:
+ break;
+ case __memory_scope_sub_group:
+ switch (order) {
+ case __memory_order_relaxed: break;
+ case __memory_order_acquire: __llvm_fence_acq_sg(); break;
+ case __memory_order_release: __llvm_fence_rel_sg(); break;
+ case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
+ case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
+ }
+ break;
+ case __memory_scope_work_group:
+ switch (order) {
+ case __memory_order_relaxed: break;
+ case __memory_order_acquire: __llvm_fence_acq_wg(); break;
+ case __memory_order_release: __llvm_fence_rel_wg(); break;
+ case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
+ case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
+ }
+ break;
+ case __memory_scope_device:
+ switch (order) {
+ case __memory_order_relaxed: break;
+ case __memory_order_acquire: __llvm_fence_acq_dev(); break;
+ case __memory_order_release: __llvm_fence_rel_dev(); break;
+ case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
+ case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
+ }
+ break;
+ case __memory_scope_all_svm_devices:
+ switch (order) {
+ case __memory_order_relaxed: break;
+ case __memory_order_acquire: __llvm_fence_acq_sys(); break;
+ case __memory_order_release: __llvm_fence_rel_sys(); break;
+ case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
+ case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
+ }
+ break;
+ }
+ }
+}
+#endif
+
+// Memory Fence Functions
+__device__
+inline
+static void __threadfence()
+{
+ __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
+}
+
+__device__
+inline
+static void __threadfence_block()
+{
+ __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
+}
+
+__device__
+inline
+static void __threadfence_system()
+{
+ __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
+}
+
+// abort
+__device__
+inline
+__attribute__((weak))
+void abort() {
+ return __builtin_trap();
+}
+
+
+#endif // __HCC_OR_HIP_CLANG__
+
+#ifdef __HCC__
+
+/**
+ * extern __shared__
+ */
+
+// Macro to replace extern __shared__ declarations
+// to local variable definitions
+#define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
+
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+
+#elif defined(__clang__) && defined(__HIP__)
+
+// The noinline attribute helps encapsulate the printf expansion,
+// which otherwise has a performance impact just by increasing the
+// size of the calling function. Additionally, the weak attribute
+// allows the function to exist as a global although its definition is
+// included in every compilation unit.
+#if defined(_WIN32) || defined(_WIN64)
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
+void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
+ // FIXME: Need `wchar_t` support to generate assertion message.
+ __builtin_trap();
+}
+#else /* defined(_WIN32) || defined(_WIN64) */
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
+void __assert_fail(const char * __assertion,
+ const char *__file,
+ unsigned int __line,
+ const char *__function)
+{
+ printf("%s:%u: %s: Device-side assertion `%s' failed.\n", __file, __line,
+ __function, __assertion);
+ __builtin_trap();
+}
+
+extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
+void __assertfail(const char * __assertion,
+ const char *__file,
+ unsigned int __line,
+ const char *__function,
+ size_t charsize)
+{
+ // ignore all the args for now.
+ __builtin_trap();
+}
+#endif /* defined(_WIN32) || defined(_WIN64) */
+
+__device__
+inline
+static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
+{
+ if (flags) {
+ __atomic_work_item_fence(flags, __memory_order_release, scope);
+ __builtin_amdgcn_s_barrier();
+ __atomic_work_item_fence(flags, __memory_order_acquire, scope);
+ } else {
+ __builtin_amdgcn_s_barrier();
+ }
+}
+
+__device__
+inline
+static void __barrier(int n)
+{
+ __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
+}
+
+__device__
+inline
+__attribute__((convergent))
+void __syncthreads()
+{
+ __barrier(__CLK_LOCAL_MEM_FENCE);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_count(int predicate)
+{
+ return __ockl_wgred_add_i32(!!predicate);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_and(int predicate)
+{
+ return __ockl_wgred_and_i32(!!predicate);
+}
+
+__device__
+inline
+__attribute__((convergent))
+int __syncthreads_or(int predicate)
+{
+ return __ockl_wgred_or_i32(!!predicate);
+}
+
+// hip.amdgcn.bc - device routine
+/*
+ HW_ID Register bit structure
+ WAVE_ID 3:0 Wave buffer slot number. 0-9.
+ SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
+ PIPE_ID 7:6 Pipeline from which the wave was dispatched.
+ CU_ID 11:8 Compute Unit the wave is assigned to.
+ SH_ID 12 Shader Array (within an SE) the wave is assigned to.
+ SE_ID 14:13 Shader Engine the wave is assigned to.
+ TG_ID 19:16 Thread-group ID
+ VM_ID 23:20 Virtual Memory ID
+ QUEUE_ID 26:24 Queue from which this wave was dispatched.
+ STATE_ID 29:27 State ID (graphics only, not compute).
+ ME_ID 31:30 Micro-engine ID.
+ */
+
+#define HW_ID 4
+
+#define HW_ID_CU_ID_SIZE 4
+#define HW_ID_CU_ID_OFFSET 8
+
+#define HW_ID_SE_ID_SIZE 2
+#define HW_ID_SE_ID_OFFSET 13
+
+/*
+ Encoding of parameter bitmask
+ HW_ID 5:0 HW_ID
+ OFFSET 10:6 Range: 0..31
+ SIZE 15:11 Range: 1..32
+ */
+
+#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
+
+/*
+ __smid returns the wave's assigned Compute Unit and Shader Engine.
+ The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
+ Note: the results vary over time.
+ SZ minus 1 since SIZE is 1-based.
+*/
+__device__
+inline
+unsigned __smid(void)
+{
+ unsigned cu_id = __builtin_amdgcn_s_getreg(
+ GETREG_IMMED(HW_ID_CU_ID_SIZE-1, HW_ID_CU_ID_OFFSET, HW_ID));
+ unsigned se_id = __builtin_amdgcn_s_getreg(
+ GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
+
+ /* Each shader engine has 16 CU */
+ return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}
+
+// Macro to replace extern __shared__ declarations
+// to local variable definitions
+#define HIP_DYNAMIC_SHARED(type, var) \
+ type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
+
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+
+#endif //defined(__clang__) && defined(__HIP__)
+
+
+// loop unrolling
+static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
+ auto dstPtr = static_cast<unsigned char*>(dst);
+ auto srcPtr = static_cast<const unsigned char*>(src);
+
+ while (size >= 4u) {
+ dstPtr[0] = srcPtr[0];
+ dstPtr[1] = srcPtr[1];
+ dstPtr[2] = srcPtr[2];
+ dstPtr[3] = srcPtr[3];
+
+ size -= 4u;
+ srcPtr += 4u;
+ dstPtr += 4u;
+ }
+ switch (size) {
+ case 3:
+ dstPtr[2] = srcPtr[2];
+ case 2:
+ dstPtr[1] = srcPtr[1];
+ case 1:
+ dstPtr[0] = srcPtr[0];
+ }
+
+ return dst;
+}
+
+static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
+ auto dstPtr = static_cast<unsigned char*>(dst);
+
+ while (size >= 4u) {
+ dstPtr[0] = val;
+ dstPtr[1] = val;
+ dstPtr[2] = val;
+ dstPtr[3] = val;
+
+ size -= 4u;
+ dstPtr += 4u;
+ }
+ switch (size) {
+ case 3:
+ dstPtr[2] = val;
+ case 2:
+ dstPtr[1] = val;
+ case 1:
+ dstPtr[0] = val;
+ }
+
+ return dst;
+}
+#ifndef __OPENMP_AMDGCN__
+static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
+ return __hip_hc_memcpy(dst, src, size);
+}
+
+static inline __device__ void* memset(void* ptr, int val, size_t size) {
+ unsigned char val8 = static_cast<unsigned char>(val);
+ return __hip_hc_memset(ptr, val8, size);
+}
+#endif // !__OPENMP_AMDGCN__
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/device_library_decls.h b/third_party/rocm/include/hip/hcc_detail/device_library_decls.h
new file mode 100644
index 0000000..90aef16
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/device_library_decls.h
@@ -0,0 +1,139 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/device_library_decls.h
+ * @brief Contains declarations for types and functions in device library.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_LIBRARY_DECLS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_LIBRARY_DECLS_H
+
+#include "hip/hcc_detail/host_defines.h"
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef unsigned long long ullong;
+
+extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
+extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
+extern "C" __device__ uint __ockl_activelane_u32(void);
+
+extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
+extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
+extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
+
+extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
+extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
+extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
+extern "C" __device__ __attribute__((const)) ullong __ockl_clz_u64(ullong);
+
+extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
+extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);
+
+extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
+extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
+
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
+extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
+
+extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
+extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
+extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
+extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
+extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
+
+extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
+
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
+extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
+
+
+// Introduce local address space
+#define __local __attribute__((address_space(3)))
+
+#ifdef __HIP_DEVICE_COMPILE__
+__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
+#endif //__HIP_DEVICE_COMPILE__
+
+#if defined(__HCC__) && (__hcc_major__ < 3) && (__hcc_minor__ < 3)
+// __llvm_fence* functions from device-libs/irif/src/fence.ll
+extern "C" __device__ void __llvm_fence_acq_sg(void);
+extern "C" __device__ void __llvm_fence_acq_wg(void);
+extern "C" __device__ void __llvm_fence_acq_dev(void);
+extern "C" __device__ void __llvm_fence_acq_sys(void);
+
+extern "C" __device__ void __llvm_fence_rel_sg(void);
+extern "C" __device__ void __llvm_fence_rel_wg(void);
+extern "C" __device__ void __llvm_fence_rel_dev(void);
+extern "C" __device__ void __llvm_fence_rel_sys(void);
+
+extern "C" __device__ void __llvm_fence_ar_sg(void);
+extern "C" __device__ void __llvm_fence_ar_wg(void);
+extern "C" __device__ void __llvm_fence_ar_dev(void);
+extern "C" __device__ void __llvm_fence_ar_sys(void);
+
+
+extern "C" __device__ void __llvm_fence_sc_sg(void);
+extern "C" __device__ void __llvm_fence_sc_wg(void);
+extern "C" __device__ void __llvm_fence_sc_dev(void);
+extern "C" __device__ void __llvm_fence_sc_sys(void);
+#else
+// Using hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE 0x01
+typedef unsigned __cl_mem_fence_flags;
+
+typedef enum __memory_scope {
+ __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+ __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+ __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+ __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+ __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+} __memory_scope;
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum __memory_order
+{
+ __memory_order_relaxed = __ATOMIC_RELAXED,
+ __memory_order_acquire = __ATOMIC_ACQUIRE,
+ __memory_order_release = __ATOMIC_RELEASE,
+ __memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ __memory_order_seq_cst = __ATOMIC_SEQ_CST
+} __memory_order;
+
+// Linked from hip.amdgcn.bc
+extern "C" __device__ void
+__atomic_work_item_fence(__cl_mem_fence_flags, __memory_order, __memory_scope);
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/driver_types.h b/third_party/rocm/include/hip/hcc_detail/driver_types.h
new file mode 100644
index 0000000..7db78e5
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/driver_types.h
@@ -0,0 +1,466 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_DRIVER_TYPES_H
+
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif
+
+typedef void* hipDeviceptr_t;
+typedef enum hipChannelFormatKind {
+ hipChannelFormatKindSigned = 0,
+ hipChannelFormatKindUnsigned = 1,
+ hipChannelFormatKindFloat = 2,
+ hipChannelFormatKindNone = 3
+}hipChannelFormatKind;
+
+typedef struct hipChannelFormatDesc {
+ int x;
+ int y;
+ int z;
+ int w;
+ enum hipChannelFormatKind f;
+}hipChannelFormatDesc;
+
+#define HIP_TRSA_OVERRIDE_FORMAT 0x01
+#define HIP_TRSF_READ_AS_INTEGER 0x01
+#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
+#define HIP_TRSF_SRGB 0x10
+
+typedef enum hipArray_Format {
+ HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+ HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+ HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+ HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
+ HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
+ HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
+ HIP_AD_FORMAT_HALF = 0x10,
+ HIP_AD_FORMAT_FLOAT = 0x20
+}hipArray_Format;
+
+typedef struct HIP_ARRAY_DESCRIPTOR {
+ size_t Width;
+ size_t Height;
+ enum hipArray_Format Format;
+ unsigned int NumChannels;
+}HIP_ARRAY_DESCRIPTOR;
+
+typedef struct HIP_ARRAY3D_DESCRIPTOR {
+ size_t Width;
+ size_t Height;
+ size_t Depth;
+ enum hipArray_Format Format;
+ unsigned int NumChannels;
+ unsigned int Flags;
+}HIP_ARRAY3D_DESCRIPTOR;
+
+typedef struct hipArray {
+ void* data; // FIXME: generalize this
+ struct hipChannelFormatDesc desc;
+ unsigned int type;
+ unsigned int width;
+ unsigned int height;
+ unsigned int depth;
+ enum hipArray_Format Format;
+ unsigned int NumChannels;
+ bool isDrv;
+ unsigned int textureType;
+}hipArray;
+
+typedef struct hip_Memcpy2D {
+ size_t srcXInBytes;
+ size_t srcY;
+ hipMemoryType srcMemoryType;
+ const void* srcHost;
+ hipDeviceptr_t srcDevice;
+ hipArray* srcArray;
+ size_t srcPitch;
+ size_t dstXInBytes;
+ size_t dstY;
+ hipMemoryType dstMemoryType;
+ void* dstHost;
+ hipDeviceptr_t dstDevice;
+ hipArray* dstArray;
+ size_t dstPitch;
+ size_t WidthInBytes;
+ size_t Height;
+} hip_Memcpy2D;
+
+
+typedef struct hipArray* hipArray_t;
+typedef hipArray_t hiparray;
+typedef const struct hipArray* hipArray_const_t;
+
+// TODO: It needs to be modified since it was just copied from hipArray.
+struct hipMipmappedArray {
+ void* data; // FIXME: generalize this
+ struct hipChannelFormatDesc desc;
+ unsigned int width;
+ unsigned int height;
+ unsigned int depth;
+};
+
+typedef struct hipMipmappedArray* hipMipmappedArray_t;
+
+typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;
+
+/**
+ * hip resource types
+ */
+typedef enum hipResourceType {
+ hipResourceTypeArray = 0x00,
+ hipResourceTypeMipmappedArray = 0x01,
+ hipResourceTypeLinear = 0x02,
+ hipResourceTypePitch2D = 0x03
+}hipResourceType;
+
+typedef enum HIPresourcetype_enum {
+ HIP_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */
+ HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+ HIP_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
+ HIP_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */
+} HIPresourcetype;
+
+/**
+ * hip address modes
+ */
+typedef enum HIPaddress_mode_enum {
+ HIP_TR_ADDRESS_MODE_WRAP = 0,
+ HIP_TR_ADDRESS_MODE_CLAMP = 1,
+ HIP_TR_ADDRESS_MODE_MIRROR = 2,
+ HIP_TR_ADDRESS_MODE_BORDER = 3
+} HIPaddress_mode;
+
+/**
+ * hip filter modes
+ */
+typedef enum HIPfilter_mode_enum {
+ HIP_TR_FILTER_MODE_POINT = 0,
+ HIP_TR_FILTER_MODE_LINEAR = 1
+} HIPfilter_mode;
+
+/**
+ * Texture descriptor
+ */
+typedef struct HIP_TEXTURE_DESC_st {
+ HIPaddress_mode addressMode[3]; /**< Address modes */
+ HIPfilter_mode filterMode; /**< Filter mode */
+ unsigned int flags; /**< Flags */
+ unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */
+ HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+ float mipmapLevelBias; /**< Mipmap level bias */
+ float minMipmapLevelClamp; /**< Mipmap minimum level clamp */
+ float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */
+ float borderColor[4]; /**< Border Color */
+ int reserved[12];
+} HIP_TEXTURE_DESC;
+
+/**
+ * hip texture resource view formats
+ */
+typedef enum hipResourceViewFormat {
+ hipResViewFormatNone = 0x00,
+ hipResViewFormatUnsignedChar1 = 0x01,
+ hipResViewFormatUnsignedChar2 = 0x02,
+ hipResViewFormatUnsignedChar4 = 0x03,
+ hipResViewFormatSignedChar1 = 0x04,
+ hipResViewFormatSignedChar2 = 0x05,
+ hipResViewFormatSignedChar4 = 0x06,
+ hipResViewFormatUnsignedShort1 = 0x07,
+ hipResViewFormatUnsignedShort2 = 0x08,
+ hipResViewFormatUnsignedShort4 = 0x09,
+ hipResViewFormatSignedShort1 = 0x0a,
+ hipResViewFormatSignedShort2 = 0x0b,
+ hipResViewFormatSignedShort4 = 0x0c,
+ hipResViewFormatUnsignedInt1 = 0x0d,
+ hipResViewFormatUnsignedInt2 = 0x0e,
+ hipResViewFormatUnsignedInt4 = 0x0f,
+ hipResViewFormatSignedInt1 = 0x10,
+ hipResViewFormatSignedInt2 = 0x11,
+ hipResViewFormatSignedInt4 = 0x12,
+ hipResViewFormatHalf1 = 0x13,
+ hipResViewFormatHalf2 = 0x14,
+ hipResViewFormatHalf4 = 0x15,
+ hipResViewFormatFloat1 = 0x16,
+ hipResViewFormatFloat2 = 0x17,
+ hipResViewFormatFloat4 = 0x18,
+ hipResViewFormatUnsignedBlockCompressed1 = 0x19,
+ hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
+ hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
+ hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
+ hipResViewFormatSignedBlockCompressed4 = 0x1d,
+ hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
+ hipResViewFormatSignedBlockCompressed5 = 0x1f,
+ hipResViewFormatUnsignedBlockCompressed6H = 0x20,
+ hipResViewFormatSignedBlockCompressed6H = 0x21,
+ hipResViewFormatUnsignedBlockCompressed7 = 0x22
+}hipResourceViewFormat;
+
+typedef enum HIPresourceViewFormat_enum
+{
+ HIP_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */
+ HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */
+ HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */
+ HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */
+ HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */
+ HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */
+ HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */
+ HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */
+ HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */
+ HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */
+ HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */
+ HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+ HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */
+ HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */
+} HIPresourceViewFormat;
+
+/**
+ * HIP resource descriptor
+ */
+typedef struct hipResourceDesc {
+ enum hipResourceType resType;
+
+ union {
+ struct {
+ hipArray_t array;
+ } array;
+ struct {
+ hipMipmappedArray_t mipmap;
+ } mipmap;
+ struct {
+ void* devPtr;
+ struct hipChannelFormatDesc desc;
+ size_t sizeInBytes;
+ } linear;
+ struct {
+ void* devPtr;
+ struct hipChannelFormatDesc desc;
+ size_t width;
+ size_t height;
+ size_t pitchInBytes;
+ } pitch2D;
+ } res;
+}hipResourceDesc;
+
+typedef struct HIP_RESOURCE_DESC_st
+{
+ HIPresourcetype resType; /**< Resource type */
+
+ union {
+ struct {
+ hipArray_t hArray; /**< HIP array */
+ } array;
+ struct {
+ hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */
+ } mipmap;
+ struct {
+ hipDeviceptr_t devPtr; /**< Device pointer */
+ hipArray_Format format; /**< Array format */
+ unsigned int numChannels; /**< Channels per array element */
+ size_t sizeInBytes; /**< Size in bytes */
+ } linear;
+ struct {
+ hipDeviceptr_t devPtr; /**< Device pointer */
+ hipArray_Format format; /**< Array format */
+ unsigned int numChannels; /**< Channels per array element */
+ size_t width; /**< Width of the array in elements */
+ size_t height; /**< Height of the array in elements */
+ size_t pitchInBytes; /**< Pitch between two rows in bytes */
+ } pitch2D;
+ struct {
+ int reserved[32];
+ } reserved;
+ } res;
+
+ unsigned int flags; /**< Flags (must be zero) */
+} HIP_RESOURCE_DESC;
+
+/**
+ * hip resource view descriptor
+ */
+struct hipResourceViewDesc {
+ enum hipResourceViewFormat format;
+ size_t width;
+ size_t height;
+ size_t depth;
+ unsigned int firstMipmapLevel;
+ unsigned int lastMipmapLevel;
+ unsigned int firstLayer;
+ unsigned int lastLayer;
+};
+
+/**
+ * Resource view descriptor
+ */
+typedef struct HIP_RESOURCE_VIEW_DESC_st
+{
+ HIPresourceViewFormat format; /**< Resource view format */
+ size_t width; /**< Width of the resource view */
+ size_t height; /**< Height of the resource view */
+ size_t depth; /**< Depth of the resource view */
+ unsigned int firstMipmapLevel; /**< First defined mipmap level */
+ unsigned int lastMipmapLevel; /**< Last defined mipmap level */
+ unsigned int firstLayer; /**< First layer index */
+ unsigned int lastLayer; /**< Last layer index */
+ unsigned int reserved[16];
+} HIP_RESOURCE_VIEW_DESC;
+
+/**
+ * Memory copy types
+ *
+ */
+typedef enum hipMemcpyKind {
+ hipMemcpyHostToHost = 0, ///< Host-to-Host Copy
+ hipMemcpyHostToDevice = 1, ///< Host-to-Device Copy
+ hipMemcpyDeviceToHost = 2, ///< Device-to-Host Copy
+ hipMemcpyDeviceToDevice = 3, ///< Device-to-Device Copy
+ hipMemcpyDefault =
+ 4 ///< Runtime will automatically determine copy-kind based on virtual addresses.
+} hipMemcpyKind;
+
+typedef struct hipPitchedPtr {
+ void* ptr;
+ size_t pitch;
+ size_t xsize;
+ size_t ysize;
+}hipPitchedPtr;
+
+typedef struct hipExtent {
+ size_t width; // Width in elements when referring to array memory, in bytes when referring to
+ // linear memory
+ size_t height;
+ size_t depth;
+}hipExtent;
+
+typedef struct hipPos {
+ size_t x;
+ size_t y;
+ size_t z;
+}hipPos;
+
+typedef struct hipMemcpy3DParms {
+ hipArray_t srcArray;
+ struct hipPos srcPos;
+ struct hipPitchedPtr srcPtr;
+ hipArray_t dstArray;
+ struct hipPos dstPos;
+ struct hipPitchedPtr dstPtr;
+ struct hipExtent extent;
+ enum hipMemcpyKind kind;
+} hipMemcpy3DParms;
+
+typedef struct HIP_MEMCPY3D {
+ unsigned int srcXInBytes;
+ unsigned int srcY;
+ unsigned int srcZ;
+ unsigned int srcLOD;
+ hipMemoryType srcMemoryType;
+ const void* srcHost;
+ hipDeviceptr_t srcDevice;
+ hipArray_t srcArray;
+ unsigned int srcPitch;
+ unsigned int srcHeight;
+ unsigned int dstXInBytes;
+ unsigned int dstY;
+ unsigned int dstZ;
+ unsigned int dstLOD;
+ hipMemoryType dstMemoryType;
+ void* dstHost;
+ hipDeviceptr_t dstDevice;
+ hipArray_t dstArray;
+ unsigned int dstPitch;
+ unsigned int dstHeight;
+ unsigned int WidthInBytes;
+ unsigned int Height;
+ unsigned int Depth;
+} HIP_MEMCPY3D;
+
+static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
+ size_t ysz) {
+ struct hipPitchedPtr s;
+
+ s.ptr = d;
+ s.pitch = p;
+ s.xsize = xsz;
+ s.ysize = ysz;
+
+ return s;
+}
+
+static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
+ struct hipPos p;
+
+ p.x = x;
+ p.y = y;
+ p.z = z;
+
+ return p;
+}
+
+static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
+ struct hipExtent e;
+
+ e.width = w;
+ e.height = h;
+ e.depth = d;
+
+ return e;
+}
+
+typedef enum hipFunction_attribute {
+ HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+ HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+ HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,
+ HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+ HIP_FUNC_ATTRIBUTE_NUM_REGS,
+ HIP_FUNC_ATTRIBUTE_PTX_VERSION,
+ HIP_FUNC_ATTRIBUTE_BINARY_VERSION,
+ HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,
+ HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+ HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
+ HIP_FUNC_ATTRIBUTE_MAX
+}hipFunction_attribute;
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elf_types.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elf_types.hpp
new file mode 100644
index 0000000..a17b700
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elf_types.hpp
@@ -0,0 +1,748 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFTYPES_H
+#define ELFTYPES_H
+
+#ifndef ELFIO_NO_OWN_TYPES
+#if !defined(ELFIO_NO_CSTDINT) && !defined(ELFIO_NO_INTTYPES)
+#include <stdint.h>
+#else
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef unsigned short uint16_t;
+typedef signed short int16_t;
+#ifdef _MSC_VER
+typedef unsigned __int32 uint32_t;
+typedef signed __int32 int32_t;
+typedef unsigned __int64 uint64_t;
+typedef signed __int64 int64_t;
+#else
+typedef unsigned int uint32_t;
+typedef signed int int32_t;
+typedef unsigned long long uint64_t;
+typedef signed long long int64_t;
+#endif // _MSC_VER
+#endif // ELFIO_NO_CSTDINT
+#endif // ELFIO_NO_OWN_TYPES
+
+namespace ELFIO {
+
+// Attention! Platform depended definitions.
+typedef uint16_t Elf_Half;
+typedef uint32_t Elf_Word;
+typedef int32_t Elf_Sword;
+typedef uint64_t Elf_Xword;
+typedef int64_t Elf_Sxword;
+
+typedef uint32_t Elf32_Addr;
+typedef uint32_t Elf32_Off;
+typedef uint64_t Elf64_Addr;
+typedef uint64_t Elf64_Off;
+
+#define Elf32_Half Elf_Half
+#define Elf64_Half Elf_Half
+#define Elf32_Word Elf_Word
+#define Elf64_Word Elf_Word
+#define Elf32_Sword Elf_Sword
+#define Elf64_Sword Elf_Sword
+
+///////////////////////
+// ELF Header Constants
+
+// File type
+#define ET_NONE 0
+#define ET_REL 1
+#define ET_EXEC 2
+#define ET_DYN 3
+#define ET_CORE 4
+#define ET_LOOS 0xFE00
+#define ET_HIOS 0xFEFF
+#define ET_LOPROC 0xFF00
+#define ET_HIPROC 0xFFFF
+
+
+#define EM_NONE 0 // No machine
+#define EM_M32 1 // AT&T WE 32100
+#define EM_SPARC 2 // SUN SPARC
+#define EM_386 3 // Intel 80386
+#define EM_68K 4 // Motorola m68k family
+#define EM_88K 5 // Motorola m88k family
+#define EM_486 6 // Intel 80486// Reserved for future use
+#define EM_860 7 // Intel 80860
+#define EM_MIPS 8 // MIPS R3000 (officially, big-endian only)
+#define EM_S370 9 // IBM System/370
+#define EM_MIPS_RS3_LE 10 // MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated
+#define EM_res011 11 // Reserved
+#define EM_res012 12 // Reserved
+#define EM_res013 13 // Reserved
+#define EM_res014 14 // Reserved
+#define EM_PARISC 15 // HPPA
+#define EM_res016 16 // Reserved
+#define EM_VPP550 17 // Fujitsu VPP500
+#define EM_SPARC32PLUS 18 // Sun's "v8plus"
+#define EM_960 19 // Intel 80960
+#define EM_PPC 20 // PowerPC
+#define EM_PPC64 21 // 64-bit PowerPC
+#define EM_S390 22 // IBM S/390
+#define EM_SPU 23 // Sony/Toshiba/IBM SPU
+#define EM_res024 24 // Reserved
+#define EM_res025 25 // Reserved
+#define EM_res026 26 // Reserved
+#define EM_res027 27 // Reserved
+#define EM_res028 28 // Reserved
+#define EM_res029 29 // Reserved
+#define EM_res030 30 // Reserved
+#define EM_res031 31 // Reserved
+#define EM_res032 32 // Reserved
+#define EM_res033 33 // Reserved
+#define EM_res034 34 // Reserved
+#define EM_res035 35 // Reserved
+#define EM_V800 36 // NEC V800 series
+#define EM_FR20 37 // Fujitsu FR20
+#define EM_RH32 38 // TRW RH32
+#define EM_MCORE 39 // Motorola M*Core // May also be taken by Fujitsu MMA
+#define EM_RCE 39 // Old name for MCore
+#define EM_ARM 40 // ARM
+#define EM_OLD_ALPHA 41 // Digital Alpha
+#define EM_SH 42 // Renesas (formerly Hitachi) / SuperH SH
+#define EM_SPARCV9 43 // SPARC v9 64-bit
+#define EM_TRICORE 44 // Siemens Tricore embedded processor
+#define EM_ARC 45 // ARC Cores
+#define EM_H8_300 46 // Renesas (formerly Hitachi) H8/300
+#define EM_H8_300H 47 // Renesas (formerly Hitachi) H8/300H
+#define EM_H8S 48 // Renesas (formerly Hitachi) H8S
+#define EM_H8_500 49 // Renesas (formerly Hitachi) H8/500
+#define EM_IA_64 50 // Intel IA-64 Processor
+#define EM_MIPS_X 51 // Stanford MIPS-X
+#define EM_COLDFIRE 52 // Motorola Coldfire
+#define EM_68HC12 53 // Motorola M68HC12
+#define EM_MMA 54 // Fujitsu Multimedia Accelerator
+#define EM_PCP 55 // Siemens PCP
+#define EM_NCPU 56 // Sony nCPU embedded RISC processor
+#define EM_NDR1 57 // Denso NDR1 microprocesspr
+#define EM_STARCORE 58 // Motorola Star*Core processor
+#define EM_ME16 59 // Toyota ME16 processor
+#define EM_ST100 60 // STMicroelectronics ST100 processor
+#define EM_TINYJ 61 // Advanced Logic Corp. TinyJ embedded processor
+#define EM_X86_64 62 // Advanced Micro Devices X86-64 processor
+#define EM_PDSP 63 // Sony DSP Processor
+#define EM_PDP10 64 // Digital Equipment Corp. PDP-10
+#define EM_PDP11 65 // Digital Equipment Corp. PDP-11
+#define EM_FX66 66 // Siemens FX66 microcontroller
+#define EM_ST9PLUS 67 // STMicroelectronics ST9+ 8/16 bit microcontroller
+#define EM_ST7 68 // STMicroelectronics ST7 8-bit microcontroller
+#define EM_68HC16 69 // Motorola MC68HC16 Microcontroller
+#define EM_68HC11 70 // Motorola MC68HC11 Microcontroller
+#define EM_68HC08 71 // Motorola MC68HC08 Microcontroller
+#define EM_68HC05 72 // Motorola MC68HC05 Microcontroller
+#define EM_SVX 73 // Silicon Graphics SVx
+#define EM_ST19 74 // STMicroelectronics ST19 8-bit cpu
+#define EM_VAX 75 // Digital VAX
+#define EM_CRIS 76 // Axis Communications 32-bit embedded processor
+#define EM_JAVELIN 77 // Infineon Technologies 32-bit embedded cpu
+#define EM_FIREPATH 78 // Element 14 64-bit DSP processor
+#define EM_ZSP 79 // LSI Logic's 16-bit DSP processor
+#define EM_MMIX 80 // Donald Knuth's educational 64-bit processor
+#define EM_HUANY 81 // Harvard's machine-independent format
+#define EM_PRISM 82 // SiTera Prism
+#define EM_AVR 83 // Atmel AVR 8-bit microcontroller
+#define EM_FR30 84 // Fujitsu FR30
+#define EM_D10V 85 // Mitsubishi D10V
+#define EM_D30V 86 // Mitsubishi D30V
+#define EM_V850 87 // NEC v850
+#define EM_M32R 88 // Renesas M32R (formerly Mitsubishi M32R)
+#define EM_MN10300 89 // Matsushita MN10300
+#define EM_MN10200 90 // Matsushita MN10200
+#define EM_PJ 91 // picoJava
+#define EM_OPENRISC 92 // OpenRISC 32-bit embedded processor
+#define EM_ARC_A5 93 // ARC Cores Tangent-A5
+#define EM_XTENSA 94 // Tensilica Xtensa Architecture
+#define EM_VIDEOCORE 95 // Alphamosaic VideoCore processor
+#define EM_TMM_GPP 96 // Thompson Multimedia General Purpose Processor
+#define EM_NS32K 97 // National Semiconductor 32000 series
+#define EM_TPC 98 // Tenor Network TPC processor
+#define EM_SNP1K 99 // Trebia SNP 1000 processor
+#define EM_ST200 100 // STMicroelectronics ST200 microcontroller
+#define EM_IP2K 101 // Ubicom IP2022 micro controller
+#define EM_MAX 102 // MAX Processor
+#define EM_CR 103 // National Semiconductor CompactRISC
+#define EM_F2MC16 104 // Fujitsu F2MC16
+#define EM_MSP430 105 // TI msp430 micro controller
+#define EM_BLACKFIN 106 // ADI Blackfin
+#define EM_SE_C33 107 // S1C33 Family of Seiko Epson processors
+#define EM_SEP 108 // Sharp embedded microprocessor
+#define EM_ARCA 109 // Arca RISC Microprocessor
+#define EM_UNICORE 110 // Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University
+#define EM_EXCESS 111 // eXcess: 16/32/64-bit configurable embedded CPU
+#define EM_DXP 112 // Icera Semiconductor Inc. Deep Execution Processor
+#define EM_ALTERA_NIOS2 113 // Altera Nios II soft-core processor
+#define EM_CRX 114 // National Semiconductor CRX
+#define EM_XGATE 115 // Motorola XGATE embedded processor
+#define EM_C166 116 // Infineon C16x/XC16x processor
+#define EM_M16C 117 // Renesas M16C series microprocessors
+#define EM_DSPIC30F 118 // Microchip Technology dsPIC30F Digital Signal Controller
+#define EM_CE 119 // Freescale Communication Engine RISC core
+#define EM_M32C 120 // Renesas M32C series microprocessors
+#define EM_res121 121 // Reserved
+#define EM_res122 122 // Reserved
+#define EM_res123 123 // Reserved
+#define EM_res124 124 // Reserved
+#define EM_res125 125 // Reserved
+#define EM_res126 126 // Reserved
+#define EM_res127 127 // Reserved
+#define EM_res128 128 // Reserved
+#define EM_res129 129 // Reserved
+#define EM_res130 130 // Reserved
+#define EM_TSK3000 131 // Altium TSK3000 core
+#define EM_RS08 132 // Freescale RS08 embedded processor
+#define EM_res133 133 // Reserved
+#define EM_ECOG2 134 // Cyan Technology eCOG2 microprocessor
+#define EM_SCORE 135 // Sunplus Score
+#define EM_SCORE7 135 // Sunplus S+core7 RISC processor
+#define EM_DSP24 136 // New Japan Radio (NJR) 24-bit DSP Processor
+#define EM_VIDEOCORE3 137 // Broadcom VideoCore III processor
+#define EM_LATTICEMICO32 138 // RISC processor for Lattice FPGA architecture
+#define EM_SE_C17 139 // Seiko Epson C17 family
+#define EM_TI_C6000 140 // Texas Instruments TMS320C6000 DSP family
+#define EM_TI_C2000 141 // Texas Instruments TMS320C2000 DSP family
+#define EM_TI_C5500 142 // Texas Instruments TMS320C55x DSP family
+#define EM_res143 143 // Reserved
+#define EM_res144 144 // Reserved
+#define EM_res145 145 // Reserved
+#define EM_res146 146 // Reserved
+#define EM_res147 147 // Reserved
+#define EM_res148 148 // Reserved
+#define EM_res149 149 // Reserved
+#define EM_res150 150 // Reserved
+#define EM_res151 151 // Reserved
+#define EM_res152 152 // Reserved
+#define EM_res153 153 // Reserved
+#define EM_res154 154 // Reserved
+#define EM_res155 155 // Reserved
+#define EM_res156 156 // Reserved
+#define EM_res157 157 // Reserved
+#define EM_res158 158 // Reserved
+#define EM_res159 159 // Reserved
+#define EM_MMDSP_PLUS 160 // STMicroelectronics 64bit VLIW Data Signal Processor
+#define EM_CYPRESS_M8C 161 // Cypress M8C microprocessor
+#define EM_R32C 162 // Renesas R32C series microprocessors
+#define EM_TRIMEDIA 163 // NXP Semiconductors TriMedia architecture family
+#define EM_QDSP6 164 // QUALCOMM DSP6 Processor
+#define EM_8051 165 // Intel 8051 and variants
+#define EM_STXP7X 166 // STMicroelectronics STxP7x family
+#define EM_NDS32 167 // Andes Technology compact code size embedded RISC processor family
+#define EM_ECOG1 168 // Cyan Technology eCOG1X family
+#define EM_ECOG1X 168 // Cyan Technology eCOG1X family
+#define EM_MAXQ30 169 // Dallas Semiconductor MAXQ30 Core Micro-controllers
+#define EM_XIMO16 170 // New Japan Radio (NJR) 16-bit DSP Processor
+#define EM_MANIK 171 // M2000 Reconfigurable RISC Microprocessor
+#define EM_CRAYNV2 172 // Cray Inc. NV2 vector architecture
+#define EM_RX 173 // Renesas RX family
+#define EM_METAG 174 // Imagination Technologies META processor architecture
+#define EM_MCST_ELBRUS 175 // MCST Elbrus general purpose hardware architecture
+#define EM_ECOG16 176 // Cyan Technology eCOG16 family
+#define EM_CR16 177 // National Semiconductor CompactRISC 16-bit processor
+#define EM_ETPU 178 // Freescale Extended Time Processing Unit
+#define EM_SLE9X 179 // Infineon Technologies SLE9X core
+#define EM_L1OM 180 // Intel L1OM
+#define EM_INTEL181 181 // Reserved by Intel
+#define EM_INTEL182 182 // Reserved by Intel
+#define EM_res183 183 // Reserved by ARM
+#define EM_res184 184 // Reserved by ARM
+#define EM_AVR32 185 // Atmel Corporation 32-bit microprocessor family
+#define EM_STM8 186 // STMicroeletronics STM8 8-bit microcontroller
+#define EM_TILE64 187 // Tilera TILE64 multicore architecture family
+#define EM_TILEPRO 188 // Tilera TILEPro multicore architecture family
+#define EM_MICROBLAZE 189 // Xilinx MicroBlaze 32-bit RISC soft processor core
+#define EM_CUDA 190 // NVIDIA CUDA architecture
+#define EM_TILEGX 191 // Tilera TILE-Gx multicore architecture family
+#define EM_CLOUDSHIELD 192 // CloudShield architecture family
+#define EM_COREA_1ST 193 // KIPO-KAIST Core-A 1st generation processor family
+#define EM_COREA_2ND 194 // KIPO-KAIST Core-A 2nd generation processor family
+#define EM_ARC_COMPACT2 195 // Synopsys ARCompact V2
+#define EM_OPEN8 196 // Open8 8-bit RISC soft processor core
+#define EM_RL78 197 // Renesas RL78 family
+#define EM_VIDEOCORE5 198 // Broadcom VideoCore V processor
+#define EM_78KOR 199 // Renesas 78KOR family
+#define EM_56800EX 200 // Freescale 56800EX Digital Signal Controller (DSC)
+#define EM_BA1 201 // Beyond BA1 CPU architecture
+#define EM_BA2 202 // Beyond BA2 CPU architecture
+#define EM_XCORE 203 // XMOS xCORE processor family
+#define EM_MCHP_PIC 204 // Microchip 8-bit PIC(r) family
+#define EM_INTEL205 205 // Reserved by Intel
+#define EM_INTEL206 206 // Reserved by Intel
+#define EM_INTEL207 207 // Reserved by Intel
+#define EM_INTEL208 208 // Reserved by Intel
+#define EM_INTEL209 209 // Reserved by Intel
+#define EM_KM32 210 // KM211 KM32 32-bit processor
+#define EM_KMX32 211 // KM211 KMX32 32-bit processor
+#define EM_KMX16 212 // KM211 KMX16 16-bit processor
+#define EM_KMX8 213 // KM211 KMX8 8-bit processor
+#define EM_KVARC 214 // KM211 KVARC processor
+#define EM_CDP 215 // Paneve CDP architecture family
+#define EM_COGE 216 // Cognitive Smart Memory Processor
+#define EM_COOL 217 // iCelero CoolEngine
+#define EM_NORC 218 // Nanoradio Optimized RISC
+#define EM_CSR_KALIMBA 219 // CSR Kalimba architecture family
+#define EM_Z80 220 // Zilog Z80
+#define EM_VISIUM 221 // Controls and Data Services VISIUMcore processor
+#define EM_FT32 222 // FTDI Chip FT32 high performance 32-bit RISC architecture
+#define EM_MOXIE 223 // Moxie processor family
+#define EM_AMDGPU 224 // AMD GPU architecture
+#define EM_RISCV 243 // RISC-V
+#define EM_LANAI 244 // Lanai processor
+#define EM_CEVA 245 // CEVA Processor Architecture Family
+#define EM_CEVA_X2 246 // CEVA X2 Processor Family
+#define EM_BPF 247 // Linux BPF – in-kernel virtual machine
+
+// File version
+#define EV_NONE 0
+#define EV_CURRENT 1
+
+// Identification index
+#define EI_MAG0 0
+#define EI_MAG1 1
+#define EI_MAG2 2
+#define EI_MAG3 3
+#define EI_CLASS 4
+#define EI_DATA 5
+#define EI_VERSION 6
+#define EI_OSABI 7
+#define EI_ABIVERSION 8
+#define EI_PAD 9
+#define EI_NIDENT 16
+
+// Magic number
+#define ELFMAG0 0x7F
+#define ELFMAG1 'E'
+#define ELFMAG2 'L'
+#define ELFMAG3 'F'
+
+// File class
+#define ELFCLASSNONE 0
+#define ELFCLASS32 1
+#define ELFCLASS64 2
+
+// Encoding
+#define ELFDATANONE 0
+#define ELFDATA2LSB 1
+#define ELFDATA2MSB 2
+
+// OS extensions
+#define ELFOSABI_NONE 0 // No extensions or unspecified
+#define ELFOSABI_HPUX 1 // Hewlett-Packard HP-UX
+#define ELFOSABI_NETBSD 2 // NetBSD
+#define ELFOSABI_LINUX 3 // Linux
+#define ELFOSABI_SOLARIS 6 // Sun Solaris
+#define ELFOSABI_AIX 7 // AIX
+#define ELFOSABI_IRIX 8 // IRIX
+#define ELFOSABI_FREEBSD 9 // FreeBSD
+#define ELFOSABI_TRU64 10 // Compaq TRU64 UNIX
+#define ELFOSABI_MODESTO 11 // Novell Modesto
+#define ELFOSABI_OPENBSD 12 // Open BSD
+#define ELFOSABI_OPENVMS 13 // Open VMS
+#define ELFOSABI_NSK 14 // Hewlett-Packard Non-Stop Kernel
+#define ELFOSABI_AROS 15 // Amiga Research OS
+#define ELFOSABI_FENIXOS 16 // The FenixOS highly scalable multi-core OS
+// 64-255 Architecture-specific value range
+
+
+/////////////////////
+// Sections constants
+
+// Section indexes
+#define SHN_UNDEF 0
+#define SHN_LORESERVE 0xFF00
+#define SHN_LOPROC 0xFF00
+#define SHN_HIPROC 0xFF1F
+#define SHN_LOOS 0xFF20
+#define SHN_HIOS 0xFF3F
+#define SHN_ABS 0xFFF1
+#define SHN_COMMON 0xFFF2
+#define SHN_XINDEX 0xFFFF
+#define SHN_HIRESERVE 0xFFFF
+
+// Section types
+#define SHT_NULL 0
+#define SHT_PROGBITS 1
+#define SHT_SYMTAB 2
+#define SHT_STRTAB 3
+#define SHT_RELA 4
+#define SHT_HASH 5
+#define SHT_DYNAMIC 6
+#define SHT_NOTE 7
+#define SHT_NOBITS 8
+#define SHT_REL 9
+#define SHT_SHLIB 10
+#define SHT_DYNSYM 11
+#define SHT_INIT_ARRAY 14
+#define SHT_FINI_ARRAY 15
+#define SHT_PREINIT_ARRAY 16
+#define SHT_GROUP 17
+#define SHT_SYMTAB_SHNDX 18
+#define SHT_LOOS 0x60000000
+#define SHT_HIOS 0x6fffffff
+#define SHT_LOPROC 0x70000000
+#define SHT_HIPROC 0x7FFFFFFF
+#define SHT_LOUSER 0x80000000
+#define SHT_HIUSER 0xFFFFFFFF
+
+// Section attribute flags
+#define SHF_WRITE 0x1
+#define SHF_ALLOC 0x2
+#define SHF_EXECINSTR 0x4
+#define SHF_MERGE 0x10
+#define SHF_STRINGS 0x20
+#define SHF_INFO_LINK 0x40
+#define SHF_LINK_ORDER 0x80
+#define SHF_OS_NONCONFORMING 0x100
+#define SHF_GROUP 0x200
+#define SHF_TLS 0x400
+#define SHF_MASKOS 0x0ff00000
+#define SHF_MASKPROC 0xF0000000
+
+// Section group flags
+#define GRP_COMDAT 0x1
+#define GRP_MASKOS 0x0ff00000
+#define GRP_MASKPROC 0xf0000000
+
+// Symbol binding
+#define STB_LOCAL 0
+#define STB_GLOBAL 1
+#define STB_WEAK 2
+#define STB_LOOS 10
+#define STB_HIOS 12
+#define STB_MULTIDEF 13
+#define STB_LOPROC 13
+#define STB_HIPROC 15
+
+// Symbol types
+#define STT_NOTYPE 0
+#define STT_OBJECT 1
+#define STT_FUNC 2
+#define STT_SECTION 3
+#define STT_FILE 4
+#define STT_COMMON 5
+#define STT_TLS 6
+#define STT_LOOS 10
+#define STT_HIOS 12
+#define STT_LOPROC 13
+#define STT_HIPROC 15
+
+// Symbol visibility
+#define STV_DEFAULT 0
+#define STV_INTERNAL 1
+#define STV_HIDDEN 2
+#define STV_PROTECTED 3
+
+// Undefined name
+#define STN_UNDEF 0
+
+// Relocation types
+#define R_386_NONE 0
+#define R_X86_64_NONE 0
+#define R_386_32 1
+#define R_X86_64_64 1
+#define R_386_PC32 2
+#define R_X86_64_PC32 2
+#define R_386_GOT32 3
+#define R_X86_64_GOT32 3
+#define R_386_PLT32 4
+#define R_X86_64_PLT32 4
+#define R_386_COPY 5
+#define R_X86_64_COPY 5
+#define R_386_GLOB_DAT 6
+#define R_X86_64_GLOB_DAT 6
+#define R_386_JMP_SLOT 7
+#define R_X86_64_JUMP_SLOT 7
+#define R_386_RELATIVE 8
+#define R_X86_64_RELATIVE 8
+#define R_386_GOTOFF 9
+#define R_X86_64_GOTPCREL 9
+#define R_386_GOTPC 10
+#define R_X86_64_32 10
+#define R_X86_64_32S 11
+#define R_X86_64_16 12
+#define R_X86_64_PC16 13
+#define R_X86_64_8 14
+#define R_X86_64_PC8 15
+#define R_X86_64_DTPMOD64 16
+#define R_X86_64_DTPOFF64 17
+#define R_X86_64_TPOFF64 18
+#define R_X86_64_TLSGD 19
+#define R_X86_64_TLSLD 20
+#define R_X86_64_DTPOFF32 21
+#define R_X86_64_GOTTPOFF 22
+#define R_X86_64_TPOFF32 23
+#define R_X86_64_PC64 24
+#define R_X86_64_GOTOFF64 25
+#define R_X86_64_GOTPC32 26
+#define R_X86_64_GOT64 27
+#define R_X86_64_GOTPCREL64 28
+#define R_X86_64_GOTPC64 29
+#define R_X86_64_GOTPLT64 30
+#define R_X86_64_PLTOFF64 31
+#define R_X86_64_GOTPC32_TLSDESC 34
+#define R_X86_64_TLSDESC_CALL 35
+#define R_X86_64_TLSDESC 36
+#define R_X86_64_IRELATIVE 37
+#define R_X86_64_GNU_VTINHERIT 250
+#define R_X86_64_GNU_VTENTRY 251
+
+// Segment types
+#define PT_NULL 0
+#define PT_LOAD 1
+#define PT_DYNAMIC 2
+#define PT_INTERP 3
+#define PT_NOTE 4
+#define PT_SHLIB 5
+#define PT_PHDR 6
+#define PT_TLS 7
+#define PT_LOOS 0x60000000
+#define PT_HIOS 0x6fffffff
+#define PT_LOPROC 0x70000000
+#define PT_HIPROC 0x7FFFFFFF
+
+// Segment flags
+#define PF_X 1 // Execute
+#define PF_W 2 // Write
+#define PF_R 4 // Read
+#define PF_MASKOS 0x0ff00000 // Unspecified
+#define PF_MASKPROC 0xf0000000 // Unspecified
+
+// Dynamic Array Tags
+#define DT_NULL 0
+#define DT_NEEDED 1
+#define DT_PLTRELSZ 2
+#define DT_PLTGOT 3
+#define DT_HASH 4
+#define DT_STRTAB 5
+#define DT_SYMTAB 6
+#define DT_RELA 7
+#define DT_RELASZ 8
+#define DT_RELAENT 9
+#define DT_STRSZ 10
+#define DT_SYMENT 11
+#define DT_INIT 12
+#define DT_FINI 13
+#define DT_SONAME 14
+#define DT_RPATH 15
+#define DT_SYMBOLIC 16
+#define DT_REL 17
+#define DT_RELSZ 18
+#define DT_RELENT 19
+#define DT_PLTREL 20
+#define DT_DEBUG 21
+#define DT_TEXTREL 22
+#define DT_JMPREL 23
+#define DT_BIND_NOW 24
+#define DT_INIT_ARRAY 25
+#define DT_FINI_ARRAY 26
+#define DT_INIT_ARRAYSZ 27
+#define DT_FINI_ARRAYSZ 28
+#define DT_RUNPATH 29
+#define DT_FLAGS 30
+#define DT_ENCODING 32
+#define DT_PREINIT_ARRAY 32
+#define DT_PREINIT_ARRAYSZ 33
+#define DT_MAXPOSTAGS 34
+#define DT_LOOS 0x6000000D
+#define DT_HIOS 0x6ffff000
+#define DT_LOPROC 0x70000000
+#define DT_HIPROC 0x7FFFFFFF
+
+// DT_FLAGS values
+#define DF_ORIGIN 0x1
+#define DF_SYMBOLIC 0x2
+#define DF_TEXTREL 0x4
+#define DF_BIND_NOW 0x8
+#define DF_STATIC_TLS 0x10
+
+
+// ELF file header
+struct Elf32_Ehdr {
+ unsigned char e_ident[EI_NIDENT];
+ Elf_Half e_type;
+ Elf_Half e_machine;
+ Elf_Word e_version;
+ Elf32_Addr e_entry;
+ Elf32_Off e_phoff;
+ Elf32_Off e_shoff;
+ Elf_Word e_flags;
+ Elf_Half e_ehsize;
+ Elf_Half e_phentsize;
+ Elf_Half e_phnum;
+ Elf_Half e_shentsize;
+ Elf_Half e_shnum;
+ Elf_Half e_shstrndx;
+};
+
+struct Elf64_Ehdr {
+ unsigned char e_ident[EI_NIDENT];
+ Elf_Half e_type;
+ Elf_Half e_machine;
+ Elf_Word e_version;
+ Elf64_Addr e_entry;
+ Elf64_Off e_phoff;
+ Elf64_Off e_shoff;
+ Elf_Word e_flags;
+ Elf_Half e_ehsize;
+ Elf_Half e_phentsize;
+ Elf_Half e_phnum;
+ Elf_Half e_shentsize;
+ Elf_Half e_shnum;
+ Elf_Half e_shstrndx;
+};
+
+
+// Section header
+struct Elf32_Shdr {
+ Elf_Word sh_name;
+ Elf_Word sh_type;
+ Elf_Word sh_flags;
+ Elf32_Addr sh_addr;
+ Elf32_Off sh_offset;
+ Elf_Word sh_size;
+ Elf_Word sh_link;
+ Elf_Word sh_info;
+ Elf_Word sh_addralign;
+ Elf_Word sh_entsize;
+};
+
+struct Elf64_Shdr {
+ Elf_Word sh_name;
+ Elf_Word sh_type;
+ Elf_Xword sh_flags;
+ Elf64_Addr sh_addr;
+ Elf64_Off sh_offset;
+ Elf_Xword sh_size;
+ Elf_Word sh_link;
+ Elf_Word sh_info;
+ Elf_Xword sh_addralign;
+ Elf_Xword sh_entsize;
+};
+
+
+// Segment header
+struct Elf32_Phdr {
+ Elf_Word p_type;
+ Elf32_Off p_offset;
+ Elf32_Addr p_vaddr;
+ Elf32_Addr p_paddr;
+ Elf_Word p_filesz;
+ Elf_Word p_memsz;
+ Elf_Word p_flags;
+ Elf_Word p_align;
+};
+
+struct Elf64_Phdr {
+ Elf_Word p_type;
+ Elf_Word p_flags;
+ Elf64_Off p_offset;
+ Elf64_Addr p_vaddr;
+ Elf64_Addr p_paddr;
+ Elf_Xword p_filesz;
+ Elf_Xword p_memsz;
+ Elf_Xword p_align;
+};
+
+
+// Symbol table entry
+struct Elf32_Sym {
+ Elf_Word st_name;
+ Elf32_Addr st_value;
+ Elf_Word st_size;
+ unsigned char st_info;
+ unsigned char st_other;
+ Elf_Half st_shndx;
+};
+
+struct Elf64_Sym {
+ Elf_Word st_name;
+ unsigned char st_info;
+ unsigned char st_other;
+ Elf_Half st_shndx;
+ Elf64_Addr st_value;
+ Elf_Xword st_size;
+};
+
+
+#define ELF_ST_BIND(i) ((i) >> 4)
+#define ELF_ST_TYPE(i) ((i)&0xf)
+#define ELF_ST_INFO(b, t) (((b) << 4) + ((t)&0xf))
+
+#define ELF_ST_VISIBILITY(o) ((o)&0x3)
+
+
+// Relocation entries
+struct Elf32_Rel {
+ Elf32_Addr r_offset;
+ Elf_Word r_info;
+};
+
+struct Elf32_Rela {
+ Elf32_Addr r_offset;
+ Elf_Word r_info;
+ Elf_Sword r_addend;
+};
+
+struct Elf64_Rel {
+ Elf64_Addr r_offset;
+ Elf_Xword r_info;
+};
+
+struct Elf64_Rela {
+ Elf64_Addr r_offset;
+ Elf_Xword r_info;
+ Elf_Sxword r_addend;
+};
+
+
+#define ELF32_R_SYM(i) ((i) >> 8)
+#define ELF32_R_TYPE(i) ((unsigned char)(i))
+#define ELF32_R_INFO(s, t) (((s) << 8) + (unsigned char)(t))
+
+#define ELF64_R_SYM(i) ((i) >> 32)
+#define ELF64_R_TYPE(i) ((i)&0xffffffffL)
+#define ELF64_R_INFO(s, t) ((((int64_t)s) << 32) + ((t)&0xffffffffL))
+
+// Dynamic structure
+struct Elf32_Dyn {
+ Elf_Sword d_tag;
+ union {
+ Elf_Word d_val;
+ Elf32_Addr d_ptr;
+ } d_un;
+};
+
+struct Elf64_Dyn {
+ Elf_Sxword d_tag;
+ union {
+ Elf_Xword d_val;
+ Elf64_Addr d_ptr;
+ } d_un;
+};
+
+} // namespace ELFIO
+
+#endif // ELFTYPES_H
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio.hpp
new file mode 100644
index 0000000..6bc0418
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio.hpp
@@ -0,0 +1,740 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_HPP
+#define ELFIO_HPP
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#pragma warning(disable : 4355)
+#pragma warning(disable : 4244)
+#endif
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <vector>
+#include <deque>
+#include <iterator>
+#include <typeinfo>
+
+#include "elf_types.hpp"
+#include "elfio_utils.hpp"
+#include "elfio_header.hpp"
+#include "elfio_section.hpp"
+#include "elfio_segment.hpp"
+#include "elfio_strings.hpp"
+
+#define ELFIO_HEADER_ACCESS_GET(TYPE, FNAME) \
+ TYPE get_##FNAME() const { return header->get_##FNAME(); }
+
+#define ELFIO_HEADER_ACCESS_GET_SET(TYPE, FNAME) \
+ TYPE get_##FNAME() const { return header->get_##FNAME(); } \
+ void set_##FNAME(TYPE val) { header->set_##FNAME(val); }
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class elfio {
+ public:
+ //------------------------------------------------------------------------------
+ elfio() : sections(this), segments(this) {
+ header = 0;
+ current_file_pos = 0;
+ create(ELFCLASS32, ELFDATA2LSB);
+ }
+
+ //------------------------------------------------------------------------------
+ ~elfio() { clean(); }
+
+ //------------------------------------------------------------------------------
+ void create(unsigned char file_class, unsigned char encoding) {
+ clean();
+ convertor.setup(encoding);
+ header = create_header(file_class, encoding);
+ create_mandatory_sections();
+ }
+
+ //------------------------------------------------------------------------------
+ bool load(const std::string& file_name) {
+ std::ifstream stream;
+ stream.open(file_name.c_str(), std::ios::in | std::ios::binary);
+ if (!stream) {
+ return false;
+ }
+
+ return load(stream);
+ }
+
+ //------------------------------------------------------------------------------
+ bool load(std::istream& stream) {
+ clean();
+
+ unsigned char e_ident[EI_NIDENT];
+
+ // Read ELF file signature
+ stream.seekg(0);
+ stream.read(reinterpret_cast<char*>(&e_ident), sizeof(e_ident));
+
+ // Is it ELF file?
+ if (stream.gcount() != sizeof(e_ident) || e_ident[EI_MAG0] != ELFMAG0 ||
+ e_ident[EI_MAG1] != ELFMAG1 || e_ident[EI_MAG2] != ELFMAG2 ||
+ e_ident[EI_MAG3] != ELFMAG3) {
+ return false;
+ }
+
+ if ((e_ident[EI_CLASS] != ELFCLASS64) && (e_ident[EI_CLASS] != ELFCLASS32)) {
+ return false;
+ }
+
+ convertor.setup(e_ident[EI_DATA]);
+
+ header = create_header(e_ident[EI_CLASS], e_ident[EI_DATA]);
+ if (0 == header) {
+ return false;
+ }
+ if (!header->load(stream)) {
+ return false;
+ }
+
+ load_sections(stream);
+ load_segments(stream);
+
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool save(const std::string& file_name) {
+ std::ofstream f(file_name.c_str(), std::ios::out | std::ios::binary);
+
+ if (!f) {
+ return false;
+ }
+
+ bool is_still_good = true;
+
+ // Define layout specific header fields
+ // The position of the segment table is fixed after the header.
+ // The position of the section table is variable and needs to be fixed
+ // before saving.
+ header->set_segments_num(segments.size());
+ header->set_segments_offset(segments.size() ? header->get_header_size() : 0);
+ header->set_sections_num(sections.size());
+ header->set_sections_offset(0);
+
+ // Layout the first section right after the segment table
+ current_file_pos = header->get_header_size() +
+ header->get_segment_entry_size() * header->get_segments_num();
+
+ is_still_good = layout_segments_and_their_sections();
+ is_still_good = is_still_good && layout_sections_without_segments();
+ is_still_good = is_still_good && layout_section_table();
+
+ is_still_good = is_still_good && save_header(f);
+ is_still_good = is_still_good && save_sections(f);
+ is_still_good = is_still_good && save_segments(f);
+
+ f.close();
+
+ return is_still_good;
+ }
+
+ //------------------------------------------------------------------------------
+ // ELF header access functions
+ ELFIO_HEADER_ACCESS_GET(unsigned char, class);
+ ELFIO_HEADER_ACCESS_GET(unsigned char, elf_version);
+ ELFIO_HEADER_ACCESS_GET(unsigned char, encoding);
+ ELFIO_HEADER_ACCESS_GET(Elf_Word, version);
+ ELFIO_HEADER_ACCESS_GET(Elf_Half, header_size);
+ ELFIO_HEADER_ACCESS_GET(Elf_Half, section_entry_size);
+ ELFIO_HEADER_ACCESS_GET(Elf_Half, segment_entry_size);
+
+ ELFIO_HEADER_ACCESS_GET_SET(unsigned char, os_abi);
+ ELFIO_HEADER_ACCESS_GET_SET(unsigned char, abi_version);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf_Half, type);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf_Half, machine);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf_Word, flags);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf64_Addr, entry);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf64_Off, sections_offset);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf64_Off, segments_offset);
+ ELFIO_HEADER_ACCESS_GET_SET(Elf_Half, section_name_str_index);
+
+ //------------------------------------------------------------------------------
+ const endianess_convertor& get_convertor() const { return convertor; }
+
+ //------------------------------------------------------------------------------
+ Elf_Xword get_default_entry_size(Elf_Word section_type) const {
+ switch (section_type) {
+ case SHT_RELA:
+ if (header->get_class() == ELFCLASS64) {
+ return sizeof(Elf64_Rela);
+ } else {
+ return sizeof(Elf32_Rela);
+ }
+ case SHT_REL:
+ if (header->get_class() == ELFCLASS64) {
+ return sizeof(Elf64_Rel);
+ } else {
+ return sizeof(Elf32_Rel);
+ }
+ case SHT_SYMTAB:
+ if (header->get_class() == ELFCLASS64) {
+ return sizeof(Elf64_Sym);
+ } else {
+ return sizeof(Elf32_Sym);
+ }
+ case SHT_DYNAMIC:
+ if (header->get_class() == ELFCLASS64) {
+ return sizeof(Elf64_Dyn);
+ } else {
+ return sizeof(Elf32_Dyn);
+ }
+ default:
+ return 0;
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ //------------------------------------------------------------------------------
+ void clean() {
+ delete header;
+ header = 0;
+
+ std::vector<section*>::const_iterator it;
+ for (it = sections_.begin(); it != sections_.end(); ++it) {
+ delete *it;
+ }
+ sections_.clear();
+
+ std::vector<segment*>::const_iterator it1;
+ for (it1 = segments_.begin(); it1 != segments_.end(); ++it1) {
+ delete *it1;
+ }
+ segments_.clear();
+ }
+
+ //------------------------------------------------------------------------------
+ elf_header* create_header(unsigned char file_class, unsigned char encoding) {
+ elf_header* new_header = 0;
+
+ if (file_class == ELFCLASS64) {
+ new_header = new elf_header_impl<Elf64_Ehdr>(&convertor, encoding);
+ } else if (file_class == ELFCLASS32) {
+ new_header = new elf_header_impl<Elf32_Ehdr>(&convertor, encoding);
+ } else {
+ return 0;
+ }
+
+ return new_header;
+ }
+
+ //------------------------------------------------------------------------------
+ section* create_section() {
+ section* new_section;
+ unsigned char file_class = get_class();
+
+ if (file_class == ELFCLASS64) {
+ new_section = new section_impl<Elf64_Shdr>(&convertor);
+ } else if (file_class == ELFCLASS32) {
+ new_section = new section_impl<Elf32_Shdr>(&convertor);
+ } else {
+ return 0;
+ }
+
+ new_section->set_index((Elf_Half)sections_.size());
+ sections_.push_back(new_section);
+
+ return new_section;
+ }
+
+
+ //------------------------------------------------------------------------------
+ segment* create_segment() {
+ segment* new_segment;
+ unsigned char file_class = header->get_class();
+
+ if (file_class == ELFCLASS64) {
+ new_segment = new segment_impl<Elf64_Phdr>(&convertor);
+ } else if (file_class == ELFCLASS32) {
+ new_segment = new segment_impl<Elf32_Phdr>(&convertor);
+ } else {
+ return 0;
+ }
+
+ new_segment->set_index((Elf_Half)segments_.size());
+ segments_.push_back(new_segment);
+
+ return new_segment;
+ }
+
+ //------------------------------------------------------------------------------
+ void create_mandatory_sections() {
+ // Create null section without calling to 'add_section' as no string
+ // section containing section names exists yet
+ section* sec0 = create_section();
+ sec0->set_index(0);
+ sec0->set_name("");
+ sec0->set_name_string_offset(0);
+
+ set_section_name_str_index(1);
+ section* shstrtab = sections.add(".shstrtab");
+ shstrtab->set_type(SHT_STRTAB);
+ shstrtab->set_addr_align(1);
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Half load_sections(std::istream& stream) {
+ Elf_Half entry_size = header->get_section_entry_size();
+ Elf_Half num = header->get_sections_num();
+ Elf64_Off offset = header->get_sections_offset();
+
+ for (Elf_Half i = 0; i < num; ++i) {
+ section* sec = create_section();
+ sec->load(stream, (std::streamoff)offset + i * entry_size);
+ sec->set_index(i);
+ // To mark that the section is not permitted to reassign address
+ // during layout calculation
+ sec->set_address(sec->get_address());
+ }
+
+ Elf_Half shstrndx = get_section_name_str_index();
+
+ if (SHN_UNDEF != shstrndx) {
+ string_section_accessor str_reader(sections[shstrndx]);
+ for (Elf_Half i = 0; i < num; ++i) {
+ Elf_Word offset = sections[i]->get_name_string_offset();
+ const char* p = str_reader.get_string(offset);
+ if (p != 0) {
+ sections[i]->set_name(p);
+ }
+ }
+ }
+
+ return num;
+ }
+
+ //------------------------------------------------------------------------------
+ bool load_segments(std::istream& stream) {
+ Elf_Half entry_size = header->get_segment_entry_size();
+ Elf_Half num = header->get_segments_num();
+ Elf64_Off offset = header->get_segments_offset();
+
+ for (Elf_Half i = 0; i < num; ++i) {
+ segment* seg;
+ unsigned char file_class = header->get_class();
+
+ if (file_class == ELFCLASS64) {
+ seg = new segment_impl<Elf64_Phdr>(&convertor);
+ } else if (file_class == ELFCLASS32) {
+ seg = new segment_impl<Elf32_Phdr>(&convertor);
+ } else {
+ return false;
+ }
+
+ seg->load(stream, (std::streamoff)offset + i * entry_size);
+ seg->set_index(i);
+
+ // Add sections to the segments (similar to readelfs algorithm)
+ Elf64_Off segBaseOffset = seg->get_offset();
+ Elf64_Off segEndOffset = segBaseOffset + seg->get_file_size();
+ Elf64_Off segVBaseAddr = seg->get_virtual_address();
+ Elf64_Off segVEndAddr = segVBaseAddr + seg->get_memory_size();
+ for (Elf_Half j = 0; j < sections.size(); ++j) {
+ const section* psec = sections[j];
+
+ // SHF_ALLOC sections are matched based on the virtual address
+ // otherwise the file offset is matched
+ if (psec->get_flags() & SHF_ALLOC
+ ? (segVBaseAddr <= psec->get_address() &&
+ psec->get_address() + psec->get_size() <= segVEndAddr)
+ : (segBaseOffset <= psec->get_offset() &&
+ psec->get_offset() + psec->get_size() <= segEndOffset)) {
+ seg->add_section_index(psec->get_index(), psec->get_addr_align());
+ }
+ }
+
+ // Add section into the segments' container
+ segments_.push_back(seg);
+ }
+
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool save_header(std::ofstream& f) { return header->save(f); }
+
+ //------------------------------------------------------------------------------
+ bool save_sections(std::ofstream& f) {
+ for (unsigned int i = 0; i < sections_.size(); ++i) {
+ section* sec = sections_.at(i);
+
+ std::streampos headerPosition = (std::streamoff)header->get_sections_offset() +
+ header->get_section_entry_size() * sec->get_index();
+
+ sec->save(f, headerPosition, sec->get_offset());
+ }
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool save_segments(std::ofstream& f) {
+ for (unsigned int i = 0; i < segments_.size(); ++i) {
+ segment* seg = segments_.at(i);
+
+ std::streampos headerPosition =
+ header->get_segments_offset() + header->get_segment_entry_size() * seg->get_index();
+
+ seg->save(f, headerPosition, seg->get_offset());
+ }
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool is_section_without_segment(unsigned int section_index) {
+ bool found = false;
+
+ for (unsigned int j = 0; !found && (j < segments.size()); ++j) {
+ for (unsigned int k = 0; !found && (k < segments[j]->get_sections_num()); ++k) {
+ found = segments[j]->get_section_index_at(k) == section_index;
+ }
+ }
+
+ return !found;
+ }
+
+ //------------------------------------------------------------------------------
+ bool is_subsequence_of(segment* seg1, segment* seg2) {
+ // Return 'true' if sections of seg1 are a subset of sections in seg2
+ const std::vector<Elf_Half>& sections1 = seg1->get_sections();
+ const std::vector<Elf_Half>& sections2 = seg2->get_sections();
+
+ bool found = false;
+ if (sections1.size() < sections2.size()) {
+ found = std::includes(sections2.begin(), sections2.end(), sections1.begin(),
+ sections1.end());
+ }
+
+ return found;
+ }
+
+ //------------------------------------------------------------------------------
+ std::vector<segment*> get_ordered_segments() {
+ std::vector<segment*> res;
+ std::deque<segment*> worklist;
+
+ res.reserve(segments.size());
+ std::copy(segments_.begin(), segments_.end(), std::back_inserter(worklist));
+
+ // Bring the segments which start at address 0 to the front
+ size_t nextSlot = 0;
+ for (size_t i = 0; i < worklist.size(); ++i) {
+ if (i != nextSlot && worklist[i]->is_offset_initialized() &&
+ worklist[i]->get_offset() == 0) {
+ std::swap(worklist[i], worklist[nextSlot]);
+ ++nextSlot;
+ }
+ }
+
+ while (!worklist.empty()) {
+ segment* seg = worklist.front();
+ worklist.pop_front();
+
+ size_t i = 0;
+ for (; i < worklist.size(); ++i) {
+ if (is_subsequence_of(seg, worklist[i])) {
+ break;
+ }
+ }
+
+ if (i < worklist.size())
+ worklist.push_back(seg);
+ else
+ res.push_back(seg);
+ }
+
+ return res;
+ }
+
+
+ //------------------------------------------------------------------------------
+ bool layout_sections_without_segments() {
+ for (unsigned int i = 0; i < sections_.size(); ++i) {
+ if (is_section_without_segment(i)) {
+ section* sec = sections_[i];
+
+ Elf_Xword section_align = sec->get_addr_align();
+ if (section_align > 1 && current_file_pos % section_align != 0) {
+ current_file_pos += section_align - current_file_pos % section_align;
+ }
+
+ if (0 != sec->get_index()) sec->set_offset(current_file_pos);
+
+ if (SHT_NOBITS != sec->get_type() && SHT_NULL != sec->get_type()) {
+ current_file_pos += sec->get_size();
+ }
+ }
+ }
+
+ return true;
+ }
+
+
+ //------------------------------------------------------------------------------
+ bool layout_segments_and_their_sections() {
+ std::vector<segment*> worklist;
+ std::vector<bool> section_generated(sections.size(), false);
+
+ // Get segments in a order in where segments which contain a
+ // sub sequence of other segments are located at the end
+ worklist = get_ordered_segments();
+
+ for (unsigned int i = 0; i < worklist.size(); ++i) {
+ Elf_Xword segment_memory = 0;
+ Elf_Xword segment_filesize = 0;
+ Elf_Xword seg_start_pos = current_file_pos;
+ segment* seg = worklist[i];
+
+ // Special case: PHDR segment
+ // This segment contains the program headers but no sections
+ if (seg->get_type() == PT_PHDR && seg->get_sections_num() == 0) {
+ seg_start_pos = header->get_segments_offset();
+ segment_memory = segment_filesize =
+ header->get_segment_entry_size() * header->get_segments_num();
+ }
+ // Special case:
+ // Segments which start with the NULL section and have further sections
+ else if (seg->get_sections_num() > 1 &&
+ sections[seg->get_section_index_at(0)]->get_type() == SHT_NULL) {
+ seg_start_pos = 0;
+ if (seg->get_sections_num()) {
+ segment_memory = segment_filesize = current_file_pos;
+ }
+ }
+ // New segments with not generated sections
+ // have to be aligned
+ else if (seg->get_sections_num() && !section_generated[seg->get_section_index_at(0)]) {
+ Elf64_Off cur_page_alignment = current_file_pos % seg->get_align();
+ Elf64_Off req_page_alignment = seg->get_virtual_address() % seg->get_align();
+ Elf64_Off error = req_page_alignment - cur_page_alignment;
+
+ current_file_pos += (seg->get_align() + error) % seg->get_align();
+ seg_start_pos = current_file_pos;
+ } else if (seg->get_sections_num()) {
+ seg_start_pos = sections[seg->get_section_index_at(0)]->get_offset();
+ }
+
+ // Write segment's data
+ for (unsigned int j = 0; j < seg->get_sections_num(); ++j) {
+ Elf_Half index = seg->get_section_index_at(j);
+
+ section* sec = sections[index];
+
+ // The NULL section is always generated
+ if (SHT_NULL == sec->get_type()) {
+ section_generated[index] = true;
+ continue;
+ }
+
+ Elf_Xword secAlign = 0;
+ // Fix up the alignment
+ if (!section_generated[index] && sec->is_address_initialized() &&
+ SHT_NOBITS != sec->get_type() && SHT_NULL != sec->get_type()) {
+ // Align the sections based on the virtual addresses
+ // when possible (this is what matters for execution)
+ Elf64_Off req_offset = sec->get_address() - seg->get_virtual_address();
+ Elf64_Off cur_offset = current_file_pos - seg_start_pos;
+ secAlign = req_offset - cur_offset;
+ } else if (!section_generated[index]) {
+ // If no address has been specified then only the section
+ // alignment constraint has to be matched
+ Elf_Xword align = sec->get_addr_align();
+ if (align == 0) {
+ align = 1;
+ }
+ Elf64_Off error = current_file_pos % align;
+ secAlign = (align - error) % align;
+ } else {
+ // Alignment for already generated sections
+ secAlign = sec->get_offset() - seg_start_pos - segment_filesize;
+ }
+
+ // Determine the segment file and memory sizes
+ // Special case .tbss section (NOBITS) in non TLS segment
+ if ((sec->get_flags() & SHF_ALLOC) &&
+ !((sec->get_flags() & SHF_TLS) && (seg->get_type() != PT_TLS) &&
+ (SHT_NOBITS == sec->get_type())))
+ segment_memory += sec->get_size() + secAlign;
+ if (SHT_NOBITS != sec->get_type() && SHT_NULL != sec->get_type())
+ segment_filesize += sec->get_size() + secAlign;
+
+ // Nothing to be done when generating nested segments
+ if (section_generated[index]) {
+ continue;
+ }
+
+ current_file_pos += secAlign;
+
+ // Set the section addresses when missing
+ if (!sec->is_address_initialized())
+ sec->set_address(seg->get_virtual_address() + current_file_pos - seg_start_pos);
+
+ if (0 != sec->get_index()) sec->set_offset(current_file_pos);
+
+ if (SHT_NOBITS != sec->get_type() && SHT_NULL != sec->get_type())
+ current_file_pos += sec->get_size();
+ section_generated[index] = true;
+ }
+
+ seg->set_file_size(segment_filesize);
+ seg->set_memory_size(segment_memory);
+ seg->set_offset(seg_start_pos);
+ }
+
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool layout_section_table() {
+ // Simply place the section table at the end for now
+ Elf64_Off alignmentError = current_file_pos % 4;
+ current_file_pos += (4 - alignmentError) % 4;
+ header->set_sections_offset(current_file_pos);
+ return true;
+ }
+
+
+ //------------------------------------------------------------------------------
+ public:
+ friend class Sections;
+ class Sections {
+ public:
+ //------------------------------------------------------------------------------
+ Sections(elfio* parent_) : parent(parent_) {}
+
+ //------------------------------------------------------------------------------
+ Elf_Half size() const { return (Elf_Half)parent->sections_.size(); }
+
+ //------------------------------------------------------------------------------
+ section* operator[](unsigned int index) const {
+ section* sec = 0;
+
+ if (index < parent->sections_.size()) {
+ sec = parent->sections_[index];
+ }
+
+ return sec;
+ }
+
+ //------------------------------------------------------------------------------
+ section* operator[](const std::string& name) const {
+ section* sec = 0;
+
+ std::vector<section*>::const_iterator it;
+ for (it = parent->sections_.begin(); it != parent->sections_.end(); ++it) {
+ if ((*it)->get_name() == name) {
+ sec = *it;
+ break;
+ }
+ }
+
+ return sec;
+ }
+
+ //------------------------------------------------------------------------------
+ section* add(const std::string& name) {
+ section* new_section = parent->create_section();
+ new_section->set_name(name);
+
+ Elf_Half str_index = parent->get_section_name_str_index();
+ section* string_table(parent->sections_[str_index]);
+ string_section_accessor str_writer(string_table);
+ Elf_Word pos = str_writer.add_string(name);
+ new_section->set_name_string_offset(pos);
+
+ return new_section;
+ }
+
+ //------------------------------------------------------------------------------
+ std::vector<section*>::iterator begin() { return parent->sections_.begin(); }
+
+ //------------------------------------------------------------------------------
+ std::vector<section*>::iterator end() { return parent->sections_.end(); }
+
+ //------------------------------------------------------------------------------
+ private:
+ elfio* parent;
+ } sections;
+
+ //------------------------------------------------------------------------------
+ public:
+ friend class Segments;
+ class Segments {
+ public:
+ //------------------------------------------------------------------------------
+ Segments(elfio* parent_) : parent(parent_) {}
+
+ //------------------------------------------------------------------------------
+ Elf_Half size() const { return (Elf_Half)parent->segments_.size(); }
+
+ //------------------------------------------------------------------------------
+ segment* operator[](unsigned int index) const { return parent->segments_[index]; }
+
+
+ //------------------------------------------------------------------------------
+ segment* add() { return parent->create_segment(); }
+
+ //------------------------------------------------------------------------------
+ std::vector<segment*>::iterator begin() { return parent->segments_.begin(); }
+
+ //------------------------------------------------------------------------------
+ std::vector<segment*>::iterator end() { return parent->segments_.end(); }
+
+ //------------------------------------------------------------------------------
+ private:
+ elfio* parent;
+ } segments;
+
+ //------------------------------------------------------------------------------
+ private:
+ elf_header* header;
+ std::vector<section*> sections_;
+ std::vector<segment*> segments_;
+ endianess_convertor convertor;
+
+ Elf_Xword current_file_pos;
+};
+
+} // namespace ELFIO
+
+#include "elfio_symbols.hpp"
+#include "elfio_note.hpp"
+#include "elfio_relocation.hpp"
+#include "elfio_dynamic.hpp"
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // ELFIO_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_dump.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_dump.hpp
new file mode 100644
index 0000000..c40a010
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_dump.hpp
@@ -0,0 +1,825 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_DUMP_HPP
+#define ELFIO_DUMP_HPP
+
+#include <algorithm>
+#include <string>
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include "elfio.hpp"
+
+namespace ELFIO {
+
+
+static struct class_table_t {
+ const char key;
+ const char* str;
+} class_table[] = {
+ {ELFCLASS32, "ELF32"},
+ {ELFCLASS64, "ELF64"},
+};
+
+
+static struct endian_table_t {
+ const char key;
+ const char* str;
+} endian_table[] = {
+ {ELFDATANONE, "None"},
+ {ELFDATA2LSB, "Little endian"},
+ {ELFDATA2MSB, "Big endian"},
+};
+
+
+static struct version_table_t {
+ const Elf64_Word key;
+ const char* str;
+} version_table[] = {
+ {EV_NONE, "None"},
+ {EV_CURRENT, "Current"},
+};
+
+
+static struct type_table_t {
+ const Elf32_Half key;
+ const char* str;
+} type_table[] = {
+ {ET_NONE, "No file type"}, {ET_REL, "Relocatable file"}, {ET_EXEC, "Executable file"},
+ {ET_DYN, "Shared object file"}, {ET_CORE, "Core file"},
+};
+
+
+static struct machine_table_t {
+ const Elf64_Half key;
+ const char* str;
+} machine_table[] = {
+ {EM_NONE, "No machine"},
+ {EM_M32, "AT&T WE 32100"},
+ {EM_SPARC, "SUN SPARC"},
+ {EM_386, "Intel 80386"},
+ {EM_68K, "Motorola m68k family"},
+ {EM_88K, "Motorola m88k family"},
+ {EM_486, "Intel 80486// Reserved for future use"},
+ {EM_860, "Intel 80860"},
+ {EM_MIPS, "MIPS R3000 (officially, big-endian only)"},
+ {EM_S370, "IBM System/370"},
+ {EM_MIPS_RS3_LE, "MIPS R3000 little-endian (Oct 4 1999 Draft) Deprecated"},
+ {EM_res011, "Reserved"},
+ {EM_res012, "Reserved"},
+ {EM_res013, "Reserved"},
+ {EM_res014, "Reserved"},
+ {EM_PARISC, "HPPA"},
+ {EM_res016, "Reserved"},
+ {EM_VPP550, "Fujitsu VPP500"},
+ {EM_SPARC32PLUS, "Sun's v8plus"},
+ {EM_960, "Intel 80960"},
+ {EM_PPC, "PowerPC"},
+ {EM_PPC64, "64-bit PowerPC"},
+ {EM_S390, "IBM S/390"},
+ {EM_SPU, "Sony/Toshiba/IBM SPU"},
+ {EM_res024, "Reserved"},
+ {EM_res025, "Reserved"},
+ {EM_res026, "Reserved"},
+ {EM_res027, "Reserved"},
+ {EM_res028, "Reserved"},
+ {EM_res029, "Reserved"},
+ {EM_res030, "Reserved"},
+ {EM_res031, "Reserved"},
+ {EM_res032, "Reserved"},
+ {EM_res033, "Reserved"},
+ {EM_res034, "Reserved"},
+ {EM_res035, "Reserved"},
+ {EM_V800, "NEC V800 series"},
+ {EM_FR20, "Fujitsu FR20"},
+ {EM_RH32, "TRW RH32"},
+ {EM_MCORE, "Motorola M*Core // May also be taken by Fujitsu MMA"},
+ {EM_RCE, "Old name for MCore"},
+ {EM_ARM, "ARM"},
+ {EM_OLD_ALPHA, "Digital Alpha"},
+ {EM_SH, "Renesas (formerly Hitachi) / SuperH SH"},
+ {EM_SPARCV9, "SPARC v9 64-bit"},
+ {EM_TRICORE, "Siemens Tricore embedded processor"},
+ {EM_ARC, "ARC Cores"},
+ {EM_H8_300, "Renesas (formerly Hitachi) H8/300"},
+ {EM_H8_300H, "Renesas (formerly Hitachi) H8/300H"},
+ {EM_H8S, "Renesas (formerly Hitachi) H8S"},
+ {EM_H8_500, "Renesas (formerly Hitachi) H8/500"},
+ {EM_IA_64, "Intel IA-64 Processor"},
+ {EM_MIPS_X, "Stanford MIPS-X"},
+ {EM_COLDFIRE, "Motorola Coldfire"},
+ {EM_68HC12, "Motorola M68HC12"},
+ {EM_MMA, "Fujitsu Multimedia Accelerator"},
+ {EM_PCP, "Siemens PCP"},
+ {EM_NCPU, "Sony nCPU embedded RISC processor"},
+ {EM_NDR1, "Denso NDR1 microprocesspr"},
+ {EM_STARCORE, "Motorola Star*Core processor"},
+ {EM_ME16, "Toyota ME16 processor"},
+ {EM_ST100, "STMicroelectronics ST100 processor"},
+ {EM_TINYJ, "Advanced Logic Corp. TinyJ embedded processor"},
+ {EM_X86_64, "Advanced Micro Devices X86-64 processor"},
+ {EM_PDSP, "Sony DSP Processor"},
+ {EM_PDP10, "Digital Equipment Corp. PDP-10"},
+ {EM_PDP11, "Digital Equipment Corp. PDP-11"},
+ {EM_FX66, "Siemens FX66 microcontroller"},
+ {EM_ST9PLUS, "STMicroelectronics ST9+ 8/16 bit microcontroller"},
+ {EM_ST7, "STMicroelectronics ST7 8-bit microcontroller"},
+ {EM_68HC16, "Motorola MC68HC16 Microcontroller"},
+ {EM_68HC11, "Motorola MC68HC11 Microcontroller"},
+ {EM_68HC08, "Motorola MC68HC08 Microcontroller"},
+ {EM_68HC05, "Motorola MC68HC05 Microcontroller"},
+ {EM_SVX, "Silicon Graphics SVx"},
+ {EM_ST19, "STMicroelectronics ST19 8-bit cpu"},
+ {EM_VAX, "Digital VAX"},
+ {EM_CRIS, "Axis Communications 32-bit embedded processor"},
+ {EM_JAVELIN, "Infineon Technologies 32-bit embedded cpu"},
+ {EM_FIREPATH, "Element 14 64-bit DSP processor"},
+ {EM_ZSP, "LSI Logic's 16-bit DSP processor"},
+ {EM_MMIX, "Donald Knuth's educational 64-bit processor"},
+ {EM_HUANY, "Harvard's machine-independent format"},
+ {EM_PRISM, "SiTera Prism"},
+ {EM_AVR, "Atmel AVR 8-bit microcontroller"},
+ {EM_FR30, "Fujitsu FR30"},
+ {EM_D10V, "Mitsubishi D10V"},
+ {EM_D30V, "Mitsubishi D30V"},
+ {EM_V850, "NEC v850"},
+ {EM_M32R, "Renesas M32R (formerly Mitsubishi M32R)"},
+ {EM_MN10300, "Matsushita MN10300"},
+ {EM_MN10200, "Matsushita MN10200"},
+ {EM_PJ, "picoJava"},
+ {EM_OPENRISC, "OpenRISC 32-bit embedded processor"},
+ {EM_ARC_A5, "ARC Cores Tangent-A5"},
+ {EM_XTENSA, "Tensilica Xtensa Architecture"},
+ {EM_VIDEOCORE, "Alphamosaic VideoCore processor"},
+ {EM_TMM_GPP, "Thompson Multimedia General Purpose Processor"},
+ {EM_NS32K, "National Semiconductor 32000 series"},
+ {EM_TPC, "Tenor Network TPC processor"},
+ {EM_SNP1K, "Trebia SNP 1000 processor"},
+ {EM_ST200, "STMicroelectronics ST200 microcontroller"},
+ {EM_IP2K, "Ubicom IP2022 micro controller"},
+ {EM_MAX, "MAX Processor"},
+ {EM_CR, "National Semiconductor CompactRISC"},
+ {EM_F2MC16, "Fujitsu F2MC16"},
+ {EM_MSP430, "TI msp430 micro controller"},
+ {EM_BLACKFIN, "ADI Blackfin"},
+ {EM_SE_C33, "S1C33 Family of Seiko Epson processors"},
+ {EM_SEP, "Sharp embedded microprocessor"},
+ {EM_ARCA, "Arca RISC Microprocessor"},
+ {EM_UNICORE, "Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University"},
+ {EM_EXCESS, "eXcess: 16/32/64-bit configurable embedded CPU"},
+ {EM_DXP, "Icera Semiconductor Inc. Deep Execution Processor"},
+ {EM_ALTERA_NIOS2, "Altera Nios II soft-core processor"},
+ {EM_CRX, "National Semiconductor CRX"},
+ {EM_XGATE, "Motorola XGATE embedded processor"},
+ {EM_C166, "Infineon C16x/XC16x processor"},
+ {EM_M16C, "Renesas M16C series microprocessors"},
+ {EM_DSPIC30F, "Microchip Technology dsPIC30F Digital Signal Controller"},
+ {EM_CE, "Freescale Communication Engine RISC core"},
+ {EM_M32C, "Renesas M32C series microprocessors"},
+ {EM_res121, "Reserved"},
+ {EM_res122, "Reserved"},
+ {EM_res123, "Reserved"},
+ {EM_res124, "Reserved"},
+ {EM_res125, "Reserved"},
+ {EM_res126, "Reserved"},
+ {EM_res127, "Reserved"},
+ {EM_res128, "Reserved"},
+ {EM_res129, "Reserved"},
+ {EM_res130, "Reserved"},
+ {EM_TSK3000, "Altium TSK3000 core"},
+ {EM_RS08, "Freescale RS08 embedded processor"},
+ {EM_res133, "Reserved"},
+ {EM_ECOG2, "Cyan Technology eCOG2 microprocessor"},
+ {EM_SCORE, "Sunplus Score"},
+ {EM_SCORE7, "Sunplus S+core7 RISC processor"},
+ {EM_DSP24, "New Japan Radio (NJR) 24-bit DSP Processor"},
+ {EM_VIDEOCORE3, "Broadcom VideoCore III processor"},
+ {EM_LATTICEMICO32, "RISC processor for Lattice FPGA architecture"},
+ {EM_SE_C17, "Seiko Epson C17 family"},
+ {EM_TI_C6000, "Texas Instruments TMS320C6000 DSP family"},
+ {EM_TI_C2000, "Texas Instruments TMS320C2000 DSP family"},
+ {EM_TI_C5500, "Texas Instruments TMS320C55x DSP family"},
+ {EM_res143, "Reserved"},
+ {EM_res144, "Reserved"},
+ {EM_res145, "Reserved"},
+ {EM_res146, "Reserved"},
+ {EM_res147, "Reserved"},
+ {EM_res148, "Reserved"},
+ {EM_res149, "Reserved"},
+ {EM_res150, "Reserved"},
+ {EM_res151, "Reserved"},
+ {EM_res152, "Reserved"},
+ {EM_res153, "Reserved"},
+ {EM_res154, "Reserved"},
+ {EM_res155, "Reserved"},
+ {EM_res156, "Reserved"},
+ {EM_res157, "Reserved"},
+ {EM_res158, "Reserved"},
+ {EM_res159, "Reserved"},
+ {EM_MMDSP_PLUS, "STMicroelectronics 64bit VLIW Data Signal Processor"},
+ {EM_CYPRESS_M8C, "Cypress M8C microprocessor"},
+ {EM_R32C, "Renesas R32C series microprocessors"},
+ {EM_TRIMEDIA, "NXP Semiconductors TriMedia architecture family"},
+ {EM_QDSP6, "QUALCOMM DSP6 Processor"},
+ {EM_8051, "Intel 8051 and variants"},
+ {EM_STXP7X, "STMicroelectronics STxP7x family"},
+ {EM_NDS32, "Andes Technology compact code size embedded RISC processor family"},
+ {EM_ECOG1, "Cyan Technology eCOG1X family"},
+ {EM_ECOG1X, "Cyan Technology eCOG1X family"},
+ {EM_MAXQ30, "Dallas Semiconductor MAXQ30 Core Micro-controllers"},
+ {EM_XIMO16, "New Japan Radio (NJR) 16-bit DSP Processor"},
+ {EM_MANIK, "M2000 Reconfigurable RISC Microprocessor"},
+ {EM_CRAYNV2, "Cray Inc. NV2 vector architecture"},
+ {EM_RX, "Renesas RX family"},
+ {EM_METAG, "Imagination Technologies META processor architecture"},
+ {EM_MCST_ELBRUS, "MCST Elbrus general purpose hardware architecture"},
+ {EM_ECOG16, "Cyan Technology eCOG16 family"},
+ {EM_CR16, "National Semiconductor CompactRISC 16-bit processor"},
+ {EM_ETPU, "Freescale Extended Time Processing Unit"},
+ {EM_SLE9X, "Infineon Technologies SLE9X core"},
+ {EM_L1OM, "Intel L1OM"},
+ {EM_INTEL181, "Reserved by Intel"},
+ {EM_INTEL182, "Reserved by Intel"},
+ {EM_res183, "Reserved by ARM"},
+ {EM_res184, "Reserved by ARM"},
+ {EM_AVR32, "Atmel Corporation 32-bit microprocessor family"},
+ {EM_STM8, "STMicroeletronics STM8 8-bit microcontroller"},
+ {EM_TILE64, "Tilera TILE64 multicore architecture family"},
+ {EM_TILEPRO, "Tilera TILEPro multicore architecture family"},
+ {EM_MICROBLAZE, "Xilinx MicroBlaze 32-bit RISC soft processor core"},
+ {EM_CUDA, "NVIDIA CUDA architecture "},
+};
+
+
+static struct section_type_table_t {
+ const Elf64_Half key;
+ const char* str;
+} section_type_table[] = {
+ {SHT_NULL, "NULL"},
+ {SHT_PROGBITS, "PROGBITS"},
+ {SHT_SYMTAB, "SYMTAB"},
+ {SHT_STRTAB, "STRTAB"},
+ {SHT_RELA, "RELA"},
+ {SHT_HASH, "HASH"},
+ {SHT_DYNAMIC, "DYNAMIC"},
+ {SHT_NOTE, "NOTE"},
+ {SHT_NOBITS, "NOBITS"},
+ {SHT_REL, "REL"},
+ {SHT_SHLIB, "SHLIB"},
+ {SHT_DYNSYM, "DYNSYM"},
+ {SHT_INIT_ARRAY, "INIT_ARRAY"},
+ {SHT_FINI_ARRAY, "FINI_ARRAY"},
+ {SHT_PREINIT_ARRAY, "PREINIT_ARRAY"},
+ {SHT_GROUP, "GROUP"},
+ {SHT_SYMTAB_SHNDX, "SYMTAB_SHNDX "},
+};
+
+
+static struct segment_type_table_t {
+ const Elf_Word key;
+ const char* str;
+} segment_type_table[] = {
+ {PT_NULL, "NULL"}, {PT_LOAD, "LOAD"}, {PT_DYNAMIC, "DYNAMIC"}, {PT_INTERP, "INTERP"},
+ {PT_NOTE, "NOTE"}, {PT_SHLIB, "SHLIB"}, {PT_PHDR, "PHDR"}, {PT_TLS, "TLS"},
+};
+
+
+static struct segment_flag_table_t {
+ const Elf_Word key;
+ const char* str;
+} segment_flag_table[] = {
+ {0, ""}, {1, "X"}, {2, "W"}, {3, "WX"}, {4, "R"}, {5, "RX"}, {6, "RW"}, {7, "RWX"},
+};
+
+
+static struct symbol_bind_t {
+ const Elf_Word key;
+ const char* str;
+} symbol_bind_table[] = {
+ {STB_LOCAL, "LOCAL"}, {STB_GLOBAL, "GLOBAL"}, {STB_WEAK, "WEAK"},
+ {STB_LOOS, "LOOS"}, {STB_HIOS, "HIOS"}, {STB_MULTIDEF, "MULTIDEF"},
+ {STB_LOPROC, "LOPROC"}, {STB_HIPROC, "HIPROC"},
+};
+
+
+static struct symbol_type_t {
+ const Elf_Word key;
+ const char* str;
+} symbol_type_table[] = {
+ {STT_NOTYPE, "NOTYPE"}, {STT_OBJECT, "OBJECT"}, {STT_FUNC, "FUNC"},
+ {STT_SECTION, "SECTION"}, {STT_FILE, "FILE"}, {STT_COMMON, "COMMON"},
+ {STT_TLS, "TLS"}, {STT_LOOS, "LOOS"}, {STT_HIOS, "HIOS"},
+ {STT_LOPROC, "LOPROC"}, {STT_HIPROC, "HIPROC"},
+};
+
+
+static struct dynamic_tag_t {
+ const Elf_Word key;
+ const char* str;
+} dynamic_tag_table[] = {
+ {DT_NULL, "NULL"},
+ {DT_NEEDED, "NEEDED"},
+ {DT_PLTRELSZ, "PLTRELSZ"},
+ {DT_PLTGOT, "PLTGOT"},
+ {DT_HASH, "HASH"},
+ {DT_STRTAB, "STRTAB"},
+ {DT_SYMTAB, "SYMTAB"},
+ {DT_RELA, "RELA"},
+ {DT_RELASZ, "RELASZ"},
+ {DT_RELAENT, "RELAENT"},
+ {DT_STRSZ, "STRSZ"},
+ {DT_SYMENT, "SYMENT"},
+ {DT_INIT, "INIT"},
+ {DT_FINI, "FINI"},
+ {DT_SONAME, "SONAME"},
+ {DT_RPATH, "RPATH"},
+ {DT_SYMBOLIC, "SYMBOLIC"},
+ {DT_REL, "REL"},
+ {DT_RELSZ, "RELSZ"},
+ {DT_RELENT, "RELENT"},
+ {DT_PLTREL, "PLTREL"},
+ {DT_DEBUG, "DEBUG"},
+ {DT_TEXTREL, "TEXTREL"},
+ {DT_JMPREL, "JMPREL"},
+ {DT_BIND_NOW, "BIND_NOW"},
+ {DT_INIT_ARRAY, "INIT_ARRAY"},
+ {DT_FINI_ARRAY, "FINI_ARRAY"},
+ {DT_INIT_ARRAYSZ, "INIT_ARRAYSZ"},
+ {DT_FINI_ARRAYSZ, "FINI_ARRAYSZ"},
+ {DT_RUNPATH, "RUNPATH"},
+ {DT_FLAGS, "FLAGS"},
+ {DT_ENCODING, "ENCODING"},
+ {DT_PREINIT_ARRAY, "PREINIT_ARRAY"},
+ {DT_PREINIT_ARRAYSZ, "PREINIT_ARRAYSZ"},
+ {DT_MAXPOSTAGS, "MAXPOSTAGS"},
+};
+
+static const ELFIO::Elf_Xword MAX_DATA_ENTRIES = 64;
+
+//------------------------------------------------------------------------------
+class dump {
+#define DUMP_DEC_FORMAT(width) std::setw(width) << std::setfill(' ') << std::dec << std::right
+#define DUMP_HEX_FORMAT(width) std::setw(width) << std::setfill('0') << std::hex << std::right
+#define DUMP_STR_FORMAT(width) std::setw(width) << std::setfill(' ') << std::hex << std::left
+
+ public:
+ //------------------------------------------------------------------------------
+ static void header(std::ostream& out, const elfio& reader) {
+ out << "ELF Header" << std::endl
+ << std::endl
+ << " Class: " << str_class(reader.get_class()) << std::endl
+ << " Encoding: " << str_endian(reader.get_encoding()) << std::endl
+ << " ELFVersion: " << str_version(reader.get_elf_version()) << std::endl
+ << " Type: " << str_type(reader.get_type()) << std::endl
+ << " Machine: " << str_machine(reader.get_machine()) << std::endl
+ << " Version: " << str_version(reader.get_version()) << std::endl
+ << " Entry: "
+ << "0x" << std::hex << reader.get_entry() << std::endl
+ << " Flags: "
+ << "0x" << std::hex << reader.get_flags() << std::endl
+ << std::endl;
+ }
+
+ //------------------------------------------------------------------------------
+ static void section_headers(std::ostream& out, const elfio& reader) {
+ Elf_Half n = reader.sections.size();
+
+ if (n == 0) {
+ return;
+ }
+
+ out << "Section Headers:" << std::endl;
+ if (reader.get_class() == ELFCLASS32) { // Output for 32-bit
+ out << "[ Nr ] Type Addr Size ES Flg Lk Inf Al Name" << std::endl;
+ } else { // Output for 64-bit
+ out << "[ Nr ] Type Addr Size ES Flg"
+ << std::endl
+ << " Lk Inf Al Name" << std::endl;
+ }
+
+ for (Elf_Half i = 0; i < n; ++i) { // For all sections
+ section* sec = reader.sections[i];
+ section_header(out, i, sec, reader.get_class());
+ }
+
+ out << "Key to Flags: W (write), A (alloc), X (execute)\n\n" << std::endl;
+ }
+
+ //------------------------------------------------------------------------------
+ static void section_header(std::ostream& out, Elf_Half no, const section* sec,
+ unsigned char elf_class) {
+ std::ios_base::fmtflags original_flags = out.flags();
+
+ if (elf_class == ELFCLASS32) { // Output for 32-bit
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_STR_FORMAT(17)
+ << str_section_type(sec->get_type()) << " " << DUMP_HEX_FORMAT(8)
+ << sec->get_address() << " " << DUMP_HEX_FORMAT(8) << sec->get_size() << " "
+ << DUMP_HEX_FORMAT(2) << sec->get_entry_size() << " " << DUMP_STR_FORMAT(3)
+ << section_flags(sec->get_flags()) << " " << DUMP_HEX_FORMAT(2) << sec->get_link()
+ << " " << DUMP_HEX_FORMAT(3) << sec->get_info() << " " << DUMP_HEX_FORMAT(2)
+ << sec->get_addr_align() << " " << DUMP_STR_FORMAT(17) << sec->get_name() << " "
+ << std::endl;
+ } else { // Output for 64-bit
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_STR_FORMAT(17)
+ << str_section_type(sec->get_type()) << " " << DUMP_HEX_FORMAT(16)
+ << sec->get_address() << " " << DUMP_HEX_FORMAT(16) << sec->get_size() << " "
+ << DUMP_HEX_FORMAT(4) << sec->get_entry_size() << " " << DUMP_STR_FORMAT(3)
+ << section_flags(sec->get_flags()) << " " << std::endl
+ << " " << DUMP_HEX_FORMAT(4) << sec->get_link() << " " << DUMP_HEX_FORMAT(4)
+ << sec->get_info() << " " << DUMP_HEX_FORMAT(4) << sec->get_addr_align() << " "
+ << DUMP_STR_FORMAT(17) << sec->get_name() << " " << std::endl;
+ }
+
+ out.flags(original_flags);
+
+ return;
+ }
+
+ //------------------------------------------------------------------------------
+ static void segment_headers(std::ostream& out, const elfio& reader) {
+ Elf_Half n = reader.segments.size();
+ if (n == 0) {
+ return;
+ }
+
+ out << "Segment headers:" << std::endl;
+ if (reader.get_class() == ELFCLASS32) { // Output for 32-bit
+ out << "[ Nr ] Type VirtAddr PhysAddr FileSize Mem.Size Flags Align"
+ << std::endl;
+ } else { // Output for 64-bit
+ out << "[ Nr ] Type VirtAddr PhysAddr Flags" << std::endl
+ << " FileSize Mem.Size Align" << std::endl;
+ }
+
+ for (Elf_Half i = 0; i < n; ++i) {
+ segment* seg = reader.segments[i];
+ segment_header(out, i, seg, reader.get_class());
+ }
+
+ out << std::endl;
+ }
+
+ //------------------------------------------------------------------------------
+ static void segment_header(std::ostream& out, Elf_Half no, const segment* seg,
+ unsigned int elf_class) {
+ std::ios_base::fmtflags original_flags = out.flags();
+
+ if (elf_class == ELFCLASS32) { // Output for 32-bit
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_STR_FORMAT(14)
+ << str_segment_type(seg->get_type()) << " " << DUMP_HEX_FORMAT(8)
+ << seg->get_virtual_address() << " " << DUMP_HEX_FORMAT(8)
+ << seg->get_physical_address() << " " << DUMP_HEX_FORMAT(8) << seg->get_file_size()
+ << " " << DUMP_HEX_FORMAT(8) << seg->get_memory_size() << " " << DUMP_STR_FORMAT(8)
+ << str_segment_flag(seg->get_flags()) << " " << DUMP_HEX_FORMAT(8)
+ << seg->get_align() << " " << std::endl;
+ } else { // Output for 64-bit
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_STR_FORMAT(14)
+ << str_segment_type(seg->get_type()) << " " << DUMP_HEX_FORMAT(16)
+ << seg->get_virtual_address() << " " << DUMP_HEX_FORMAT(16)
+ << seg->get_physical_address() << " " << DUMP_STR_FORMAT(16)
+ << str_segment_flag(seg->get_flags()) << " " << std::endl
+ << " " << DUMP_HEX_FORMAT(16) << seg->get_file_size() << " "
+ << DUMP_HEX_FORMAT(16) << seg->get_memory_size() << " " << DUMP_HEX_FORMAT(16)
+ << seg->get_align() << " " << std::endl;
+ }
+
+ out.flags(original_flags);
+ }
+
+ //------------------------------------------------------------------------------
+ static void symbol_tables(std::ostream& out, const elfio& reader) {
+ Elf_Half n = reader.sections.size();
+ for (Elf_Half i = 0; i < n; ++i) { // For all sections
+ section* sec = reader.sections[i];
+ if (SHT_SYMTAB == sec->get_type() || SHT_DYNSYM == sec->get_type()) {
+ symbol_section_accessor symbols(reader, sec);
+
+ Elf_Xword sym_no = symbols.get_symbols_num();
+ if (sym_no > 0) {
+ out << "Symbol table (" << sec->get_name() << ")" << std::endl;
+ if (reader.get_class() == ELFCLASS32) { // Output for 32-bit
+ out << "[ Nr ] Value Size Type Bind Sect Name" << std::endl;
+ } else { // Output for 64-bit
+ out << "[ Nr ] Value Size Type Bind Sect"
+ << std::endl
+ << " Name" << std::endl;
+ }
+ for (Elf_Half i = 0; i < sym_no; ++i) {
+ std::string name;
+ Elf64_Addr value = 0;
+ Elf_Xword size = 0;
+ unsigned char bind = 0;
+ unsigned char type = 0;
+ Elf_Half section = 0;
+ unsigned char other = 0;
+ symbols.get_symbol(i, name, value, size, bind, type, section, other);
+ symbol_table(out, i, name, value, size, bind, type, section,
+ reader.get_class());
+ }
+
+ out << std::endl;
+ }
+ }
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ static void symbol_table(std::ostream& out, Elf_Half no, std::string& name, Elf64_Addr value,
+ Elf_Xword size, unsigned char bind, unsigned char type,
+ Elf_Half section, unsigned int elf_class) {
+ std::ios_base::fmtflags original_flags = out.flags();
+
+ if (elf_class == ELFCLASS32) { // Output for 32-bit
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_HEX_FORMAT(8) << value << " "
+ << DUMP_HEX_FORMAT(8) << size << " " << DUMP_STR_FORMAT(7) << str_symbol_type(type)
+ << " " << DUMP_STR_FORMAT(8) << str_symbol_bind(bind) << " " << DUMP_DEC_FORMAT(5)
+ << section << " " << DUMP_STR_FORMAT(1) << name << " " << std::endl;
+ } else { // Output for 64-bit
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_HEX_FORMAT(16) << value << " "
+ << DUMP_HEX_FORMAT(16) << size << " " << DUMP_STR_FORMAT(7) << str_symbol_type(type)
+ << " " << DUMP_STR_FORMAT(8) << str_symbol_bind(bind) << " " << DUMP_DEC_FORMAT(5)
+ << section << " " << std::endl
+ << " " << DUMP_STR_FORMAT(1) << name << " " << std::endl;
+ }
+
+ out.flags(original_flags);
+ }
+
+ //------------------------------------------------------------------------------
+ static void notes(std::ostream& out, const elfio& reader) {
+ Elf_Half no = reader.sections.size();
+ for (Elf_Half i = 0; i < no; ++i) { // For all sections
+ section* sec = reader.sections[i];
+ if (SHT_NOTE == sec->get_type()) { // Look at notes
+ note_section_accessor notes(reader, sec);
+ int no_notes = notes.get_notes_num();
+ if (no > 0) {
+ out << "Note section (" << sec->get_name() << ")" << std::endl
+ << " No Type Name" << std::endl;
+ for (int j = 0; j < no_notes; ++j) { // For all notes
+ Elf_Word type;
+ std::string name;
+ void* desc;
+ Elf_Word descsz;
+
+ if (notes.get_note(j, type, name, desc, descsz)) {
+ // 'name' usually contains \0 at the end. Try to fix it
+ name = name.c_str();
+ note(out, j, type, name);
+ }
+ }
+
+ out << std::endl;
+ }
+ }
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ static void note(std::ostream& out, int no, Elf_Word type, const std::string& name) {
+ out << " [" << DUMP_DEC_FORMAT(2) << no << "] " << DUMP_HEX_FORMAT(8) << type << " "
+ << DUMP_STR_FORMAT(1) << name << std::endl;
+ }
+
+ //------------------------------------------------------------------------------
+ static void dynamic_tags(std::ostream& out, const elfio& reader) {
+ Elf_Half n = reader.sections.size();
+ for (Elf_Half i = 0; i < n; ++i) { // For all sections
+ section* sec = reader.sections[i];
+ if (SHT_DYNAMIC == sec->get_type()) {
+ dynamic_section_accessor dynamic(reader, sec);
+
+ Elf_Xword dyn_no = dynamic.get_entries_num();
+ if (dyn_no > 0) {
+ out << "Dynamic section (" << sec->get_name() << ")" << std::endl;
+ out << "[ Nr ] Tag Name/Value" << std::endl;
+ for (int i = 0; i < dyn_no; ++i) {
+ Elf_Xword tag = 0;
+ Elf_Xword value = 0;
+ std::string str;
+ dynamic.get_entry(i, tag, value, str);
+ dynamic_tag(out, i, tag, value, str, reader.get_class());
+ if (DT_NULL == tag) {
+ break;
+ }
+ }
+
+ out << std::endl;
+ }
+ }
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ static void dynamic_tag(std::ostream& out, int no, Elf_Xword tag, Elf_Xword value,
+ std::string str, unsigned int /*elf_class*/) {
+ out << "[" << DUMP_DEC_FORMAT(5) << no << "] " << DUMP_STR_FORMAT(16)
+ << str_dynamic_tag(tag) << " ";
+ if (str.empty()) {
+ out << DUMP_HEX_FORMAT(16) << value << " ";
+ } else {
+ out << DUMP_STR_FORMAT(32) << str << " ";
+ }
+ out << std::endl;
+ }
+
+ //------------------------------------------------------------------------------
+ static void section_data(std::ostream& out, const section* sec) {
+ std::ios_base::fmtflags original_flags = out.flags();
+
+ out << sec->get_name() << std::endl;
+ const char* pdata = sec->get_data();
+ if (pdata) {
+ ELFIO::Elf_Xword i;
+ for (i = 0; i < std::min(sec->get_size(), MAX_DATA_ENTRIES); ++i) {
+ if (i % 16 == 0) {
+ out << "[" << DUMP_HEX_FORMAT(8) << i << "]";
+ }
+
+ out << " " << DUMP_HEX_FORMAT(2) << (pdata[i] & 0x000000FF);
+
+ if (i % 16 == 15) {
+ out << std::endl;
+ }
+ }
+ if (i % 16 != 0) {
+ out << std::endl;
+ }
+
+ out.flags(original_flags);
+ }
+
+ return;
+ }
+
+ //------------------------------------------------------------------------------
+ static void section_datas(std::ostream& out, const elfio& reader) {
+ Elf_Half n = reader.sections.size();
+
+ if (n == 0) {
+ return;
+ }
+
+ out << "Section Data:" << std::endl;
+
+ for (Elf_Half i = 1; i < n; ++i) { // For all sections
+ section* sec = reader.sections[i];
+ if (sec->get_type() == SHT_NOBITS) {
+ continue;
+ }
+ section_data(out, sec);
+ }
+
+ out << std::endl;
+ }
+
+ //------------------------------------------------------------------------------
+ static void segment_data(std::ostream& out, Elf_Half no, const segment* seg) {
+ std::ios_base::fmtflags original_flags = out.flags();
+
+ out << "Segment # " << no << std::endl;
+ const char* pdata = seg->get_data();
+ if (pdata) {
+ ELFIO::Elf_Xword i;
+ for (i = 0; i < std::min(seg->get_file_size(), MAX_DATA_ENTRIES); ++i) {
+ if (i % 16 == 0) {
+ out << "[" << DUMP_HEX_FORMAT(8) << i << "]";
+ }
+
+ out << " " << DUMP_HEX_FORMAT(2) << (pdata[i] & 0x000000FF);
+
+ if (i % 16 == 15) {
+ out << std::endl;
+ }
+ }
+ if (i % 16 != 0) {
+ out << std::endl;
+ }
+
+ out.flags(original_flags);
+ }
+
+ return;
+ }
+
+ //------------------------------------------------------------------------------
+ static void segment_datas(std::ostream& out, const elfio& reader) {
+ Elf_Half n = reader.segments.size();
+
+ if (n == 0) {
+ return;
+ }
+
+ out << "Segment Data:" << std::endl;
+
+ for (Elf_Half i = 0; i < n; ++i) { // For all sections
+ segment* seg = reader.segments[i];
+ segment_data(out, i, seg);
+ }
+
+ out << std::endl;
+ }
+
+ private:
+ //------------------------------------------------------------------------------
+ template <typename T, typename K>
+ std::string static find_value_in_table(const T& table, const K& key) {
+ std::string res = "?";
+ for (unsigned int i = 0; i < sizeof(table) / sizeof(table[0]); ++i) {
+ if (table[i].key == key) {
+ res = table[i].str;
+ break;
+ }
+ }
+
+ return res;
+ }
+
+
+ //------------------------------------------------------------------------------
+ template <typename T, typename K>
+ static std::string format_assoc(const T& table, const K& key) {
+ std::string str = find_value_in_table(table, key);
+ if (str == "?") {
+ std::ostringstream oss;
+ oss << str << " (0x" << std::hex << key << ")";
+ str = oss.str();
+ }
+
+ return str;
+ }
+
+
+ //------------------------------------------------------------------------------
+ template <typename T>
+ static std::string format_assoc(const T& table, const char key) {
+ return format_assoc(table, (const int)key);
+ }
+
+
+ //------------------------------------------------------------------------------
+ static std::string section_flags(Elf_Xword flags) {
+ std::string ret = "";
+ if (flags & SHF_WRITE) {
+ ret += "W";
+ }
+ if (flags & SHF_ALLOC) {
+ ret += "A";
+ }
+ if (flags & SHF_EXECINSTR) {
+ ret += "X";
+ }
+
+ return ret;
+ }
+
+
+//------------------------------------------------------------------------------
+#define STR_FUNC_TABLE(name) \
+ template <typename T> \
+ static std::string str_##name(const T key) { \
+ return format_assoc(name##_table, key); \
+ }
+
+ STR_FUNC_TABLE(class)
+ STR_FUNC_TABLE(endian)
+ STR_FUNC_TABLE(version)
+ STR_FUNC_TABLE(type)
+ STR_FUNC_TABLE(machine)
+ STR_FUNC_TABLE(section_type)
+ STR_FUNC_TABLE(segment_type)
+ STR_FUNC_TABLE(segment_flag)
+ STR_FUNC_TABLE(symbol_bind)
+ STR_FUNC_TABLE(symbol_type)
+ STR_FUNC_TABLE(dynamic_tag)
+
+#undef STR_FUNC_TABLE
+#undef DUMP_DEC_FORMAT
+#undef DUMP_HEX_FORMAT
+#undef DUMP_STR_FORMAT
+}; // class dump
+
+
+}; // namespace ELFIO
+
+#endif // ELFIO_DUMP_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_dynamic.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_dynamic.hpp
new file mode 100644
index 0000000..53a6e28
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_dynamic.hpp
@@ -0,0 +1,217 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_DYNAMIC_HPP
+#define ELFIO_DYNAMIC_HPP
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class dynamic_section_accessor {
+ public:
+ //------------------------------------------------------------------------------
+ dynamic_section_accessor(const elfio& elf_file_, section* section_)
+ : elf_file(elf_file_), dynamic_section(section_) {}
+
+ //------------------------------------------------------------------------------
+ Elf_Xword get_entries_num() const {
+ Elf_Xword nRet = 0;
+
+ if (0 != dynamic_section->get_entry_size()) {
+ nRet = dynamic_section->get_size() / dynamic_section->get_entry_size();
+ }
+
+ return nRet;
+ }
+
+ //------------------------------------------------------------------------------
+ bool get_entry(Elf_Xword index, Elf_Xword& tag, Elf_Xword& value, std::string& str) const {
+ if (index >= get_entries_num()) { // Is index valid
+ return false;
+ }
+
+ if (elf_file.get_class() == ELFCLASS32) {
+ generic_get_entry_dyn<Elf32_Dyn>(index, tag, value);
+ } else {
+ generic_get_entry_dyn<Elf64_Dyn>(index, tag, value);
+ }
+
+ // If the tag may have a string table reference, prepare the string
+ if (tag == DT_NEEDED || tag == DT_SONAME || tag == DT_RPATH || tag == DT_RUNPATH) {
+ string_section_accessor strsec = elf_file.sections[get_string_table_index()];
+ const char* result = strsec.get_string(value);
+ if (0 == result) {
+ str.clear();
+ return false;
+ }
+ str = result;
+ } else {
+ str.clear();
+ }
+
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(Elf_Xword& tag, Elf_Xword& value) {
+ if (elf_file.get_class() == ELFCLASS32) {
+ generic_add_entry<Elf32_Dyn>(tag, value);
+ } else {
+ generic_add_entry<Elf64_Dyn>(tag, value);
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(Elf_Xword& tag, std::string& str) {
+ string_section_accessor strsec = elf_file.sections[get_string_table_index()];
+ Elf_Xword value = strsec.add_string(str);
+ add_entry(tag, value);
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ //------------------------------------------------------------------------------
+ Elf_Half get_string_table_index() const { return (Elf_Half)dynamic_section->get_link(); }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ void generic_get_entry_dyn(Elf_Xword index, Elf_Xword& tag, Elf_Xword& value) const {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ // Check unusual case when dynamic section has no data
+ if (dynamic_section->get_data() == 0 ||
+ (index + 1) * dynamic_section->get_entry_size() > dynamic_section->get_size()) {
+ tag = DT_NULL;
+ value = 0;
+ return;
+ }
+
+ const T* pEntry = reinterpret_cast<const T*>(dynamic_section->get_data() +
+ index * dynamic_section->get_entry_size());
+ tag = convertor(pEntry->d_tag);
+ switch (tag) {
+ case DT_NULL:
+ case DT_SYMBOLIC:
+ case DT_TEXTREL:
+ case DT_BIND_NOW:
+ value = 0;
+ break;
+ case DT_NEEDED:
+ case DT_PLTRELSZ:
+ case DT_RELASZ:
+ case DT_RELAENT:
+ case DT_STRSZ:
+ case DT_SYMENT:
+ case DT_SONAME:
+ case DT_RPATH:
+ case DT_RELSZ:
+ case DT_RELENT:
+ case DT_PLTREL:
+ case DT_INIT_ARRAYSZ:
+ case DT_FINI_ARRAYSZ:
+ case DT_RUNPATH:
+ case DT_FLAGS:
+ case DT_PREINIT_ARRAYSZ:
+ value = convertor(pEntry->d_un.d_val);
+ break;
+ case DT_PLTGOT:
+ case DT_HASH:
+ case DT_STRTAB:
+ case DT_SYMTAB:
+ case DT_RELA:
+ case DT_INIT:
+ case DT_FINI:
+ case DT_REL:
+ case DT_DEBUG:
+ case DT_JMPREL:
+ case DT_INIT_ARRAY:
+ case DT_FINI_ARRAY:
+ case DT_PREINIT_ARRAY:
+ default:
+ value = convertor(pEntry->d_un.d_ptr);
+ break;
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ void generic_add_entry(Elf_Xword tag, Elf_Xword value) {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ T entry;
+
+ switch (tag) {
+ case DT_NULL:
+ case DT_SYMBOLIC:
+ case DT_TEXTREL:
+ case DT_BIND_NOW:
+ value = 0;
+ case DT_NEEDED:
+ case DT_PLTRELSZ:
+ case DT_RELASZ:
+ case DT_RELAENT:
+ case DT_STRSZ:
+ case DT_SYMENT:
+ case DT_SONAME:
+ case DT_RPATH:
+ case DT_RELSZ:
+ case DT_RELENT:
+ case DT_PLTREL:
+ case DT_INIT_ARRAYSZ:
+ case DT_FINI_ARRAYSZ:
+ case DT_RUNPATH:
+ case DT_FLAGS:
+ case DT_PREINIT_ARRAYSZ:
+ entry.d_un.d_val = convertor(value);
+ break;
+ case DT_PLTGOT:
+ case DT_HASH:
+ case DT_STRTAB:
+ case DT_SYMTAB:
+ case DT_RELA:
+ case DT_INIT:
+ case DT_FINI:
+ case DT_REL:
+ case DT_DEBUG:
+ case DT_JMPREL:
+ case DT_INIT_ARRAY:
+ case DT_FINI_ARRAY:
+ case DT_PREINIT_ARRAY:
+ default:
+ entry.d_un.d_ptr = convertor(value);
+ break;
+ }
+
+ entry.d_tag = convertor(tag);
+
+ dynamic_section->append_data(reinterpret_cast<char*>(&entry), sizeof(entry));
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ const elfio& elf_file;
+ section* dynamic_section;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_DYNAMIC_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_header.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_header.hpp
new file mode 100644
index 0000000..b95f0a9
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_header.hpp
@@ -0,0 +1,142 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELF_HEADER_HPP
+#define ELF_HEADER_HPP
+
+#include <iostream>
+
+namespace ELFIO {
+
+class elf_header {
+ public:
+ virtual ~elf_header(){};
+ virtual bool load(std::istream& stream) = 0;
+ virtual bool save(std::ostream& stream) const = 0;
+
+ // ELF header functions
+ ELFIO_GET_ACCESS_DECL(unsigned char, class);
+ ELFIO_GET_ACCESS_DECL(unsigned char, elf_version);
+ ELFIO_GET_ACCESS_DECL(unsigned char, encoding);
+ ELFIO_GET_ACCESS_DECL(Elf_Word, version);
+ ELFIO_GET_ACCESS_DECL(Elf_Half, header_size);
+ ELFIO_GET_ACCESS_DECL(Elf_Half, section_entry_size);
+ ELFIO_GET_ACCESS_DECL(Elf_Half, segment_entry_size);
+
+ ELFIO_GET_SET_ACCESS_DECL(unsigned char, os_abi);
+ ELFIO_GET_SET_ACCESS_DECL(unsigned char, abi_version);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Half, type);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Half, machine);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, flags);
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Addr, entry);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Half, sections_num);
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Off, sections_offset);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Half, segments_num);
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Off, segments_offset);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Half, section_name_str_index);
+};
+
+
+template <class T>
+struct elf_header_impl_types;
+template <>
+struct elf_header_impl_types<Elf32_Ehdr> {
+ typedef Elf32_Phdr Phdr_type;
+ typedef Elf32_Shdr Shdr_type;
+ static const unsigned char file_class = ELFCLASS32;
+};
+template <>
+struct elf_header_impl_types<Elf64_Ehdr> {
+ typedef Elf64_Phdr Phdr_type;
+ typedef Elf64_Shdr Shdr_type;
+ static const unsigned char file_class = ELFCLASS64;
+};
+
+template <class T>
+class elf_header_impl : public elf_header {
+ public:
+ elf_header_impl(endianess_convertor* convertor_, unsigned char encoding) {
+ convertor = convertor_;
+
+ std::fill_n(reinterpret_cast<char*>(&header), sizeof(header), '\0');
+
+ header.e_ident[EI_MAG0] = ELFMAG0;
+ header.e_ident[EI_MAG1] = ELFMAG1;
+ header.e_ident[EI_MAG2] = ELFMAG2;
+ header.e_ident[EI_MAG3] = ELFMAG3;
+ header.e_ident[EI_CLASS] = elf_header_impl_types<T>::file_class;
+ header.e_ident[EI_DATA] = encoding;
+ header.e_ident[EI_VERSION] = EV_CURRENT;
+ header.e_version = EV_CURRENT;
+ header.e_version = (*convertor)(header.e_version);
+ header.e_ehsize = (sizeof(header));
+ header.e_ehsize = (*convertor)(header.e_ehsize);
+ header.e_shstrndx = (*convertor)((Elf_Half)1);
+ header.e_phentsize = sizeof(typename elf_header_impl_types<T>::Phdr_type);
+ header.e_shentsize = sizeof(typename elf_header_impl_types<T>::Shdr_type);
+ header.e_phentsize = (*convertor)(header.e_phentsize);
+ header.e_shentsize = (*convertor)(header.e_shentsize);
+ }
+
+ bool load(std::istream& stream) {
+ stream.seekg(0);
+ stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+
+ return (stream.gcount() == sizeof(header));
+ }
+
+ bool save(std::ostream& stream) const {
+ stream.seekp(0);
+ stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+ return stream.good();
+ }
+
+ // ELF header functions
+ ELFIO_GET_ACCESS(unsigned char, class, header.e_ident[EI_CLASS]);
+ ELFIO_GET_ACCESS(unsigned char, elf_version, header.e_ident[EI_VERSION]);
+ ELFIO_GET_ACCESS(unsigned char, encoding, header.e_ident[EI_DATA]);
+ ELFIO_GET_ACCESS(Elf_Word, version, header.e_version);
+ ELFIO_GET_ACCESS(Elf_Half, header_size, header.e_ehsize);
+ ELFIO_GET_ACCESS(Elf_Half, section_entry_size, header.e_shentsize);
+ ELFIO_GET_ACCESS(Elf_Half, segment_entry_size, header.e_phentsize);
+
+ ELFIO_GET_SET_ACCESS(unsigned char, os_abi, header.e_ident[EI_OSABI]);
+ ELFIO_GET_SET_ACCESS(unsigned char, abi_version, header.e_ident[EI_ABIVERSION]);
+ ELFIO_GET_SET_ACCESS(Elf_Half, type, header.e_type);
+ ELFIO_GET_SET_ACCESS(Elf_Half, machine, header.e_machine);
+ ELFIO_GET_SET_ACCESS(Elf_Word, flags, header.e_flags);
+ ELFIO_GET_SET_ACCESS(Elf_Half, section_name_str_index, header.e_shstrndx);
+ ELFIO_GET_SET_ACCESS(Elf64_Addr, entry, header.e_entry);
+ ELFIO_GET_SET_ACCESS(Elf_Half, sections_num, header.e_shnum);
+ ELFIO_GET_SET_ACCESS(Elf64_Off, sections_offset, header.e_shoff);
+ ELFIO_GET_SET_ACCESS(Elf_Half, segments_num, header.e_phnum);
+ ELFIO_GET_SET_ACCESS(Elf64_Off, segments_offset, header.e_phoff);
+
+ private:
+ T header;
+ endianess_convertor* convertor;
+};
+
+} // namespace ELFIO
+
+#endif // ELF_HEADER_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_note.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_note.hpp
new file mode 100644
index 0000000..e350c85
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_note.hpp
@@ -0,0 +1,144 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_NOTE_HPP
+#define ELFIO_NOTE_HPP
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+// There are discrepancies in documentations. SCO documentation
+// (http://www.sco.com/developers/gabi/latest/ch5.pheader.html#note_section)
+// requires 8 byte entries alignment for 64-bit ELF file,
+// but Oracle's definition uses the same structure
+// for 32-bit and 64-bit formats.
+// (https://docs.oracle.com/cd/E23824_01/html/819-0690/chapter6-18048.html)
+//
+// It looks like EM_X86_64 Linux implementation is similar to Oracle's
+// definition. Therefore, the same alignment works for both formats
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+class note_section_accessor {
+ public:
+ //------------------------------------------------------------------------------
+ note_section_accessor(const elfio& elf_file_, section* section_)
+ : elf_file(elf_file_), note_section(section_) {
+ process_section();
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Word get_notes_num() const { return (Elf_Word)note_start_positions.size(); }
+
+ //------------------------------------------------------------------------------
+ bool get_note(Elf_Word index, Elf_Word& type, std::string& name, void*& desc,
+ Elf_Word& descSize) const {
+ if (index >= note_section->get_size()) {
+ return false;
+ }
+
+ const char* pData = note_section->get_data() + note_start_positions[index];
+ int align = sizeof(Elf_Word);
+
+ const endianess_convertor& convertor = elf_file.get_convertor();
+ type = convertor(*(Elf_Word*)(pData + 2 * align));
+ Elf_Word namesz = convertor(*(Elf_Word*)(pData));
+ descSize = convertor(*(Elf_Word*)(pData + sizeof(namesz)));
+ Elf_Word max_name_size = note_section->get_size() - note_start_positions[index];
+ if (namesz > max_name_size || namesz + descSize > max_name_size) {
+ return false;
+ }
+ name.assign(pData + 3 * align, namesz - 1);
+ if (0 == descSize) {
+ desc = 0;
+ } else {
+ desc = const_cast<char*>(pData + 3 * align + ((namesz + align - 1) / align) * align);
+ }
+
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ void add_note(Elf_Word type, const std::string& name, const void* desc, Elf_Word descSize) {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ int align = sizeof(Elf_Word);
+ Elf_Word nameLen = (Elf_Word)name.size() + 1;
+ Elf_Word nameLenConv = convertor(nameLen);
+ std::string buffer(reinterpret_cast<char*>(&nameLenConv), align);
+ Elf_Word descSizeConv = convertor(descSize);
+ buffer.append(reinterpret_cast<char*>(&descSizeConv), align);
+ type = convertor(type);
+ buffer.append(reinterpret_cast<char*>(&type), align);
+ buffer.append(name);
+ buffer.append(1, '\x00');
+ const char pad[] = {'\0', '\0', '\0', '\0'};
+ if (nameLen % align != 0) {
+ buffer.append(pad, align - nameLen % align);
+ }
+ if (desc != 0 && descSize != 0) {
+ buffer.append(reinterpret_cast<const char*>(desc), descSize);
+ if (descSize % align != 0) {
+ buffer.append(pad, align - descSize % align);
+ }
+ }
+
+ note_start_positions.push_back(note_section->get_size());
+ note_section->append_data(buffer);
+ }
+
+ private:
+ //------------------------------------------------------------------------------
+ void process_section() {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+ const char* data = note_section->get_data();
+ Elf_Xword size = note_section->get_size();
+ Elf_Xword current = 0;
+
+ note_start_positions.clear();
+
+ // Is it empty?
+ if (0 == data || 0 == size) {
+ return;
+ }
+
+ int align = sizeof(Elf_Word);
+ while (current + 3 * align <= size) {
+ note_start_positions.push_back(current);
+ Elf_Word namesz = convertor(*(Elf_Word*)(data + current));
+ Elf_Word descsz = convertor(*(Elf_Word*)(data + current + sizeof(namesz)));
+
+ current += 3 * sizeof(Elf_Word) + ((namesz + align - 1) / align) * align +
+ ((descsz + align - 1) / align) * align;
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ const elfio& elf_file;
+ section* note_section;
+ std::vector<Elf_Xword> note_start_positions;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_NOTE_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_relocation.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_relocation.hpp
new file mode 100644
index 0000000..270c911
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_relocation.hpp
@@ -0,0 +1,280 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_RELOCATION_HPP
+#define ELFIO_RELOCATION_HPP
+
+namespace ELFIO {
+
+template <typename T>
+struct get_sym_and_type;
+template <>
+struct get_sym_and_type<Elf32_Rel> {
+ static int get_r_sym(Elf_Xword info) { return ELF32_R_SYM((Elf_Word)info); }
+ static int get_r_type(Elf_Xword info) { return ELF32_R_TYPE((Elf_Word)info); }
+};
+template <>
+struct get_sym_and_type<Elf32_Rela> {
+ static int get_r_sym(Elf_Xword info) { return ELF32_R_SYM((Elf_Word)info); }
+ static int get_r_type(Elf_Xword info) { return ELF32_R_TYPE((Elf_Word)info); }
+};
+template <>
+struct get_sym_and_type<Elf64_Rel> {
+ static int get_r_sym(Elf_Xword info) { return ELF64_R_SYM(info); }
+ static int get_r_type(Elf_Xword info) { return ELF64_R_TYPE(info); }
+};
+template <>
+struct get_sym_and_type<Elf64_Rela> {
+ static int get_r_sym(Elf_Xword info) { return ELF64_R_SYM(info); }
+ static int get_r_type(Elf_Xword info) { return ELF64_R_TYPE(info); }
+};
+
+
+//------------------------------------------------------------------------------
+class relocation_section_accessor {
+ public:
+ //------------------------------------------------------------------------------
+ relocation_section_accessor(const elfio& elf_file_, section* section_)
+ : elf_file(elf_file_), relocation_section(section_) {}
+
+ //------------------------------------------------------------------------------
+ Elf_Xword get_entries_num() const {
+ Elf_Xword nRet = 0;
+
+ if (0 != relocation_section->get_entry_size()) {
+ nRet = relocation_section->get_size() / relocation_section->get_entry_size();
+ }
+
+ return nRet;
+ }
+
+ //------------------------------------------------------------------------------
+ bool get_entry(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type,
+ Elf_Sxword& addend) const {
+ if (index >= get_entries_num()) { // Is index valid
+ return false;
+ }
+
+ if (elf_file.get_class() == ELFCLASS32) {
+ if (SHT_REL == relocation_section->get_type()) {
+ generic_get_entry_rel<Elf32_Rel>(index, offset, symbol, type, addend);
+ } else if (SHT_RELA == relocation_section->get_type()) {
+ generic_get_entry_rela<Elf32_Rela>(index, offset, symbol, type, addend);
+ }
+ } else {
+ if (SHT_REL == relocation_section->get_type()) {
+ generic_get_entry_rel<Elf64_Rel>(index, offset, symbol, type, addend);
+ } else if (SHT_RELA == relocation_section->get_type()) {
+ generic_get_entry_rela<Elf64_Rela>(index, offset, symbol, type, addend);
+ }
+ }
+
+ return true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool get_entry(Elf_Xword index, Elf64_Addr& offset, Elf64_Addr& symbolValue,
+ std::string& symbolName, Elf_Word& type, Elf_Sxword& addend,
+ Elf_Sxword& calcValue) const {
+ // Do regular job
+ Elf_Word symbol;
+ bool ret = get_entry(index, offset, symbol, type, addend);
+
+ // Find the symbol
+ Elf_Xword size;
+ unsigned char bind;
+ unsigned char symbolType;
+ Elf_Half section;
+ unsigned char other;
+
+ symbol_section_accessor symbols(elf_file, elf_file.sections[get_symbol_table_index()]);
+ ret = ret && symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType,
+ section, other);
+
+ if (ret) { // Was it successful?
+ switch (type) {
+ case R_386_NONE: // none
+ calcValue = 0;
+ break;
+ case R_386_32: // S + A
+ calcValue = symbolValue + addend;
+ break;
+ case R_386_PC32: // S + A - P
+ calcValue = symbolValue + addend - offset;
+ break;
+ case R_386_GOT32: // G + A - P
+ calcValue = 0;
+ break;
+ case R_386_PLT32: // L + A - P
+ calcValue = 0;
+ break;
+ case R_386_COPY: // none
+ calcValue = 0;
+ break;
+ case R_386_GLOB_DAT: // S
+ case R_386_JMP_SLOT: // S
+ calcValue = symbolValue;
+ break;
+ case R_386_RELATIVE: // B + A
+ calcValue = addend;
+ break;
+ case R_386_GOTOFF: // S + A - GOT
+ calcValue = 0;
+ break;
+ case R_386_GOTPC: // GOT + A - P
+ calcValue = 0;
+ break;
+ default: // Not recognized symbol!
+ calcValue = 0;
+ break;
+ }
+ }
+
+ return ret;
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(Elf64_Addr offset, Elf_Xword info) {
+ if (elf_file.get_class() == ELFCLASS32) {
+ generic_add_entry<Elf32_Rel>(offset, info);
+ } else {
+ generic_add_entry<Elf64_Rel>(offset, info);
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(Elf64_Addr offset, Elf_Word symbol, unsigned char type) {
+ Elf_Xword info;
+ if (elf_file.get_class() == ELFCLASS32) {
+ info = ELF32_R_INFO((Elf_Xword)symbol, type);
+ } else {
+ info = ELF64_R_INFO((Elf_Xword)symbol, type);
+ }
+
+ add_entry(offset, info);
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend) {
+ if (elf_file.get_class() == ELFCLASS32) {
+ generic_add_entry<Elf32_Rela>(offset, info, addend);
+ } else {
+ generic_add_entry<Elf64_Rela>(offset, info, addend);
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(Elf64_Addr offset, Elf_Word symbol, unsigned char type, Elf_Sxword addend) {
+ Elf_Xword info;
+ if (elf_file.get_class() == ELFCLASS32) {
+ info = ELF32_R_INFO((Elf_Xword)symbol, type);
+ } else {
+ info = ELF64_R_INFO((Elf_Xword)symbol, type);
+ }
+
+ add_entry(offset, info, addend);
+ }
+
+ //------------------------------------------------------------------------------
+ void add_entry(string_section_accessor str_writer, const char* str,
+ symbol_section_accessor sym_writer, Elf64_Addr value, Elf_Word size,
+ unsigned char sym_info, unsigned char other, Elf_Half shndx, Elf64_Addr offset,
+ unsigned char type) {
+ Elf_Word str_index = str_writer.add_string(str);
+ Elf_Word sym_index = sym_writer.add_symbol(str_index, value, size, sym_info, other, shndx);
+ add_entry(offset, sym_index, type);
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ //------------------------------------------------------------------------------
+ Elf_Half get_symbol_table_index() const { return (Elf_Half)relocation_section->get_link(); }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol,
+ Elf_Word& type, Elf_Sxword& addend) const {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ const T* pEntry = reinterpret_cast<const T*>(relocation_section->get_data() +
+ index * relocation_section->get_entry_size());
+ offset = convertor(pEntry->r_offset);
+ Elf_Xword tmp = convertor(pEntry->r_info);
+ symbol = get_sym_and_type<T>::get_r_sym(tmp);
+ type = get_sym_and_type<T>::get_r_type(tmp);
+ addend = 0;
+ }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol,
+ Elf_Word& type, Elf_Sxword& addend) const {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ const T* pEntry = reinterpret_cast<const T*>(relocation_section->get_data() +
+ index * relocation_section->get_entry_size());
+ offset = convertor(pEntry->r_offset);
+ Elf_Xword tmp = convertor(pEntry->r_info);
+ symbol = get_sym_and_type<T>::get_r_sym(tmp);
+ type = get_sym_and_type<T>::get_r_type(tmp);
+ addend = convertor(pEntry->r_addend);
+ }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ void generic_add_entry(Elf64_Addr offset, Elf_Xword info) {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ T entry;
+ entry.r_offset = offset;
+ entry.r_info = info;
+ entry.r_offset = convertor(entry.r_offset);
+ entry.r_info = convertor(entry.r_info);
+
+ relocation_section->append_data(reinterpret_cast<char*>(&entry), sizeof(entry));
+ }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ void generic_add_entry(Elf64_Addr offset, Elf_Xword info, Elf_Sxword addend) {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ T entry;
+ entry.r_offset = offset;
+ entry.r_info = info;
+ entry.r_addend = addend;
+ entry.r_offset = convertor(entry.r_offset);
+ entry.r_info = convertor(entry.r_info);
+ entry.r_addend = convertor(entry.r_addend);
+
+ relocation_section->append_data(reinterpret_cast<char*>(&entry), sizeof(entry));
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ const elfio& elf_file;
+ section* relocation_section;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_RELOCATION_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_section.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_section.hpp
new file mode 100644
index 0000000..6106fc7
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_section.hpp
@@ -0,0 +1,240 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_SECTION_HPP
+#define ELFIO_SECTION_HPP
+
+#include <string>
+#include <iostream>
+
+namespace ELFIO {
+
+class section {
+ friend class elfio;
+
+ public:
+ virtual ~section(){};
+
+ ELFIO_GET_ACCESS_DECL(Elf_Half, index);
+ ELFIO_GET_SET_ACCESS_DECL(std::string, name);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, type);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, flags);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, info);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, link);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, addr_align);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, entry_size);
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Addr, address);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, size);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, name_string_offset);
+
+ virtual const char* get_data() const = 0;
+ virtual void set_data(const char* pData, Elf_Word size) = 0;
+ virtual void set_data(const std::string& data) = 0;
+ virtual void append_data(const char* pData, Elf_Word size) = 0;
+ virtual void append_data(const std::string& data) = 0;
+
+ protected:
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Off, offset);
+ ELFIO_SET_ACCESS_DECL(Elf_Half, index);
+
+ virtual void load(std::istream& f, std::streampos header_offset) = 0;
+ virtual void save(std::ostream& f, std::streampos header_offset,
+ std::streampos data_offset) = 0;
+ virtual bool is_address_initialized() const = 0;
+};
+
+
+template <class T>
+class section_impl : public section {
+ public:
+ //------------------------------------------------------------------------------
+ section_impl(const endianess_convertor* convertor_) : convertor(convertor_) {
+ std::fill_n(reinterpret_cast<char*>(&header), sizeof(header), '\0');
+ is_address_set = false;
+ data = 0;
+ data_size = 0;
+ }
+
+ //------------------------------------------------------------------------------
+ ~section_impl() { delete[] data; }
+
+ //------------------------------------------------------------------------------
+ // Section info functions
+ ELFIO_GET_SET_ACCESS(Elf_Word, type, header.sh_type);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, flags, header.sh_flags);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, size, header.sh_size);
+ ELFIO_GET_SET_ACCESS(Elf_Word, link, header.sh_link);
+ ELFIO_GET_SET_ACCESS(Elf_Word, info, header.sh_info);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, addr_align, header.sh_addralign);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, entry_size, header.sh_entsize);
+ ELFIO_GET_SET_ACCESS(Elf_Word, name_string_offset, header.sh_name);
+ ELFIO_GET_ACCESS(Elf64_Addr, address, header.sh_addr);
+
+ //------------------------------------------------------------------------------
+ Elf_Half get_index() const { return index; }
+
+
+ //------------------------------------------------------------------------------
+ std::string get_name() const { return name; }
+
+ //------------------------------------------------------------------------------
+ void set_name(std::string name_) { name = name_; }
+
+ //------------------------------------------------------------------------------
+ void set_address(Elf64_Addr value) {
+ header.sh_addr = value;
+ header.sh_addr = (*convertor)(header.sh_addr);
+ is_address_set = true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool is_address_initialized() const { return is_address_set; }
+
+ //------------------------------------------------------------------------------
+ const char* get_data() const { return data; }
+
+ //------------------------------------------------------------------------------
+ void set_data(const char* raw_data, Elf_Word size) {
+ if (get_type() != SHT_NOBITS) {
+ delete[] data;
+ try {
+ data = new char[size];
+ } catch (const std::bad_alloc&) {
+ data = 0;
+ data_size = 0;
+ size = 0;
+ }
+ if (0 != data && 0 != raw_data) {
+ data_size = size;
+ std::copy(raw_data, raw_data + size, data);
+ }
+ }
+
+ set_size(size);
+ }
+
+ //------------------------------------------------------------------------------
+ void set_data(const std::string& str_data) {
+ return set_data(str_data.c_str(), (Elf_Word)str_data.size());
+ }
+
+ //------------------------------------------------------------------------------
+ void append_data(const char* raw_data, Elf_Word size) {
+ if (get_type() != SHT_NOBITS) {
+ if (get_size() + size < data_size) {
+ std::copy(raw_data, raw_data + size, data + get_size());
+ } else {
+ data_size = 2 * (data_size + size);
+ char* new_data;
+ try {
+ new_data = new char[data_size];
+ } catch (const std::bad_alloc&) {
+ new_data = 0;
+ size = 0;
+ }
+ if (0 != new_data) {
+ std::copy(data, data + get_size(), new_data);
+ std::copy(raw_data, raw_data + size, new_data + get_size());
+ delete[] data;
+ data = new_data;
+ }
+ }
+ set_size(get_size() + size);
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ void append_data(const std::string& str_data) {
+ return append_data(str_data.c_str(), (Elf_Word)str_data.size());
+ }
+
+ //------------------------------------------------------------------------------
+ protected:
+ //------------------------------------------------------------------------------
+ ELFIO_GET_SET_ACCESS(Elf64_Off, offset, header.sh_offset);
+
+ //------------------------------------------------------------------------------
+ void set_index(Elf_Half value) { index = value; }
+
+ //------------------------------------------------------------------------------
+ void load(std::istream& stream, std::streampos header_offset) {
+ std::fill_n(reinterpret_cast<char*>(&header), sizeof(header), '\0');
+ stream.seekg(header_offset);
+ stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+
+ Elf_Xword size = get_size();
+ if (0 == data && SHT_NULL != get_type() && SHT_NOBITS != get_type()) {
+ try {
+ data = new char[size];
+ } catch (const std::bad_alloc&) {
+ data = 0;
+ data_size = 0;
+ }
+ if (0 != size) {
+ stream.seekg((*convertor)(header.sh_offset));
+ stream.read(data, size);
+ data_size = size;
+ }
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ void save(std::ostream& f, std::streampos header_offset, std::streampos data_offset) {
+ if (0 != get_index()) {
+ header.sh_offset = data_offset;
+ header.sh_offset = (*convertor)(header.sh_offset);
+ }
+
+ save_header(f, header_offset);
+ if (get_type() != SHT_NOBITS && get_type() != SHT_NULL && get_size() != 0 && data != 0) {
+ save_data(f, data_offset);
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ //------------------------------------------------------------------------------
+ void save_header(std::ostream& f, std::streampos header_offset) const {
+ f.seekp(header_offset);
+ f.write(reinterpret_cast<const char*>(&header), sizeof(header));
+ }
+
+ //------------------------------------------------------------------------------
+ void save_data(std::ostream& f, std::streampos data_offset) const {
+ f.seekp(data_offset);
+ f.write(get_data(), get_size());
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ T header;
+ Elf_Half index;
+ std::string name;
+ char* data;
+ Elf_Word data_size;
+ const endianess_convertor* convertor;
+ bool is_address_set;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_SECTION_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_segment.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_segment.hpp
new file mode 100644
index 0000000..59e37ec
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_segment.hpp
@@ -0,0 +1,178 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_SEGMENT_HPP
+#define ELFIO_SEGMENT_HPP
+
+#include <iostream>
+#include <vector>
+
+namespace ELFIO {
+
+class segment {
+ friend class elfio;
+
+ public:
+ virtual ~segment(){};
+
+ ELFIO_GET_ACCESS_DECL(Elf_Half, index);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, type);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Word, flags);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, align);
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Addr, virtual_address);
+ ELFIO_GET_SET_ACCESS_DECL(Elf64_Addr, physical_address);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, file_size);
+ ELFIO_GET_SET_ACCESS_DECL(Elf_Xword, memory_size);
+ ELFIO_GET_ACCESS_DECL(Elf64_Off, offset);
+
+ virtual const char* get_data() const = 0;
+
+ virtual Elf_Half add_section_index(Elf_Half index, Elf_Xword addr_align) = 0;
+ virtual Elf_Half get_sections_num() const = 0;
+ virtual Elf_Half get_section_index_at(Elf_Half num) const = 0;
+ virtual bool is_offset_initialized() const = 0;
+
+ protected:
+ ELFIO_SET_ACCESS_DECL(Elf64_Off, offset);
+ ELFIO_SET_ACCESS_DECL(Elf_Half, index);
+
+ virtual const std::vector<Elf_Half>& get_sections() const = 0;
+ virtual void load(std::istream& stream, std::streampos header_offset) = 0;
+ virtual void save(std::ostream& f, std::streampos header_offset,
+ std::streampos data_offset) = 0;
+};
+
+
+//------------------------------------------------------------------------------
+template <class T>
+class segment_impl : public segment {
+ public:
+ //------------------------------------------------------------------------------
+ segment_impl(endianess_convertor* convertor_) : convertor(convertor_) {
+ is_offset_set = false;
+ std::fill_n(reinterpret_cast<char*>(&ph), sizeof(ph), '\0');
+ data = 0;
+ }
+
+ //------------------------------------------------------------------------------
+ virtual ~segment_impl() { delete[] data; }
+
+ //------------------------------------------------------------------------------
+ // Section info functions
+ ELFIO_GET_SET_ACCESS(Elf_Word, type, ph.p_type);
+ ELFIO_GET_SET_ACCESS(Elf_Word, flags, ph.p_flags);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, align, ph.p_align);
+ ELFIO_GET_SET_ACCESS(Elf64_Addr, virtual_address, ph.p_vaddr);
+ ELFIO_GET_SET_ACCESS(Elf64_Addr, physical_address, ph.p_paddr);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, file_size, ph.p_filesz);
+ ELFIO_GET_SET_ACCESS(Elf_Xword, memory_size, ph.p_memsz);
+ ELFIO_GET_ACCESS(Elf64_Off, offset, ph.p_offset);
+
+ //------------------------------------------------------------------------------
+ Elf_Half get_index() const { return index; }
+
+ //------------------------------------------------------------------------------
+ const char* get_data() const { return data; }
+
+ //------------------------------------------------------------------------------
+ Elf_Half add_section_index(Elf_Half sec_index, Elf_Xword addr_align) {
+ sections.push_back(sec_index);
+ if (addr_align > get_align()) {
+ set_align(addr_align);
+ }
+
+ return (Elf_Half)sections.size();
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Half get_sections_num() const { return (Elf_Half)sections.size(); }
+
+ //------------------------------------------------------------------------------
+ Elf_Half get_section_index_at(Elf_Half num) const {
+ if (num < sections.size()) {
+ return sections[num];
+ }
+
+ return -1;
+ }
+
+ //------------------------------------------------------------------------------
+ protected:
+ //------------------------------------------------------------------------------
+
+ //------------------------------------------------------------------------------
+ void set_offset(Elf64_Off value) {
+ ph.p_offset = value;
+ ph.p_offset = (*convertor)(ph.p_offset);
+ is_offset_set = true;
+ }
+
+ //------------------------------------------------------------------------------
+ bool is_offset_initialized() const { return is_offset_set; }
+
+ //------------------------------------------------------------------------------
+ const std::vector<Elf_Half>& get_sections() const { return sections; }
+
+ //------------------------------------------------------------------------------
+ void set_index(Elf_Half value) { index = value; }
+
+ //------------------------------------------------------------------------------
+ void load(std::istream& stream, std::streampos header_offset) {
+ stream.seekg(header_offset);
+ stream.read(reinterpret_cast<char*>(&ph), sizeof(ph));
+ is_offset_set = true;
+
+ if (PT_NULL != get_type() && 0 != get_file_size()) {
+ stream.seekg((*convertor)(ph.p_offset));
+ Elf_Xword size = get_file_size();
+ try {
+ data = new char[size];
+ } catch (const std::bad_alloc&) {
+ data = 0;
+ }
+ if (0 != data) {
+ stream.read(data, size);
+ }
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ void save(std::ostream& f, std::streampos header_offset, std::streampos data_offset) {
+ ph.p_offset = data_offset;
+ ph.p_offset = (*convertor)(ph.p_offset);
+ f.seekp(header_offset);
+ f.write(reinterpret_cast<const char*>(&ph), sizeof(ph));
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ T ph;
+ Elf_Half index;
+ char* data;
+ std::vector<Elf_Half> sections;
+ endianess_convertor* convertor;
+ bool is_offset_set;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_SEGMENT_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_strings.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_strings.hpp
new file mode 100644
index 0000000..07adc3a
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_strings.hpp
@@ -0,0 +1,84 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_STRINGS_HPP
+#define ELFIO_STRINGS_HPP
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class string_section_accessor {
+ public:
+ //------------------------------------------------------------------------------
+ string_section_accessor(section* section_) : string_section(section_) {}
+
+
+ //------------------------------------------------------------------------------
+ const char* get_string(Elf_Word index) const {
+ if (string_section) {
+ if (index < string_section->get_size()) {
+ const char* data = string_section->get_data();
+ if (0 != data) {
+ return data + index;
+ }
+ }
+ }
+
+ return 0;
+ }
+
+
+ //------------------------------------------------------------------------------
+ Elf_Word add_string(const char* str) {
+ Elf_Word current_position = 0;
+
+ if (string_section) {
+ // Strings are addeded to the end of the current section data
+ current_position = (Elf_Word)string_section->get_size();
+
+ if (current_position == 0) {
+ char empty_string = '\0';
+ string_section->append_data(&empty_string, 1);
+ current_position++;
+ }
+ string_section->append_data(str, (Elf_Word)std::strlen(str) + 1);
+ }
+
+ return current_position;
+ }
+
+
+ //------------------------------------------------------------------------------
+ Elf_Word add_string(const std::string& str) { return add_string(str.c_str()); }
+
+ //------------------------------------------------------------------------------
+ private:
+ section* string_section;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_STRINGS_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_symbols.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_symbols.hpp
new file mode 100644
index 0000000..8184bcd
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_symbols.hpp
@@ -0,0 +1,220 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_SYMBOLS_HPP
+#define ELFIO_SYMBOLS_HPP
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class symbol_section_accessor {
+ public:
+ //------------------------------------------------------------------------------
+ symbol_section_accessor(const elfio& elf_file_, section* symbol_section_)
+ : elf_file(elf_file_), symbol_section(symbol_section_) {
+ find_hash_section();
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Xword get_symbols_num() const {
+ Elf_Xword nRet = 0;
+ if (0 != symbol_section->get_entry_size()) {
+ nRet = symbol_section->get_size() / symbol_section->get_entry_size();
+ }
+
+ return nRet;
+ }
+
+ //------------------------------------------------------------------------------
+ bool get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value, Elf_Xword& size,
+ unsigned char& bind, unsigned char& type, Elf_Half& section_index,
+ unsigned char& other) const {
+ bool ret = false;
+
+ if (elf_file.get_class() == ELFCLASS32) {
+ ret = generic_get_symbol<Elf32_Sym>(index, name, value, size, bind, type, section_index,
+ other);
+ } else {
+ ret = generic_get_symbol<Elf64_Sym>(index, name, value, size, bind, type, section_index,
+ other);
+ }
+
+ return ret;
+ }
+
+ //------------------------------------------------------------------------------
+ bool get_symbol(const std::string& name, Elf64_Addr& value, Elf_Xword& size,
+ unsigned char& bind, unsigned char& type, Elf_Half& section_index,
+ unsigned char& other) const {
+ bool ret = false;
+
+ if (0 != get_hash_table_index()) {
+ Elf_Word nbucket = *(Elf_Word*)hash_section->get_data();
+ Elf_Word nchain = *(Elf_Word*)(hash_section->get_data() + sizeof(Elf_Word));
+ Elf_Word val = elf_hash((const unsigned char*)name.c_str());
+
+ Elf_Word y =
+ *(Elf_Word*)(hash_section->get_data() + (2 + val % nbucket) * sizeof(Elf_Word));
+ std::string str;
+ get_symbol(y, str, value, size, bind, type, section_index, other);
+ while (str != name && STN_UNDEF != y && y < nchain) {
+ y = *(Elf_Word*)(hash_section->get_data() + (2 + nbucket + y) * sizeof(Elf_Word));
+ get_symbol(y, str, value, size, bind, type, section_index, other);
+ }
+ if (str == name) {
+ ret = true;
+ }
+ }
+
+ return ret;
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Word add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, unsigned char info,
+ unsigned char other, Elf_Half shndx) {
+ Elf_Word nRet;
+
+ if (symbol_section->get_size() == 0) {
+ if (elf_file.get_class() == ELFCLASS32) {
+ nRet = generic_add_symbol<Elf32_Sym>(0, 0, 0, 0, 0, 0);
+ } else {
+ nRet = generic_add_symbol<Elf64_Sym>(0, 0, 0, 0, 0, 0);
+ }
+ }
+
+ if (elf_file.get_class() == ELFCLASS32) {
+ nRet = generic_add_symbol<Elf32_Sym>(name, value, size, info, other, shndx);
+ } else {
+ nRet = generic_add_symbol<Elf64_Sym>(name, value, size, info, other, shndx);
+ }
+
+ return nRet;
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Word add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, unsigned char bind,
+ unsigned char type, unsigned char other, Elf_Half shndx) {
+ return add_symbol(name, value, size, ELF_ST_INFO(bind, type), other, shndx);
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Word add_symbol(string_section_accessor& pStrWriter, const char* str, Elf64_Addr value,
+ Elf_Xword size, unsigned char info, unsigned char other, Elf_Half shndx) {
+ Elf_Word index = pStrWriter.add_string(str);
+ return add_symbol(index, value, size, info, other, shndx);
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Word add_symbol(string_section_accessor& pStrWriter, const char* str, Elf64_Addr value,
+ Elf_Xword size, unsigned char bind, unsigned char type, unsigned char other,
+ Elf_Half shndx) {
+ return add_symbol(pStrWriter, str, value, size, ELF_ST_INFO(bind, type), other, shndx);
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ //------------------------------------------------------------------------------
+ void find_hash_section() {
+ hash_section = 0;
+ hash_section_index = 0;
+ Elf_Half nSecNo = elf_file.sections.size();
+ for (Elf_Half i = 0; i < nSecNo && 0 == hash_section_index; ++i) {
+ const section* sec = elf_file.sections[i];
+ if (sec->get_link() == symbol_section->get_index()) {
+ hash_section = sec;
+ hash_section_index = i;
+ }
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ Elf_Half get_string_table_index() const { return (Elf_Half)symbol_section->get_link(); }
+
+ //------------------------------------------------------------------------------
+ Elf_Half get_hash_table_index() const { return hash_section_index; }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value, Elf_Xword& size,
+ unsigned char& bind, unsigned char& type, Elf_Half& section_index,
+ unsigned char& other) const {
+ bool ret = false;
+
+ if (index < get_symbols_num()) {
+ const T* pSym = reinterpret_cast<const T*>(symbol_section->get_data() +
+ index * symbol_section->get_entry_size());
+
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ section* string_section = elf_file.sections[get_string_table_index()];
+ string_section_accessor str_reader(string_section);
+ const char* pStr = str_reader.get_string(convertor(pSym->st_name));
+ if (0 != pStr) {
+ name = pStr;
+ }
+ value = convertor(pSym->st_value);
+ size = convertor(pSym->st_size);
+ bind = ELF_ST_BIND(pSym->st_info);
+ type = ELF_ST_TYPE(pSym->st_info);
+ section_index = convertor(pSym->st_shndx);
+ other = pSym->st_other;
+
+ ret = true;
+ }
+
+ return ret;
+ }
+
+ //------------------------------------------------------------------------------
+ template <class T>
+ Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, unsigned char info,
+ unsigned char other, Elf_Half shndx) {
+ const endianess_convertor& convertor = elf_file.get_convertor();
+
+ T entry;
+ entry.st_name = convertor(name);
+ entry.st_value = value;
+ entry.st_value = convertor(entry.st_value);
+ entry.st_size = size;
+ entry.st_size = convertor(entry.st_size);
+ entry.st_info = convertor(info);
+ entry.st_other = convertor(other);
+ entry.st_shndx = convertor(shndx);
+
+ symbol_section->append_data(reinterpret_cast<char*>(&entry), sizeof(entry));
+
+ Elf_Word nRet = symbol_section->get_size() / sizeof(entry) - 1;
+
+ return nRet;
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ const elfio& elf_file;
+ section* symbol_section;
+ Elf_Half hash_section_index;
+ const section* hash_section;
+};
+
+} // namespace ELFIO
+
+#endif // ELFIO_SYMBOLS_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/elfio/elfio_utils.hpp b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_utils.hpp
new file mode 100644
index 0000000..b1bb00e
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/elfio/elfio_utils.hpp
@@ -0,0 +1,157 @@
+/*
+Copyright (C) 2001-2015 by Serge Lamikhov-Center
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ELFIO_UTILS_HPP
+#define ELFIO_UTILS_HPP
+
+#define ELFIO_GET_ACCESS(TYPE, NAME, FIELD) \
+ TYPE get_##NAME() const { return (*convertor)(FIELD); }
+#define ELFIO_SET_ACCESS(TYPE, NAME, FIELD) \
+ void set_##NAME(TYPE value) { \
+ FIELD = value; \
+ FIELD = (*convertor)(FIELD); \
+ }
+#define ELFIO_GET_SET_ACCESS(TYPE, NAME, FIELD) \
+ TYPE get_##NAME() const { return (*convertor)(FIELD); } \
+ void set_##NAME(TYPE value) { \
+ FIELD = value; \
+ FIELD = (*convertor)(FIELD); \
+ }
+
+#define ELFIO_GET_ACCESS_DECL(TYPE, NAME) virtual TYPE get_##NAME() const = 0
+
+#define ELFIO_SET_ACCESS_DECL(TYPE, NAME) virtual void set_##NAME(TYPE value) = 0
+
+#define ELFIO_GET_SET_ACCESS_DECL(TYPE, NAME) \
+ virtual TYPE get_##NAME() const = 0; \
+ virtual void set_##NAME(TYPE value) = 0
+
+namespace ELFIO {
+
+//------------------------------------------------------------------------------
+class endianess_convertor {
+ public:
+ //------------------------------------------------------------------------------
+ endianess_convertor() { need_conversion = false; }
+
+ //------------------------------------------------------------------------------
+ void setup(unsigned char elf_file_encoding) {
+ need_conversion = (elf_file_encoding != get_host_encoding());
+ }
+
+ //------------------------------------------------------------------------------
+ uint64_t operator()(uint64_t value) const {
+ if (!need_conversion) {
+ return value;
+ }
+ value = ((value & 0x00000000000000FFull) << 56) | ((value & 0x000000000000FF00ull) << 40) |
+ ((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) |
+ ((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) |
+ ((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56);
+
+ return value;
+ }
+
+ //------------------------------------------------------------------------------
+ int64_t operator()(int64_t value) const {
+ if (!need_conversion) {
+ return value;
+ }
+ return (int64_t)(*this)((uint64_t)value);
+ }
+
+ //------------------------------------------------------------------------------
+ uint32_t operator()(uint32_t value) const {
+ if (!need_conversion) {
+ return value;
+ }
+ value = ((value & 0x000000FF) << 24) | ((value & 0x0000FF00) << 8) |
+ ((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24);
+
+ return value;
+ }
+
+ //------------------------------------------------------------------------------
+ int32_t operator()(int32_t value) const {
+ if (!need_conversion) {
+ return value;
+ }
+ return (int32_t)(*this)((uint32_t)value);
+ }
+
+ //------------------------------------------------------------------------------
+ uint16_t operator()(uint16_t value) const {
+ if (!need_conversion) {
+ return value;
+ }
+ value = ((value & 0x00FF) << 8) | ((value & 0xFF00) >> 8);
+
+ return value;
+ }
+
+ //------------------------------------------------------------------------------
+ int16_t operator()(int16_t value) const {
+ if (!need_conversion) {
+ return value;
+ }
+ return (int16_t)(*this)((uint16_t)value);
+ }
+
+ //------------------------------------------------------------------------------
+ int8_t operator()(int8_t value) const { return value; }
+
+ //------------------------------------------------------------------------------
+ uint8_t operator()(uint8_t value) const { return value; }
+
+ //------------------------------------------------------------------------------
+ private:
+ //------------------------------------------------------------------------------
+ unsigned char get_host_encoding() const {
+ static const int tmp = 1;
+ if (1 == *(char*)&tmp) {
+ return ELFDATA2LSB;
+ } else {
+ return ELFDATA2MSB;
+ }
+ }
+
+ //------------------------------------------------------------------------------
+ private:
+ bool need_conversion;
+};
+
+
+//------------------------------------------------------------------------------
+inline uint32_t elf_hash(const unsigned char* name) {
+ uint32_t h = 0, g;
+ while (*name) {
+ h = (h << 4) + *name++;
+ g = h & 0xf0000000;
+ if (g != 0) h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+} // namespace ELFIO
+
+#endif // ELFIO_UTILS_HPP
diff --git a/third_party/rocm/include/hip/hcc_detail/functional_grid_launch.hpp b/third_party/rocm/include/hip/hcc_detail/functional_grid_launch.hpp
new file mode 100644
index 0000000..efe6a60
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/functional_grid_launch.hpp
@@ -0,0 +1,218 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "concepts.hpp"
+#include "helpers.hpp"
+#include "program_state.hpp"
+#include "hip_runtime_api.h"
+
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+hipError_t ihipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList, int numDevices,
+ unsigned int flags, hip_impl::program_state& ps);
+
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
+ dim3 blockDim, void** args,
+ size_t sharedMem, hipStream_t stream,
+ hip_impl::program_state& ps);
+
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+ int numDevices,
+ unsigned int flags,
+ hip_impl::program_state& ps);
+
+#pragma GCC visibility push(hidden)
+
+namespace hip_impl {
+template <typename T, typename std::enable_if<std::is_integral<T>{}>::type* = nullptr>
+inline T round_up_to_next_multiple_nonnegative(T x, T y) {
+ T tmp = x + y - 1;
+ return tmp - tmp % y;
+}
+
+template <
+ std::size_t n,
+ typename... Ts,
+ typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+inline hip_impl::kernarg make_kernarg(
+ const std::tuple<Ts...>&,
+ const kernargs_size_align&,
+ hip_impl::kernarg kernarg) {
+ return kernarg;
+}
+
+template <
+ std::size_t n,
+ typename... Ts,
+ typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+inline hip_impl::kernarg make_kernarg(
+ const std::tuple<Ts...>& formals,
+ const kernargs_size_align& size_align,
+ hip_impl::kernarg kernarg) {
+ using T = typename std::tuple_element<n, std::tuple<Ts...>>::type;
+
+ static_assert(
+ !std::is_reference<T>{},
+ "A __global__ function cannot have a reference as one of its "
+ "arguments.");
+ #if defined(HIP_STRICT)
+ static_assert(
+ std::is_trivially_copyable<T>{},
+ "Only TriviallyCopyable types can be arguments to a __global__ "
+ "function");
+ #endif
+
+ kernarg.resize(round_up_to_next_multiple_nonnegative(
+ kernarg.size(), size_align.alignment(n)) + size_align.size(n));
+
+ std::memcpy(
+ kernarg.data() + kernarg.size() - size_align.size(n),
+ &std::get<n>(formals),
+ size_align.size(n));
+ return make_kernarg<n + 1>(formals, size_align, std::move(kernarg));
+}
+
+template <typename... Formals, typename... Actuals>
+inline hip_impl::kernarg make_kernarg(
+ void (*kernel)(Formals...), std::tuple<Actuals...> actuals) {
+ static_assert(sizeof...(Formals) == sizeof...(Actuals),
+ "The count of formal arguments must match the count of actuals.");
+
+ if (sizeof...(Formals) == 0) return {};
+
+ std::tuple<Formals...> to_formals{std::move(actuals)};
+ hip_impl::kernarg kernarg;
+ kernarg.reserve(sizeof(to_formals));
+
+ auto& ps = hip_impl::get_program_state();
+ return make_kernarg<0>(to_formals,
+ ps.get_kernargs_size_align(
+ reinterpret_cast<std::uintptr_t>(kernel)),
+ std::move(kernarg));
+}
+
+
+HIP_INTERNAL_EXPORTED_API hsa_agent_t target_agent(hipStream_t stream);
+
+inline
+__attribute__((visibility("hidden")))
+void hipLaunchKernelGGLImpl(
+ std::uintptr_t function_address,
+ const dim3& numBlocks,
+ const dim3& dimBlocks,
+ std::uint32_t sharedMemBytes,
+ hipStream_t stream,
+ void** kernarg) {
+
+ const auto& kd = hip_impl::get_program_state().kernel_descriptor(function_address,
+ target_agent(stream));
+
+ hipModuleLaunchKernel(kd, numBlocks.x, numBlocks.y, numBlocks.z,
+ dimBlocks.x, dimBlocks.y, dimBlocks.z, sharedMemBytes,
+ stream, nullptr, kernarg);
+}
+} // Namespace hip_impl.
+
+
+template <class T>
+inline
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
+
+ using namespace hip_impl;
+
+ hip_impl::hip_init();
+ auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
+ target_agent(0));
+
+ return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
+ dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <class T>
+inline
+hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+ T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int flags = 0 ) {
+
+ using namespace hip_impl;
+
+ hip_impl::hip_init();
+ if(flags != hipOccupancyDefault) return hipErrorNotSupported;
+ auto f = get_program_state().kernel_descriptor(reinterpret_cast<std::uintptr_t>(kernel),
+ target_agent(0));
+
+ return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, f,
+ dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <typename... Args, typename F = void (*)(Args...)>
+inline
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+ std::uint32_t sharedMemBytes, hipStream_t stream,
+ Args... args) {
+ hip_impl::hip_init();
+ auto kernarg = hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
+ std::size_t kernarg_size = kernarg.size();
+
+ void* config[]{
+ HIP_LAUNCH_PARAM_BUFFER_POINTER,
+ kernarg.data(),
+ HIP_LAUNCH_PARAM_BUFFER_SIZE,
+ &kernarg_size,
+ HIP_LAUNCH_PARAM_END};
+
+ hip_impl::hipLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
+ numBlocks, dimBlocks, sharedMemBytes,
+ stream, &config[0]);
+}
+
+template <typename F>
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipLaunchCooperativeKernel(F f, dim3 gridDim, dim3 blockDim,
+ void** args, size_t sharedMem,
+ hipStream_t stream) {
+ hip_impl::hip_init();
+ auto& ps = hip_impl::get_program_state();
+ return hipLaunchCooperativeKernel(reinterpret_cast<void*>(f), gridDim,
+ blockDim, args, sharedMem, stream, ps);
+}
+
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+ int numDevices,
+ unsigned int flags) {
+
+ hip_impl::hip_init();
+ auto& ps = hip_impl::get_program_state();
+ return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags, ps);
+}
+
+#pragma GCC visibility pop
diff --git a/third_party/rocm/include/hip/hcc_detail/grid_launch.h b/third_party/rocm/include/hip/hcc_detail/grid_launch.h
new file mode 100644
index 0000000..22841a5
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/grid_launch.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <hc_defines.h>
+
+#define GRID_LAUNCH_VERSION 20
+
+// Extern definitions
+namespace hc{
+class completion_future;
+class accelerator_view;
+}
+
+
+// 3 dim structure for groups and grids.
+typedef struct gl_dim3
+{
+ int x,y,z;
+ gl_dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
+} gl_dim3;
+
+typedef enum gl_barrier_bit {
+ barrier_bit_queue_default,
+ barrier_bit_none,
+ barrier_bit_wait,
+} gl_barrier_bit;
+
+
+// grid_launch_parm contains information used to launch the kernel.
+typedef struct grid_launch_parm
+{
+ //! Grid dimensions
+ gl_dim3 grid_dim;
+
+ //! Group dimensions
+ gl_dim3 group_dim;
+
+ //! Amount of dynamic group memory to use with the kernel launch.
+ //! This memory is in addition to the amount used statically in the kernel.
+ unsigned int dynamic_group_mem_bytes;
+
+ //! Control setting of barrier bit on per-packet basis:
+ //! See gl_barrier_bit description.
+ //! Placeholder, is not used to control packet dispatch yet
+ enum gl_barrier_bit barrier_bit;
+
+ //! Value of packet fences to apply to launch.
+ //! The correspond to the value of bits 9:14 in the AQL packet,
+ //! see HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE and hsa_fence_scope_t.
+ unsigned int launch_fence;
+
+ //! Pointer to the accelerator_view where the kernel should execute.
+ //! If NULL, the default view on the default accelerator is used.
+ hc::accelerator_view *av;
+
+ //! Pointer to the completion_future used to track the status of the command.
+ //! If NULL, the command does not write status. In this case,
+ //! synchronization can be enforced with queue-level waits or
+ //! waiting on younger commands.
+ hc::completion_future *cf;
+
+ grid_launch_parm() = default;
+} grid_launch_parm;
+
+
+extern void init_grid_launch(grid_launch_parm *gl);
diff --git a/third_party/rocm/include/hip/hcc_detail/grid_launch.hpp b/third_party/rocm/include/hip/hcc_detail/grid_launch.hpp
new file mode 100644
index 0000000..04ce7e0
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/grid_launch.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "grid_launch.h"
+#include "hc.hpp"
+
+class grid_launch_parm_cxx : public grid_launch_parm
+{
+public:
+ grid_launch_parm_cxx() = default;
+
+ // customized serialization: don't need av and cf in kernel
+ __attribute__((annotate("serialize")))
+ void __cxxamp_serialize(Kalmar::Serialize& s) const {
+ s.Append(sizeof(int), &grid_dim.x);
+ s.Append(sizeof(int), &grid_dim.y);
+ s.Append(sizeof(int), &grid_dim.z);
+ s.Append(sizeof(int), &group_dim.x);
+ s.Append(sizeof(int), &group_dim.y);
+ s.Append(sizeof(int), &group_dim.z);
+ }
+
+ __attribute__((annotate("user_deserialize")))
+ grid_launch_parm_cxx(int grid_dim_x, int grid_dim_y, int grid_dim_z,
+ int group_dim_x, int group_dim_y, int group_dim_z) {
+ grid_dim.x = grid_dim_x;
+ grid_dim.y = grid_dim_y;
+ grid_dim.z = grid_dim_z;
+ group_dim.x = group_dim_x;
+ group_dim.y = group_dim_y;
+ group_dim.z = group_dim_z;
+ }
+};
+
+
+extern inline void grid_launch_init(grid_launch_parm *lp) {
+ lp->grid_dim.x = lp->grid_dim.y = lp->grid_dim.z = 1;
+
+ lp->group_dim.x = lp->group_dim.y = lp->group_dim.z = 1;
+
+ lp->dynamic_group_mem_bytes = 0;
+
+ lp->barrier_bit = barrier_bit_queue_default;
+ lp->launch_fence = -1;
+
+ // TODO - set to NULL?
+ static hc::accelerator_view av = hc::accelerator().get_default_view();
+ lp->av = &av;
+ lp->cf = NULL;
+}
+
diff --git a/third_party/rocm/include/hip/hcc_detail/grid_launch_GGL.hpp b/third_party/rocm/include/hip/hcc_detail/grid_launch_GGL.hpp
new file mode 100644
index 0000000..1c05279
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/grid_launch_GGL.hpp
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#if GENERIC_GRID_LAUNCH == 1
+#if __hcc_workweek__ >= 17481
+#include "functional_grid_launch.hpp"
+#else
+#include "macro_based_grid_launch.hpp"
+#endif
+#endif // GENERIC_GRID_LAUNCH
\ No newline at end of file
diff --git a/third_party/rocm/include/hip/hcc_detail/helpers.hpp b/third_party/rocm/include/hip/hcc_detail/helpers.hpp
new file mode 100644
index 0000000..b94b126
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/helpers.hpp
@@ -0,0 +1,137 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "concepts.hpp"
+
+#include <type_traits> // For std::conditional, std::decay, std::enable_if,
+ // std::false_type, std result_of and std::true_type.
+#include <utility> // For std::declval.
+
+#ifdef __has_include // Check if __has_include is present
+# if __has_include(<version>) // Check for version header
+# include <version>
+# if defined(__cpp_lib_is_invocable) && !defined(HIP_HAS_INVOCABLE)
+# define HIP_HAS_INVOCABLE __cpp_lib_is_invocable
+# endif
+# if defined(__cpp_lib_result_of_sfinae) && !defined(HIP_HAS_RESULT_OF_SFINAE)
+# define HIP_HAS_RESULT_OF_SFINAE __cpp_lib_result_of_sfinae
+# endif
+# endif
+#endif
+
+#ifndef HIP_HAS_INVOCABLE
+#define HIP_HAS_INVOCABLE 0
+#endif
+
+#ifndef HIP_HAS_RESULT_OF_SFINAE
+#define HIP_HAS_RESULT_OF_SFINAE 0
+#endif
+
+namespace std { // TODO: these should be removed as soon as possible.
+#if (__cplusplus < 201406L)
+#if (__cplusplus < 201402L)
+template <bool cond, typename T = void>
+using enable_if_t = typename enable_if<cond, T>::type;
+template <bool cond, typename T, typename U>
+using conditional_t = typename conditional<cond, T, U>::type;
+template <typename T>
+using decay_t = typename decay<T>::type;
+template <FunctionalProcedure F, typename... Ts>
+using result_of_t = typename result_of<F(Ts...)>::type;
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+#endif
+#endif
+} // namespace std
+
+namespace hip_impl {
+template <typename...>
+using void_t_ = void;
+
+#if HIP_HAS_INVOCABLE
+template <typename, typename = void>
+struct is_callable_impl;
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
+#elif HIP_HAS_RESULT_OF_SFINAE
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type > > : std::true_type {};
+#else
+template <class Base, class T, class Derived>
+auto simple_invoke(T Base::*pmd, Derived&& ref)
+-> decltype(static_cast<Derived&&>(ref).*pmd);
+
+template <class PMD, class Pointer>
+auto simple_invoke(PMD&& pmd, Pointer&& ptr)
+-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
+
+template <class Base, class T, class Derived>
+auto simple_invoke(T Base::*pmd, const std::reference_wrapper<Derived>& ref)
+-> decltype(ref.get().*pmd);
+
+template <class Base, class T, class Derived, class... Args>
+auto simple_invoke(T Base::*pmf, Derived&& ref, Args&&... args)
+-> decltype((static_cast<Derived&&>(ref).*pmf)(static_cast<Args&&>(args)...));
+
+template <class PMF, class Pointer, class... Args>
+auto simple_invoke(PMF&& pmf, Pointer&& ptr, Args&&... args)
+-> decltype(((*static_cast<Pointer&&>(ptr)).*static_cast<PMF&&>(pmf))(static_cast<Args&&>(args)...));
+
+template <class Base, class T, class Derived, class... Args>
+auto simple_invoke(T Base::*pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
+-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
+
+template<class F, class... Ts>
+auto simple_invoke(F&& f, Ts&&... xs)
+-> decltype(f(static_cast<Ts&&>(xs)...));
+
+template <typename, typename = void>
+struct is_callable_impl : std::false_type {};
+
+template <FunctionalProcedure F, typename... Ts>
+struct is_callable_impl<F(Ts...), void_t_<decltype(simple_invoke(std::declval<F>(), std::declval<Ts>()...))> >
+ : std::true_type {};
+
+#endif
+
+template <typename Call>
+struct is_callable : is_callable_impl<Call> {};
+
+#define count_macro_args_impl_hip_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \
+ _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, \
+ _26, _27, _28, _29, _30, _31, _n, ...) \
+ _n
+#define count_macro_args_hip_(...) \
+ count_macro_args_impl_hip_(, ##__VA_ARGS__, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, \
+ 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, \
+ 0)
+
+#define overloaded_macro_expand_hip_(macro, arg_cnt) macro##arg_cnt
+#define overload_macro_impl_hip_(macro, arg_cnt) overloaded_macro_expand_hip_(macro, arg_cnt)
+#define overload_macro_hip_(macro, ...) \
+ overload_macro_impl_hip_(macro, count_macro_args_hip_(__VA_ARGS__))(__VA_ARGS__)
+} // namespace hip_impl
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_atomic.h b/third_party/rocm/include/hip/hcc_detail/hip_atomic.h
new file mode 100644
index 0000000..a1370ce
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_atomic.h
@@ -0,0 +1,286 @@
+#pragma once
+
+#include "device_functions.h"
+
+__device__
+inline
+int atomicCAS(int* address, int compare, int val)
+{
+ __atomic_compare_exchange_n(
+ address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+
+ return compare;
+}
+__device__
+inline
+unsigned int atomicCAS(
+ unsigned int* address, unsigned int compare, unsigned int val)
+{
+ __atomic_compare_exchange_n(
+ address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+
+ return compare;
+}
+__device__
+inline
+unsigned long long atomicCAS(
+ unsigned long long* address,
+ unsigned long long compare,
+ unsigned long long val)
+{
+ __atomic_compare_exchange_n(
+ address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+
+ return compare;
+}
+
+__device__
+inline
+int atomicAdd(int* address, int val)
+{
+ return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicAdd(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicAdd(
+ unsigned long long* address, unsigned long long val)
+{
+ return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+float atomicAdd(float* address, float val)
+{
+ return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+
+DEPRECATED("use atomicAdd instead")
+__device__
+inline
+void atomicAddNoRet(float* address, float val)
+{
+ __ockl_atomic_add_noret_f32(address, val);
+}
+
+__device__
+inline
+double atomicAdd(double* address, double val)
+{
+ unsigned long long* uaddr{reinterpret_cast<unsigned long long*>(address)};
+ unsigned long long r{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
+
+ unsigned long long old;
+ do {
+ old = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
+
+ if (r != old) { r = old; continue; }
+
+ r = atomicCAS(
+ uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
+
+ if (r == old) break;
+ } while (true);
+
+ return __longlong_as_double(r);
+}
+
+__device__
+inline
+int atomicSub(int* address, int val)
+{
+ return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicSub(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicExch(int* address, int val)
+{
+ return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicExch(unsigned int* address, unsigned int val)
+{
+ return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
+{
+ return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+float atomicExch(float* address, float val)
+{
+ return __uint_as_float(__atomic_exchange_n(
+ reinterpret_cast<unsigned int*>(address),
+ __float_as_uint(val),
+ __ATOMIC_RELAXED));
+}
+
+__device__
+inline
+int atomicMin(int* address, int val)
+{
+ return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicMin(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicMin(
+ unsigned long long* address, unsigned long long val)
+{
+ unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
+ while (val < tmp) {
+ const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+ if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+ tmp = atomicCAS(address, tmp, val);
+ }
+
+ return tmp;
+}
+
+__device__
+inline
+int atomicMax(int* address, int val)
+{
+ return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicMax(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicMax(
+ unsigned long long* address, unsigned long long val)
+{
+ unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
+ while (tmp < val) {
+ const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
+
+ if (tmp1 != tmp) { tmp = tmp1; continue; }
+
+ tmp = atomicCAS(address, tmp, val);
+ }
+
+ return tmp;
+}
+
+__device__
+inline
+unsigned int atomicInc(unsigned int* address, unsigned int val)
+{
+ __device__
+ extern
+ unsigned int __builtin_amdgcn_atomic_inc(
+ unsigned int*,
+ unsigned int,
+ unsigned int,
+ unsigned int,
+ bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
+
+ return __builtin_amdgcn_atomic_inc(
+ address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
+}
+
+__device__
+inline
+unsigned int atomicDec(unsigned int* address, unsigned int val)
+{
+ __device__
+ extern
+ unsigned int __builtin_amdgcn_atomic_dec(
+ unsigned int*,
+ unsigned int,
+ unsigned int,
+ unsigned int,
+ bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
+
+ return __builtin_amdgcn_atomic_dec(
+ address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
+}
+
+__device__
+inline
+int atomicAnd(int* address, int val)
+{
+ return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicAnd(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicAnd(
+ unsigned long long* address, unsigned long long val)
+{
+ return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicOr(int* address, int val)
+{
+ return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicOr(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicOr(
+ unsigned long long* address, unsigned long long val)
+{
+ return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
+}
+
+__device__
+inline
+int atomicXor(int* address, int val)
+{
+ return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned int atomicXor(unsigned int* address, unsigned int val)
+{
+ return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
+}
+__device__
+inline
+unsigned long long atomicXor(
+ unsigned long long* address, unsigned long long val)
+{
+ return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
+}
+
+// TODO: add scoped atomics i.e. atomic{*}_system && atomic{*}_block.
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_common.h b/third_party/rocm/include/hip/hcc_detail/hip_common.h
new file mode 100644
index 0000000..2e2abac
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_common.h
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMMON_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMMON_H
+
+#if defined(__HCC__)
+#define __HCC_OR_HIP_CLANG__ 1
+#define __HCC_ONLY__ 1
+#define __HIP_CLANG_ONLY__ 0
+#elif defined(__clang__) && defined(__HIP__)
+#define __HCC_OR_HIP_CLANG__ 1
+#define __HCC_ONLY__ 0
+#define __HIP_CLANG_ONLY__ 1
+#else
+#define __HCC_OR_HIP_CLANG__ 0
+#define __HCC_ONLY__ 0
+#define __HIP_CLANG_ONLY__ 0
+#endif
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMMON_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_complex.h b/third_party/rocm/include/hip/hcc_detail/hip_complex.h
new file mode 100644
index 0000000..11648ce
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_complex.h
@@ -0,0 +1,304 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
+
+#include "hip/hcc_detail/hip_vector_types.h"
+
+// TODO: Clang has a bug which allows device functions to call std functions
+// when std functions are introduced into default namespace by using statement.
+// math.h may be included after this bug is fixed.
+#if __cplusplus
+#include <cmath>
+#else
+#include "math.h"
+#endif
+
+#if __cplusplus
+#define COMPLEX_NEG_OP_OVERLOAD(type) \
+ __device__ __host__ static inline type operator-(const type& op) { \
+ type ret; \
+ ret.x = -op.x; \
+ ret.y = -op.y; \
+ return ret; \
+ }
+
+#define COMPLEX_EQ_OP_OVERLOAD(type) \
+ __device__ __host__ static inline bool operator==(const type& lhs, const type& rhs) { \
+ return lhs.x == rhs.x && lhs.y == rhs.y; \
+ }
+
+#define COMPLEX_NE_OP_OVERLOAD(type) \
+ __device__ __host__ static inline bool operator!=(const type& lhs, const type& rhs) { \
+ return !(lhs == rhs); \
+ }
+
+#define COMPLEX_ADD_OP_OVERLOAD(type) \
+ __device__ __host__ static inline type operator+(const type& lhs, const type& rhs) { \
+ type ret; \
+ ret.x = lhs.x + rhs.x; \
+ ret.y = lhs.y + rhs.y; \
+ return ret; \
+ }
+
+#define COMPLEX_SUB_OP_OVERLOAD(type) \
+ __device__ __host__ static inline type operator-(const type& lhs, const type& rhs) { \
+ type ret; \
+ ret.x = lhs.x - rhs.x; \
+ ret.y = lhs.y - rhs.y; \
+ return ret; \
+ }
+
+#define COMPLEX_MUL_OP_OVERLOAD(type) \
+ __device__ __host__ static inline type operator*(const type& lhs, const type& rhs) { \
+ type ret; \
+ ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
+ ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
+ return ret; \
+ }
+
+#define COMPLEX_DIV_OP_OVERLOAD(type) \
+ __device__ __host__ static inline type operator/(const type& lhs, const type& rhs) { \
+ type ret; \
+ ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
+ ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
+ ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y); \
+ ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y); \
+ return ret; \
+ }
+
+#define COMPLEX_ADD_PREOP_OVERLOAD(type) \
+ __device__ __host__ static inline type& operator+=(type& lhs, const type& rhs) { \
+ lhs.x += rhs.x; \
+ lhs.y += rhs.y; \
+ return lhs; \
+ }
+
+#define COMPLEX_SUB_PREOP_OVERLOAD(type) \
+ __device__ __host__ static inline type& operator-=(type& lhs, const type& rhs) { \
+ lhs.x -= rhs.x; \
+ lhs.y -= rhs.y; \
+ return lhs; \
+ }
+
+#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
+ __device__ __host__ static inline type& operator*=(type& lhs, const type& rhs) { \
+ lhs = lhs * rhs; \
+ return lhs; \
+ }
+
+#define COMPLEX_DIV_PREOP_OVERLOAD(type) \
+ __device__ __host__ static inline type& operator/=(type& lhs, const type& rhs) { \
+ lhs = lhs / rhs; \
+ return lhs; \
+ }
+
+#define COMPLEX_SCALAR_PRODUCT(type, type1) \
+ __device__ __host__ static inline type operator*(const type& lhs, type1 rhs) { \
+ type ret; \
+ ret.x = lhs.x * rhs; \
+ ret.y = lhs.y * rhs; \
+ return ret; \
+ }
+
+#endif
+
+typedef float2 hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+ hipFloatComplex z;
+ z.x = a;
+ z.y = b;
+ return z;
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
+ hipFloatComplex ret;
+ ret.x = z.x;
+ ret.y = -z.y;
+ return ret;
+}
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+ return z.x * z.x + z.y * z.y;
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+ return make_hipFloatComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+ return make_hipFloatComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+ return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+ float sqabs = hipCsqabsf(q);
+ hipFloatComplex ret;
+ ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+ ret.y = (p.y * q.x - p.x * q.y) / sqabs;
+ return ret;
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
+
+
+typedef double2 hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+ hipDoubleComplex z;
+ z.x = a;
+ z.y = b;
+ return z;
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
+ hipDoubleComplex ret;
+ ret.x = z.x;
+ ret.y = -z.y;
+ return ret;
+}
+
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+ return z.x * z.x + z.y * z.y;
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+ return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+ return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+ return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+ double sqabs = hipCsqabs(q);
+ hipDoubleComplex ret;
+ ret.x = (p.x * q.x + p.y * q.y) / sqabs;
+ ret.y = (p.y * q.x - p.x * q.y) / sqabs;
+ return ret;
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return sqrtf(hipCsqabs(z)); }
+
+
+#if __cplusplus
+
+COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
+COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
+COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)
+
+COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
+COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
+COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
+
+#endif
+
+
+typedef hipFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
+ return make_hipFloatComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+ return make_hipFloatComplex((float)z.x, (float)z.y);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+ return make_hipDoubleComplex((double)z.x, (double)z.y);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+ float real = (p.x * q.x) + r.x;
+ float imag = (q.x * p.y) + r.y;
+
+ real = -(p.y * q.y) + real;
+ imag = (p.x * q.y) + imag;
+
+ return make_hipComplex(real, imag);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+ hipDoubleComplex r) {
+ double real = (p.x * q.x) + r.x;
+ double imag = (q.x * p.y) + r.y;
+
+ real = -(p.y * q.y) + real;
+ imag = (p.x * q.y) + imag;
+
+ return make_hipDoubleComplex(real, imag);
+}
+
+#endif //HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_cooperative_groups.h b/third_party/rocm/include/hip/hcc_detail/hip_cooperative_groups.h
new file mode 100644
index 0000000..353bdc5
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_cooperative_groups.h
@@ -0,0 +1,304 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/hip_cooperative_groups.h
+ *
+ * @brief Device side implementation of `Cooperative Group` feature.
+ *
+ * Defines new types and device API wrappers related to `Cooperative Group`
+ * feature, which the programmer can directly use in his kernel(s) in order to
+ * make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+#if __cplusplus
+#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
+
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ * \details Holds the key properties of a constructed cooperative group type
+ * object, like the group type, its size, etc
+ */
+class thread_group {
+ protected:
+ uint32_t _type; // thread_group type
+ uint32_t _size; // total number of threads in the tread_group
+ uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
+ // LSB represents lane 0, and MSB represents lane 63
+
+ // Construct a thread group, and set thread group type and other essential
+ // thread group properties. This generic thread group is directly constructed
+ // only when the group is supposed to contain only the calling the thread
+ // (throurh the API - `this_thread()`), and in all other cases, this thread
+ // group object is a sub-object of some other derived thread group object
+ __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
+ uint64_t mask = (uint64_t)0) {
+ _type = type;
+ _size = size;
+ _mask = mask;
+ }
+
+ public:
+ // Total number of threads in the thread group, and this serves the purpose
+ // for all derived cooperative group types since their `size` is directly
+ // saved during the construction
+ __CG_QUALIFIER__ uint32_t size() const {
+ return _size;
+ }
+ // Rank of the calling thread within [0, size())
+ __CG_QUALIFIER__ uint32_t thread_rank() const;
+ // Is this cooperative group type valid?
+ __CG_QUALIFIER__ bool is_valid() const;
+ // synchronize the threads in the thread group
+ __CG_QUALIFIER__ void sync() const;
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ * \details Represents an inter-device cooperative group type where the
+ * participating threads within the group spans across multple
+ * devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+ // Only these friend functions are allowed to construct an object of this class
+ // and access its resources
+ friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+ protected:
+ // Construct mutli-grid thread group (through the API this_multi_grid())
+ explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+ : thread_group(internal::cg_multi_grid, size) { }
+
+ public:
+ // Number of invocations participating in this multi-grid group. In other
+ // words, the number of GPUs
+ __CG_QUALIFIER__ uint32_t num_grids() {
+ return internal::multi_grid::num_grids();
+ }
+ // Rank of this invocation. In other words, an ID number within the range
+ // [0, num_grids()) of the GPU, this kernel is running on
+ __CG_QUALIFIER__ uint32_t grid_rank() {
+ return internal::multi_grid::grid_rank();
+ }
+ __CG_QUALIFIER__ uint32_t thread_rank() const {
+ return internal::multi_grid::thread_rank();
+ }
+ __CG_QUALIFIER__ bool is_valid() const {
+ return internal::multi_grid::is_valid();
+ }
+ __CG_QUALIFIER__ void sync() const {
+ internal::multi_grid::sync();
+ }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ * group type object - `multi_grid_group`
+ *
+ * \details User is not allowed to directly construct an object of type
+ * `multi_grid_group`. Instead, he should construct it through this
+ * API function
+ */
+__CG_QUALIFIER__ multi_grid_group
+this_multi_grid() {
+ return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ * \details Represents an inter-workgroup cooperative group type where the
+ * participating threads within the group spans across multiple
+ * workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+ // Only these friend functions are allowed to construct an object of this class
+ // and access its resources
+ friend __CG_QUALIFIER__ grid_group this_grid();
+
+ protected:
+ // Construct grid thread group (through the API this_grid())
+ explicit __CG_QUALIFIER__ grid_group(uint32_t size)
+ : thread_group(internal::cg_grid, size) { }
+
+ public:
+ __CG_QUALIFIER__ uint32_t thread_rank() const {
+ return internal::grid::thread_rank();
+ }
+ __CG_QUALIFIER__ bool is_valid() const {
+ return internal::grid::is_valid();
+ }
+ __CG_QUALIFIER__ void sync() const {
+ internal::grid::sync();
+ }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ * object - `grid_group`
+ *
+ * \details User is not allowed to directly construct an object of type
+ * `multi_grid_group`. Instead, he should construct it through this
+ * API function
+ */
+__CG_QUALIFIER__ grid_group
+this_grid() {
+ return grid_group(internal::grid::size());
+}
+
+/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
+ * type
+ *
+ * \details Represents an intra-workgroup cooperative group type where the
+ * participating threads within the group are exctly the same threads
+ * which are participated in the currently executing `workgroup`
+ */
+class thread_block : public thread_group {
+ // Only these friend functions are allowed to construct an object of this
+ // class and access its resources
+ friend __CG_QUALIFIER__ thread_block this_thread_block();
+
+ protected:
+ // Construct a workgroup thread group (through the API this_thread_block())
+ explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+ : thread_group(internal::cg_workgroup, size) { }
+
+ public:
+ // 3-dimensional block index within the grid
+ __CG_QUALIFIER__ dim3 group_index() {
+ return internal::workgroup::group_index();
+ }
+ // 3-dimensional thread index within the block
+ __CG_QUALIFIER__ dim3 thread_index() {
+ return internal::workgroup::thread_index();
+ }
+ __CG_QUALIFIER__ uint32_t thread_rank() const {
+ return internal::workgroup::thread_rank();
+ }
+ __CG_QUALIFIER__ bool is_valid() const {
+ return internal::workgroup::is_valid();
+ }
+ __CG_QUALIFIER__ void sync() const {
+ internal::workgroup::sync();
+ }
+};
+
+/** \brief User exposed API interface to construct workgroup cooperative
+ * group type object - `thread_block`
+ *
+ * \details User is not allowed to directly construct an object of type
+ * `thread_block`. Instead, he should construct it through this API
+ * function
+ */
+__CG_QUALIFIER__ thread_block
+this_thread_block() {
+ return thread_block(internal::workgroup::size());
+}
+
+/**
+ * Implemenation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
+ switch (this->_type) {
+ case internal::cg_multi_grid: {
+ return (static_cast<const multi_grid_group*>(this)->thread_rank());
+ }
+ case internal::cg_grid: {
+ return (static_cast<const grid_group*>(this)->thread_rank());
+ }
+ case internal::cg_workgroup: {
+ return (static_cast<const thread_block*>(this)->thread_rank());
+ }
+ default: {
+ assert(false && "invalid cooperative group type");
+ return -1;
+ }
+ }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const {
+ switch (this->_type) {
+ case internal::cg_multi_grid: {
+ return (static_cast<const multi_grid_group*>(this)->is_valid());
+ }
+ case internal::cg_grid: {
+ return (static_cast<const grid_group*>(this)->is_valid());
+ }
+ case internal::cg_workgroup: {
+ return (static_cast<const thread_block*>(this)->is_valid());
+ }
+ default: {
+ assert(false && "invalid cooperative group type");
+ return false;
+ }
+ }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const {
+ switch (this->_type) {
+ case internal::cg_multi_grid: {
+ static_cast<const multi_grid_group*>(this)->sync();
+ break;
+ }
+ case internal::cg_grid: {
+ static_cast<const grid_group*>(this)->sync();
+ break;
+ }
+ case internal::cg_workgroup: {
+ static_cast<const thread_block*>(this)->sync();
+ break;
+ }
+ default: {
+ assert(false && "invalid cooperative group type");
+ }
+ }
+}
+
+/**
+ * Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
+ * group type APIs
+ */
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) {
+ return g.size();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) {
+ return g.thread_rank();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ bool is_valid(CGTy const &g) {
+ return g.is_valid();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ void sync(CGTy const &g) {
+ g.sync();
+}
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h b/third_party/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h
new file mode 100644
index 0000000..4e10c0d
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h
@@ -0,0 +1,182 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/hip_cooperative_groups_helper.h
+ *
+ * @brief Device side implementation of cooperative group feature.
+ *
+ * Defines helper constructs and APIs which aid the types and device API
+ * wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+#include <hip/hcc_detail/hip_runtime_api.h>
+#include <hip/hcc_detail/device_functions.h>
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(WAVEFRONT_SIZE)
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace cooperative_groups {
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum {
+ cg_invalid,
+ cg_multi_grid,
+ cg_grid,
+ cg_workgroup
+} group_type;
+
+/**
+ * Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
+ return (uint32_t)__ockl_multi_grid_num_grids();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
+ return (uint32_t)__ockl_multi_grid_grid_rank();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+ return (uint32_t)__ockl_multi_grid_size();
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+ return (uint32_t)__ockl_multi_grid_thread_rank();
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+ return (bool)__ockl_multi_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+ __ockl_multi_grid_sync();
+}
+
+} // namespace multi_grid
+
+/**
+ * Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+ return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
+ (hipBlockDim_y * hipGridDim_y) *
+ (hipBlockDim_x * hipGridDim_x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+ // Compute global id of the workgroup to which the current thread belongs to
+ uint32_t blkIdx =
+ (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
+ (hipBlockIdx_y * hipGridDim_x) +
+ (hipBlockIdx_x));
+
+ // Compute total number of threads being passed to reach current workgroup
+ // within grid
+ uint32_t num_threads_till_current_workgroup =
+ (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+
+ // Compute thread local rank within current workgroup
+ uint32_t local_thread_rank =
+ (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+ (hipThreadIdx_y * hipBlockDim_x) +
+ (hipThreadIdx_x));
+
+ return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+ return (bool)__ockl_grid_is_valid();
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+ __ockl_grid_sync();
+}
+
+} // namespace grid
+
+/**
+ * Functionalities related to `workgroup` (thread_block in CUDA terminology)
+ * cooperative group type
+ */
+namespace workgroup {
+
+__CG_STATIC_QUALIFIER__ dim3 group_index() {
+ return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y,
+ (uint32_t)hipBlockIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ dim3 thread_index() {
+ return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y,
+ (uint32_t)hipThreadIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size() {
+ return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+ return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+ (hipThreadIdx_y * hipBlockDim_x) +
+ (hipThreadIdx_x)));
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() {
+ //TODO(mahesha) any functionality need to be added here? I believe not
+ return true;
+}
+
+__CG_STATIC_QUALIFIER__ void sync() {
+ __syncthreads();
+}
+
+} // namespace workgroup
+
+} // namespace internal
+
+} // namespace cooperative_groups
+
+#endif // __cplusplus
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_fp16.h b/third_party/rocm/include/hip/hcc_detail/hip_fp16.h
new file mode 100644
index 0000000..af004a8
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_fp16.h
@@ -0,0 +1,1658 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
+
+#include <hip/hcc_detail/hip_common.h>
+
+#include "hip/hcc_detail/host_defines.h"
+#include <assert.h>
+#if defined(__cplusplus)
+ #include <algorithm>
+ #include <type_traits>
+ #include <utility>
+#endif
+
+#if __HCC_OR_HIP_CLANG__
+ typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
+
+ struct __half_raw {
+ union {
+ static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
+
+ _Float16 data;
+ unsigned short x;
+ };
+ };
+
+ struct __half2_raw {
+ union {
+ static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
+
+ _Float16_2 data;
+ struct {
+ unsigned short x;
+ unsigned short y;
+ };
+ };
+ };
+
+ #if defined(__cplusplus)
+ #include "hip_fp16_math_fwd.h"
+ #include "hip_vector_types.h"
+ #include "host_defines.h"
+
+ namespace std
+ {
+ template<> struct is_floating_point<_Float16> : std::true_type {};
+ }
+
+ template<bool cond, typename T = void>
+ using Enable_if_t = typename std::enable_if<cond, T>::type;
+
+ // BEGIN STRUCT __HALF
+ struct __half {
+ protected:
+ union {
+ static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
+
+ _Float16 data;
+ unsigned short __x;
+ };
+ public:
+ // CREATORS
+ __host__ __device__
+ __half() = default;
+ __host__ __device__
+ __half(const __half_raw& x) : data{x.data} {}
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ __host__ __device__
+ __half(decltype(data) x) : data{x} {}
+ template<
+ typename T,
+ Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
+ __host__ __device__
+ __half(T x) : data{static_cast<_Float16>(x)} {}
+ #endif
+ __host__ __device__
+ __half(const __half&) = default;
+ __host__ __device__
+ __half(__half&&) = default;
+ __host__ __device__
+ ~__half() = default;
+
+ // CREATORS - DEVICE ONLY
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ template<
+ typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ __half(T x) : data{static_cast<_Float16>(x)} {}
+ #endif
+
+ // MANIPULATORS
+ __host__ __device__
+ __half& operator=(const __half&) = default;
+ __host__ __device__
+ __half& operator=(__half&&) = default;
+ __host__ __device__
+ __half& operator=(const __half_raw& x)
+ {
+ data = x.data;
+ return *this;
+ }
+ __host__ __device__
+ volatile __half& operator=(const __half_raw& x) volatile
+ {
+ data = x.data;
+ return *this;
+ }
+ volatile __half& operator=(const volatile __half_raw& x) volatile
+ {
+ data = x.data;
+ return *this;
+ }
+ __half& operator=(__half_raw&& x)
+ {
+ data = x.data;
+ return *this;
+ }
+ volatile __half& operator=(__half_raw&& x) volatile
+ {
+ data = x.data;
+ return *this;
+ }
+ volatile __half& operator=(volatile __half_raw&& x) volatile
+ {
+ data = x.data;
+ return *this;
+ }
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ template<
+ typename T,
+ Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
+ __host__ __device__
+ __half& operator=(T x)
+ {
+ data = static_cast<_Float16>(x);
+ return *this;
+ }
+ #endif
+
+ // MANIPULATORS - DEVICE ONLY
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ template<
+ typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
+ __device__
+ __half& operator=(T x)
+ {
+ data = static_cast<_Float16>(x);
+ return *this;
+ }
+ #endif
+
+ #if !defined(__HIP_NO_HALF_OPERATORS__)
+ __device__
+ __half& operator+=(const __half& x)
+ {
+ data += x.data;
+ return *this;
+ }
+ __device__
+ __half& operator-=(const __half& x)
+ {
+ data -= x.data;
+ return *this;
+ }
+ __device__
+ __half& operator*=(const __half& x)
+ {
+ data *= x.data;
+ return *this;
+ }
+ __device__
+ __half& operator/=(const __half& x)
+ {
+ data /= x.data;
+ return *this;
+ }
+ __device__
+ __half& operator++() { ++data; return *this; }
+ __device__
+ __half operator++(int)
+ {
+ __half tmp{*this};
+ ++*this;
+ return tmp;
+ }
+ __device__
+ __half& operator--() { --data; return *this; }
+ __device__
+ __half operator--(int)
+ {
+ __half tmp{*this};
+ --*this;
+ return tmp;
+ }
+ #endif
+
+ // ACCESSORS
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ template<
+ typename T,
+ Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
+ __host__ __device__
+ operator T() const { return data; }
+ #endif
+ __host__ __device__
+ operator __half_raw() const { return __half_raw{data}; }
+ __host__ __device__
+ operator __half_raw() const volatile
+ {
+ return __half_raw{data};
+ }
+
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ template<
+ typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ operator T() const { return data; }
+ #endif
+
+ #if !defined(__HIP_NO_HALF_OPERATORS__)
+ __device__
+ __half operator+() const { return *this; }
+ __device__
+ __half operator-() const
+ {
+ __half tmp{*this};
+ tmp.data = -tmp.data;
+ return tmp;
+ }
+ #endif
+
+ // FRIENDS
+ #if !defined(__HIP_NO_HALF_OPERATORS__)
+ friend
+ inline
+ __device__
+ __half operator+(const __half& x, const __half& y)
+ {
+ return __half{x} += y;
+ }
+ friend
+ inline
+ __device__
+ __half operator-(const __half& x, const __half& y)
+ {
+ return __half{x} -= y;
+ }
+ friend
+ inline
+ __device__
+ __half operator*(const __half& x, const __half& y)
+ {
+ return __half{x} *= y;
+ }
+ friend
+ inline
+ __device__
+ __half operator/(const __half& x, const __half& y)
+ {
+ return __half{x} /= y;
+ }
+ friend
+ inline
+ __device__
+ bool operator==(const __half& x, const __half& y)
+ {
+ return x.data == y.data;
+ }
+ friend
+ inline
+ __device__
+ bool operator!=(const __half& x, const __half& y)
+ {
+ return !(x == y);
+ }
+ friend
+ inline
+ __device__
+ bool operator<(const __half& x, const __half& y)
+ {
+ return x.data < y.data;
+ }
+ friend
+ inline
+ __device__
+ bool operator>(const __half& x, const __half& y)
+ {
+ return y.data < x.data;
+ }
+ friend
+ inline
+ __device__
+ bool operator<=(const __half& x, const __half& y)
+ {
+ return !(y < x);
+ }
+ friend
+ inline
+ __device__
+ bool operator>=(const __half& x, const __half& y)
+ {
+ return !(x < y);
+ }
+ #endif // !defined(__HIP_NO_HALF_OPERATORS__)
+ };
+ // END STRUCT __HALF
+
+ // BEGIN STRUCT __HALF2
+ struct __half2 {
+ protected:
+ union {
+ static_assert(
+ sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
+
+ _Float16_2 data;
+ struct {
+ unsigned short x;
+ unsigned short y;
+ };
+ };
+ public:
+ // CREATORS
+ __host__ __device__
+ __half2() = default;
+ __host__ __device__
+ __half2(const __half2_raw& x) : data{x.data} {}
+ __host__ __device__
+ __half2(decltype(data) x) : data{x} {}
+ __host__ __device__
+ __half2(const __half& x, const __half& y)
+ :
+ data{
+ static_cast<__half_raw>(x).data,
+ static_cast<__half_raw>(y).data}
+ {}
+ __host__ __device__
+ __half2(const __half2&) = default;
+ __host__ __device__
+ __half2(__half2&&) = default;
+ __host__ __device__
+ ~__half2() = default;
+
+ // MANIPULATORS
+ __host__ __device__
+ __half2& operator=(const __half2&) = default;
+ __host__ __device__
+ __half2& operator=(__half2&&) = default;
+ __host__ __device__
+ __half2& operator=(const __half2_raw& x)
+ {
+ data = x.data;
+ return *this;
+ }
+
+ // MANIPULATORS - DEVICE ONLY
+ #if !defined(__HIP_NO_HALF_OPERATORS__)
+ __device__
+ __half2& operator+=(const __half2& x)
+ {
+ data += x.data;
+ return *this;
+ }
+ __device__
+ __half2& operator-=(const __half2& x)
+ {
+ data -= x.data;
+ return *this;
+ }
+ __device__
+ __half2& operator*=(const __half2& x)
+ {
+ data *= x.data;
+ return *this;
+ }
+ __device__
+ __half2& operator/=(const __half2& x)
+ {
+ data /= x.data;
+ return *this;
+ }
+ __device__
+ __half2& operator++() { return *this += _Float16_2{1, 1}; }
+ __device__
+ __half2 operator++(int)
+ {
+ __half2 tmp{*this};
+ ++*this;
+ return tmp;
+ }
+ __device__
+ __half2& operator--() { return *this -= _Float16_2{1, 1}; }
+ __device__
+ __half2 operator--(int)
+ {
+ __half2 tmp{*this};
+ --*this;
+ return tmp;
+ }
+ #endif
+
+ // ACCESSORS
+ __host__ __device__
+ operator decltype(data)() const { return data; }
+ __host__ __device__
+ operator __half2_raw() const { return __half2_raw{data}; }
+
+ // ACCESSORS - DEVICE ONLY
+ #if !defined(__HIP_NO_HALF_OPERATORS__)
+ __device__
+ __half2 operator+() const { return *this; }
+ __device__
+ __half2 operator-() const
+ {
+ __half2 tmp{*this};
+ tmp.data = -tmp.data;
+ return tmp;
+ }
+ #endif
+
+ // FRIENDS
+ #if !defined(__HIP_NO_HALF_OPERATORS__)
+ friend
+ inline
+ __device__
+ __half2 operator+(const __half2& x, const __half2& y)
+ {
+ return __half2{x} += y;
+ }
+ friend
+ inline
+ __device__
+ __half2 operator-(const __half2& x, const __half2& y)
+ {
+ return __half2{x} -= y;
+ }
+ friend
+ inline
+ __device__
+ __half2 operator*(const __half2& x, const __half2& y)
+ {
+ return __half2{x} *= y;
+ }
+ friend
+ inline
+ __device__
+ __half2 operator/(const __half2& x, const __half2& y)
+ {
+ return __half2{x} /= y;
+ }
+ friend
+ inline
+ __device__
+ bool operator==(const __half2& x, const __half2& y)
+ {
+ auto r = x.data == y.data;
+ return r.x != 0 && r.y != 0;
+ }
+ friend
+ inline
+ __device__
+ bool operator!=(const __half2& x, const __half2& y)
+ {
+ return !(x == y);
+ }
+ friend
+ inline
+ __device__
+ bool operator<(const __half2& x, const __half2& y)
+ {
+ auto r = x.data < y.data;
+ return r.x != 0 && r.y != 0;
+ }
+ friend
+ inline
+ __device__
+ bool operator>(const __half2& x, const __half2& y)
+ {
+ return y < x;
+ }
+ friend
+ inline
+ __device__
+ bool operator<=(const __half2& x, const __half2& y)
+ {
+ return !(y < x);
+ }
+ friend
+ inline
+ __device__
+ bool operator>=(const __half2& x, const __half2& y)
+ {
+ return !(x < y);
+ }
+ #endif // !defined(__HIP_NO_HALF_OPERATORS__)
+ };
+ // END STRUCT __HALF2
+
+ namespace
+ {
+ inline
+ __host__ __device__
+ __half2 make_half2(__half x, __half y)
+ {
+ return __half2{x, y};
+ }
+
+ inline
+ __host__ __device__
+ __half __low2half(__half2 x)
+ {
+ return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
+ }
+
+ inline
+ __host__ __device__
+ __half __high2half(__half2 x)
+ {
+ return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __half2half2(__half x)
+ {
+ return __half2{x, x};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __halves2half2(__half x, __half y)
+ {
+ return __half2{x, y};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __low2half2(__half2 x)
+ {
+ return __half2{
+ _Float16_2{
+ static_cast<__half2_raw>(x).data.x,
+ static_cast<__half2_raw>(x).data.x}};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __high2half2(__half2 x)
+ {
+ return __half2_raw{
+ _Float16_2{
+ static_cast<__half2_raw>(x).data.y,
+ static_cast<__half2_raw>(x).data.y}};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __lows2half2(__half2 x, __half2 y)
+ {
+ return __half2_raw{
+ _Float16_2{
+ static_cast<__half2_raw>(x).data.x,
+ static_cast<__half2_raw>(y).data.x}};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __highs2half2(__half2 x, __half2 y)
+ {
+ return __half2_raw{
+ _Float16_2{
+ static_cast<__half2_raw>(x).data.y,
+ static_cast<__half2_raw>(y).data.y}};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __lowhigh2highlow(__half2 x)
+ {
+ return __half2_raw{
+ _Float16_2{
+ static_cast<__half2_raw>(x).data.y,
+ static_cast<__half2_raw>(x).data.x}};
+ }
+
+ // Bitcasts
+ inline
+ __device__
+ short __half_as_short(__half x)
+ {
+ return static_cast<__half_raw>(x).x;
+ }
+
+ inline
+ __device__
+ unsigned short __half_as_ushort(__half x)
+ {
+ return static_cast<__half_raw>(x).x;
+ }
+
+ inline
+ __device__
+ __half __short_as_half(short x)
+ {
+ __half_raw r; r.x = x;
+ return r;
+ }
+
+ inline
+ __device__
+ __half __ushort_as_half(unsigned short x)
+ {
+ __half_raw r; r.x = x;
+ return r;
+ }
+
+ // TODO: rounding behaviour is not correct.
+ // float -> half | half2
+ inline
+ __device__ __host__
+ __half __float2half(float x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__ __host__
+ __half __float2half_rn(float x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__ __host__
+ __half __float2half_rz(float x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__ __host__
+ __half __float2half_rd(float x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__ __host__
+ __half __float2half_ru(float x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__ __host__
+ __half2 __float2half2_rn(float x)
+ {
+ return __half2_raw{
+ _Float16_2{
+ static_cast<_Float16>(x), static_cast<_Float16>(x)}};
+ }
+ inline
+ __device__ __host__
+ __half2 __floats2half2_rn(float x, float y)
+ {
+ return __half2_raw{_Float16_2{
+ static_cast<_Float16>(x), static_cast<_Float16>(y)}};
+ }
+ inline
+ __device__ __host__
+ __half2 __float22half2_rn(float2 x)
+ {
+ return __floats2half2_rn(x.x, x.y);
+ }
+
+ // half | half2 -> float
+ inline
+ __device__ __host__
+ float __half2float(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__ __host__
+ float __low2float(__half2 x)
+ {
+ return static_cast<__half2_raw>(x).data.x;
+ }
+ inline
+ __device__ __host__
+ float __high2float(__half2 x)
+ {
+ return static_cast<__half2_raw>(x).data.y;
+ }
+ inline
+ __device__ __host__
+ float2 __half22float2(__half2 x)
+ {
+ return make_float2(
+ static_cast<__half2_raw>(x).data.x,
+ static_cast<__half2_raw>(x).data.y);
+ }
+
+ // half -> int
+ inline
+ __device__
+ int __half2int_rn(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ int __half2int_rz(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ int __half2int_rd(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ int __half2int_ru(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+
+ // int -> half
+ inline
+ __device__
+ __half __int2half_rn(int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __int2half_rz(int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __int2half_rd(int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __int2half_ru(int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+
+ // half -> short
+ inline
+ __device__
+ short __half2short_rn(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ short __half2short_rz(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ short __half2short_rd(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ short __half2short_ru(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+
+ // short -> half
+ inline
+ __device__
+ __half __short2half_rn(short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __short2half_rz(short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __short2half_rd(short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __short2half_ru(short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+
+ // half -> long long
+ inline
+ __device__
+ long long __half2ll_rn(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ long long __half2ll_rz(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ long long __half2ll_rd(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ long long __half2ll_ru(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+
+ // long long -> half
+ inline
+ __device__
+ __half __ll2half_rn(long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ll2half_rz(long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ll2half_rd(long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ll2half_ru(long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+
+ // half -> unsigned int
+ inline
+ __device__
+ unsigned int __half2uint_rn(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned int __half2uint_rz(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned int __half2uint_rd(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned int __half2uint_ru(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+
+ // unsigned int -> half
+ inline
+ __device__
+ __half __uint2half_rn(unsigned int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __uint2half_rz(unsigned int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __uint2half_rd(unsigned int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __uint2half_ru(unsigned int x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+
+ // half -> unsigned short
+ inline
+ __device__
+ unsigned short __half2ushort_rn(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned short __half2ushort_rz(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned short __half2ushort_rd(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned short __half2ushort_ru(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+
+ // unsigned short -> half
+ inline
+ __device__
+ __half __ushort2half_rn(unsigned short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ushort2half_rz(unsigned short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ushort2half_rd(unsigned short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ushort2half_ru(unsigned short x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+
+ // half -> unsigned long long
+ inline
+ __device__
+ unsigned long long __half2ull_rn(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned long long __half2ull_rz(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned long long __half2ull_rd(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+ inline
+ __device__
+ unsigned long long __half2ull_ru(__half x)
+ {
+ return static_cast<__half_raw>(x).data;
+ }
+
+ // unsigned long long -> half
+ inline
+ __device__
+ __half __ull2half_rn(unsigned long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ull2half_rz(unsigned long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ull2half_rd(unsigned long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+ inline
+ __device__
+ __half __ull2half_ru(unsigned long long x)
+ {
+ return __half_raw{static_cast<_Float16>(x)};
+ }
+
+ // Load primitives
+ inline
+ __device__
+ __half __ldg(const __half* ptr) { return *ptr; }
+ inline
+ __device__
+ __half __ldcg(const __half* ptr) { return *ptr; }
+ inline
+ __device__
+ __half __ldca(const __half* ptr) { return *ptr; }
+ inline
+ __device__
+ __half __ldcs(const __half* ptr) { return *ptr; }
+
+ inline
+ __host__ __device__
+ __half2 __ldg(const __half2* ptr) { return *ptr; }
+ inline
+ __host__ __device__
+ __half2 __ldcg(const __half2* ptr) { return *ptr; }
+ inline
+ __host__ __device__
+ __half2 __ldca(const __half2* ptr) { return *ptr; }
+ inline
+ __host__ __device__
+ __half2 __ldcs(const __half2* ptr) { return *ptr; }
+
+ // Relations
+ inline
+ __device__
+ bool __heq(__half x, __half y)
+ {
+ return static_cast<__half_raw>(x).data ==
+ static_cast<__half_raw>(y).data;
+ }
+ inline
+ __device__
+ bool __hne(__half x, __half y)
+ {
+ return static_cast<__half_raw>(x).data !=
+ static_cast<__half_raw>(y).data;
+ }
+ inline
+ __device__
+ bool __hle(__half x, __half y)
+ {
+ return static_cast<__half_raw>(x).data <=
+ static_cast<__half_raw>(y).data;
+ }
+ inline
+ __device__
+ bool __hge(__half x, __half y)
+ {
+ return static_cast<__half_raw>(x).data >=
+ static_cast<__half_raw>(y).data;
+ }
+ inline
+ __device__
+ bool __hlt(__half x, __half y)
+ {
+ return static_cast<__half_raw>(x).data <
+ static_cast<__half_raw>(y).data;
+ }
+ inline
+ __device__
+ bool __hgt(__half x, __half y)
+ {
+ return static_cast<__half_raw>(x).data >
+ static_cast<__half_raw>(y).data;
+ }
+ inline
+ __device__
+ bool __hequ(__half x, __half y) { return __heq(x, y); }
+ inline
+ __device__
+ bool __hneu(__half x, __half y) { return __hne(x, y); }
+ inline
+ __device__
+ bool __hleu(__half x, __half y) { return __hle(x, y); }
+ inline
+ __device__
+ bool __hgeu(__half x, __half y) { return __hge(x, y); }
+ inline
+ __device__
+ bool __hltu(__half x, __half y) { return __hlt(x, y); }
+ inline
+ __device__
+ bool __hgtu(__half x, __half y) { return __hgt(x, y); }
+
+ inline
+ __host__ __device__
+ __half2 __heq2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(x).data ==
+ static_cast<__half2_raw>(y).data;
+ return __builtin_convertvector(-r, _Float16_2);
+ }
+ inline
+ __host__ __device__
+ __half2 __hne2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(x).data !=
+ static_cast<__half2_raw>(y).data;
+ return __builtin_convertvector(-r, _Float16_2);
+ }
+ inline
+ __host__ __device__
+ __half2 __hle2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(x).data <=
+ static_cast<__half2_raw>(y).data;
+ return __builtin_convertvector(-r, _Float16_2);
+ }
+ inline
+ __host__ __device__
+ __half2 __hge2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(x).data >=
+ static_cast<__half2_raw>(y).data;
+ return __builtin_convertvector(-r, _Float16_2);
+ }
+ inline
+ __host__ __device__
+ __half2 __hlt2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(x).data <
+ static_cast<__half2_raw>(y).data;
+ return __builtin_convertvector(-r, _Float16_2);
+ }
+ inline
+ __host__ __device__
+ __half2 __hgt2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(x).data >
+ static_cast<__half2_raw>(y).data;
+ return __builtin_convertvector(-r, _Float16_2);
+ }
+ inline
+ __host__ __device__
+ __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); }
+ inline
+ __host__ __device__
+ __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); }
+ inline
+ __host__ __device__
+ __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); }
+ inline
+ __host__ __device__
+ __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); }
+ inline
+ __host__ __device__
+ __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); }
+ inline
+ __host__ __device__
+ __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); }
+
+ inline
+ __host__ __device__
+ bool __hbeq2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__heq2(x, y));
+ return r.data.x != 0 && r.data.y != 0;
+ }
+ inline
+ __host__ __device__
+ bool __hbne2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hne2(x, y));
+ return r.data.x != 0 && r.data.y != 0;
+ }
+ inline
+ __host__ __device__
+ bool __hble2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hle2(x, y));
+ return r.data.x != 0 && r.data.y != 0;
+ }
+ inline
+ __host__ __device__
+ bool __hbge2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hge2(x, y));
+ return r.data.x != 0 && r.data.y != 0;
+ }
+ inline
+ __host__ __device__
+ bool __hblt2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hlt2(x, y));
+ return r.data.x != 0 && r.data.y != 0;
+ }
+ inline
+ __host__ __device__
+ bool __hbgt2(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hgt2(x, y));
+ return r.data.x != 0 && r.data.y != 0;
+ }
+ inline
+ __host__ __device__
+ bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
+ inline
+ __host__ __device__
+ bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
+ inline
+ __host__ __device__
+ bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
+ inline
+ __host__ __device__
+ bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
+ inline
+ __host__ __device__
+ bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
+ inline
+ __host__ __device__
+ bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
+
+ // Arithmetic
+ inline
+ __device__
+ __half __clamp_01(__half x)
+ {
+ auto r = static_cast<__half_raw>(x);
+
+ if (__hlt(x, __half_raw{0})) return __half_raw{0};
+ if (__hlt(__half_raw{1}, x)) return __half_raw{1};
+ return r;
+ }
+
+ inline
+ __device__
+ __half __hadd(__half x, __half y)
+ {
+ return __half_raw{
+ static_cast<__half_raw>(x).data +
+ static_cast<__half_raw>(y).data};
+ }
+ inline
+ __device__
+ __half __habs(__half x)
+ {
+ return __half_raw{
+ __ocml_fabs_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half __hsub(__half x, __half y)
+ {
+ return __half_raw{
+ static_cast<__half_raw>(x).data -
+ static_cast<__half_raw>(y).data};
+ }
+ inline
+ __device__
+ __half __hmul(__half x, __half y)
+ {
+ return __half_raw{
+ static_cast<__half_raw>(x).data *
+ static_cast<__half_raw>(y).data};
+ }
+ inline
+ __device__
+ __half __hadd_sat(__half x, __half y)
+ {
+ return __clamp_01(__hadd(x, y));
+ }
+ inline
+ __device__
+ __half __hsub_sat(__half x, __half y)
+ {
+ return __clamp_01(__hsub(x, y));
+ }
+ inline
+ __device__
+ __half __hmul_sat(__half x, __half y)
+ {
+ return __clamp_01(__hmul(x, y));
+ }
+ inline
+ __device__
+ __half __hfma(__half x, __half y, __half z)
+ {
+ return __half_raw{__ocml_fma_f16(
+ static_cast<__half_raw>(x).data,
+ static_cast<__half_raw>(y).data,
+ static_cast<__half_raw>(z).data)};
+ }
+ inline
+ __device__
+ __half __hfma_sat(__half x, __half y, __half z)
+ {
+ return __clamp_01(__hfma(x, y, z));
+ }
+ inline
+ __device__
+ __half __hdiv(__half x, __half y)
+ {
+ return __half_raw{
+ static_cast<__half_raw>(x).data /
+ static_cast<__half_raw>(y).data};
+ }
+
+ inline
+ __host__ __device__
+ __half2 __hadd2(__half2 x, __half2 y)
+ {
+ return __half2_raw{
+ static_cast<__half2_raw>(x).data +
+ static_cast<__half2_raw>(y).data};
+ }
+ inline
+ __host__ __device__
+ __half2 __habs2(__half2 x)
+ {
+ return __half2_raw{
+ __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
+ }
+ inline
+ __host__ __device__
+ __half2 __hsub2(__half2 x, __half2 y)
+ {
+ return __half2_raw{
+ static_cast<__half2_raw>(x).data -
+ static_cast<__half2_raw>(y).data};
+ }
+ inline
+ __host__ __device__
+ __half2 __hmul2(__half2 x, __half2 y)
+ {
+ return __half2_raw{
+ static_cast<__half2_raw>(x).data *
+ static_cast<__half2_raw>(y).data};
+ }
+ inline
+ __host__ __device__
+ __half2 __hadd2_sat(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hadd2(x, y));
+ return __half2{
+ __clamp_01(__half_raw{r.data.x}),
+ __clamp_01(__half_raw{r.data.y})};
+ }
+ inline
+ __host__ __device__
+ __half2 __hsub2_sat(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hsub2(x, y));
+ return __half2{
+ __clamp_01(__half_raw{r.data.x}),
+ __clamp_01(__half_raw{r.data.y})};
+ }
+ inline
+ __host__ __device__
+ __half2 __hmul2_sat(__half2 x, __half2 y)
+ {
+ auto r = static_cast<__half2_raw>(__hmul2(x, y));
+ return __half2{
+ __clamp_01(__half_raw{r.data.x}),
+ __clamp_01(__half_raw{r.data.y})};
+ }
+ inline
+ __host__ __device__
+ __half2 __hfma2(__half2 x, __half2 y, __half2 z)
+ {
+ return __half2_raw{__ocml_fma_2f16(x, y, z)};
+ }
+ inline
+ __host__ __device__
+ __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
+ {
+ auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
+ return __half2{
+ __clamp_01(__half_raw{r.data.x}),
+ __clamp_01(__half_raw{r.data.y})};
+ }
+ inline
+ __host__ __device__
+ __half2 __h2div(__half2 x, __half2 y)
+ {
+ return __half2_raw{
+ static_cast<__half2_raw>(x).data /
+ static_cast<__half2_raw>(y).data};
+ }
+
+ // Math functions
+ #if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__
+ inline
+ __device__
+ float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
+ return __ockl_fdot2(static_cast<__half2_raw>(a).data,
+ static_cast<__half2_raw>(b).data,
+ c, saturate);
+ }
+ #endif
+ inline
+ __device__
+ __half htrunc(__half x)
+ {
+ return __half_raw{
+ __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hceil(__half x)
+ {
+ return __half_raw{
+ __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hfloor(__half x)
+ {
+ return __half_raw{
+ __ocml_floor_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hrint(__half x)
+ {
+ return __half_raw{
+ __ocml_rint_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hsin(__half x)
+ {
+ return __half_raw{
+ __ocml_sin_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hcos(__half x)
+ {
+ return __half_raw{
+ __ocml_cos_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hexp(__half x)
+ {
+ return __half_raw{
+ __ocml_exp_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hexp2(__half x)
+ {
+ return __half_raw{
+ __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hexp10(__half x)
+ {
+ return __half_raw{
+ __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hlog2(__half x)
+ {
+ return __half_raw{
+ __ocml_log2_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hlog(__half x)
+ {
+ return __half_raw{
+ __ocml_log_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hlog10(__half x)
+ {
+ return __half_raw{
+ __ocml_log10_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hrcp(__half x)
+ {
+ return __half_raw{
+ __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hrsqrt(__half x)
+ {
+ return __half_raw{
+ __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ __half hsqrt(__half x)
+ {
+ return __half_raw{
+ __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
+ }
+ inline
+ __device__
+ bool __hisinf(__half x)
+ {
+ return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
+ }
+ inline
+ __device__
+ bool __hisnan(__half x)
+ {
+ return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
+ }
+ inline
+ __device__
+ __half __hneg(__half x)
+ {
+ return __half_raw{-static_cast<__half_raw>(x).data};
+ }
+
+ inline
+ __host__ __device__
+ __half2 h2trunc(__half2 x)
+ {
+ return __half2_raw{__ocml_trunc_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2ceil(__half2 x)
+ {
+ return __half2_raw{__ocml_ceil_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2floor(__half2 x)
+ {
+ return __half2_raw{__ocml_floor_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2rint(__half2 x)
+ {
+ return __half2_raw{__ocml_rint_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2sin(__half2 x)
+ {
+ return __half2_raw{__ocml_sin_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2cos(__half2 x)
+ {
+ return __half2_raw{__ocml_cos_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2exp(__half2 x)
+ {
+ return __half2_raw{__ocml_exp_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2exp2(__half2 x)
+ {
+ return __half2_raw{__ocml_exp2_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2exp10(__half2 x)
+ {
+ return __half2_raw{__ocml_exp10_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2log2(__half2 x)
+ {
+ return __half2_raw{__ocml_log2_2f16(x)};
+ }
+ inline
+ __host__ __device__
+ __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
+ inline
+ __host__ __device__
+ __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
+ inline
+ __host__ __device__
+ __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
+ inline
+ __host__ __device__
+ __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
+ inline
+ __host__ __device__
+ __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
+ inline
+ __host__ __device__
+ __half2 __hisinf2(__half2 x)
+ {
+ auto r = __ocml_isinf_2f16(x);
+ return __half2_raw{_Float16_2{
+ static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
+ }
+ inline
+ __host__ __device__
+ __half2 __hisnan2(__half2 x)
+ {
+ auto r = __ocml_isnan_2f16(x);
+ return __half2_raw{_Float16_2{
+ static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
+ }
+ inline
+ __host__ __device__
+ __half2 __hneg2(__half2 x)
+ {
+ return __half2_raw{-static_cast<__half2_raw>(x).data};
+ }
+ } // Anonymous namespace.
+
+ #if !defined(HIP_NO_HALF)
+ using half = __half;
+ using half2 = __half2;
+ #endif
+ #endif // defined(__cplusplus)
+#elif defined(__GNUC__)
+ #include "hip_fp16_gcc.h"
+#endif // !defined(__clang__) && defined(__GNUC__)
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_fp16_gcc.h b/third_party/rocm/include/hip/hcc_detail/hip_fp16_gcc.h
new file mode 100644
index 0000000..480fd81
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_fp16_gcc.h
@@ -0,0 +1,254 @@
+#pragma once
+
+#if defined(__cplusplus)
+ #include <cstring>
+#endif
+
+struct __half_raw {
+ unsigned short x;
+};
+
+struct __half2_raw {
+ unsigned short x;
+ unsigned short y;
+};
+
+#if defined(__cplusplus)
+ struct __half;
+
+ __half __float2half(float);
+ float __half2float(__half);
+
+ // BEGIN STRUCT __HALF
+ struct __half {
+ protected:
+ unsigned short __x;
+ public:
+ // CREATORS
+ __half() = default;
+ __half(const __half_raw& x) : __x{x.x} {}
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ __half(float x) : __x{__float2half(x).__x} {}
+ __half(double x) : __x{__float2half(x).__x} {}
+ #endif
+ __half(const __half&) = default;
+ __half(__half&&) = default;
+ ~__half() = default;
+
+ // MANIPULATORS
+ __half& operator=(const __half&) = default;
+ __half& operator=(__half&&) = default;
+ __half& operator=(const __half_raw& x) { __x = x.x; return *this; }
+ #if !defined(__HIP_NO_HALF_CONVERSIONS__)
+ __half& operator=(float x)
+ {
+ __x = __float2half(x).__x;
+ return *this;
+ }
+ __half& operator=(double x)
+ {
+ return *this = static_cast<float>(x);
+ }
+ #endif
+
+ // ACCESSORS
+ operator float() const { return __half2float(*this); }
+ operator __half_raw() const { return __half_raw{__x}; }
+ };
+ // END STRUCT __HALF
+
+ // BEGIN STRUCT __HALF2
+ struct __half2 {
+ protected:
+ __half x;
+ __half y;
+ public:
+ // CREATORS
+ __half2() = default;
+ __half2(const __half2_raw& ix)
+ :
+ x{reinterpret_cast<const __half&>(ix.x)},
+ y{reinterpret_cast<const __half&>(ix.y)}
+ {}
+ __half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {}
+ __half2(const __half2&) = default;
+ __half2(__half2&&) = default;
+ ~__half2() = default;
+
+ // MANIPULATORS
+ __half2& operator=(const __half2&) = default;
+ __half2& operator=(__half2&&) = default;
+ __half2& operator=(const __half2_raw& ix)
+ {
+ x = reinterpret_cast<const __half_raw&>(ix.x);
+ y = reinterpret_cast<const __half_raw&>(ix.y);
+ return *this;
+ }
+
+ // ACCESSORS
+ operator __half2_raw() const
+ {
+ return __half2_raw{
+ reinterpret_cast<const unsigned short&>(x),
+ reinterpret_cast<const unsigned short&>(y)};
+ }
+ };
+ // END STRUCT __HALF2
+
+ inline
+ unsigned short __internal_float2half(
+ float flt, unsigned int& sgn, unsigned int& rem)
+ {
+ unsigned int x{};
+ std::memcpy(&x, &flt, sizeof(flt));
+
+ unsigned int u = (x & 0x7fffffffU);
+ sgn = ((x >> 16) & 0x8000U);
+
+ // NaN/+Inf/-Inf
+ if (u >= 0x7f800000U) {
+ rem = 0;
+ return static_cast<unsigned short>(
+ (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU);
+ }
+ // Overflows
+ if (u > 0x477fefffU) {
+ rem = 0x80000000U;
+ return static_cast<unsigned short>(sgn | 0x7bffU);
+ }
+ // Normal numbers
+ if (u >= 0x38800000U) {
+ rem = u << 19;
+ u -= 0x38000000U;
+ return static_cast<unsigned short>(sgn | (u >> 13));
+ }
+ // +0/-0
+ if (u < 0x33000001U) {
+ rem = u;
+ return static_cast<unsigned short>(sgn);
+ }
+ // Denormal numbers
+ unsigned int exponent = u >> 23;
+ unsigned int mantissa = (u & 0x7fffffU);
+ unsigned int shift = 0x7eU - exponent;
+ mantissa |= 0x800000U;
+ rem = mantissa << (32 - shift);
+ return static_cast<unsigned short>(sgn | (mantissa >> shift));
+ }
+
+ inline
+ __half __float2half(float x)
+ {
+ __half_raw r;
+ unsigned int sgn{};
+ unsigned int rem{};
+ r.x = __internal_float2half(x, sgn, rem);
+ if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x;
+
+ return r;
+ }
+
+ inline
+ __half __float2half_rn(float x) { return __float2half(x); }
+
+ inline
+ __half __float2half_rz(float x)
+ {
+ __half_raw r;
+ unsigned int sgn{};
+ unsigned int rem{};
+ r.x = __internal_float2half(x, sgn, rem);
+
+ return r;
+ }
+
+ inline
+ __half __float2half_rd(float x)
+ {
+ __half_raw r;
+ unsigned int sgn{};
+ unsigned int rem{};
+ r.x = __internal_float2half(x, sgn, rem);
+ if (rem && sgn) ++r.x;
+
+ return r;
+ }
+
+ inline
+ __half __float2half_ru(float x)
+ {
+ __half_raw r;
+ unsigned int sgn{};
+ unsigned int rem{};
+ r.x = __internal_float2half(x, sgn, rem);
+ if (rem && !sgn) ++r.x;
+
+ return r;
+ }
+
+ inline
+ __half2 __float2half2_rn(float x)
+ {
+ return __half2{__float2half_rn(x), __float2half_rn(x)};
+ }
+
+ inline
+ __half2 __floats2half2_rn(float x, float y)
+ {
+ return __half2{__float2half_rn(x), __float2half_rn(y)};
+ }
+
+ inline
+ float __internal_half2float(unsigned short x)
+ {
+ unsigned int sign = ((x >> 15) & 1);
+ unsigned int exponent = ((x >> 10) & 0x1f);
+ unsigned int mantissa = ((x & 0x3ff) << 13);
+
+ if (exponent == 0x1fU) { /* NaN or Inf */
+ mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0);
+ exponent = 0xffU;
+ } else if (!exponent) { /* Denorm or Zero */
+ if (mantissa) {
+ unsigned int msb;
+ exponent = 0x71U;
+ do {
+ msb = (mantissa & 0x400000U);
+ mantissa <<= 1; /* normalize */
+ --exponent;
+ } while (!msb);
+ mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+ }
+ } else {
+ exponent += 0x70U;
+ }
+ unsigned int u = ((sign << 31) | (exponent << 23) | mantissa);
+ float f;
+ memcpy(&f, &u, sizeof(u));
+
+ return f;
+ }
+
+ inline
+ float __half2float(__half x)
+ {
+ return __internal_half2float(static_cast<__half_raw>(x).x);
+ }
+
+ inline
+ float __low2float(__half2 x)
+ {
+ return __internal_half2float(static_cast<__half2_raw>(x).x);
+ }
+
+ inline
+ float __high2float(__half2 x)
+ {
+ return __internal_half2float(static_cast<__half2_raw>(x).y);
+ }
+
+ #if !defined(HIP_NO_HALF)
+ using half = __half;
+ using half2 = __half2;
+ #endif
+#endif // defined(__cplusplus)
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_fp16_math_fwd.h b/third_party/rocm/include/hip/hcc_detail/hip_fp16_math_fwd.h
new file mode 100644
index 0000000..53a2c66
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_fp16_math_fwd.h
@@ -0,0 +1,86 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// /*
+// Half Math Functions
+// */
+
+#include "host_defines.h"
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+extern "C"
+{
+ __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
+ __device__ _Float16 __ocml_cos_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
+ __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
+ __device__ __attribute__((const))
+ _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
+ __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
+ __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
+ __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
+ __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
+ __device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
+ __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
+ __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
+ __device__ _Float16 __ocml_sin_f16(_Float16);
+ __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
+ __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+
+ typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
+ typedef short __2i16 __attribute__((ext_vector_type(2)));
+
+ #if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__
+ __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
+ #endif
+
+ __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
+ __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
+ __device__ __2f16 __ocml_cos_2f16(__2f16);
+ __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
+ __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
+ __device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
+ __device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
+ __device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
+ __device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
+ __device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
+ __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
+ __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
+ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
+ __device__ inline
+ __2f16 __llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL.
+ {
+ return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)};
+ }
+ __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
+ __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
+ __device__ __2f16 __ocml_sin_2f16(__2f16);
+ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
+ __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+}
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_ldg.h b/third_party/rocm/include/hip/hcc_detail/hip_ldg.h
new file mode 100644
index 0000000..ab86955
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_ldg.h
@@ -0,0 +1,103 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_LDG_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_LDG_H
+
+#if defined(__HCC_OR_HIP_CLANG__)
+#if __hcc_workweek__ >= 16164 || __HIP_CLANG_ONLY__
+#include "hip_vector_types.h"
+#include "host_defines.h"
+
+__device__ inline static char __ldg(const char* ptr) { return *ptr; }
+
+__device__ inline static char2 __ldg(const char2* ptr) { return *ptr; }
+
+__device__ inline static char4 __ldg(const char4* ptr) { return *ptr; }
+
+__device__ inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
+
+
+__device__ inline static short __ldg(const short* ptr) { return ptr[0]; }
+
+__device__ inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
+
+__device__ inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
+
+
+__device__ inline static int __ldg(const int* ptr) { return ptr[0]; }
+
+__device__ inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
+
+__device__ inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
+
+
+__device__ inline static long __ldg(const long* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
+
+
+__device__ inline static long long __ldg(const long long* ptr) { return ptr[0]; }
+
+__device__ inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
+
+__device__ inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
+
+
+__device__ inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
+
+__device__ inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
+
+
+__device__ inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
+
+__device__ inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
+
+
+__device__ inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
+
+
+__device__ inline static float __ldg(const float* ptr) { return ptr[0]; }
+
+__device__ inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
+
+__device__ inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
+
+
+__device__ inline static double __ldg(const double* ptr) { return ptr[0]; }
+
+__device__ inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
+
+#endif // __hcc_workweek__ || __HIP_CLANG_ONLY__
+
+#endif // defined(__HCC_OR_HIP_CLANG__)
+
+#endif // HIP_LDG_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_memory.h b/third_party/rocm/include/hip/hcc_detail/hip_memory.h
new file mode 100644
index 0000000..0c00614
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_memory.h
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
+
+// Implementation of malloc and free device functions.
+// HIP heap is implemented as a global array with fixed size. Users may define
+// __HIP_SIZE_OF_PAGE and __HIP_NUM_PAGES to have a larger heap.
+
+#if (__HCC__ || __HIP__) && __HIP_ENABLE_DEVICE_MALLOC__
+
+// Size of page in bytes.
+#ifndef __HIP_SIZE_OF_PAGE
+#define __HIP_SIZE_OF_PAGE 64
+#endif
+
+// Total number of pages
+#ifndef __HIP_NUM_PAGES
+#define __HIP_NUM_PAGES (16 * 64 * 64)
+#endif
+
+#define __HIP_SIZE_OF_HEAP (__HIP_NUM_PAGES * __HIP_SIZE_OF_PAGE)
+
+#if __HIP__ && __HIP_DEVICE_COMPILE__
+__attribute__((weak)) __device__ char __hip_device_heap[__HIP_SIZE_OF_HEAP];
+__attribute__((weak)) __device__
+ uint32_t __hip_device_page_flag[__HIP_NUM_PAGES];
+#else
+extern __device__ char __hip_device_heap[];
+extern __device__ uint32_t __hip_device_page_flag[];
+#endif
+
+extern "C" inline __device__ void* __hip_malloc(size_t size) {
+ char* heap = (char*)__hip_device_heap;
+ if (size > __HIP_SIZE_OF_HEAP) {
+ return (void*)nullptr;
+ }
+ uint32_t totalThreads =
+ hipBlockDim_x * hipGridDim_x * hipBlockDim_y
+ * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+ uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x
+ + (hipThreadIdx_y + hipBlockDim_y * hipBlockIdx_y) * hipBlockDim_x
+ + (hipThreadIdx_z + hipBlockDim_z * hipBlockIdx_z) * hipBlockDim_x
+ * hipBlockDim_y;
+
+ uint32_t numHeapsPerWorkItem = __HIP_NUM_PAGES / totalThreads;
+ uint32_t heapSizePerWorkItem = __HIP_SIZE_OF_HEAP / totalThreads;
+
+ uint32_t stride = size / __HIP_SIZE_OF_PAGE;
+ uint32_t start = numHeapsPerWorkItem * currentWorkItem;
+
+ uint32_t k = 0;
+
+ while (__hip_device_page_flag[k] > 0) {
+ k++;
+ }
+
+ for (uint32_t i = 0; i < stride - 1; i++) {
+ __hip_device_page_flag[i + start + k] = 1;
+ }
+
+ __hip_device_page_flag[start + stride - 1 + k] = 2;
+
+ void* ptr = (void*)(heap
+ + heapSizePerWorkItem * currentWorkItem + k * __HIP_SIZE_OF_PAGE);
+
+ return ptr;
+}
+
+extern "C" inline __device__ void* __hip_free(void* ptr) {
+ if (ptr == nullptr) {
+ return nullptr;
+ }
+
+ uint32_t offsetByte = (uint64_t)ptr - (uint64_t)__hip_device_heap;
+ uint32_t offsetPage = offsetByte / __HIP_SIZE_OF_PAGE;
+
+ while (__hip_device_page_flag[offsetPage] != 0) {
+ if (__hip_device_page_flag[offsetPage] == 2) {
+ __hip_device_page_flag[offsetPage] = 0;
+ offsetPage++;
+ break;
+ } else {
+ __hip_device_page_flag[offsetPage] = 0;
+ offsetPage++;
+ }
+ }
+
+ return nullptr;
+}
+
+#endif
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_MEMORY_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_prof_str.h b/third_party/rocm/include/hip/hcc_detail/hip_prof_str.h
new file mode 100644
index 0000000..cb297b2
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_prof_str.h
@@ -0,0 +1,5127 @@
+// automatically generated sources
+#ifndef _HIP_PROF_STR_H
+#define _HIP_PROF_STR_H
+#define HIP_PROF_VER 1
+
+// Dummy API primitives
+#define INIT_NONE_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetAddress_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetBorderColor_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyDtoA_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexObjectGetResourceViewDesc_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyAtoHAsync_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipDestroyTextureObject_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetAddress_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipArrayDestroy_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetMaxAnisotropy_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetMipmapFilterMode_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipDeviceGetCount_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyArrayToArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipBindTexture2D_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipCreateTextureObject_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyHtoAAsync_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyAtoA_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyAtoD_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipBindTextureToMipmappedArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetMipmapLevelClamp_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipBindTextureToArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetFlags_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetFormat_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexObjectGetTextureDesc_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexObjectDestroy_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpy2DArrayToArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipGetTextureReference_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMipmappedArrayDestroy_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetFilterMode_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetFormat_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyToArrayAsync_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetAddress2D_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipGetTextureObjectResourceViewDesc_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetFlags_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipUnbindTexture_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetMipmapLevelBias_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetFilterMode_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipGetTextureAlignmentOffset_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMipmappedArrayGetLevel_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipCreateSurfaceObject_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMipmappedArrayCreate_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexObjectGetResourceDesc_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipGetChannelDesc_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetAddressMode_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipGetTextureObjectResourceDesc_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipModuleLaunchKernelExt_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpy2DToArrayAsync_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetBorderColor_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipDestroySurfaceObject_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetMipmapFilterMode_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetMaxAnisotropy_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexObjectCreate_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetAddressMode_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetMipmapLevelBias_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipMemcpyFromArrayAsync_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipBindTexture_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetMipmappedArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefGetMipmappedArray_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipSetValidDevices_CB_ARGS_DATA(cb_data) {};
+#define INIT_ihipModuleLaunchKernel_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipTexRefSetMipmapLevelClamp_CB_ARGS_DATA(cb_data) {};
+#define INIT_hipGetTextureObjectTextureDesc_CB_ARGS_DATA(cb_data) {};
+
+// HIP API callbacks ID enumaration
+enum hip_api_id_t {
+ HIP_API_ID_hipDrvMemcpy3DAsync = 0,
+ HIP_API_ID_hipDeviceEnablePeerAccess = 1,
+ HIP_API_ID_hipFuncSetSharedMemConfig = 2,
+ HIP_API_ID_hipMemcpyToSymbolAsync = 3,
+ HIP_API_ID_hipMallocPitch = 4,
+ HIP_API_ID_hipMalloc = 5,
+ HIP_API_ID_hipMemsetD16 = 6,
+ HIP_API_ID_hipExtStreamGetCUMask = 7,
+ HIP_API_ID_hipEventRecord = 8,
+ HIP_API_ID_hipCtxSynchronize = 9,
+ HIP_API_ID_hipSetDevice = 10,
+ HIP_API_ID_hipCtxGetApiVersion = 11,
+ HIP_API_ID_hipMemcpyFromSymbolAsync = 12,
+ HIP_API_ID_hipExtGetLinkTypeAndHopCount = 13,
+ HIP_API_ID___hipPopCallConfiguration = 14,
+ HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor = 15,
+ HIP_API_ID_hipMemset3D = 16,
+ HIP_API_ID_hipStreamCreateWithPriority = 17,
+ HIP_API_ID_hipMemcpy2DToArray = 18,
+ HIP_API_ID_hipMemsetD8Async = 19,
+ HIP_API_ID_hipCtxGetCacheConfig = 20,
+ HIP_API_ID_hipModuleGetFunction = 21,
+ HIP_API_ID_hipStreamWaitEvent = 22,
+ HIP_API_ID_hipDeviceGetStreamPriorityRange = 23,
+ HIP_API_ID_hipModuleLoad = 24,
+ HIP_API_ID_hipDevicePrimaryCtxSetFlags = 25,
+ HIP_API_ID_hipLaunchCooperativeKernel = 26,
+ HIP_API_ID_hipLaunchCooperativeKernelMultiDevice = 27,
+ HIP_API_ID_hipMemcpyAsync = 28,
+ HIP_API_ID_hipMalloc3DArray = 29,
+ HIP_API_ID_hipMallocHost = 30,
+ HIP_API_ID_hipCtxGetCurrent = 31,
+ HIP_API_ID_hipDevicePrimaryCtxGetState = 32,
+ HIP_API_ID_hipEventQuery = 33,
+ HIP_API_ID_hipEventCreate = 34,
+ HIP_API_ID_hipMemGetAddressRange = 35,
+ HIP_API_ID_hipMemcpyFromSymbol = 36,
+ HIP_API_ID_hipArrayCreate = 37,
+ HIP_API_ID_hipStreamAttachMemAsync = 38,
+ HIP_API_ID_hipStreamGetFlags = 39,
+ HIP_API_ID_hipMallocArray = 40,
+ HIP_API_ID_hipCtxGetSharedMemConfig = 41,
+ HIP_API_ID_hipDeviceDisablePeerAccess = 42,
+ HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize = 43,
+ HIP_API_ID_hipMemPtrGetInfo = 44,
+ HIP_API_ID_hipFuncGetAttribute = 45,
+ HIP_API_ID_hipCtxGetFlags = 46,
+ HIP_API_ID_hipStreamDestroy = 47,
+ HIP_API_ID___hipPushCallConfiguration = 48,
+ HIP_API_ID_hipMemset3DAsync = 49,
+ HIP_API_ID_hipDeviceGetPCIBusId = 50,
+ HIP_API_ID_hipInit = 51,
+ HIP_API_ID_hipMemcpyAtoH = 52,
+ HIP_API_ID_hipStreamGetPriority = 53,
+ HIP_API_ID_hipMemset2D = 54,
+ HIP_API_ID_hipMemset2DAsync = 55,
+ HIP_API_ID_hipDeviceCanAccessPeer = 56,
+ HIP_API_ID_hipLaunchByPtr = 57,
+ HIP_API_ID_hipMemPrefetchAsync = 58,
+ HIP_API_ID_hipCtxDestroy = 59,
+ HIP_API_ID_hipMemsetD16Async = 60,
+ HIP_API_ID_hipModuleUnload = 61,
+ HIP_API_ID_hipHostUnregister = 62,
+ HIP_API_ID_hipProfilerStop = 63,
+ HIP_API_ID_hipExtStreamCreateWithCUMask = 64,
+ HIP_API_ID_hipStreamSynchronize = 65,
+ HIP_API_ID_hipFreeHost = 66,
+ HIP_API_ID_hipDeviceSetCacheConfig = 67,
+ HIP_API_ID_hipGetErrorName = 68,
+ HIP_API_ID_hipMemcpyHtoD = 69,
+ HIP_API_ID_hipModuleGetGlobal = 70,
+ HIP_API_ID_hipMemcpyHtoA = 71,
+ HIP_API_ID_hipCtxCreate = 72,
+ HIP_API_ID_hipMemcpy2D = 73,
+ HIP_API_ID_hipIpcCloseMemHandle = 74,
+ HIP_API_ID_hipChooseDevice = 75,
+ HIP_API_ID_hipDeviceSetSharedMemConfig = 76,
+ HIP_API_ID_hipMallocMipmappedArray = 77,
+ HIP_API_ID_hipSetupArgument = 78,
+ HIP_API_ID_hipIpcGetEventHandle = 79,
+ HIP_API_ID_hipFreeArray = 80,
+ HIP_API_ID_hipCtxSetCacheConfig = 81,
+ HIP_API_ID_hipFuncSetCacheConfig = 82,
+ HIP_API_ID_hipLaunchKernel = 83,
+ HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = 84,
+ HIP_API_ID_hipModuleGetTexRef = 85,
+ HIP_API_ID_hipFuncSetAttribute = 86,
+ HIP_API_ID_hipEventElapsedTime = 87,
+ HIP_API_ID_hipConfigureCall = 88,
+ HIP_API_ID_hipMemAdvise = 89,
+ HIP_API_ID_hipMemcpy3DAsync = 90,
+ HIP_API_ID_hipEventDestroy = 91,
+ HIP_API_ID_hipCtxPopCurrent = 92,
+ HIP_API_ID_hipGetSymbolAddress = 93,
+ HIP_API_ID_hipHostGetFlags = 94,
+ HIP_API_ID_hipHostMalloc = 95,
+ HIP_API_ID_hipCtxSetSharedMemConfig = 96,
+ HIP_API_ID_hipFreeMipmappedArray = 97,
+ HIP_API_ID_hipMemGetInfo = 98,
+ HIP_API_ID_hipDeviceReset = 99,
+ HIP_API_ID_hipMemset = 100,
+ HIP_API_ID_hipMemsetD8 = 101,
+ HIP_API_ID_hipMemcpyParam2DAsync = 102,
+ HIP_API_ID_hipHostRegister = 103,
+ HIP_API_ID_hipDriverGetVersion = 104,
+ HIP_API_ID_hipArray3DCreate = 105,
+ HIP_API_ID_hipIpcOpenMemHandle = 106,
+ HIP_API_ID_hipGetLastError = 107,
+ HIP_API_ID_hipGetDeviceFlags = 108,
+ HIP_API_ID_hipDeviceGetSharedMemConfig = 109,
+ HIP_API_ID_hipDrvMemcpy3D = 110,
+ HIP_API_ID_hipMemcpy2DFromArray = 111,
+ HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags = 112,
+ HIP_API_ID_hipSetDeviceFlags = 113,
+ HIP_API_ID_hipHccModuleLaunchKernel = 114,
+ HIP_API_ID_hipFree = 115,
+ HIP_API_ID_hipOccupancyMaxPotentialBlockSize = 116,
+ HIP_API_ID_hipDeviceGetAttribute = 117,
+ HIP_API_ID_hipDeviceComputeCapability = 118,
+ HIP_API_ID_hipCtxDisablePeerAccess = 119,
+ HIP_API_ID_hipMallocManaged = 120,
+ HIP_API_ID_hipDeviceGetByPCIBusId = 121,
+ HIP_API_ID_hipIpcGetMemHandle = 122,
+ HIP_API_ID_hipMemcpyHtoDAsync = 123,
+ HIP_API_ID_hipCtxGetDevice = 124,
+ HIP_API_ID_hipMemcpyDtoD = 125,
+ HIP_API_ID_hipModuleLoadData = 126,
+ HIP_API_ID_hipDevicePrimaryCtxRelease = 127,
+ HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor = 128,
+ HIP_API_ID_hipCtxSetCurrent = 129,
+ HIP_API_ID_hipGetErrorString = 130,
+ HIP_API_ID_hipStreamCreate = 131,
+ HIP_API_ID_hipDevicePrimaryCtxRetain = 132,
+ HIP_API_ID_hipDeviceGet = 133,
+ HIP_API_ID_hipStreamCreateWithFlags = 134,
+ HIP_API_ID_hipMemcpyFromArray = 135,
+ HIP_API_ID_hipMemcpy2DAsync = 136,
+ HIP_API_ID_hipFuncGetAttributes = 137,
+ HIP_API_ID_hipGetSymbolSize = 138,
+ HIP_API_ID_hipHostFree = 139,
+ HIP_API_ID_hipEventCreateWithFlags = 140,
+ HIP_API_ID_hipStreamQuery = 141,
+ HIP_API_ID_hipMemcpy3D = 142,
+ HIP_API_ID_hipMemcpyToSymbol = 143,
+ HIP_API_ID_hipMemcpy = 144,
+ HIP_API_ID_hipPeekAtLastError = 145,
+ HIP_API_ID_hipExtLaunchMultiKernelMultiDevice = 146,
+ HIP_API_ID_hipHostAlloc = 147,
+ HIP_API_ID_hipStreamAddCallback = 148,
+ HIP_API_ID_hipMemcpyToArray = 149,
+ HIP_API_ID_hipMemsetD32 = 150,
+ HIP_API_ID_hipExtModuleLaunchKernel = 151,
+ HIP_API_ID_hipDeviceSynchronize = 152,
+ HIP_API_ID_hipDeviceGetCacheConfig = 153,
+ HIP_API_ID_hipMalloc3D = 154,
+ HIP_API_ID_hipPointerGetAttributes = 155,
+ HIP_API_ID_hipMemsetAsync = 156,
+ HIP_API_ID_hipDeviceGetName = 157,
+ HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags = 158,
+ HIP_API_ID_hipCtxPushCurrent = 159,
+ HIP_API_ID_hipMemcpyPeer = 160,
+ HIP_API_ID_hipEventSynchronize = 161,
+ HIP_API_ID_hipMemcpyDtoDAsync = 162,
+ HIP_API_ID_hipProfilerStart = 163,
+ HIP_API_ID_hipExtMallocWithFlags = 164,
+ HIP_API_ID_hipCtxEnablePeerAccess = 165,
+ HIP_API_ID_hipMemAllocHost = 166,
+ HIP_API_ID_hipMemcpyDtoHAsync = 167,
+ HIP_API_ID_hipModuleLaunchKernel = 168,
+ HIP_API_ID_hipMemAllocPitch = 169,
+ HIP_API_ID_hipExtLaunchKernel = 170,
+ HIP_API_ID_hipMemcpy2DFromArrayAsync = 171,
+ HIP_API_ID_hipDeviceGetLimit = 172,
+ HIP_API_ID_hipModuleLoadDataEx = 173,
+ HIP_API_ID_hipRuntimeGetVersion = 174,
+ HIP_API_ID_hipMemRangeGetAttribute = 175,
+ HIP_API_ID_hipDeviceGetP2PAttribute = 176,
+ HIP_API_ID_hipMemcpyPeerAsync = 177,
+ HIP_API_ID_hipGetDeviceProperties = 178,
+ HIP_API_ID_hipMemcpyDtoH = 179,
+ HIP_API_ID_hipMemcpyWithStream = 180,
+ HIP_API_ID_hipDeviceTotalMem = 181,
+ HIP_API_ID_hipHostGetDevicePointer = 182,
+ HIP_API_ID_hipMemRangeGetAttributes = 183,
+ HIP_API_ID_hipMemcpyParam2D = 184,
+ HIP_API_ID_hipDevicePrimaryCtxReset = 185,
+ HIP_API_ID_hipGetMipmappedArrayLevel = 186,
+ HIP_API_ID_hipMemsetD32Async = 187,
+ HIP_API_ID_hipGetDevice = 188,
+ HIP_API_ID_hipGetDeviceCount = 189,
+ HIP_API_ID_hipIpcOpenEventHandle = 190,
+ HIP_API_ID_NUMBER = 191,
+
+ HIP_API_ID_NONE = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetAddress = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetBorderColor = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyDtoA = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipArrayGetDescriptor = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexObjectGetResourceViewDesc = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyAtoHAsync = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipDestroyTextureObject = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipArray3DGetDescriptor = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetAddress = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipArrayDestroy = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetMaxAnisotropy = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetMipmapFilterMode = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipDeviceGetCount = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyArrayToArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipBindTexture2D = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipCreateTextureObject = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyHtoAAsync = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyAtoA = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyAtoD = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipBindTextureToMipmappedArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetMipmapLevelClamp = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipBindTextureToArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetFlags = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetFormat = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexObjectGetTextureDesc = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexObjectDestroy = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpy2DArrayToArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipGetTextureReference = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMipmappedArrayDestroy = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetFilterMode = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetFormat = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyToArrayAsync = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetAddress2D = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipGetTextureObjectResourceViewDesc = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetFlags = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipUnbindTexture = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetMipmapLevelBias = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetFilterMode = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipGetTextureAlignmentOffset = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMipmappedArrayGetLevel = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipCreateSurfaceObject = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMipmappedArrayCreate = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexObjectGetResourceDesc = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipGetChannelDesc = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetAddressMode = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipGetTextureObjectResourceDesc = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipModuleLaunchKernelExt = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpy2DToArrayAsync = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetBorderColor = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipDestroySurfaceObject = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetMipmapFilterMode = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetMaxAnisotropy = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexObjectCreate = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetAddressMode = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetMipmapLevelBias = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipMemcpyFromArrayAsync = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipBindTexture = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetMipmappedArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefGetMipmappedArray = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipSetValidDevices = HIP_API_ID_NUMBER,
+ HIP_API_ID_ihipModuleLaunchKernel = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipTexRefSetMipmapLevelClamp = HIP_API_ID_NUMBER,
+ HIP_API_ID_hipGetTextureObjectTextureDesc = HIP_API_ID_NUMBER,
+};
+
+// Return HIP API string by given ID
+static inline const char* hip_api_name(const uint32_t id) {
+ switch(id) {
+ case HIP_API_ID_hipDrvMemcpy3DAsync: return "hipDrvMemcpy3DAsync";
+ case HIP_API_ID_hipDeviceEnablePeerAccess: return "hipDeviceEnablePeerAccess";
+ case HIP_API_ID_hipFuncSetSharedMemConfig: return "hipFuncSetSharedMemConfig";
+ case HIP_API_ID_hipMemcpyToSymbolAsync: return "hipMemcpyToSymbolAsync";
+ case HIP_API_ID_hipMallocPitch: return "hipMallocPitch";
+ case HIP_API_ID_hipMalloc: return "hipMalloc";
+ case HIP_API_ID_hipMemsetD16: return "hipMemsetD16";
+ case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask";
+ case HIP_API_ID_hipEventRecord: return "hipEventRecord";
+ case HIP_API_ID_hipCtxSynchronize: return "hipCtxSynchronize";
+ case HIP_API_ID_hipSetDevice: return "hipSetDevice";
+ case HIP_API_ID_hipCtxGetApiVersion: return "hipCtxGetApiVersion";
+ case HIP_API_ID_hipMemcpyFromSymbolAsync: return "hipMemcpyFromSymbolAsync";
+ case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
+ case HIP_API_ID___hipPopCallConfiguration: return "__hipPopCallConfiguration";
+ case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor: return "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor";
+ case HIP_API_ID_hipMemset3D: return "hipMemset3D";
+ case HIP_API_ID_hipStreamCreateWithPriority: return "hipStreamCreateWithPriority";
+ case HIP_API_ID_hipMemcpy2DToArray: return "hipMemcpy2DToArray";
+ case HIP_API_ID_hipMemsetD8Async: return "hipMemsetD8Async";
+ case HIP_API_ID_hipCtxGetCacheConfig: return "hipCtxGetCacheConfig";
+ case HIP_API_ID_hipModuleGetFunction: return "hipModuleGetFunction";
+ case HIP_API_ID_hipStreamWaitEvent: return "hipStreamWaitEvent";
+ case HIP_API_ID_hipDeviceGetStreamPriorityRange: return "hipDeviceGetStreamPriorityRange";
+ case HIP_API_ID_hipModuleLoad: return "hipModuleLoad";
+ case HIP_API_ID_hipDevicePrimaryCtxSetFlags: return "hipDevicePrimaryCtxSetFlags";
+ case HIP_API_ID_hipLaunchCooperativeKernel: return "hipLaunchCooperativeKernel";
+ case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: return "hipLaunchCooperativeKernelMultiDevice";
+ case HIP_API_ID_hipMemcpyAsync: return "hipMemcpyAsync";
+ case HIP_API_ID_hipMalloc3DArray: return "hipMalloc3DArray";
+ case HIP_API_ID_hipMallocHost: return "hipMallocHost";
+ case HIP_API_ID_hipCtxGetCurrent: return "hipCtxGetCurrent";
+ case HIP_API_ID_hipDevicePrimaryCtxGetState: return "hipDevicePrimaryCtxGetState";
+ case HIP_API_ID_hipEventQuery: return "hipEventQuery";
+ case HIP_API_ID_hipEventCreate: return "hipEventCreate";
+ case HIP_API_ID_hipMemGetAddressRange: return "hipMemGetAddressRange";
+ case HIP_API_ID_hipMemcpyFromSymbol: return "hipMemcpyFromSymbol";
+ case HIP_API_ID_hipArrayCreate: return "hipArrayCreate";
+ case HIP_API_ID_hipStreamAttachMemAsync: return "hipStreamAttachMemAsync";
+ case HIP_API_ID_hipStreamGetFlags: return "hipStreamGetFlags";
+ case HIP_API_ID_hipMallocArray: return "hipMallocArray";
+ case HIP_API_ID_hipCtxGetSharedMemConfig: return "hipCtxGetSharedMemConfig";
+ case HIP_API_ID_hipDeviceDisablePeerAccess: return "hipDeviceDisablePeerAccess";
+ case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize: return "hipModuleOccupancyMaxPotentialBlockSize";
+ case HIP_API_ID_hipMemPtrGetInfo: return "hipMemPtrGetInfo";
+ case HIP_API_ID_hipFuncGetAttribute: return "hipFuncGetAttribute";
+ case HIP_API_ID_hipCtxGetFlags: return "hipCtxGetFlags";
+ case HIP_API_ID_hipStreamDestroy: return "hipStreamDestroy";
+ case HIP_API_ID___hipPushCallConfiguration: return "__hipPushCallConfiguration";
+ case HIP_API_ID_hipMemset3DAsync: return "hipMemset3DAsync";
+ case HIP_API_ID_hipDeviceGetPCIBusId: return "hipDeviceGetPCIBusId";
+ case HIP_API_ID_hipInit: return "hipInit";
+ case HIP_API_ID_hipMemcpyAtoH: return "hipMemcpyAtoH";
+ case HIP_API_ID_hipStreamGetPriority: return "hipStreamGetPriority";
+ case HIP_API_ID_hipMemset2D: return "hipMemset2D";
+ case HIP_API_ID_hipMemset2DAsync: return "hipMemset2DAsync";
+ case HIP_API_ID_hipDeviceCanAccessPeer: return "hipDeviceCanAccessPeer";
+ case HIP_API_ID_hipLaunchByPtr: return "hipLaunchByPtr";
+ case HIP_API_ID_hipMemPrefetchAsync: return "hipMemPrefetchAsync";
+ case HIP_API_ID_hipCtxDestroy: return "hipCtxDestroy";
+ case HIP_API_ID_hipMemsetD16Async: return "hipMemsetD16Async";
+ case HIP_API_ID_hipModuleUnload: return "hipModuleUnload";
+ case HIP_API_ID_hipHostUnregister: return "hipHostUnregister";
+ case HIP_API_ID_hipProfilerStop: return "hipProfilerStop";
+ case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask";
+ case HIP_API_ID_hipStreamSynchronize: return "hipStreamSynchronize";
+ case HIP_API_ID_hipFreeHost: return "hipFreeHost";
+ case HIP_API_ID_hipDeviceSetCacheConfig: return "hipDeviceSetCacheConfig";
+ case HIP_API_ID_hipGetErrorName: return "hipGetErrorName";
+ case HIP_API_ID_hipMemcpyHtoD: return "hipMemcpyHtoD";
+ case HIP_API_ID_hipModuleGetGlobal: return "hipModuleGetGlobal";
+ case HIP_API_ID_hipMemcpyHtoA: return "hipMemcpyHtoA";
+ case HIP_API_ID_hipCtxCreate: return "hipCtxCreate";
+ case HIP_API_ID_hipMemcpy2D: return "hipMemcpy2D";
+ case HIP_API_ID_hipIpcCloseMemHandle: return "hipIpcCloseMemHandle";
+ case HIP_API_ID_hipChooseDevice: return "hipChooseDevice";
+ case HIP_API_ID_hipDeviceSetSharedMemConfig: return "hipDeviceSetSharedMemConfig";
+ case HIP_API_ID_hipMallocMipmappedArray: return "hipMallocMipmappedArray";
+ case HIP_API_ID_hipSetupArgument: return "hipSetupArgument";
+ case HIP_API_ID_hipIpcGetEventHandle: return "hipIpcGetEventHandle";
+ case HIP_API_ID_hipFreeArray: return "hipFreeArray";
+ case HIP_API_ID_hipCtxSetCacheConfig: return "hipCtxSetCacheConfig";
+ case HIP_API_ID_hipFuncSetCacheConfig: return "hipFuncSetCacheConfig";
+ case HIP_API_ID_hipLaunchKernel: return "hipLaunchKernel";
+ case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags: return "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags";
+ case HIP_API_ID_hipModuleGetTexRef: return "hipModuleGetTexRef";
+ case HIP_API_ID_hipFuncSetAttribute: return "hipFuncSetAttribute";
+ case HIP_API_ID_hipEventElapsedTime: return "hipEventElapsedTime";
+ case HIP_API_ID_hipConfigureCall: return "hipConfigureCall";
+ case HIP_API_ID_hipMemAdvise: return "hipMemAdvise";
+ case HIP_API_ID_hipMemcpy3DAsync: return "hipMemcpy3DAsync";
+ case HIP_API_ID_hipEventDestroy: return "hipEventDestroy";
+ case HIP_API_ID_hipCtxPopCurrent: return "hipCtxPopCurrent";
+ case HIP_API_ID_hipGetSymbolAddress: return "hipGetSymbolAddress";
+ case HIP_API_ID_hipHostGetFlags: return "hipHostGetFlags";
+ case HIP_API_ID_hipHostMalloc: return "hipHostMalloc";
+ case HIP_API_ID_hipCtxSetSharedMemConfig: return "hipCtxSetSharedMemConfig";
+ case HIP_API_ID_hipFreeMipmappedArray: return "hipFreeMipmappedArray";
+ case HIP_API_ID_hipMemGetInfo: return "hipMemGetInfo";
+ case HIP_API_ID_hipDeviceReset: return "hipDeviceReset";
+ case HIP_API_ID_hipMemset: return "hipMemset";
+ case HIP_API_ID_hipMemsetD8: return "hipMemsetD8";
+ case HIP_API_ID_hipMemcpyParam2DAsync: return "hipMemcpyParam2DAsync";
+ case HIP_API_ID_hipHostRegister: return "hipHostRegister";
+ case HIP_API_ID_hipDriverGetVersion: return "hipDriverGetVersion";
+ case HIP_API_ID_hipArray3DCreate: return "hipArray3DCreate";
+ case HIP_API_ID_hipIpcOpenMemHandle: return "hipIpcOpenMemHandle";
+ case HIP_API_ID_hipGetLastError: return "hipGetLastError";
+ case HIP_API_ID_hipGetDeviceFlags: return "hipGetDeviceFlags";
+ case HIP_API_ID_hipDeviceGetSharedMemConfig: return "hipDeviceGetSharedMemConfig";
+ case HIP_API_ID_hipDrvMemcpy3D: return "hipDrvMemcpy3D";
+ case HIP_API_ID_hipMemcpy2DFromArray: return "hipMemcpy2DFromArray";
+ case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags: return "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags";
+ case HIP_API_ID_hipSetDeviceFlags: return "hipSetDeviceFlags";
+ case HIP_API_ID_hipHccModuleLaunchKernel: return "hipHccModuleLaunchKernel";
+ case HIP_API_ID_hipFree: return "hipFree";
+ case HIP_API_ID_hipOccupancyMaxPotentialBlockSize: return "hipOccupancyMaxPotentialBlockSize";
+ case HIP_API_ID_hipDeviceGetAttribute: return "hipDeviceGetAttribute";
+ case HIP_API_ID_hipDeviceComputeCapability: return "hipDeviceComputeCapability";
+ case HIP_API_ID_hipCtxDisablePeerAccess: return "hipCtxDisablePeerAccess";
+ case HIP_API_ID_hipMallocManaged: return "hipMallocManaged";
+ case HIP_API_ID_hipDeviceGetByPCIBusId: return "hipDeviceGetByPCIBusId";
+ case HIP_API_ID_hipIpcGetMemHandle: return "hipIpcGetMemHandle";
+ case HIP_API_ID_hipMemcpyHtoDAsync: return "hipMemcpyHtoDAsync";
+ case HIP_API_ID_hipCtxGetDevice: return "hipCtxGetDevice";
+ case HIP_API_ID_hipMemcpyDtoD: return "hipMemcpyDtoD";
+ case HIP_API_ID_hipModuleLoadData: return "hipModuleLoadData";
+ case HIP_API_ID_hipDevicePrimaryCtxRelease: return "hipDevicePrimaryCtxRelease";
+ case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor: return "hipOccupancyMaxActiveBlocksPerMultiprocessor";
+ case HIP_API_ID_hipCtxSetCurrent: return "hipCtxSetCurrent";
+ case HIP_API_ID_hipGetErrorString: return "hipGetErrorString";
+ case HIP_API_ID_hipStreamCreate: return "hipStreamCreate";
+ case HIP_API_ID_hipDevicePrimaryCtxRetain: return "hipDevicePrimaryCtxRetain";
+ case HIP_API_ID_hipDeviceGet: return "hipDeviceGet";
+ case HIP_API_ID_hipStreamCreateWithFlags: return "hipStreamCreateWithFlags";
+ case HIP_API_ID_hipMemcpyFromArray: return "hipMemcpyFromArray";
+ case HIP_API_ID_hipMemcpy2DAsync: return "hipMemcpy2DAsync";
+ case HIP_API_ID_hipFuncGetAttributes: return "hipFuncGetAttributes";
+ case HIP_API_ID_hipGetSymbolSize: return "hipGetSymbolSize";
+ case HIP_API_ID_hipHostFree: return "hipHostFree";
+ case HIP_API_ID_hipEventCreateWithFlags: return "hipEventCreateWithFlags";
+ case HIP_API_ID_hipStreamQuery: return "hipStreamQuery";
+ case HIP_API_ID_hipMemcpy3D: return "hipMemcpy3D";
+ case HIP_API_ID_hipMemcpyToSymbol: return "hipMemcpyToSymbol";
+ case HIP_API_ID_hipMemcpy: return "hipMemcpy";
+ case HIP_API_ID_hipPeekAtLastError: return "hipPeekAtLastError";
+ case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
+ case HIP_API_ID_hipHostAlloc: return "hipHostAlloc";
+ case HIP_API_ID_hipStreamAddCallback: return "hipStreamAddCallback";
+ case HIP_API_ID_hipMemcpyToArray: return "hipMemcpyToArray";
+ case HIP_API_ID_hipMemsetD32: return "hipMemsetD32";
+ case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel";
+ case HIP_API_ID_hipDeviceSynchronize: return "hipDeviceSynchronize";
+ case HIP_API_ID_hipDeviceGetCacheConfig: return "hipDeviceGetCacheConfig";
+ case HIP_API_ID_hipMalloc3D: return "hipMalloc3D";
+ case HIP_API_ID_hipPointerGetAttributes: return "hipPointerGetAttributes";
+ case HIP_API_ID_hipMemsetAsync: return "hipMemsetAsync";
+ case HIP_API_ID_hipDeviceGetName: return "hipDeviceGetName";
+ case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags: return "hipModuleOccupancyMaxPotentialBlockSizeWithFlags";
+ case HIP_API_ID_hipCtxPushCurrent: return "hipCtxPushCurrent";
+ case HIP_API_ID_hipMemcpyPeer: return "hipMemcpyPeer";
+ case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
+ case HIP_API_ID_hipMemcpyDtoDAsync: return "hipMemcpyDtoDAsync";
+ case HIP_API_ID_hipProfilerStart: return "hipProfilerStart";
+ case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags";
+ case HIP_API_ID_hipCtxEnablePeerAccess: return "hipCtxEnablePeerAccess";
+ case HIP_API_ID_hipMemAllocHost: return "hipMemAllocHost";
+ case HIP_API_ID_hipMemcpyDtoHAsync: return "hipMemcpyDtoHAsync";
+ case HIP_API_ID_hipModuleLaunchKernel: return "hipModuleLaunchKernel";
+ case HIP_API_ID_hipMemAllocPitch: return "hipMemAllocPitch";
+ case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
+ case HIP_API_ID_hipMemcpy2DFromArrayAsync: return "hipMemcpy2DFromArrayAsync";
+ case HIP_API_ID_hipDeviceGetLimit: return "hipDeviceGetLimit";
+ case HIP_API_ID_hipModuleLoadDataEx: return "hipModuleLoadDataEx";
+ case HIP_API_ID_hipRuntimeGetVersion: return "hipRuntimeGetVersion";
+ case HIP_API_ID_hipMemRangeGetAttribute: return "hipMemRangeGetAttribute";
+ case HIP_API_ID_hipDeviceGetP2PAttribute: return "hipDeviceGetP2PAttribute";
+ case HIP_API_ID_hipMemcpyPeerAsync: return "hipMemcpyPeerAsync";
+ case HIP_API_ID_hipGetDeviceProperties: return "hipGetDeviceProperties";
+ case HIP_API_ID_hipMemcpyDtoH: return "hipMemcpyDtoH";
+ case HIP_API_ID_hipMemcpyWithStream: return "hipMemcpyWithStream";
+ case HIP_API_ID_hipDeviceTotalMem: return "hipDeviceTotalMem";
+ case HIP_API_ID_hipHostGetDevicePointer: return "hipHostGetDevicePointer";
+ case HIP_API_ID_hipMemRangeGetAttributes: return "hipMemRangeGetAttributes";
+ case HIP_API_ID_hipMemcpyParam2D: return "hipMemcpyParam2D";
+ case HIP_API_ID_hipDevicePrimaryCtxReset: return "hipDevicePrimaryCtxReset";
+ case HIP_API_ID_hipGetMipmappedArrayLevel: return "hipGetMipmappedArrayLevel";
+ case HIP_API_ID_hipMemsetD32Async: return "hipMemsetD32Async";
+ case HIP_API_ID_hipGetDevice: return "hipGetDevice";
+ case HIP_API_ID_hipGetDeviceCount: return "hipGetDeviceCount";
+ case HIP_API_ID_hipIpcOpenEventHandle: return "hipIpcOpenEventHandle";
+ };
+ return "unknown";
+};
+
+#include <string.h>
+// Return HIP API ID by given name
+static inline uint32_t hipApiIdByName(const char* name) {
+ if (strcmp("hipDrvMemcpy3DAsync", name) == 0) return HIP_API_ID_hipDrvMemcpy3DAsync;
+ if (strcmp("hipDeviceEnablePeerAccess", name) == 0) return HIP_API_ID_hipDeviceEnablePeerAccess;
+ if (strcmp("hipFuncSetSharedMemConfig", name) == 0) return HIP_API_ID_hipFuncSetSharedMemConfig;
+ if (strcmp("hipMemcpyToSymbolAsync", name) == 0) return HIP_API_ID_hipMemcpyToSymbolAsync;
+ if (strcmp("hipMallocPitch", name) == 0) return HIP_API_ID_hipMallocPitch;
+ if (strcmp("hipMalloc", name) == 0) return HIP_API_ID_hipMalloc;
+ if (strcmp("hipMemsetD16", name) == 0) return HIP_API_ID_hipMemsetD16;
+ if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask;
+ if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord;
+ if (strcmp("hipCtxSynchronize", name) == 0) return HIP_API_ID_hipCtxSynchronize;
+ if (strcmp("hipSetDevice", name) == 0) return HIP_API_ID_hipSetDevice;
+ if (strcmp("hipCtxGetApiVersion", name) == 0) return HIP_API_ID_hipCtxGetApiVersion;
+ if (strcmp("hipMemcpyFromSymbolAsync", name) == 0) return HIP_API_ID_hipMemcpyFromSymbolAsync;
+ if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount;
+ if (strcmp("__hipPopCallConfiguration", name) == 0) return HIP_API_ID___hipPopCallConfiguration;
+ if (strcmp("hipModuleOccupancyMaxActiveBlocksPerMultiprocessor", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+ if (strcmp("hipMemset3D", name) == 0) return HIP_API_ID_hipMemset3D;
+ if (strcmp("hipStreamCreateWithPriority", name) == 0) return HIP_API_ID_hipStreamCreateWithPriority;
+ if (strcmp("hipMemcpy2DToArray", name) == 0) return HIP_API_ID_hipMemcpy2DToArray;
+ if (strcmp("hipMemsetD8Async", name) == 0) return HIP_API_ID_hipMemsetD8Async;
+ if (strcmp("hipCtxGetCacheConfig", name) == 0) return HIP_API_ID_hipCtxGetCacheConfig;
+ if (strcmp("hipModuleGetFunction", name) == 0) return HIP_API_ID_hipModuleGetFunction;
+ if (strcmp("hipStreamWaitEvent", name) == 0) return HIP_API_ID_hipStreamWaitEvent;
+ if (strcmp("hipDeviceGetStreamPriorityRange", name) == 0) return HIP_API_ID_hipDeviceGetStreamPriorityRange;
+ if (strcmp("hipModuleLoad", name) == 0) return HIP_API_ID_hipModuleLoad;
+ if (strcmp("hipDevicePrimaryCtxSetFlags", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxSetFlags;
+ if (strcmp("hipLaunchCooperativeKernel", name) == 0) return HIP_API_ID_hipLaunchCooperativeKernel;
+ if (strcmp("hipLaunchCooperativeKernelMultiDevice", name) == 0) return HIP_API_ID_hipLaunchCooperativeKernelMultiDevice;
+ if (strcmp("hipMemcpyAsync", name) == 0) return HIP_API_ID_hipMemcpyAsync;
+ if (strcmp("hipMalloc3DArray", name) == 0) return HIP_API_ID_hipMalloc3DArray;
+ if (strcmp("hipMallocHost", name) == 0) return HIP_API_ID_hipMallocHost;
+ if (strcmp("hipCtxGetCurrent", name) == 0) return HIP_API_ID_hipCtxGetCurrent;
+ if (strcmp("hipDevicePrimaryCtxGetState", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxGetState;
+ if (strcmp("hipEventQuery", name) == 0) return HIP_API_ID_hipEventQuery;
+ if (strcmp("hipEventCreate", name) == 0) return HIP_API_ID_hipEventCreate;
+ if (strcmp("hipMemGetAddressRange", name) == 0) return HIP_API_ID_hipMemGetAddressRange;
+ if (strcmp("hipMemcpyFromSymbol", name) == 0) return HIP_API_ID_hipMemcpyFromSymbol;
+ if (strcmp("hipArrayCreate", name) == 0) return HIP_API_ID_hipArrayCreate;
+ if (strcmp("hipStreamAttachMemAsync", name) == 0) return HIP_API_ID_hipStreamAttachMemAsync;
+ if (strcmp("hipStreamGetFlags", name) == 0) return HIP_API_ID_hipStreamGetFlags;
+ if (strcmp("hipMallocArray", name) == 0) return HIP_API_ID_hipMallocArray;
+ if (strcmp("hipCtxGetSharedMemConfig", name) == 0) return HIP_API_ID_hipCtxGetSharedMemConfig;
+ if (strcmp("hipDeviceDisablePeerAccess", name) == 0) return HIP_API_ID_hipDeviceDisablePeerAccess;
+ if (strcmp("hipModuleOccupancyMaxPotentialBlockSize", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize;
+ if (strcmp("hipMemPtrGetInfo", name) == 0) return HIP_API_ID_hipMemPtrGetInfo;
+ if (strcmp("hipFuncGetAttribute", name) == 0) return HIP_API_ID_hipFuncGetAttribute;
+ if (strcmp("hipCtxGetFlags", name) == 0) return HIP_API_ID_hipCtxGetFlags;
+ if (strcmp("hipStreamDestroy", name) == 0) return HIP_API_ID_hipStreamDestroy;
+ if (strcmp("__hipPushCallConfiguration", name) == 0) return HIP_API_ID___hipPushCallConfiguration;
+ if (strcmp("hipMemset3DAsync", name) == 0) return HIP_API_ID_hipMemset3DAsync;
+ if (strcmp("hipDeviceGetPCIBusId", name) == 0) return HIP_API_ID_hipDeviceGetPCIBusId;
+ if (strcmp("hipInit", name) == 0) return HIP_API_ID_hipInit;
+ if (strcmp("hipMemcpyAtoH", name) == 0) return HIP_API_ID_hipMemcpyAtoH;
+ if (strcmp("hipStreamGetPriority", name) == 0) return HIP_API_ID_hipStreamGetPriority;
+ if (strcmp("hipMemset2D", name) == 0) return HIP_API_ID_hipMemset2D;
+ if (strcmp("hipMemset2DAsync", name) == 0) return HIP_API_ID_hipMemset2DAsync;
+ if (strcmp("hipDeviceCanAccessPeer", name) == 0) return HIP_API_ID_hipDeviceCanAccessPeer;
+ if (strcmp("hipLaunchByPtr", name) == 0) return HIP_API_ID_hipLaunchByPtr;
+ if (strcmp("hipMemPrefetchAsync", name) == 0) return HIP_API_ID_hipMemPrefetchAsync;
+ if (strcmp("hipCtxDestroy", name) == 0) return HIP_API_ID_hipCtxDestroy;
+ if (strcmp("hipMemsetD16Async", name) == 0) return HIP_API_ID_hipMemsetD16Async;
+ if (strcmp("hipModuleUnload", name) == 0) return HIP_API_ID_hipModuleUnload;
+ if (strcmp("hipHostUnregister", name) == 0) return HIP_API_ID_hipHostUnregister;
+ if (strcmp("hipProfilerStop", name) == 0) return HIP_API_ID_hipProfilerStop;
+ if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask;
+ if (strcmp("hipStreamSynchronize", name) == 0) return HIP_API_ID_hipStreamSynchronize;
+ if (strcmp("hipFreeHost", name) == 0) return HIP_API_ID_hipFreeHost;
+ if (strcmp("hipDeviceSetCacheConfig", name) == 0) return HIP_API_ID_hipDeviceSetCacheConfig;
+ if (strcmp("hipGetErrorName", name) == 0) return HIP_API_ID_hipGetErrorName;
+ if (strcmp("hipMemcpyHtoD", name) == 0) return HIP_API_ID_hipMemcpyHtoD;
+ if (strcmp("hipModuleGetGlobal", name) == 0) return HIP_API_ID_hipModuleGetGlobal;
+ if (strcmp("hipMemcpyHtoA", name) == 0) return HIP_API_ID_hipMemcpyHtoA;
+ if (strcmp("hipCtxCreate", name) == 0) return HIP_API_ID_hipCtxCreate;
+ if (strcmp("hipMemcpy2D", name) == 0) return HIP_API_ID_hipMemcpy2D;
+ if (strcmp("hipIpcCloseMemHandle", name) == 0) return HIP_API_ID_hipIpcCloseMemHandle;
+ if (strcmp("hipChooseDevice", name) == 0) return HIP_API_ID_hipChooseDevice;
+ if (strcmp("hipDeviceSetSharedMemConfig", name) == 0) return HIP_API_ID_hipDeviceSetSharedMemConfig;
+ if (strcmp("hipMallocMipmappedArray", name) == 0) return HIP_API_ID_hipMallocMipmappedArray;
+ if (strcmp("hipSetupArgument", name) == 0) return HIP_API_ID_hipSetupArgument;
+ if (strcmp("hipIpcGetEventHandle", name) == 0) return HIP_API_ID_hipIpcGetEventHandle;
+ if (strcmp("hipFreeArray", name) == 0) return HIP_API_ID_hipFreeArray;
+ if (strcmp("hipCtxSetCacheConfig", name) == 0) return HIP_API_ID_hipCtxSetCacheConfig;
+ if (strcmp("hipFuncSetCacheConfig", name) == 0) return HIP_API_ID_hipFuncSetCacheConfig;
+ if (strcmp("hipLaunchKernel", name) == 0) return HIP_API_ID_hipLaunchKernel;
+ if (strcmp("hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+ if (strcmp("hipModuleGetTexRef", name) == 0) return HIP_API_ID_hipModuleGetTexRef;
+ if (strcmp("hipFuncSetAttribute", name) == 0) return HIP_API_ID_hipFuncSetAttribute;
+ if (strcmp("hipEventElapsedTime", name) == 0) return HIP_API_ID_hipEventElapsedTime;
+ if (strcmp("hipConfigureCall", name) == 0) return HIP_API_ID_hipConfigureCall;
+ if (strcmp("hipMemAdvise", name) == 0) return HIP_API_ID_hipMemAdvise;
+ if (strcmp("hipMemcpy3DAsync", name) == 0) return HIP_API_ID_hipMemcpy3DAsync;
+ if (strcmp("hipEventDestroy", name) == 0) return HIP_API_ID_hipEventDestroy;
+ if (strcmp("hipCtxPopCurrent", name) == 0) return HIP_API_ID_hipCtxPopCurrent;
+ if (strcmp("hipGetSymbolAddress", name) == 0) return HIP_API_ID_hipGetSymbolAddress;
+ if (strcmp("hipHostGetFlags", name) == 0) return HIP_API_ID_hipHostGetFlags;
+ if (strcmp("hipHostMalloc", name) == 0) return HIP_API_ID_hipHostMalloc;
+ if (strcmp("hipCtxSetSharedMemConfig", name) == 0) return HIP_API_ID_hipCtxSetSharedMemConfig;
+ if (strcmp("hipFreeMipmappedArray", name) == 0) return HIP_API_ID_hipFreeMipmappedArray;
+ if (strcmp("hipMemGetInfo", name) == 0) return HIP_API_ID_hipMemGetInfo;
+ if (strcmp("hipDeviceReset", name) == 0) return HIP_API_ID_hipDeviceReset;
+ if (strcmp("hipMemset", name) == 0) return HIP_API_ID_hipMemset;
+ if (strcmp("hipMemsetD8", name) == 0) return HIP_API_ID_hipMemsetD8;
+ if (strcmp("hipMemcpyParam2DAsync", name) == 0) return HIP_API_ID_hipMemcpyParam2DAsync;
+ if (strcmp("hipHostRegister", name) == 0) return HIP_API_ID_hipHostRegister;
+ if (strcmp("hipDriverGetVersion", name) == 0) return HIP_API_ID_hipDriverGetVersion;
+ if (strcmp("hipArray3DCreate", name) == 0) return HIP_API_ID_hipArray3DCreate;
+ if (strcmp("hipIpcOpenMemHandle", name) == 0) return HIP_API_ID_hipIpcOpenMemHandle;
+ if (strcmp("hipGetLastError", name) == 0) return HIP_API_ID_hipGetLastError;
+ if (strcmp("hipGetDeviceFlags", name) == 0) return HIP_API_ID_hipGetDeviceFlags;
+ if (strcmp("hipDeviceGetSharedMemConfig", name) == 0) return HIP_API_ID_hipDeviceGetSharedMemConfig;
+ if (strcmp("hipDrvMemcpy3D", name) == 0) return HIP_API_ID_hipDrvMemcpy3D;
+ if (strcmp("hipMemcpy2DFromArray", name) == 0) return HIP_API_ID_hipMemcpy2DFromArray;
+ if (strcmp("hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", name) == 0) return HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+ if (strcmp("hipSetDeviceFlags", name) == 0) return HIP_API_ID_hipSetDeviceFlags;
+ if (strcmp("hipHccModuleLaunchKernel", name) == 0) return HIP_API_ID_hipHccModuleLaunchKernel;
+ if (strcmp("hipFree", name) == 0) return HIP_API_ID_hipFree;
+ if (strcmp("hipOccupancyMaxPotentialBlockSize", name) == 0) return HIP_API_ID_hipOccupancyMaxPotentialBlockSize;
+ if (strcmp("hipDeviceGetAttribute", name) == 0) return HIP_API_ID_hipDeviceGetAttribute;
+ if (strcmp("hipDeviceComputeCapability", name) == 0) return HIP_API_ID_hipDeviceComputeCapability;
+ if (strcmp("hipCtxDisablePeerAccess", name) == 0) return HIP_API_ID_hipCtxDisablePeerAccess;
+ if (strcmp("hipMallocManaged", name) == 0) return HIP_API_ID_hipMallocManaged;
+ if (strcmp("hipDeviceGetByPCIBusId", name) == 0) return HIP_API_ID_hipDeviceGetByPCIBusId;
+ if (strcmp("hipIpcGetMemHandle", name) == 0) return HIP_API_ID_hipIpcGetMemHandle;
+ if (strcmp("hipMemcpyHtoDAsync", name) == 0) return HIP_API_ID_hipMemcpyHtoDAsync;
+ if (strcmp("hipCtxGetDevice", name) == 0) return HIP_API_ID_hipCtxGetDevice;
+ if (strcmp("hipMemcpyDtoD", name) == 0) return HIP_API_ID_hipMemcpyDtoD;
+ if (strcmp("hipModuleLoadData", name) == 0) return HIP_API_ID_hipModuleLoadData;
+ if (strcmp("hipDevicePrimaryCtxRelease", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxRelease;
+ if (strcmp("hipOccupancyMaxActiveBlocksPerMultiprocessor", name) == 0) return HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor;
+ if (strcmp("hipCtxSetCurrent", name) == 0) return HIP_API_ID_hipCtxSetCurrent;
+ if (strcmp("hipGetErrorString", name) == 0) return HIP_API_ID_hipGetErrorString;
+ if (strcmp("hipStreamCreate", name) == 0) return HIP_API_ID_hipStreamCreate;
+ if (strcmp("hipDevicePrimaryCtxRetain", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxRetain;
+ if (strcmp("hipDeviceGet", name) == 0) return HIP_API_ID_hipDeviceGet;
+ if (strcmp("hipStreamCreateWithFlags", name) == 0) return HIP_API_ID_hipStreamCreateWithFlags;
+ if (strcmp("hipMemcpyFromArray", name) == 0) return HIP_API_ID_hipMemcpyFromArray;
+ if (strcmp("hipMemcpy2DAsync", name) == 0) return HIP_API_ID_hipMemcpy2DAsync;
+ if (strcmp("hipFuncGetAttributes", name) == 0) return HIP_API_ID_hipFuncGetAttributes;
+ if (strcmp("hipGetSymbolSize", name) == 0) return HIP_API_ID_hipGetSymbolSize;
+ if (strcmp("hipHostFree", name) == 0) return HIP_API_ID_hipHostFree;
+ if (strcmp("hipEventCreateWithFlags", name) == 0) return HIP_API_ID_hipEventCreateWithFlags;
+ if (strcmp("hipStreamQuery", name) == 0) return HIP_API_ID_hipStreamQuery;
+ if (strcmp("hipMemcpy3D", name) == 0) return HIP_API_ID_hipMemcpy3D;
+ if (strcmp("hipMemcpyToSymbol", name) == 0) return HIP_API_ID_hipMemcpyToSymbol;
+ if (strcmp("hipMemcpy", name) == 0) return HIP_API_ID_hipMemcpy;
+ if (strcmp("hipPeekAtLastError", name) == 0) return HIP_API_ID_hipPeekAtLastError;
+ if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice;
+ if (strcmp("hipHostAlloc", name) == 0) return HIP_API_ID_hipHostAlloc;
+ if (strcmp("hipStreamAddCallback", name) == 0) return HIP_API_ID_hipStreamAddCallback;
+ if (strcmp("hipMemcpyToArray", name) == 0) return HIP_API_ID_hipMemcpyToArray;
+ if (strcmp("hipMemsetD32", name) == 0) return HIP_API_ID_hipMemsetD32;
+ if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel;
+ if (strcmp("hipDeviceSynchronize", name) == 0) return HIP_API_ID_hipDeviceSynchronize;
+ if (strcmp("hipDeviceGetCacheConfig", name) == 0) return HIP_API_ID_hipDeviceGetCacheConfig;
+ if (strcmp("hipMalloc3D", name) == 0) return HIP_API_ID_hipMalloc3D;
+ if (strcmp("hipPointerGetAttributes", name) == 0) return HIP_API_ID_hipPointerGetAttributes;
+ if (strcmp("hipMemsetAsync", name) == 0) return HIP_API_ID_hipMemsetAsync;
+ if (strcmp("hipDeviceGetName", name) == 0) return HIP_API_ID_hipDeviceGetName;
+ if (strcmp("hipModuleOccupancyMaxPotentialBlockSizeWithFlags", name) == 0) return HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags;
+ if (strcmp("hipCtxPushCurrent", name) == 0) return HIP_API_ID_hipCtxPushCurrent;
+ if (strcmp("hipMemcpyPeer", name) == 0) return HIP_API_ID_hipMemcpyPeer;
+ if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize;
+ if (strcmp("hipMemcpyDtoDAsync", name) == 0) return HIP_API_ID_hipMemcpyDtoDAsync;
+ if (strcmp("hipProfilerStart", name) == 0) return HIP_API_ID_hipProfilerStart;
+ if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags;
+ if (strcmp("hipCtxEnablePeerAccess", name) == 0) return HIP_API_ID_hipCtxEnablePeerAccess;
+ if (strcmp("hipMemAllocHost", name) == 0) return HIP_API_ID_hipMemAllocHost;
+ if (strcmp("hipMemcpyDtoHAsync", name) == 0) return HIP_API_ID_hipMemcpyDtoHAsync;
+ if (strcmp("hipModuleLaunchKernel", name) == 0) return HIP_API_ID_hipModuleLaunchKernel;
+ if (strcmp("hipMemAllocPitch", name) == 0) return HIP_API_ID_hipMemAllocPitch;
+ if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel;
+ if (strcmp("hipMemcpy2DFromArrayAsync", name) == 0) return HIP_API_ID_hipMemcpy2DFromArrayAsync;
+ if (strcmp("hipDeviceGetLimit", name) == 0) return HIP_API_ID_hipDeviceGetLimit;
+ if (strcmp("hipModuleLoadDataEx", name) == 0) return HIP_API_ID_hipModuleLoadDataEx;
+ if (strcmp("hipRuntimeGetVersion", name) == 0) return HIP_API_ID_hipRuntimeGetVersion;
+ if (strcmp("hipMemRangeGetAttribute", name) == 0) return HIP_API_ID_hipMemRangeGetAttribute;
+ if (strcmp("hipDeviceGetP2PAttribute", name) == 0) return HIP_API_ID_hipDeviceGetP2PAttribute;
+ if (strcmp("hipMemcpyPeerAsync", name) == 0) return HIP_API_ID_hipMemcpyPeerAsync;
+ if (strcmp("hipGetDeviceProperties", name) == 0) return HIP_API_ID_hipGetDeviceProperties;
+ if (strcmp("hipMemcpyDtoH", name) == 0) return HIP_API_ID_hipMemcpyDtoH;
+ if (strcmp("hipMemcpyWithStream", name) == 0) return HIP_API_ID_hipMemcpyWithStream;
+ if (strcmp("hipDeviceTotalMem", name) == 0) return HIP_API_ID_hipDeviceTotalMem;
+ if (strcmp("hipHostGetDevicePointer", name) == 0) return HIP_API_ID_hipHostGetDevicePointer;
+ if (strcmp("hipMemRangeGetAttributes", name) == 0) return HIP_API_ID_hipMemRangeGetAttributes;
+ if (strcmp("hipMemcpyParam2D", name) == 0) return HIP_API_ID_hipMemcpyParam2D;
+ if (strcmp("hipDevicePrimaryCtxReset", name) == 0) return HIP_API_ID_hipDevicePrimaryCtxReset;
+ if (strcmp("hipGetMipmappedArrayLevel", name) == 0) return HIP_API_ID_hipGetMipmappedArrayLevel;
+ if (strcmp("hipMemsetD32Async", name) == 0) return HIP_API_ID_hipMemsetD32Async;
+ if (strcmp("hipGetDevice", name) == 0) return HIP_API_ID_hipGetDevice;
+ if (strcmp("hipGetDeviceCount", name) == 0) return HIP_API_ID_hipGetDeviceCount;
+ if (strcmp("hipIpcOpenEventHandle", name) == 0) return HIP_API_ID_hipIpcOpenEventHandle;
+ return HIP_API_ID_NUMBER;
+}
+
+// HIP API callbacks data structure
+typedef struct hip_api_data_s {
+ uint64_t correlation_id;
+ uint32_t phase;
+ union {
+ struct {
+ const HIP_MEMCPY3D* pCopy;
+ HIP_MEMCPY3D pCopy__val;
+ hipStream_t stream;
+ } hipDrvMemcpy3DAsync;
+ struct {
+ int peerDeviceId;
+ unsigned int flags;
+ } hipDeviceEnablePeerAccess;
+ struct {
+ const void* func;
+ hipSharedMemConfig config;
+ } hipFuncSetSharedMemConfig;
+ struct {
+ const void* symbol;
+ const void* src;
+ size_t sizeBytes;
+ size_t offset;
+ hipMemcpyKind kind;
+ hipStream_t stream;
+ } hipMemcpyToSymbolAsync;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t* pitch;
+ size_t pitch__val;
+ size_t width;
+ size_t height;
+ } hipMallocPitch;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t size;
+ } hipMalloc;
+ struct {
+ hipDeviceptr_t dest;
+ unsigned short value;
+ size_t count;
+ } hipMemsetD16;
+ struct {
+ hipStream_t stream;
+ unsigned int cuMaskSize;
+ unsigned int* cuMask;
+ unsigned int cuMask__val;
+ } hipExtStreamGetCUMask;
+ struct {
+ hipEvent_t event;
+ hipStream_t stream;
+ } hipEventRecord;
+ struct {
+ int deviceId;
+ } hipSetDevice;
+ struct {
+ hipCtx_t ctx;
+ int* apiVersion;
+ int apiVersion__val;
+ } hipCtxGetApiVersion;
+ struct {
+ void* dst;
+ const void* symbol;
+ size_t sizeBytes;
+ size_t offset;
+ hipMemcpyKind kind;
+ hipStream_t stream;
+ } hipMemcpyFromSymbolAsync;
+ struct {
+ int device1;
+ int device2;
+ unsigned int* linktype;
+ unsigned int linktype__val;
+ unsigned int* hopcount;
+ unsigned int hopcount__val;
+ } hipExtGetLinkTypeAndHopCount;
+ struct {
+ dim3* gridDim;
+ dim3 gridDim__val;
+ dim3* blockDim;
+ dim3 blockDim__val;
+ size_t* sharedMem;
+ size_t sharedMem__val;
+ hipStream_t* stream;
+ hipStream_t stream__val;
+ } __hipPopCallConfiguration;
+ struct {
+ int* numBlocks;
+ int numBlocks__val;
+ hipFunction_t f;
+ int blockSize;
+ size_t dynSharedMemPerBlk;
+ } hipModuleOccupancyMaxActiveBlocksPerMultiprocessor;
+ struct {
+ hipPitchedPtr pitchedDevPtr;
+ int value;
+ hipExtent extent;
+ } hipMemset3D;
+ struct {
+ hipStream_t* stream;
+ hipStream_t stream__val;
+ unsigned int flags;
+ int priority;
+ } hipStreamCreateWithPriority;
+ struct {
+ hipArray* dst;
+ hipArray dst__val;
+ size_t wOffset;
+ size_t hOffset;
+ const void* src;
+ size_t spitch;
+ size_t width;
+ size_t height;
+ hipMemcpyKind kind;
+ } hipMemcpy2DToArray;
+ struct {
+ hipDeviceptr_t dest;
+ unsigned char value;
+ size_t count;
+ hipStream_t stream;
+ } hipMemsetD8Async;
+ struct {
+ hipFuncCache_t* cacheConfig;
+ hipFuncCache_t cacheConfig__val;
+ } hipCtxGetCacheConfig;
+ struct {
+ hipFunction_t* function;
+ hipFunction_t function__val;
+ hipModule_t module;
+ const char* kname;
+ char kname__val;
+ } hipModuleGetFunction;
+ struct {
+ hipStream_t stream;
+ hipEvent_t event;
+ unsigned int flags;
+ } hipStreamWaitEvent;
+ struct {
+ int* leastPriority;
+ int leastPriority__val;
+ int* greatestPriority;
+ int greatestPriority__val;
+ } hipDeviceGetStreamPriorityRange;
+ struct {
+ hipModule_t* module;
+ hipModule_t module__val;
+ const char* fname;
+ char fname__val;
+ } hipModuleLoad;
+ struct {
+ hipDevice_t dev;
+ unsigned int flags;
+ } hipDevicePrimaryCtxSetFlags;
+ struct {
+ const void* f;
+ dim3 gridDim;
+ dim3 blockDimX;
+ void** kernelParams;
+ void* kernelParams__val;
+ unsigned int sharedMemBytes;
+ hipStream_t stream;
+ } hipLaunchCooperativeKernel;
+ struct {
+ hipLaunchParams* launchParamsList;
+ hipLaunchParams launchParamsList__val;
+ int numDevices;
+ unsigned int flags;
+ } hipLaunchCooperativeKernelMultiDevice;
+ struct {
+ void* dst;
+ const void* src;
+ size_t sizeBytes;
+ hipMemcpyKind kind;
+ hipStream_t stream;
+ } hipMemcpyAsync;
+ struct {
+ hipArray_t* array;
+ hipArray_t array__val;
+ const hipChannelFormatDesc* desc;
+ hipChannelFormatDesc desc__val;
+ hipExtent extent;
+ unsigned int flags;
+ } hipMalloc3DArray;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t size;
+ } hipMallocHost;
+ struct {
+ hipCtx_t* ctx;
+ hipCtx_t ctx__val;
+ } hipCtxGetCurrent;
+ struct {
+ hipDevice_t dev;
+ unsigned int* flags;
+ unsigned int flags__val;
+ int* active;
+ int active__val;
+ } hipDevicePrimaryCtxGetState;
+ struct {
+ hipEvent_t event;
+ } hipEventQuery;
+ struct {
+ hipEvent_t* event;
+ hipEvent_t event__val;
+ } hipEventCreate;
+ struct {
+ hipDeviceptr_t* pbase;
+ hipDeviceptr_t pbase__val;
+ size_t* psize;
+ size_t psize__val;
+ hipDeviceptr_t dptr;
+ } hipMemGetAddressRange;
+ struct {
+ void* dst;
+ const void* symbol;
+ size_t sizeBytes;
+ size_t offset;
+ hipMemcpyKind kind;
+ } hipMemcpyFromSymbol;
+ struct {
+ hipArray** pHandle;
+ hipArray* pHandle__val;
+ const HIP_ARRAY_DESCRIPTOR* pAllocateArray;
+ HIP_ARRAY_DESCRIPTOR pAllocateArray__val;
+ } hipArrayCreate;
+ struct {
+ hipStream_t stream;
+ hipDeviceptr_t* dev_ptr;
+ hipDeviceptr_t dev_ptr__val;
+ size_t length;
+ unsigned int flags;
+ } hipStreamAttachMemAsync;
+ struct {
+ hipStream_t stream;
+ unsigned int* flags;
+ unsigned int flags__val;
+ } hipStreamGetFlags;
+ struct {
+ hipArray** array;
+ hipArray* array__val;
+ const hipChannelFormatDesc* desc;
+ hipChannelFormatDesc desc__val;
+ size_t width;
+ size_t height;
+ unsigned int flags;
+ } hipMallocArray;
+ struct {
+ hipSharedMemConfig* pConfig;
+ hipSharedMemConfig pConfig__val;
+ } hipCtxGetSharedMemConfig;
+ struct {
+ int peerDeviceId;
+ } hipDeviceDisablePeerAccess;
+ struct {
+ int* gridSize;
+ int gridSize__val;
+ int* blockSize;
+ int blockSize__val;
+ hipFunction_t f;
+ size_t dynSharedMemPerBlk;
+ int blockSizeLimit;
+ } hipModuleOccupancyMaxPotentialBlockSize;
+ struct {
+ void* ptr;
+ size_t* size;
+ size_t size__val;
+ } hipMemPtrGetInfo;
+ struct {
+ int* value;
+ int value__val;
+ hipFunction_attribute attrib;
+ hipFunction_t hfunc;
+ } hipFuncGetAttribute;
+ struct {
+ unsigned int* flags;
+ unsigned int flags__val;
+ } hipCtxGetFlags;
+ struct {
+ hipStream_t stream;
+ } hipStreamDestroy;
+ struct {
+ dim3 gridDim;
+ dim3 blockDim;
+ size_t sharedMem;
+ hipStream_t stream;
+ } __hipPushCallConfiguration;
+ struct {
+ hipPitchedPtr pitchedDevPtr;
+ int value;
+ hipExtent extent;
+ hipStream_t stream;
+ } hipMemset3DAsync;
+ struct {
+ char* pciBusId;
+ char pciBusId__val;
+ int len;
+ int device;
+ } hipDeviceGetPCIBusId;
+ struct {
+ unsigned int flags;
+ } hipInit;
+ struct {
+ void* dst;
+ hipArray* srcArray;
+ hipArray srcArray__val;
+ size_t srcOffset;
+ size_t count;
+ } hipMemcpyAtoH;
+ struct {
+ hipStream_t stream;
+ int* priority;
+ int priority__val;
+ } hipStreamGetPriority;
+ struct {
+ void* dst;
+ size_t pitch;
+ int value;
+ size_t width;
+ size_t height;
+ } hipMemset2D;
+ struct {
+ void* dst;
+ size_t pitch;
+ int value;
+ size_t width;
+ size_t height;
+ hipStream_t stream;
+ } hipMemset2DAsync;
+ struct {
+ int* canAccessPeer;
+ int canAccessPeer__val;
+ int deviceId;
+ int peerDeviceId;
+ } hipDeviceCanAccessPeer;
+ struct {
+ const void* hostFunction;
+ } hipLaunchByPtr;
+ struct {
+ const void* dev_ptr;
+ size_t count;
+ int device;
+ hipStream_t stream;
+ } hipMemPrefetchAsync;
+ struct {
+ hipCtx_t ctx;
+ } hipCtxDestroy;
+ struct {
+ hipDeviceptr_t dest;
+ unsigned short value;
+ size_t count;
+ hipStream_t stream;
+ } hipMemsetD16Async;
+ struct {
+ hipModule_t module;
+ } hipModuleUnload;
+ struct {
+ void* hostPtr;
+ } hipHostUnregister;
+ struct {
+ hipStream_t* stream;
+ hipStream_t stream__val;
+ unsigned int cuMaskSize;
+ const unsigned int* cuMask;
+ unsigned int cuMask__val;
+ } hipExtStreamCreateWithCUMask;
+ struct {
+ hipStream_t stream;
+ } hipStreamSynchronize;
+ struct {
+ void* ptr;
+ } hipFreeHost;
+ struct {
+ hipFuncCache_t cacheConfig;
+ } hipDeviceSetCacheConfig;
+ struct {
+ hipDeviceptr_t dst;
+ void* src;
+ size_t sizeBytes;
+ } hipMemcpyHtoD;
+ struct {
+ hipDeviceptr_t* dptr;
+ hipDeviceptr_t dptr__val;
+ size_t* bytes;
+ size_t bytes__val;
+ hipModule_t hmod;
+ const char* name;
+ char name__val;
+ } hipModuleGetGlobal;
+ struct {
+ hipArray* dstArray;
+ hipArray dstArray__val;
+ size_t dstOffset;
+ const void* srcHost;
+ size_t count;
+ } hipMemcpyHtoA;
+ struct {
+ hipCtx_t* ctx;
+ hipCtx_t ctx__val;
+ unsigned int flags;
+ hipDevice_t device;
+ } hipCtxCreate;
+ struct {
+ void* dst;
+ size_t dpitch;
+ const void* src;
+ size_t spitch;
+ size_t width;
+ size_t height;
+ hipMemcpyKind kind;
+ } hipMemcpy2D;
+ struct {
+ void* devPtr;
+ } hipIpcCloseMemHandle;
+ struct {
+ int* device;
+ int device__val;
+ const hipDeviceProp_t* prop;
+ hipDeviceProp_t prop__val;
+ } hipChooseDevice;
+ struct {
+ hipSharedMemConfig config;
+ } hipDeviceSetSharedMemConfig;
+ struct {
+ hipMipmappedArray_t* mipmappedArray;
+ hipMipmappedArray_t mipmappedArray__val;
+ const hipChannelFormatDesc* desc;
+ hipChannelFormatDesc desc__val;
+ hipExtent extent;
+ unsigned int numLevels;
+ unsigned int flags;
+ } hipMallocMipmappedArray;
+ struct {
+ const void* arg;
+ size_t size;
+ size_t offset;
+ } hipSetupArgument;
+ struct {
+ hipIpcEventHandle_t* handle;
+ hipIpcEventHandle_t handle__val;
+ hipEvent_t event;
+ } hipIpcGetEventHandle;
+ struct {
+ hipArray* array;
+ hipArray array__val;
+ } hipFreeArray;
+ struct {
+ hipFuncCache_t cacheConfig;
+ } hipCtxSetCacheConfig;
+ struct {
+ const void* func;
+ hipFuncCache_t config;
+ } hipFuncSetCacheConfig;
+ struct {
+ const void* function_address;
+ dim3 numBlocks;
+ dim3 dimBlocks;
+ void** args;
+ void* args__val;
+ size_t sharedMemBytes;
+ hipStream_t stream;
+ } hipLaunchKernel;
+ struct {
+ int* numBlocks;
+ int numBlocks__val;
+ hipFunction_t f;
+ int blockSize;
+ size_t dynSharedMemPerBlk;
+ unsigned int flags;
+ } hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+ struct {
+ textureReference** texRef;
+ textureReference* texRef__val;
+ hipModule_t hmod;
+ const char* name;
+ char name__val;
+ } hipModuleGetTexRef;
+ struct {
+ const void* func;
+ hipFuncAttribute attr;
+ int value;
+ } hipFuncSetAttribute;
+ struct {
+ float* ms;
+ float ms__val;
+ hipEvent_t start;
+ hipEvent_t stop;
+ } hipEventElapsedTime;
+ struct {
+ dim3 gridDim;
+ dim3 blockDim;
+ size_t sharedMem;
+ hipStream_t stream;
+ } hipConfigureCall;
+ struct {
+ const void* dev_ptr;
+ size_t count;
+ hipMemoryAdvise advice;
+ int device;
+ } hipMemAdvise;
+ struct {
+ const hipMemcpy3DParms* p;
+ hipMemcpy3DParms p__val;
+ hipStream_t stream;
+ } hipMemcpy3DAsync;
+ struct {
+ hipEvent_t event;
+ } hipEventDestroy;
+ struct {
+ hipCtx_t* ctx;
+ hipCtx_t ctx__val;
+ } hipCtxPopCurrent;
+ struct {
+ void** devPtr;
+ void* devPtr__val;
+ const void* symbol;
+ } hipGetSymbolAddress;
+ struct {
+ unsigned int* flagsPtr;
+ unsigned int flagsPtr__val;
+ void* hostPtr;
+ } hipHostGetFlags;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t size;
+ unsigned int flags;
+ } hipHostMalloc;
+ struct {
+ hipSharedMemConfig config;
+ } hipCtxSetSharedMemConfig;
+ struct {
+ hipMipmappedArray_t mipmappedArray;
+ } hipFreeMipmappedArray;
+ struct {
+ size_t* free;
+ size_t free__val;
+ size_t* total;
+ size_t total__val;
+ } hipMemGetInfo;
+ struct {
+ void* dst;
+ int value;
+ size_t sizeBytes;
+ } hipMemset;
+ struct {
+ hipDeviceptr_t dest;
+ unsigned char value;
+ size_t count;
+ } hipMemsetD8;
+ struct {
+ const hip_Memcpy2D* pCopy;
+ hip_Memcpy2D pCopy__val;
+ hipStream_t stream;
+ } hipMemcpyParam2DAsync;
+ struct {
+ void* hostPtr;
+ size_t sizeBytes;
+ unsigned int flags;
+ } hipHostRegister;
+ struct {
+ int* driverVersion;
+ int driverVersion__val;
+ } hipDriverGetVersion;
+ struct {
+ hipArray** array;
+ hipArray* array__val;
+ const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray;
+ HIP_ARRAY3D_DESCRIPTOR pAllocateArray__val;
+ } hipArray3DCreate;
+ struct {
+ void** devPtr;
+ void* devPtr__val;
+ hipIpcMemHandle_t handle;
+ unsigned int flags;
+ } hipIpcOpenMemHandle;
+ struct {
+ unsigned int* flags;
+ unsigned int flags__val;
+ } hipGetDeviceFlags;
+ struct {
+ hipSharedMemConfig* pConfig;
+ hipSharedMemConfig pConfig__val;
+ } hipDeviceGetSharedMemConfig;
+ struct {
+ const HIP_MEMCPY3D* pCopy;
+ HIP_MEMCPY3D pCopy__val;
+ } hipDrvMemcpy3D;
+ struct {
+ void* dst;
+ size_t dpitch;
+ hipArray_const_t src;
+ size_t wOffset;
+ size_t hOffset;
+ size_t width;
+ size_t height;
+ hipMemcpyKind kind;
+ } hipMemcpy2DFromArray;
+ struct {
+ int* numBlocks;
+ int numBlocks__val;
+ const void* f;
+ int blockSize;
+ size_t dynamicSMemSize;
+ unsigned int flags;
+ } hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+ struct {
+ unsigned int flags;
+ } hipSetDeviceFlags;
+ struct {
+ hipFunction_t f;
+ unsigned int globalWorkSizeX;
+ unsigned int globalWorkSizeY;
+ unsigned int globalWorkSizeZ;
+ unsigned int blockDimX;
+ unsigned int blockDimY;
+ unsigned int blockDimZ;
+ size_t sharedMemBytes;
+ hipStream_t hStream;
+ void** kernelParams;
+ void* kernelParams__val;
+ void** extra;
+ void* extra__val;
+ hipEvent_t startEvent;
+ hipEvent_t stopEvent;
+ } hipHccModuleLaunchKernel;
+ struct {
+ void* ptr;
+ } hipFree;
+ struct {
+ int* gridSize;
+ int gridSize__val;
+ int* blockSize;
+ int blockSize__val;
+ const void* f;
+ size_t dynSharedMemPerBlk;
+ int blockSizeLimit;
+ } hipOccupancyMaxPotentialBlockSize;
+ struct {
+ int* pi;
+ int pi__val;
+ hipDeviceAttribute_t attr;
+ int deviceId;
+ } hipDeviceGetAttribute;
+ struct {
+ int* major;
+ int major__val;
+ int* minor;
+ int minor__val;
+ hipDevice_t device;
+ } hipDeviceComputeCapability;
+ struct {
+ hipCtx_t peerCtx;
+ } hipCtxDisablePeerAccess;
+ struct {
+ void** dev_ptr;
+ void* dev_ptr__val;
+ size_t size;
+ unsigned int flags;
+ } hipMallocManaged;
+ struct {
+ int* device;
+ int device__val;
+ const char* pciBusId;
+ char pciBusId__val;
+ } hipDeviceGetByPCIBusId;
+ struct {
+ hipIpcMemHandle_t* handle;
+ hipIpcMemHandle_t handle__val;
+ void* devPtr;
+ } hipIpcGetMemHandle;
+ struct {
+ hipDeviceptr_t dst;
+ void* src;
+ size_t sizeBytes;
+ hipStream_t stream;
+ } hipMemcpyHtoDAsync;
+ struct {
+ hipDevice_t* device;
+ hipDevice_t device__val;
+ } hipCtxGetDevice;
+ struct {
+ hipDeviceptr_t dst;
+ hipDeviceptr_t src;
+ size_t sizeBytes;
+ } hipMemcpyDtoD;
+ struct {
+ hipModule_t* module;
+ hipModule_t module__val;
+ const void* image;
+ } hipModuleLoadData;
+ struct {
+ hipDevice_t dev;
+ } hipDevicePrimaryCtxRelease;
+ struct {
+ int* numBlocks;
+ int numBlocks__val;
+ const void* f;
+ int blockSize;
+ size_t dynamicSMemSize;
+ } hipOccupancyMaxActiveBlocksPerMultiprocessor;
+ struct {
+ hipCtx_t ctx;
+ } hipCtxSetCurrent;
+ struct {
+ hipStream_t* stream;
+ hipStream_t stream__val;
+ } hipStreamCreate;
+ struct {
+ hipCtx_t* pctx;
+ hipCtx_t pctx__val;
+ hipDevice_t dev;
+ } hipDevicePrimaryCtxRetain;
+ struct {
+ hipDevice_t* device;
+ hipDevice_t device__val;
+ int ordinal;
+ } hipDeviceGet;
+ struct {
+ hipStream_t* stream;
+ hipStream_t stream__val;
+ unsigned int flags;
+ } hipStreamCreateWithFlags;
+ struct {
+ void* dst;
+ hipArray_const_t srcArray;
+ size_t wOffset;
+ size_t hOffset;
+ size_t count;
+ hipMemcpyKind kind;
+ } hipMemcpyFromArray;
+ struct {
+ void* dst;
+ size_t dpitch;
+ const void* src;
+ size_t spitch;
+ size_t width;
+ size_t height;
+ hipMemcpyKind kind;
+ hipStream_t stream;
+ } hipMemcpy2DAsync;
+ struct {
+ hipFuncAttributes* attr;
+ hipFuncAttributes attr__val;
+ const void* func;
+ } hipFuncGetAttributes;
+ struct {
+ size_t* size;
+ size_t size__val;
+ const void* symbol;
+ } hipGetSymbolSize;
+ struct {
+ void* ptr;
+ } hipHostFree;
+ struct {
+ hipEvent_t* event;
+ hipEvent_t event__val;
+ unsigned int flags;
+ } hipEventCreateWithFlags;
+ struct {
+ hipStream_t stream;
+ } hipStreamQuery;
+ struct {
+ const hipMemcpy3DParms* p;
+ hipMemcpy3DParms p__val;
+ } hipMemcpy3D;
+ struct {
+ const void* symbol;
+ const void* src;
+ size_t sizeBytes;
+ size_t offset;
+ hipMemcpyKind kind;
+ } hipMemcpyToSymbol;
+ struct {
+ void* dst;
+ const void* src;
+ size_t sizeBytes;
+ hipMemcpyKind kind;
+ } hipMemcpy;
+ struct {
+ hipLaunchParams* launchParamsList;
+ hipLaunchParams launchParamsList__val;
+ int numDevices;
+ unsigned int flags;
+ } hipExtLaunchMultiKernelMultiDevice;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t size;
+ unsigned int flags;
+ } hipHostAlloc;
+ struct {
+ hipStream_t stream;
+ hipStreamCallback_t callback;
+ void* userData;
+ unsigned int flags;
+ } hipStreamAddCallback;
+ struct {
+ hipArray* dst;
+ hipArray dst__val;
+ size_t wOffset;
+ size_t hOffset;
+ const void* src;
+ size_t count;
+ hipMemcpyKind kind;
+ } hipMemcpyToArray;
+ struct {
+ hipDeviceptr_t dest;
+ int value;
+ size_t count;
+ } hipMemsetD32;
+ struct {
+ hipFunction_t f;
+ unsigned int globalWorkSizeX;
+ unsigned int globalWorkSizeY;
+ unsigned int globalWorkSizeZ;
+ unsigned int localWorkSizeX;
+ unsigned int localWorkSizeY;
+ unsigned int localWorkSizeZ;
+ size_t sharedMemBytes;
+ hipStream_t hStream;
+ void** kernelParams;
+ void* kernelParams__val;
+ void** extra;
+ void* extra__val;
+ hipEvent_t startEvent;
+ hipEvent_t stopEvent;
+ unsigned int flags;
+ } hipExtModuleLaunchKernel;
+ struct {
+ hipFuncCache_t* cacheConfig;
+ hipFuncCache_t cacheConfig__val;
+ } hipDeviceGetCacheConfig;
+ struct {
+ hipPitchedPtr* pitchedDevPtr;
+ hipPitchedPtr pitchedDevPtr__val;
+ hipExtent extent;
+ } hipMalloc3D;
+ struct {
+ hipPointerAttribute_t* attributes;
+ hipPointerAttribute_t attributes__val;
+ const void* ptr;
+ } hipPointerGetAttributes;
+ struct {
+ void* dst;
+ int value;
+ size_t sizeBytes;
+ hipStream_t stream;
+ } hipMemsetAsync;
+ struct {
+ char* name;
+ char name__val;
+ int len;
+ hipDevice_t device;
+ } hipDeviceGetName;
+ struct {
+ int* gridSize;
+ int gridSize__val;
+ int* blockSize;
+ int blockSize__val;
+ hipFunction_t f;
+ size_t dynSharedMemPerBlk;
+ int blockSizeLimit;
+ unsigned int flags;
+ } hipModuleOccupancyMaxPotentialBlockSizeWithFlags;
+ struct {
+ hipCtx_t ctx;
+ } hipCtxPushCurrent;
+ struct {
+ void* dst;
+ int dstDeviceId;
+ const void* src;
+ int srcDeviceId;
+ size_t sizeBytes;
+ } hipMemcpyPeer;
+ struct {
+ hipEvent_t event;
+ } hipEventSynchronize;
+ struct {
+ hipDeviceptr_t dst;
+ hipDeviceptr_t src;
+ size_t sizeBytes;
+ hipStream_t stream;
+ } hipMemcpyDtoDAsync;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t sizeBytes;
+ unsigned int flags;
+ } hipExtMallocWithFlags;
+ struct {
+ hipCtx_t peerCtx;
+ unsigned int flags;
+ } hipCtxEnablePeerAccess;
+ struct {
+ void** ptr;
+ void* ptr__val;
+ size_t size;
+ } hipMemAllocHost;
+ struct {
+ void* dst;
+ hipDeviceptr_t src;
+ size_t sizeBytes;
+ hipStream_t stream;
+ } hipMemcpyDtoHAsync;
+ struct {
+ hipFunction_t f;
+ unsigned int gridDimX;
+ unsigned int gridDimY;
+ unsigned int gridDimZ;
+ unsigned int blockDimX;
+ unsigned int blockDimY;
+ unsigned int blockDimZ;
+ unsigned int sharedMemBytes;
+ hipStream_t stream;
+ void** kernelParams;
+ void* kernelParams__val;
+ void** extra;
+ void* extra__val;
+ } hipModuleLaunchKernel;
+ struct {
+ hipDeviceptr_t* dptr;
+ hipDeviceptr_t dptr__val;
+ size_t* pitch;
+ size_t pitch__val;
+ size_t widthInBytes;
+ size_t height;
+ unsigned int elementSizeBytes;
+ } hipMemAllocPitch;
+ struct {
+ const void* function_address;
+ dim3 numBlocks;
+ dim3 dimBlocks;
+ void** args;
+ void* args__val;
+ size_t sharedMemBytes;
+ hipStream_t stream;
+ hipEvent_t startEvent;
+ hipEvent_t stopEvent;
+ int flags;
+ } hipExtLaunchKernel;
+ struct {
+ void* dst;
+ size_t dpitch;
+ hipArray_const_t src;
+ size_t wOffset;
+ size_t hOffset;
+ size_t width;
+ size_t height;
+ hipMemcpyKind kind;
+ hipStream_t stream;
+ } hipMemcpy2DFromArrayAsync;
+ struct {
+ size_t* pValue;
+ size_t pValue__val;
+ enum hipLimit_t limit;
+ } hipDeviceGetLimit;
+ struct {
+ hipModule_t* module;
+ hipModule_t module__val;
+ const void* image;
+ unsigned int numOptions;
+ hipJitOption* options;
+ hipJitOption options__val;
+ void** optionsValues;
+ void* optionsValues__val;
+ } hipModuleLoadDataEx;
+ struct {
+ int* runtimeVersion;
+ int runtimeVersion__val;
+ } hipRuntimeGetVersion;
+ struct {
+ void* data;
+ size_t data_size;
+ hipMemRangeAttribute attribute;
+ const void* dev_ptr;
+ size_t count;
+ } hipMemRangeGetAttribute;
+ struct {
+ int* value;
+ int value__val;
+ hipDeviceP2PAttr attr;
+ int srcDevice;
+ int dstDevice;
+ } hipDeviceGetP2PAttribute;
+ struct {
+ void* dst;
+ int dstDeviceId;
+ const void* src;
+ int srcDevice;
+ size_t sizeBytes;
+ hipStream_t stream;
+ } hipMemcpyPeerAsync;
+ struct {
+ hipDeviceProp_t* props;
+ hipDeviceProp_t props__val;
+ hipDevice_t device;
+ } hipGetDeviceProperties;
+ struct {
+ void* dst;
+ hipDeviceptr_t src;
+ size_t sizeBytes;
+ } hipMemcpyDtoH;
+ struct {
+ void* dst;
+ const void* src;
+ size_t sizeBytes;
+ hipMemcpyKind kind;
+ hipStream_t stream;
+ } hipMemcpyWithStream;
+ struct {
+ size_t* bytes;
+ size_t bytes__val;
+ hipDevice_t device;
+ } hipDeviceTotalMem;
+ struct {
+ void** devPtr;
+ void* devPtr__val;
+ void* hstPtr;
+ unsigned int flags;
+ } hipHostGetDevicePointer;
+ struct {
+ void** data;
+ void* data__val;
+ size_t* data_sizes;
+ size_t data_sizes__val;
+ hipMemRangeAttribute* attributes;
+ hipMemRangeAttribute attributes__val;
+ size_t num_attributes;
+ const void* dev_ptr;
+ size_t count;
+ } hipMemRangeGetAttributes;
+ struct {
+ const hip_Memcpy2D* pCopy;
+ hip_Memcpy2D pCopy__val;
+ } hipMemcpyParam2D;
+ struct {
+ hipDevice_t dev;
+ } hipDevicePrimaryCtxReset;
+ struct {
+ hipArray_t* levelArray;
+ hipArray_t levelArray__val;
+ hipMipmappedArray_const_t mipmappedArray;
+ unsigned int level;
+ } hipGetMipmappedArrayLevel;
+ struct {
+ hipDeviceptr_t dst;
+ int value;
+ size_t count;
+ hipStream_t stream;
+ } hipMemsetD32Async;
+ struct {
+ int* deviceId;
+ int deviceId__val;
+ } hipGetDevice;
+ struct {
+ int* count;
+ int count__val;
+ } hipGetDeviceCount;
+ struct {
+ hipEvent_t* event;
+ hipEvent_t event__val;
+ hipIpcEventHandle_t handle;
+ } hipIpcOpenEventHandle;
+ } args;
+} hip_api_data_t;
+
+// HIP API callbacks args data filling macros
+// hipDrvMemcpy3DAsync[('const HIP_MEMCPY3D*', 'pCopy'), ('hipStream_t', 'stream')]
+#define INIT_hipDrvMemcpy3DAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDrvMemcpy3DAsync.pCopy = (const HIP_MEMCPY3D*)pCopy; \
+ cb_data.args.hipDrvMemcpy3DAsync.stream = (hipStream_t)stream; \
+};
+// hipDeviceEnablePeerAccess[('int', 'peerDeviceId'), ('unsigned int', 'flags')]
+#define INIT_hipDeviceEnablePeerAccess_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceEnablePeerAccess.peerDeviceId = (int)peerDeviceId; \
+ cb_data.args.hipDeviceEnablePeerAccess.flags = (unsigned int)flags; \
+};
+// hipFuncSetSharedMemConfig[('const void*', 'func'), ('hipSharedMemConfig', 'config')]
+#define INIT_hipFuncSetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFuncSetSharedMemConfig.func = (const void*)func; \
+ cb_data.args.hipFuncSetSharedMemConfig.config = (hipSharedMemConfig)config; \
+};
+// hipMemcpyToSymbolAsync[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyToSymbolAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyToSymbolAsync.symbol = (const void*)symbol; \
+ cb_data.args.hipMemcpyToSymbolAsync.src = (const void*)src; \
+ cb_data.args.hipMemcpyToSymbolAsync.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyToSymbolAsync.offset = (size_t)offset; \
+ cb_data.args.hipMemcpyToSymbolAsync.kind = (hipMemcpyKind)kind; \
+ cb_data.args.hipMemcpyToSymbolAsync.stream = (hipStream_t)stream; \
+};
+// hipMallocPitch[('void**', 'ptr'), ('size_t*', 'pitch'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMallocPitch_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMallocPitch.ptr = (void**)ptr; \
+ cb_data.args.hipMallocPitch.pitch = (size_t*)pitch; \
+ cb_data.args.hipMallocPitch.width = (size_t)width; \
+ cb_data.args.hipMallocPitch.height = (size_t)height; \
+};
+// hipMalloc[('void**', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMalloc_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMalloc.ptr = (void**)ptr; \
+ cb_data.args.hipMalloc.size = (size_t)sizeBytes; \
+};
+// hipMemsetD16[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count')]
+#define INIT_hipMemsetD16_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetD16.dest = (hipDeviceptr_t)dst; \
+ cb_data.args.hipMemsetD16.value = (unsigned short)value; \
+ cb_data.args.hipMemsetD16.count = (size_t)count; \
+};
+// hipExtStreamGetCUMask[('hipStream_t', 'stream'), ('unsigned int', 'cuMaskSize'), ('unsigned int*', 'cuMask')]
+#define INIT_hipExtStreamGetCUMask_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtStreamGetCUMask.stream = (hipStream_t)stream; \
+ cb_data.args.hipExtStreamGetCUMask.cuMaskSize = (unsigned int)cuMaskSize; \
+ cb_data.args.hipExtStreamGetCUMask.cuMask = (unsigned int*)cuMask; \
+};
+// hipEventRecord[('hipEvent_t', 'event'), ('hipStream_t', 'stream')]
+#define INIT_hipEventRecord_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventRecord.event = (hipEvent_t)event; \
+ cb_data.args.hipEventRecord.stream = (hipStream_t)stream; \
+};
+// hipCtxSynchronize[]
+#define INIT_hipCtxSynchronize_CB_ARGS_DATA(cb_data) { \
+};
+// hipSetDevice[('int', 'deviceId')]
+#define INIT_hipSetDevice_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipSetDevice.deviceId = (int)device; \
+};
+// hipCtxGetApiVersion[('hipCtx_t', 'ctx'), ('int*', 'apiVersion')]
+#define INIT_hipCtxGetApiVersion_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxGetApiVersion.ctx = (hipCtx_t)ctx; \
+ cb_data.args.hipCtxGetApiVersion.apiVersion = (int*)apiVersion; \
+};
+// hipMemcpyFromSymbolAsync[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyFromSymbolAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyFromSymbolAsync.dst = (void*)dst; \
+ cb_data.args.hipMemcpyFromSymbolAsync.symbol = (const void*)symbol; \
+ cb_data.args.hipMemcpyFromSymbolAsync.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyFromSymbolAsync.offset = (size_t)offset; \
+ cb_data.args.hipMemcpyFromSymbolAsync.kind = (hipMemcpyKind)kind; \
+ cb_data.args.hipMemcpyFromSymbolAsync.stream = (hipStream_t)stream; \
+};
+// hipExtGetLinkTypeAndHopCount[('int', 'device1'), ('int', 'device2'), ('unsigned int*', 'linktype'), ('unsigned int*', 'hopcount')]
+#define INIT_hipExtGetLinkTypeAndHopCount_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtGetLinkTypeAndHopCount.device1 = (int)device1; \
+ cb_data.args.hipExtGetLinkTypeAndHopCount.device2 = (int)device2; \
+ cb_data.args.hipExtGetLinkTypeAndHopCount.linktype = (unsigned int*)linktype; \
+ cb_data.args.hipExtGetLinkTypeAndHopCount.hopcount = (unsigned int*)hopcount; \
+};
+// __hipPopCallConfiguration[('dim3*', 'gridDim'), ('dim3*', 'blockDim'), ('size_t*', 'sharedMem'), ('hipStream_t*', 'stream')]
+#define INIT___hipPopCallConfiguration_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.__hipPopCallConfiguration.gridDim = (dim3*)gridDim; \
+ cb_data.args.__hipPopCallConfiguration.blockDim = (dim3*)blockDim; \
+ cb_data.args.__hipPopCallConfiguration.sharedMem = (size_t*)sharedMem; \
+ cb_data.args.__hipPopCallConfiguration.stream = (hipStream_t*)stream; \
+};
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk')]
+#define INIT_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks = (int*)numBlocks; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.f = (hipFunction_t)f; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.blockSize = (int)blockSize; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+};
+// hipMemset3D[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent')]
+#define INIT_hipMemset3D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemset3D.pitchedDevPtr = (hipPitchedPtr)pitchedDevPtr; \
+ cb_data.args.hipMemset3D.value = (int)value; \
+ cb_data.args.hipMemset3D.extent = (hipExtent)extent; \
+};
+// hipStreamCreateWithPriority[('hipStream_t*', 'stream'), ('unsigned int', 'flags'), ('int', 'priority')]
+#define INIT_hipStreamCreateWithPriority_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamCreateWithPriority.stream = (hipStream_t*)stream; \
+ cb_data.args.hipStreamCreateWithPriority.flags = (unsigned int)flags; \
+ cb_data.args.hipStreamCreateWithPriority.priority = (int)priority; \
+};
+// hipMemcpy2DToArray[('hipArray*', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2DToArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy2DToArray.dst = (hipArray*)dst; \
+ cb_data.args.hipMemcpy2DToArray.wOffset = (size_t)wOffset; \
+ cb_data.args.hipMemcpy2DToArray.hOffset = (size_t)hOffset; \
+ cb_data.args.hipMemcpy2DToArray.src = (const void*)src; \
+ cb_data.args.hipMemcpy2DToArray.spitch = (size_t)spitch; \
+ cb_data.args.hipMemcpy2DToArray.width = (size_t)width; \
+ cb_data.args.hipMemcpy2DToArray.height = (size_t)height; \
+ cb_data.args.hipMemcpy2DToArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemsetD8Async[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD8Async_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetD8Async.dest = (hipDeviceptr_t)dst; \
+ cb_data.args.hipMemsetD8Async.value = (unsigned char)value; \
+ cb_data.args.hipMemsetD8Async.count = (size_t)count; \
+ cb_data.args.hipMemsetD8Async.stream = (hipStream_t)stream; \
+};
+// hipCtxGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+#define INIT_hipCtxGetCacheConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxGetCacheConfig.cacheConfig = (hipFuncCache_t*)cacheConfig; \
+};
+// hipModuleGetFunction[('hipFunction_t*', 'function'), ('hipModule_t', 'module'), ('const char*', 'kname')]
+#define INIT_hipModuleGetFunction_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleGetFunction.function = (hipFunction_t*)hfunc; \
+ cb_data.args.hipModuleGetFunction.module = (hipModule_t)hmod; \
+ cb_data.args.hipModuleGetFunction.kname = (name) ? strdup(name) : NULL; \
+};
+// hipStreamWaitEvent[('hipStream_t', 'stream'), ('hipEvent_t', 'event'), ('unsigned int', 'flags')]
+#define INIT_hipStreamWaitEvent_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamWaitEvent.stream = (hipStream_t)stream; \
+ cb_data.args.hipStreamWaitEvent.event = (hipEvent_t)event; \
+ cb_data.args.hipStreamWaitEvent.flags = (unsigned int)flags; \
+};
+// hipDeviceGetStreamPriorityRange[('int*', 'leastPriority'), ('int*', 'greatestPriority')]
+#define INIT_hipDeviceGetStreamPriorityRange_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetStreamPriorityRange.leastPriority = (int*)leastPriority; \
+ cb_data.args.hipDeviceGetStreamPriorityRange.greatestPriority = (int*)greatestPriority; \
+};
+// hipModuleLoad[('hipModule_t*', 'module'), ('const char*', 'fname')]
+#define INIT_hipModuleLoad_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleLoad.module = (hipModule_t*)module; \
+ cb_data.args.hipModuleLoad.fname = (fname) ? strdup(fname) : NULL; \
+};
+// hipDevicePrimaryCtxSetFlags[('hipDevice_t', 'dev'), ('unsigned int', 'flags')]
+#define INIT_hipDevicePrimaryCtxSetFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDevicePrimaryCtxSetFlags.dev = (hipDevice_t)dev; \
+ cb_data.args.hipDevicePrimaryCtxSetFlags.flags = (unsigned int)flags; \
+};
+// hipLaunchCooperativeKernel[('const void*', 'f'), ('dim3', 'gridDim'), ('dim3', 'blockDimX'), ('void**', 'kernelParams'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipLaunchCooperativeKernel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipLaunchCooperativeKernel.f = (const void*)f; \
+ cb_data.args.hipLaunchCooperativeKernel.gridDim = (dim3)gridDim; \
+ cb_data.args.hipLaunchCooperativeKernel.blockDimX = (dim3)blockDim; \
+ cb_data.args.hipLaunchCooperativeKernel.kernelParams = (void**)kernelParams; \
+ cb_data.args.hipLaunchCooperativeKernel.sharedMemBytes = (unsigned int)sharedMemBytes; \
+ cb_data.args.hipLaunchCooperativeKernel.stream = (hipStream_t)hStream; \
+};
+// hipLaunchCooperativeKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+#define INIT_hipLaunchCooperativeKernelMultiDevice_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipLaunchCooperativeKernelMultiDevice.launchParamsList = (hipLaunchParams*)launchParamsList; \
+ cb_data.args.hipLaunchCooperativeKernelMultiDevice.numDevices = (int)numDevices; \
+ cb_data.args.hipLaunchCooperativeKernelMultiDevice.flags = (unsigned int)flags; \
+};
+// hipMemcpyAsync[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyAsync.dst = (void*)dst; \
+ cb_data.args.hipMemcpyAsync.src = (const void*)src; \
+ cb_data.args.hipMemcpyAsync.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyAsync.kind = (hipMemcpyKind)kind; \
+ cb_data.args.hipMemcpyAsync.stream = (hipStream_t)stream; \
+};
+// hipMalloc3DArray[('hipArray_t*', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'flags')]
+#define INIT_hipMalloc3DArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMalloc3DArray.array = (hipArray_t*)array; \
+ cb_data.args.hipMalloc3DArray.desc = (const hipChannelFormatDesc*)desc; \
+ cb_data.args.hipMalloc3DArray.extent = (hipExtent)extent; \
+ cb_data.args.hipMalloc3DArray.flags = (unsigned int)flags; \
+};
+// hipMallocHost[('void**', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMallocHost_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMallocHost.ptr = (void**)ptr; \
+ cb_data.args.hipMallocHost.size = (size_t)size; \
+};
+// hipCtxGetCurrent[('hipCtx_t*', 'ctx')]
+#define INIT_hipCtxGetCurrent_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxGetCurrent.ctx = (hipCtx_t*)ctx; \
+};
+// hipDevicePrimaryCtxGetState[('hipDevice_t', 'dev'), ('unsigned int*', 'flags'), ('int*', 'active')]
+#define INIT_hipDevicePrimaryCtxGetState_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDevicePrimaryCtxGetState.dev = (hipDevice_t)dev; \
+ cb_data.args.hipDevicePrimaryCtxGetState.flags = (unsigned int*)flags; \
+ cb_data.args.hipDevicePrimaryCtxGetState.active = (int*)active; \
+};
+// hipEventQuery[('hipEvent_t', 'event')]
+#define INIT_hipEventQuery_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventQuery.event = (hipEvent_t)event; \
+};
+// hipEventCreate[('hipEvent_t*', 'event')]
+#define INIT_hipEventCreate_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventCreate.event = (hipEvent_t*)event; \
+};
+// hipMemGetAddressRange[('hipDeviceptr_t*', 'pbase'), ('size_t*', 'psize'), ('hipDeviceptr_t', 'dptr')]
+#define INIT_hipMemGetAddressRange_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemGetAddressRange.pbase = (hipDeviceptr_t*)pbase; \
+ cb_data.args.hipMemGetAddressRange.psize = (size_t*)psize; \
+ cb_data.args.hipMemGetAddressRange.dptr = (hipDeviceptr_t)dptr; \
+};
+// hipMemcpyFromSymbol[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyFromSymbol_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyFromSymbol.dst = (void*)dst; \
+ cb_data.args.hipMemcpyFromSymbol.symbol = (const void*)symbol; \
+ cb_data.args.hipMemcpyFromSymbol.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyFromSymbol.offset = (size_t)offset; \
+ cb_data.args.hipMemcpyFromSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')]
+#define INIT_hipArrayCreate_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipArrayCreate.pHandle = (hipArray**)array; \
+ cb_data.args.hipArrayCreate.pAllocateArray = (const HIP_ARRAY_DESCRIPTOR*)pAllocateArray; \
+};
+// hipStreamAttachMemAsync[('hipStream_t', 'stream'), ('hipDeviceptr_t*', 'dev_ptr'), ('size_t', 'length'), ('unsigned int', 'flags')]
+#define INIT_hipStreamAttachMemAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamAttachMemAsync.stream = (hipStream_t)stream; \
+ cb_data.args.hipStreamAttachMemAsync.dev_ptr = (hipDeviceptr_t*)dev_ptr; \
+ cb_data.args.hipStreamAttachMemAsync.length = (size_t)length; \
+ cb_data.args.hipStreamAttachMemAsync.flags = (unsigned int)flags; \
+};
+// hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')]
+#define INIT_hipStreamGetFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamGetFlags.stream = (hipStream_t)stream; \
+ cb_data.args.hipStreamGetFlags.flags = (unsigned int*)flags; \
+};
+// hipMallocArray[('hipArray**', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('size_t', 'width'), ('size_t', 'height'), ('unsigned int', 'flags')]
+#define INIT_hipMallocArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMallocArray.array = (hipArray**)array; \
+ cb_data.args.hipMallocArray.desc = (const hipChannelFormatDesc*)desc; \
+ cb_data.args.hipMallocArray.width = (size_t)width; \
+ cb_data.args.hipMallocArray.height = (size_t)height; \
+ cb_data.args.hipMallocArray.flags = (unsigned int)flags; \
+};
+// hipCtxGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+#define INIT_hipCtxGetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxGetSharedMemConfig.pConfig = (hipSharedMemConfig*)pConfig; \
+};
+// hipDeviceDisablePeerAccess[('int', 'peerDeviceId')]
+#define INIT_hipDeviceDisablePeerAccess_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceDisablePeerAccess.peerDeviceId = (int)peerDeviceId; \
+};
+// hipModuleOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+#define INIT_hipModuleOccupancyMaxPotentialBlockSize_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.gridSize = (int*)gridSize; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.blockSize = (int*)blockSize; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.f = (hipFunction_t)f; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSize.blockSizeLimit = (int)blockSizeLimit; \
+};
+// hipMemPtrGetInfo[('void*', 'ptr'), ('size_t*', 'size')]
+#define INIT_hipMemPtrGetInfo_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemPtrGetInfo.ptr = (void*)ptr; \
+ cb_data.args.hipMemPtrGetInfo.size = (size_t*)size; \
+};
+// hipFuncGetAttribute[('int*', 'value'), ('hipFunction_attribute', 'attrib'), ('hipFunction_t', 'hfunc')]
+#define INIT_hipFuncGetAttribute_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFuncGetAttribute.value = (int*)value; \
+ cb_data.args.hipFuncGetAttribute.attrib = (hipFunction_attribute)attrib; \
+ cb_data.args.hipFuncGetAttribute.hfunc = (hipFunction_t)hfunc; \
+};
+// hipCtxGetFlags[('unsigned int*', 'flags')]
+#define INIT_hipCtxGetFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxGetFlags.flags = (unsigned int*)flags; \
+};
+// hipStreamDestroy[('hipStream_t', 'stream')]
+#define INIT_hipStreamDestroy_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamDestroy.stream = (hipStream_t)stream; \
+};
+// __hipPushCallConfiguration[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+#define INIT___hipPushCallConfiguration_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.__hipPushCallConfiguration.gridDim = (dim3)gridDim; \
+ cb_data.args.__hipPushCallConfiguration.blockDim = (dim3)blockDim; \
+ cb_data.args.__hipPushCallConfiguration.sharedMem = (size_t)sharedMem; \
+ cb_data.args.__hipPushCallConfiguration.stream = (hipStream_t)stream; \
+};
+// hipMemset3DAsync[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent'), ('hipStream_t', 'stream')]
+#define INIT_hipMemset3DAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemset3DAsync.pitchedDevPtr = (hipPitchedPtr)pitchedDevPtr; \
+ cb_data.args.hipMemset3DAsync.value = (int)value; \
+ cb_data.args.hipMemset3DAsync.extent = (hipExtent)extent; \
+ cb_data.args.hipMemset3DAsync.stream = (hipStream_t)stream; \
+};
+// hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')]
+#define INIT_hipDeviceGetPCIBusId_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetPCIBusId.pciBusId = (char*)pciBusId; \
+ cb_data.args.hipDeviceGetPCIBusId.len = (int)len; \
+ cb_data.args.hipDeviceGetPCIBusId.device = (int)device; \
+};
+// hipInit[('unsigned int', 'flags')]
+#define INIT_hipInit_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipInit.flags = (unsigned int)flags; \
+};
+// hipMemcpyAtoH[('void*', 'dst'), ('hipArray*', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'count')]
+#define INIT_hipMemcpyAtoH_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyAtoH.dst = (void*)dstHost; \
+ cb_data.args.hipMemcpyAtoH.srcArray = (hipArray*)srcArray; \
+ cb_data.args.hipMemcpyAtoH.srcOffset = (size_t)srcOffset; \
+ cb_data.args.hipMemcpyAtoH.count = (size_t)ByteCount; \
+};
+// hipStreamGetPriority[('hipStream_t', 'stream'), ('int*', 'priority')]
+#define INIT_hipStreamGetPriority_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamGetPriority.stream = (hipStream_t)stream; \
+ cb_data.args.hipStreamGetPriority.priority = (int*)priority; \
+};
+// hipMemset2D[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+#define INIT_hipMemset2D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemset2D.dst = (void*)dst; \
+ cb_data.args.hipMemset2D.pitch = (size_t)pitch; \
+ cb_data.args.hipMemset2D.value = (int)value; \
+ cb_data.args.hipMemset2D.width = (size_t)width; \
+ cb_data.args.hipMemset2D.height = (size_t)height; \
+};
+// hipMemset2DAsync[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+#define INIT_hipMemset2DAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemset2DAsync.dst = (void*)dst; \
+ cb_data.args.hipMemset2DAsync.pitch = (size_t)pitch; \
+ cb_data.args.hipMemset2DAsync.value = (int)value; \
+ cb_data.args.hipMemset2DAsync.width = (size_t)width; \
+ cb_data.args.hipMemset2DAsync.height = (size_t)height; \
+ cb_data.args.hipMemset2DAsync.stream = (hipStream_t)stream; \
+};
+// hipDeviceCanAccessPeer[('int*', 'canAccessPeer'), ('int', 'deviceId'), ('int', 'peerDeviceId')]
+#define INIT_hipDeviceCanAccessPeer_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceCanAccessPeer.canAccessPeer = (int*)canAccess; \
+ cb_data.args.hipDeviceCanAccessPeer.deviceId = (int)deviceId; \
+ cb_data.args.hipDeviceCanAccessPeer.peerDeviceId = (int)peerDeviceId; \
+};
+// hipLaunchByPtr[('const void*', 'hostFunction')]
+#define INIT_hipLaunchByPtr_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipLaunchByPtr.hostFunction = (const void*)hostFunction; \
+};
+// hipMemPrefetchAsync[('const void*', 'dev_ptr'), ('size_t', 'count'), ('int', 'device'), ('hipStream_t', 'stream')]
+#define INIT_hipMemPrefetchAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemPrefetchAsync.dev_ptr = (const void*)dev_ptr; \
+ cb_data.args.hipMemPrefetchAsync.count = (size_t)count; \
+ cb_data.args.hipMemPrefetchAsync.device = (int)device; \
+ cb_data.args.hipMemPrefetchAsync.stream = (hipStream_t)stream; \
+};
+// hipCtxDestroy[('hipCtx_t', 'ctx')]
+#define INIT_hipCtxDestroy_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxDestroy.ctx = (hipCtx_t)ctx; \
+};
+// hipMemsetD16Async[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD16Async_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetD16Async.dest = (hipDeviceptr_t)dst; \
+ cb_data.args.hipMemsetD16Async.value = (unsigned short)value; \
+ cb_data.args.hipMemsetD16Async.count = (size_t)count; \
+ cb_data.args.hipMemsetD16Async.stream = (hipStream_t)stream; \
+};
+// hipModuleUnload[('hipModule_t', 'module')]
+#define INIT_hipModuleUnload_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleUnload.module = (hipModule_t)hmod; \
+};
+// hipHostUnregister[('void*', 'hostPtr')]
+#define INIT_hipHostUnregister_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostUnregister.hostPtr = (void*)hostPtr; \
+};
+// hipProfilerStop[]
+#define INIT_hipProfilerStop_CB_ARGS_DATA(cb_data) { \
+};
+// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
+#define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \
+ cb_data.args.hipExtStreamCreateWithCUMask.cuMaskSize = (unsigned int)cuMaskSize; \
+ cb_data.args.hipExtStreamCreateWithCUMask.cuMask = (const unsigned int*)cuMask; \
+};
+// hipStreamSynchronize[('hipStream_t', 'stream')]
+#define INIT_hipStreamSynchronize_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamSynchronize.stream = (hipStream_t)stream; \
+};
+// hipFreeHost[('void*', 'ptr')]
+#define INIT_hipFreeHost_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFreeHost.ptr = (void*)ptr; \
+};
+// hipDeviceSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+#define INIT_hipDeviceSetCacheConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceSetCacheConfig.cacheConfig = (hipFuncCache_t)cacheConfig; \
+};
+// hipGetErrorName[]
+#define INIT_hipGetErrorName_CB_ARGS_DATA(cb_data) { \
+};
+// hipMemcpyHtoD[('hipDeviceptr_t', 'dst'), ('void*', 'src'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyHtoD_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyHtoD.dst = (hipDeviceptr_t)dstDevice; \
+ cb_data.args.hipMemcpyHtoD.src = (void*)srcHost; \
+ cb_data.args.hipMemcpyHtoD.sizeBytes = (size_t)ByteCount; \
+};
+// hipModuleGetGlobal[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'bytes'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+#define INIT_hipModuleGetGlobal_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleGetGlobal.dptr = (hipDeviceptr_t*)dptr; \
+ cb_data.args.hipModuleGetGlobal.bytes = (size_t*)bytes; \
+ cb_data.args.hipModuleGetGlobal.hmod = (hipModule_t)hmod; \
+ cb_data.args.hipModuleGetGlobal.name = (name) ? strdup(name) : NULL; \
+};
+// hipMemcpyHtoA[('hipArray*', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'count')]
+#define INIT_hipMemcpyHtoA_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyHtoA.dstArray = (hipArray*)dstArray; \
+ cb_data.args.hipMemcpyHtoA.dstOffset = (size_t)dstOffset; \
+ cb_data.args.hipMemcpyHtoA.srcHost = (const void*)srcHost; \
+ cb_data.args.hipMemcpyHtoA.count = (size_t)ByteCount; \
+};
+// hipCtxCreate[('hipCtx_t*', 'ctx'), ('unsigned int', 'flags'), ('hipDevice_t', 'device')]
+#define INIT_hipCtxCreate_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxCreate.ctx = (hipCtx_t*)ctx; \
+ cb_data.args.hipCtxCreate.flags = (unsigned int)flags; \
+ cb_data.args.hipCtxCreate.device = (hipDevice_t)device; \
+};
+// hipMemcpy2D[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy2D.dst = (void*)dst; \
+ cb_data.args.hipMemcpy2D.dpitch = (size_t)dpitch; \
+ cb_data.args.hipMemcpy2D.src = (const void*)src; \
+ cb_data.args.hipMemcpy2D.spitch = (size_t)spitch; \
+ cb_data.args.hipMemcpy2D.width = (size_t)width; \
+ cb_data.args.hipMemcpy2D.height = (size_t)height; \
+ cb_data.args.hipMemcpy2D.kind = (hipMemcpyKind)kind; \
+};
+// hipIpcCloseMemHandle[('void*', 'devPtr')]
+#define INIT_hipIpcCloseMemHandle_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipIpcCloseMemHandle.devPtr = (void*)dev_ptr; \
+};
+// hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')]
+#define INIT_hipChooseDevice_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipChooseDevice.device = (int*)device; \
+ cb_data.args.hipChooseDevice.prop = (const hipDeviceProp_t*)properties; \
+};
+// hipDeviceSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+#define INIT_hipDeviceSetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceSetSharedMemConfig.config = (hipSharedMemConfig)config; \
+};
+// hipMallocMipmappedArray[('hipMipmappedArray_t*', 'mipmappedArray'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'numLevels'), ('unsigned int', 'flags')]
+#define INIT_hipMallocMipmappedArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMallocMipmappedArray.mipmappedArray = (hipMipmappedArray_t*)mipmappedArray; \
+ cb_data.args.hipMallocMipmappedArray.desc = (const hipChannelFormatDesc*)desc; \
+ cb_data.args.hipMallocMipmappedArray.extent = (hipExtent)extent; \
+ cb_data.args.hipMallocMipmappedArray.numLevels = (unsigned int)numLevels; \
+ cb_data.args.hipMallocMipmappedArray.flags = (unsigned int)flags; \
+};
+// hipSetupArgument[('const void*', 'arg'), ('size_t', 'size'), ('size_t', 'offset')]
+#define INIT_hipSetupArgument_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipSetupArgument.arg = (const void*)arg; \
+ cb_data.args.hipSetupArgument.size = (size_t)size; \
+ cb_data.args.hipSetupArgument.offset = (size_t)offset; \
+};
+// hipIpcGetEventHandle[('hipIpcEventHandle_t*', 'handle'), ('hipEvent_t', 'event')]
+#define INIT_hipIpcGetEventHandle_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipIpcGetEventHandle.handle = (hipIpcEventHandle_t*)handle; \
+ cb_data.args.hipIpcGetEventHandle.event = (hipEvent_t)event; \
+};
+// hipFreeArray[('hipArray*', 'array')]
+#define INIT_hipFreeArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFreeArray.array = (hipArray*)array; \
+};
+// hipCtxSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+#define INIT_hipCtxSetCacheConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxSetCacheConfig.cacheConfig = (hipFuncCache_t)cacheConfig; \
+};
+// hipFuncSetCacheConfig[('const void*', 'func'), ('hipFuncCache_t', 'config')]
+#define INIT_hipFuncSetCacheConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFuncSetCacheConfig.func = (const void*)func; \
+ cb_data.args.hipFuncSetCacheConfig.config = (hipFuncCache_t)cacheConfig; \
+};
+// hipLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipLaunchKernel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipLaunchKernel.function_address = (const void*)hostFunction; \
+ cb_data.args.hipLaunchKernel.numBlocks = (dim3)gridDim; \
+ cb_data.args.hipLaunchKernel.dimBlocks = (dim3)blockDim; \
+ cb_data.args.hipLaunchKernel.args = (void**)args; \
+ cb_data.args.hipLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+ cb_data.args.hipLaunchKernel.stream = (hipStream_t)stream; \
+};
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk'), ('unsigned int', 'flags')]
+#define INIT_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks = (int*)numBlocks; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f = (hipFunction_t)f; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize = (int)blockSize; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+ cb_data.args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags = (unsigned int)flags; \
+};
+// hipModuleGetTexRef[('textureReference**', 'texRef'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+#define INIT_hipModuleGetTexRef_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleGetTexRef.texRef = (textureReference**)texRef; \
+ cb_data.args.hipModuleGetTexRef.hmod = (hipModule_t)hmod; \
+ cb_data.args.hipModuleGetTexRef.name = (name) ? strdup(name) : NULL; \
+};
+// hipFuncSetAttribute[('const void*', 'func'), ('hipFuncAttribute', 'attr'), ('int', 'value')]
+#define INIT_hipFuncSetAttribute_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFuncSetAttribute.func = (const void*)func; \
+ cb_data.args.hipFuncSetAttribute.attr = (hipFuncAttribute)attr; \
+ cb_data.args.hipFuncSetAttribute.value = (int)value; \
+};
+// hipEventElapsedTime[('float*', 'ms'), ('hipEvent_t', 'start'), ('hipEvent_t', 'stop')]
+#define INIT_hipEventElapsedTime_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventElapsedTime.ms = (float*)ms; \
+ cb_data.args.hipEventElapsedTime.start = (hipEvent_t)start; \
+ cb_data.args.hipEventElapsedTime.stop = (hipEvent_t)stop; \
+};
+// hipConfigureCall[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+#define INIT_hipConfigureCall_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipConfigureCall.gridDim = (dim3)gridDim; \
+ cb_data.args.hipConfigureCall.blockDim = (dim3)blockDim; \
+ cb_data.args.hipConfigureCall.sharedMem = (size_t)sharedMem; \
+ cb_data.args.hipConfigureCall.stream = (hipStream_t)stream; \
+};
+// hipMemAdvise[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemoryAdvise', 'advice'), ('int', 'device')]
+#define INIT_hipMemAdvise_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemAdvise.dev_ptr = (const void*)dev_ptr; \
+ cb_data.args.hipMemAdvise.count = (size_t)count; \
+ cb_data.args.hipMemAdvise.advice = (hipMemoryAdvise)advice; \
+ cb_data.args.hipMemAdvise.device = (int)device; \
+};
+// hipMemcpy3DAsync[('const hipMemcpy3DParms*', 'p'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy3DAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy3DAsync.p = (const hipMemcpy3DParms*)p; \
+ cb_data.args.hipMemcpy3DAsync.stream = (hipStream_t)stream; \
+};
+// hipEventDestroy[('hipEvent_t', 'event')]
+#define INIT_hipEventDestroy_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventDestroy.event = (hipEvent_t)event; \
+};
+// hipCtxPopCurrent[('hipCtx_t*', 'ctx')]
+#define INIT_hipCtxPopCurrent_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxPopCurrent.ctx = (hipCtx_t*)ctx; \
+};
+// hipGetSymbolAddress[('void**', 'devPtr'), ('const void*', 'symbol')]
+#define INIT_hipGetSymbolAddress_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetSymbolAddress.devPtr = (void**)devPtr; \
+ cb_data.args.hipGetSymbolAddress.symbol = (const void*)symbol; \
+};
+// hipHostGetFlags[('unsigned int*', 'flagsPtr'), ('void*', 'hostPtr')]
+#define INIT_hipHostGetFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostGetFlags.flagsPtr = (unsigned int*)flagsPtr; \
+ cb_data.args.hipHostGetFlags.hostPtr = (void*)hostPtr; \
+};
+// hipHostMalloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+#define INIT_hipHostMalloc_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostMalloc.ptr = (void**)ptr; \
+ cb_data.args.hipHostMalloc.size = (size_t)sizeBytes; \
+ cb_data.args.hipHostMalloc.flags = (unsigned int)flags; \
+};
+// hipCtxSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+#define INIT_hipCtxSetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxSetSharedMemConfig.config = (hipSharedMemConfig)config; \
+};
+// hipFreeMipmappedArray[('hipMipmappedArray_t', 'mipmappedArray')]
+#define INIT_hipFreeMipmappedArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFreeMipmappedArray.mipmappedArray = (hipMipmappedArray_t)mipmappedArray; \
+};
+// hipMemGetInfo[('size_t*', 'free'), ('size_t*', 'total')]
+#define INIT_hipMemGetInfo_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemGetInfo.free = (size_t*)free; \
+ cb_data.args.hipMemGetInfo.total = (size_t*)total; \
+};
+// hipDeviceReset[]
+#define INIT_hipDeviceReset_CB_ARGS_DATA(cb_data) { \
+};
+// hipMemset[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemset_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemset.dst = (void*)dst; \
+ cb_data.args.hipMemset.value = (int)value; \
+ cb_data.args.hipMemset.sizeBytes = (size_t)sizeBytes; \
+};
+// hipMemsetD8[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count')]
+#define INIT_hipMemsetD8_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetD8.dest = (hipDeviceptr_t)dst; \
+ cb_data.args.hipMemsetD8.value = (unsigned char)value; \
+ cb_data.args.hipMemsetD8.count = (size_t)count; \
+};
+// hipMemcpyParam2DAsync[('const hip_Memcpy2D*', 'pCopy'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyParam2DAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyParam2DAsync.pCopy = (const hip_Memcpy2D*)pCopy; \
+ cb_data.args.hipMemcpyParam2DAsync.stream = (hipStream_t)stream; \
+};
+// hipHostRegister[('void*', 'hostPtr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+#define INIT_hipHostRegister_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostRegister.hostPtr = (void*)hostPtr; \
+ cb_data.args.hipHostRegister.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipHostRegister.flags = (unsigned int)flags; \
+};
+// hipDriverGetVersion[('int*', 'driverVersion')]
+#define INIT_hipDriverGetVersion_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDriverGetVersion.driverVersion = (int*)driverVersion; \
+};
+// hipArray3DCreate[('hipArray**', 'array'), ('const HIP_ARRAY3D_DESCRIPTOR*', 'pAllocateArray')]
+#define INIT_hipArray3DCreate_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipArray3DCreate.array = (hipArray**)array; \
+ cb_data.args.hipArray3DCreate.pAllocateArray = (const HIP_ARRAY3D_DESCRIPTOR*)pAllocateArray; \
+};
+// hipIpcOpenMemHandle[('void**', 'devPtr'), ('hipIpcMemHandle_t', 'handle'), ('unsigned int', 'flags')]
+#define INIT_hipIpcOpenMemHandle_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipIpcOpenMemHandle.devPtr = (void**)dev_ptr; \
+ cb_data.args.hipIpcOpenMemHandle.handle = (hipIpcMemHandle_t)handle; \
+ cb_data.args.hipIpcOpenMemHandle.flags = (unsigned int)flags; \
+};
+// hipGetLastError[]
+#define INIT_hipGetLastError_CB_ARGS_DATA(cb_data) { \
+};
+// hipGetDeviceFlags[('unsigned int*', 'flags')]
+#define INIT_hipGetDeviceFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetDeviceFlags.flags = (unsigned int*)flags; \
+};
+// hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+#define INIT_hipDeviceGetSharedMemConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetSharedMemConfig.pConfig = (hipSharedMemConfig*)pConfig; \
+};
+// hipDrvMemcpy3D[('const HIP_MEMCPY3D*', 'pCopy')]
+#define INIT_hipDrvMemcpy3D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDrvMemcpy3D.pCopy = (const HIP_MEMCPY3D*)pCopy; \
+};
+// hipMemcpy2DFromArray[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy2DFromArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy2DFromArray.dst = (void*)dst; \
+ cb_data.args.hipMemcpy2DFromArray.dpitch = (size_t)dpitch; \
+ cb_data.args.hipMemcpy2DFromArray.src = (hipArray_const_t)src; \
+ cb_data.args.hipMemcpy2DFromArray.wOffset = (size_t)wOffsetSrc; \
+ cb_data.args.hipMemcpy2DFromArray.hOffset = (size_t)hOffset; \
+ cb_data.args.hipMemcpy2DFromArray.width = (size_t)width; \
+ cb_data.args.hipMemcpy2DFromArray.height = (size_t)height; \
+ cb_data.args.hipMemcpy2DFromArray.kind = (hipMemcpyKind)kind; \
+};
+// hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize'), ('unsigned int', 'flags')]
+#define INIT_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks = (int*)numBlocks; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f = (const void*)f; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize = (int)blockSize; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynamicSMemSize = (size_t)dynamicSMemSize; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags = (unsigned int)flags; \
+};
+// hipSetDeviceFlags[('unsigned int', 'flags')]
+#define INIT_hipSetDeviceFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipSetDeviceFlags.flags = (unsigned int)flags; \
+};
+// hipHccModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent')]
+#define INIT_hipHccModuleLaunchKernel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHccModuleLaunchKernel.f = (hipFunction_t)f; \
+ cb_data.args.hipHccModuleLaunchKernel.globalWorkSizeX = (unsigned int)globalWorkSizeX; \
+ cb_data.args.hipHccModuleLaunchKernel.globalWorkSizeY = (unsigned int)globalWorkSizeY; \
+ cb_data.args.hipHccModuleLaunchKernel.globalWorkSizeZ = (unsigned int)globalWorkSizeZ; \
+ cb_data.args.hipHccModuleLaunchKernel.blockDimX = (unsigned int)blockDimX; \
+ cb_data.args.hipHccModuleLaunchKernel.blockDimY = (unsigned int)blockDimY; \
+ cb_data.args.hipHccModuleLaunchKernel.blockDimZ = (unsigned int)blockDimZ; \
+ cb_data.args.hipHccModuleLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+ cb_data.args.hipHccModuleLaunchKernel.hStream = (hipStream_t)hStream; \
+ cb_data.args.hipHccModuleLaunchKernel.kernelParams = (void**)kernelParams; \
+ cb_data.args.hipHccModuleLaunchKernel.extra = (void**)extra; \
+ cb_data.args.hipHccModuleLaunchKernel.startEvent = (hipEvent_t)startEvent; \
+ cb_data.args.hipHccModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
+};
+// hipFree[('void*', 'ptr')]
+#define INIT_hipFree_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFree.ptr = (void*)ptr; \
+};
+// hipOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('const void*', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+#define INIT_hipOccupancyMaxPotentialBlockSize_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipOccupancyMaxPotentialBlockSize.gridSize = (int*)gridSize; \
+ cb_data.args.hipOccupancyMaxPotentialBlockSize.blockSize = (int*)blockSize; \
+ cb_data.args.hipOccupancyMaxPotentialBlockSize.f = (const void*)f; \
+ cb_data.args.hipOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+ cb_data.args.hipOccupancyMaxPotentialBlockSize.blockSizeLimit = (int)blockSizeLimit; \
+};
+// hipDeviceGetAttribute[('int*', 'pi'), ('hipDeviceAttribute_t', 'attr'), ('int', 'deviceId')]
+#define INIT_hipDeviceGetAttribute_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetAttribute.pi = (int*)pi; \
+ cb_data.args.hipDeviceGetAttribute.attr = (hipDeviceAttribute_t)attr; \
+ cb_data.args.hipDeviceGetAttribute.deviceId = (int)device; \
+};
+// hipDeviceComputeCapability[('int*', 'major'), ('int*', 'minor'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceComputeCapability_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceComputeCapability.major = (int*)major; \
+ cb_data.args.hipDeviceComputeCapability.minor = (int*)minor; \
+ cb_data.args.hipDeviceComputeCapability.device = (hipDevice_t)device; \
+};
+// hipCtxDisablePeerAccess[('hipCtx_t', 'peerCtx')]
+#define INIT_hipCtxDisablePeerAccess_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxDisablePeerAccess.peerCtx = (hipCtx_t)peerCtx; \
+};
+// hipMallocManaged[('void**', 'dev_ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+#define INIT_hipMallocManaged_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMallocManaged.dev_ptr = (void**)dev_ptr; \
+ cb_data.args.hipMallocManaged.size = (size_t)size; \
+ cb_data.args.hipMallocManaged.flags = (unsigned int)flags; \
+};
+// hipDeviceGetByPCIBusId[('int*', 'device'), ('const char*', 'pciBusId')]
+#define INIT_hipDeviceGetByPCIBusId_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetByPCIBusId.device = (int*)device; \
+ cb_data.args.hipDeviceGetByPCIBusId.pciBusId = (pciBusIdstr) ? strdup(pciBusIdstr) : NULL; \
+};
+// hipIpcGetMemHandle[('hipIpcMemHandle_t*', 'handle'), ('void*', 'devPtr')]
+#define INIT_hipIpcGetMemHandle_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipIpcGetMemHandle.handle = (hipIpcMemHandle_t*)handle; \
+ cb_data.args.hipIpcGetMemHandle.devPtr = (void*)dev_ptr; \
+};
+// hipMemcpyHtoDAsync[('hipDeviceptr_t', 'dst'), ('void*', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyHtoDAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyHtoDAsync.dst = (hipDeviceptr_t)dstDevice; \
+ cb_data.args.hipMemcpyHtoDAsync.src = (void*)srcHost; \
+ cb_data.args.hipMemcpyHtoDAsync.sizeBytes = (size_t)ByteCount; \
+ cb_data.args.hipMemcpyHtoDAsync.stream = (hipStream_t)stream; \
+};
+// hipCtxGetDevice[('hipDevice_t*', 'device')]
+#define INIT_hipCtxGetDevice_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxGetDevice.device = (hipDevice_t*)device; \
+};
+// hipMemcpyDtoD[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyDtoD_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyDtoD.dst = (hipDeviceptr_t)dstDevice; \
+ cb_data.args.hipMemcpyDtoD.src = (hipDeviceptr_t)srcDevice; \
+ cb_data.args.hipMemcpyDtoD.sizeBytes = (size_t)ByteCount; \
+};
+// hipModuleLoadData[('hipModule_t*', 'module'), ('const void*', 'image')]
+#define INIT_hipModuleLoadData_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleLoadData.module = (hipModule_t*)module; \
+ cb_data.args.hipModuleLoadData.image = (const void*)image; \
+};
+// hipDevicePrimaryCtxRelease[('hipDevice_t', 'dev')]
+#define INIT_hipDevicePrimaryCtxRelease_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDevicePrimaryCtxRelease.dev = (hipDevice_t)dev; \
+};
+// hipOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize')]
+#define INIT_hipOccupancyMaxActiveBlocksPerMultiprocessor_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks = (int*)numBlocks; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.f = (const void*)f; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.blockSize = (int)blockSize; \
+ cb_data.args.hipOccupancyMaxActiveBlocksPerMultiprocessor.dynamicSMemSize = (size_t)dynamicSMemSize; \
+};
+// hipCtxSetCurrent[('hipCtx_t', 'ctx')]
+#define INIT_hipCtxSetCurrent_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxSetCurrent.ctx = (hipCtx_t)ctx; \
+};
+// hipGetErrorString[]
+#define INIT_hipGetErrorString_CB_ARGS_DATA(cb_data) { \
+};
+// hipStreamCreate[('hipStream_t*', 'stream')]
+#define INIT_hipStreamCreate_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamCreate.stream = (hipStream_t*)stream; \
+};
+// hipDevicePrimaryCtxRetain[('hipCtx_t*', 'pctx'), ('hipDevice_t', 'dev')]
+#define INIT_hipDevicePrimaryCtxRetain_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDevicePrimaryCtxRetain.pctx = (hipCtx_t*)pctx; \
+ cb_data.args.hipDevicePrimaryCtxRetain.dev = (hipDevice_t)dev; \
+};
+// hipDeviceGet[('hipDevice_t*', 'device'), ('int', 'ordinal')]
+#define INIT_hipDeviceGet_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGet.device = (hipDevice_t*)device; \
+ cb_data.args.hipDeviceGet.ordinal = (int)deviceId; \
+};
+// hipStreamCreateWithFlags[('hipStream_t*', 'stream'), ('unsigned int', 'flags')]
+#define INIT_hipStreamCreateWithFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamCreateWithFlags.stream = (hipStream_t*)stream; \
+ cb_data.args.hipStreamCreateWithFlags.flags = (unsigned int)flags; \
+};
+// hipMemcpyFromArray[('void*', 'dst'), ('hipArray_const_t', 'srcArray'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyFromArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyFromArray.dst = (void*)dst; \
+ cb_data.args.hipMemcpyFromArray.srcArray = (hipArray_const_t)src; \
+ cb_data.args.hipMemcpyFromArray.wOffset = (size_t)wOffsetSrc; \
+ cb_data.args.hipMemcpyFromArray.hOffset = (size_t)hOffset; \
+ cb_data.args.hipMemcpyFromArray.count = (size_t)count; \
+ cb_data.args.hipMemcpyFromArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy2DAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy2DAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy2DAsync.dst = (void*)dst; \
+ cb_data.args.hipMemcpy2DAsync.dpitch = (size_t)dpitch; \
+ cb_data.args.hipMemcpy2DAsync.src = (const void*)src; \
+ cb_data.args.hipMemcpy2DAsync.spitch = (size_t)spitch; \
+ cb_data.args.hipMemcpy2DAsync.width = (size_t)width; \
+ cb_data.args.hipMemcpy2DAsync.height = (size_t)height; \
+ cb_data.args.hipMemcpy2DAsync.kind = (hipMemcpyKind)kind; \
+ cb_data.args.hipMemcpy2DAsync.stream = (hipStream_t)stream; \
+};
+// hipFuncGetAttributes[('hipFuncAttributes*', 'attr'), ('const void*', 'func')]
+#define INIT_hipFuncGetAttributes_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipFuncGetAttributes.attr = (hipFuncAttributes*)attr; \
+ cb_data.args.hipFuncGetAttributes.func = (const void*)func; \
+};
+// hipGetSymbolSize[('size_t*', 'size'), ('const void*', 'symbol')]
+#define INIT_hipGetSymbolSize_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetSymbolSize.size = (size_t*)sizePtr; \
+ cb_data.args.hipGetSymbolSize.symbol = (const void*)symbol; \
+};
+// hipHostFree[('void*', 'ptr')]
+#define INIT_hipHostFree_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostFree.ptr = (void*)ptr; \
+};
+// hipEventCreateWithFlags[('hipEvent_t*', 'event'), ('unsigned int', 'flags')]
+#define INIT_hipEventCreateWithFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventCreateWithFlags.event = (hipEvent_t*)event; \
+ cb_data.args.hipEventCreateWithFlags.flags = (unsigned int)flags; \
+};
+// hipStreamQuery[('hipStream_t', 'stream')]
+#define INIT_hipStreamQuery_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamQuery.stream = (hipStream_t)stream; \
+};
+// hipMemcpy3D[('const hipMemcpy3DParms*', 'p')]
+#define INIT_hipMemcpy3D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy3D.p = (const hipMemcpy3DParms*)p; \
+};
+// hipMemcpyToSymbol[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyToSymbol_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyToSymbol.symbol = (const void*)symbol; \
+ cb_data.args.hipMemcpyToSymbol.src = (const void*)src; \
+ cb_data.args.hipMemcpyToSymbol.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyToSymbol.offset = (size_t)offset; \
+ cb_data.args.hipMemcpyToSymbol.kind = (hipMemcpyKind)kind; \
+};
+// hipMemcpy[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpy_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy.dst = (void*)dst; \
+ cb_data.args.hipMemcpy.src = (const void*)src; \
+ cb_data.args.hipMemcpy.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpy.kind = (hipMemcpyKind)kind; \
+};
+// hipPeekAtLastError[]
+#define INIT_hipPeekAtLastError_CB_ARGS_DATA(cb_data) { \
+};
+// hipExtLaunchMultiKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+#define INIT_hipExtLaunchMultiKernelMultiDevice_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtLaunchMultiKernelMultiDevice.launchParamsList = (hipLaunchParams*)launchParamsList; \
+ cb_data.args.hipExtLaunchMultiKernelMultiDevice.numDevices = (int)numDevices; \
+ cb_data.args.hipExtLaunchMultiKernelMultiDevice.flags = (unsigned int)flags; \
+};
+// hipHostAlloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+#define INIT_hipHostAlloc_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostAlloc.ptr = (void**)ptr; \
+ cb_data.args.hipHostAlloc.size = (size_t)sizeBytes; \
+ cb_data.args.hipHostAlloc.flags = (unsigned int)flags; \
+};
+// hipStreamAddCallback[('hipStream_t', 'stream'), ('hipStreamCallback_t', 'callback'), ('void*', 'userData'), ('unsigned int', 'flags')]
+#define INIT_hipStreamAddCallback_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipStreamAddCallback.stream = (hipStream_t)stream; \
+ cb_data.args.hipStreamAddCallback.callback = (hipStreamCallback_t)callback; \
+ cb_data.args.hipStreamAddCallback.userData = (void*)userData; \
+ cb_data.args.hipStreamAddCallback.flags = (unsigned int)flags; \
+};
+// hipMemcpyToArray[('hipArray*', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+#define INIT_hipMemcpyToArray_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyToArray.dst = (hipArray*)dst; \
+ cb_data.args.hipMemcpyToArray.wOffset = (size_t)wOffset; \
+ cb_data.args.hipMemcpyToArray.hOffset = (size_t)hOffset; \
+ cb_data.args.hipMemcpyToArray.src = (const void*)src; \
+ cb_data.args.hipMemcpyToArray.count = (size_t)count; \
+ cb_data.args.hipMemcpyToArray.kind = (hipMemcpyKind)kind; \
+};
+// hipMemsetD32[('hipDeviceptr_t', 'dest'), ('int', 'value'), ('size_t', 'count')]
+#define INIT_hipMemsetD32_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetD32.dest = (hipDeviceptr_t)dst; \
+ cb_data.args.hipMemsetD32.value = (int)value; \
+ cb_data.args.hipMemsetD32.count = (size_t)count; \
+};
+// hipExtModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'localWorkSizeX'), ('unsigned int', 'localWorkSizeY'), ('unsigned int', 'localWorkSizeZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('unsigned int', 'flags')]
+#define INIT_hipExtModuleLaunchKernel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtModuleLaunchKernel.f = (hipFunction_t)f; \
+ cb_data.args.hipExtModuleLaunchKernel.globalWorkSizeX = (unsigned int)globalWorkSizeX; \
+ cb_data.args.hipExtModuleLaunchKernel.globalWorkSizeY = (unsigned int)globalWorkSizeY; \
+ cb_data.args.hipExtModuleLaunchKernel.globalWorkSizeZ = (unsigned int)globalWorkSizeZ; \
+ cb_data.args.hipExtModuleLaunchKernel.localWorkSizeX = (unsigned int)localWorkSizeX; \
+ cb_data.args.hipExtModuleLaunchKernel.localWorkSizeY = (unsigned int)localWorkSizeY; \
+ cb_data.args.hipExtModuleLaunchKernel.localWorkSizeZ = (unsigned int)localWorkSizeZ; \
+ cb_data.args.hipExtModuleLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+ cb_data.args.hipExtModuleLaunchKernel.hStream = (hipStream_t)hStream; \
+ cb_data.args.hipExtModuleLaunchKernel.kernelParams = (void**)kernelParams; \
+ cb_data.args.hipExtModuleLaunchKernel.extra = (void**)extra; \
+ cb_data.args.hipExtModuleLaunchKernel.startEvent = (hipEvent_t)startEvent; \
+ cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
+ cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \
+};
+// hipDeviceSynchronize[]
+#define INIT_hipDeviceSynchronize_CB_ARGS_DATA(cb_data) { \
+};
+// hipDeviceGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+#define INIT_hipDeviceGetCacheConfig_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetCacheConfig.cacheConfig = (hipFuncCache_t*)cacheConfig; \
+};
+// hipMalloc3D[('hipPitchedPtr*', 'pitchedDevPtr'), ('hipExtent', 'extent')]
+#define INIT_hipMalloc3D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMalloc3D.pitchedDevPtr = (hipPitchedPtr*)pitchedDevPtr; \
+ cb_data.args.hipMalloc3D.extent = (hipExtent)extent; \
+};
+// hipPointerGetAttributes[('hipPointerAttribute_t*', 'attributes'), ('const void*', 'ptr')]
+#define INIT_hipPointerGetAttributes_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipPointerGetAttributes.attributes = (hipPointerAttribute_t*)attributes; \
+ cb_data.args.hipPointerGetAttributes.ptr = (const void*)ptr; \
+};
+// hipMemsetAsync[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetAsync.dst = (void*)dst; \
+ cb_data.args.hipMemsetAsync.value = (int)value; \
+ cb_data.args.hipMemsetAsync.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemsetAsync.stream = (hipStream_t)stream; \
+};
+// hipDeviceGetName[('char*', 'name'), ('int', 'len'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceGetName_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetName.name = (char*)name; \
+ cb_data.args.hipDeviceGetName.len = (int)len; \
+ cb_data.args.hipDeviceGetName.device = (hipDevice_t)device; \
+};
+// hipModuleOccupancyMaxPotentialBlockSizeWithFlags[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit'), ('unsigned int', 'flags')]
+#define INIT_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize = (int*)gridSize; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize = (int*)blockSize; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.f = (hipFunction_t)f; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.dynSharedMemPerBlk = (size_t)dynSharedMemPerBlk; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSizeLimit = (int)blockSizeLimit; \
+ cb_data.args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.flags = (unsigned int)flags; \
+};
+// hipCtxPushCurrent[('hipCtx_t', 'ctx')]
+#define INIT_hipCtxPushCurrent_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxPushCurrent.ctx = (hipCtx_t)ctx; \
+};
+// hipMemcpyPeer[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDeviceId'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyPeer_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyPeer.dst = (void*)dst; \
+ cb_data.args.hipMemcpyPeer.dstDeviceId = (int)dstDevice; \
+ cb_data.args.hipMemcpyPeer.src = (const void*)src; \
+ cb_data.args.hipMemcpyPeer.srcDeviceId = (int)srcDevice; \
+ cb_data.args.hipMemcpyPeer.sizeBytes = (size_t)sizeBytes; \
+};
+// hipEventSynchronize[('hipEvent_t', 'event')]
+#define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \
+};
+// hipMemcpyDtoDAsync[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyDtoDAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyDtoDAsync.dst = (hipDeviceptr_t)dstDevice; \
+ cb_data.args.hipMemcpyDtoDAsync.src = (hipDeviceptr_t)srcDevice; \
+ cb_data.args.hipMemcpyDtoDAsync.sizeBytes = (size_t)ByteCount; \
+ cb_data.args.hipMemcpyDtoDAsync.stream = (hipStream_t)stream; \
+};
+// hipProfilerStart[]
+#define INIT_hipProfilerStart_CB_ARGS_DATA(cb_data) { \
+};
+// hipExtMallocWithFlags[('void**', 'ptr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+#define INIT_hipExtMallocWithFlags_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtMallocWithFlags.ptr = (void**)ptr; \
+ cb_data.args.hipExtMallocWithFlags.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipExtMallocWithFlags.flags = (unsigned int)flags; \
+};
+// hipCtxEnablePeerAccess[('hipCtx_t', 'peerCtx'), ('unsigned int', 'flags')]
+#define INIT_hipCtxEnablePeerAccess_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipCtxEnablePeerAccess.peerCtx = (hipCtx_t)peerCtx; \
+ cb_data.args.hipCtxEnablePeerAccess.flags = (unsigned int)flags; \
+};
+// hipMemAllocHost[('void**', 'ptr'), ('size_t', 'size')]
+#define INIT_hipMemAllocHost_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemAllocHost.ptr = (void**)ptr; \
+ cb_data.args.hipMemAllocHost.size = (size_t)size; \
+};
+// hipMemcpyDtoHAsync[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyDtoHAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyDtoHAsync.dst = (void*)dstHost; \
+ cb_data.args.hipMemcpyDtoHAsync.src = (hipDeviceptr_t)srcDevice; \
+ cb_data.args.hipMemcpyDtoHAsync.sizeBytes = (size_t)ByteCount; \
+ cb_data.args.hipMemcpyDtoHAsync.stream = (hipStream_t)stream; \
+};
+// hipModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'gridDimX'), ('unsigned int', 'gridDimY'), ('unsigned int', 'gridDimZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('void**', 'kernelParams'), ('void**', 'extra')]
+#define INIT_hipModuleLaunchKernel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleLaunchKernel.f = (hipFunction_t)f; \
+ cb_data.args.hipModuleLaunchKernel.gridDimX = (unsigned int)gridDimX; \
+ cb_data.args.hipModuleLaunchKernel.gridDimY = (unsigned int)gridDimY; \
+ cb_data.args.hipModuleLaunchKernel.gridDimZ = (unsigned int)gridDimZ; \
+ cb_data.args.hipModuleLaunchKernel.blockDimX = (unsigned int)blockDimX; \
+ cb_data.args.hipModuleLaunchKernel.blockDimY = (unsigned int)blockDimY; \
+ cb_data.args.hipModuleLaunchKernel.blockDimZ = (unsigned int)blockDimZ; \
+ cb_data.args.hipModuleLaunchKernel.sharedMemBytes = (unsigned int)sharedMemBytes; \
+ cb_data.args.hipModuleLaunchKernel.stream = (hipStream_t)hStream; \
+ cb_data.args.hipModuleLaunchKernel.kernelParams = (void**)kernelParams; \
+ cb_data.args.hipModuleLaunchKernel.extra = (void**)extra; \
+};
+// hipMemAllocPitch[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'pitch'), ('size_t', 'widthInBytes'), ('size_t', 'height'), ('unsigned int', 'elementSizeBytes')]
+#define INIT_hipMemAllocPitch_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemAllocPitch.dptr = (hipDeviceptr_t*)dptr; \
+ cb_data.args.hipMemAllocPitch.pitch = (size_t*)pitch; \
+ cb_data.args.hipMemAllocPitch.widthInBytes = (size_t)widthInBytes; \
+ cb_data.args.hipMemAllocPitch.height = (size_t)height; \
+ cb_data.args.hipMemAllocPitch.elementSizeBytes = (unsigned int)elementSizeBytes; \
+};
+// hipExtLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('int', 'flags')]
+#define INIT_hipExtLaunchKernel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipExtLaunchKernel.function_address = (const void*)hostFunction; \
+ cb_data.args.hipExtLaunchKernel.numBlocks = (dim3)gridDim; \
+ cb_data.args.hipExtLaunchKernel.dimBlocks = (dim3)blockDim; \
+ cb_data.args.hipExtLaunchKernel.args = (void**)args; \
+ cb_data.args.hipExtLaunchKernel.sharedMemBytes = (size_t)sharedMemBytes; \
+ cb_data.args.hipExtLaunchKernel.stream = (hipStream_t)stream; \
+ cb_data.args.hipExtLaunchKernel.startEvent = (hipEvent_t)startEvent; \
+ cb_data.args.hipExtLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
+ cb_data.args.hipExtLaunchKernel.flags = (int)flags; \
+};
+// hipMemcpy2DFromArrayAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpy2DFromArrayAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpy2DFromArrayAsync.dst = (void*)dst; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.dpitch = (size_t)dpitch; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.src = (hipArray_const_t)src; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.wOffset = (size_t)wOffsetSrc; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.hOffset = (size_t)hOffsetSrc; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.width = (size_t)width; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.height = (size_t)height; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.kind = (hipMemcpyKind)kind; \
+ cb_data.args.hipMemcpy2DFromArrayAsync.stream = (hipStream_t)stream; \
+};
+// hipDeviceGetLimit[('size_t*', 'pValue'), ('hipLimit_t', 'limit')]
+#define INIT_hipDeviceGetLimit_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetLimit.pValue = (size_t*)pValue; \
+ cb_data.args.hipDeviceGetLimit.limit = (hipLimit_t)limit; \
+};
+// hipModuleLoadDataEx[('hipModule_t*', 'module'), ('const void*', 'image'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionsValues')]
+#define INIT_hipModuleLoadDataEx_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipModuleLoadDataEx.module = (hipModule_t*)module; \
+ cb_data.args.hipModuleLoadDataEx.image = (const void*)image; \
+ cb_data.args.hipModuleLoadDataEx.numOptions = (unsigned int)numOptions; \
+ cb_data.args.hipModuleLoadDataEx.options = (hipJitOption*)options; \
+ cb_data.args.hipModuleLoadDataEx.optionsValues = (void**)optionsValues; \
+};
+// hipRuntimeGetVersion[('int*', 'runtimeVersion')]
+#define INIT_hipRuntimeGetVersion_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipRuntimeGetVersion.runtimeVersion = (int*)runtimeVersion; \
+};
+// hipMemRangeGetAttribute[('void*', 'data'), ('size_t', 'data_size'), ('hipMemRangeAttribute', 'attribute'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+#define INIT_hipMemRangeGetAttribute_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemRangeGetAttribute.data = (void*)data; \
+ cb_data.args.hipMemRangeGetAttribute.data_size = (size_t)data_size; \
+ cb_data.args.hipMemRangeGetAttribute.attribute = (hipMemRangeAttribute)attribute; \
+ cb_data.args.hipMemRangeGetAttribute.dev_ptr = (const void*)dev_ptr; \
+ cb_data.args.hipMemRangeGetAttribute.count = (size_t)count; \
+};
+// hipDeviceGetP2PAttribute[('int*', 'value'), ('hipDeviceP2PAttr', 'attr'), ('int', 'srcDevice'), ('int', 'dstDevice')]
+#define INIT_hipDeviceGetP2PAttribute_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceGetP2PAttribute.value = (int*)value; \
+ cb_data.args.hipDeviceGetP2PAttribute.attr = (hipDeviceP2PAttr)attr; \
+ cb_data.args.hipDeviceGetP2PAttribute.srcDevice = (int)srcDevice; \
+ cb_data.args.hipDeviceGetP2PAttribute.dstDevice = (int)dstDevice; \
+};
+// hipMemcpyPeerAsync[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDevice'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyPeerAsync_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyPeerAsync.dst = (void*)dst; \
+ cb_data.args.hipMemcpyPeerAsync.dstDeviceId = (int)dstDevice; \
+ cb_data.args.hipMemcpyPeerAsync.src = (const void*)src; \
+ cb_data.args.hipMemcpyPeerAsync.srcDevice = (int)srcDevice; \
+ cb_data.args.hipMemcpyPeerAsync.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyPeerAsync.stream = (hipStream_t)stream; \
+};
+// hipGetDeviceProperties[('hipDeviceProp_t*', 'props'), ('hipDevice_t', 'device')]
+#define INIT_hipGetDeviceProperties_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetDeviceProperties.props = (hipDeviceProp_t*)props; \
+ cb_data.args.hipGetDeviceProperties.device = (hipDevice_t)device; \
+};
+// hipMemcpyDtoH[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+#define INIT_hipMemcpyDtoH_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyDtoH.dst = (void*)dstHost; \
+ cb_data.args.hipMemcpyDtoH.src = (hipDeviceptr_t)srcDevice; \
+ cb_data.args.hipMemcpyDtoH.sizeBytes = (size_t)ByteCount; \
+};
+// hipMemcpyWithStream[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+#define INIT_hipMemcpyWithStream_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyWithStream.dst = (void*)dst; \
+ cb_data.args.hipMemcpyWithStream.src = (const void*)src; \
+ cb_data.args.hipMemcpyWithStream.sizeBytes = (size_t)sizeBytes; \
+ cb_data.args.hipMemcpyWithStream.kind = (hipMemcpyKind)kind; \
+ cb_data.args.hipMemcpyWithStream.stream = (hipStream_t)stream; \
+};
+// hipDeviceTotalMem[('size_t*', 'bytes'), ('hipDevice_t', 'device')]
+#define INIT_hipDeviceTotalMem_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDeviceTotalMem.bytes = (size_t*)bytes; \
+ cb_data.args.hipDeviceTotalMem.device = (hipDevice_t)device; \
+};
+// hipHostGetDevicePointer[('void**', 'devPtr'), ('void*', 'hstPtr'), ('unsigned int', 'flags')]
+#define INIT_hipHostGetDevicePointer_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipHostGetDevicePointer.devPtr = (void**)devicePointer; \
+ cb_data.args.hipHostGetDevicePointer.hstPtr = (void*)hostPointer; \
+ cb_data.args.hipHostGetDevicePointer.flags = (unsigned int)flags; \
+};
+// hipMemRangeGetAttributes[('void**', 'data'), ('size_t*', 'data_sizes'), ('hipMemRangeAttribute*', 'attributes'), ('size_t', 'num_attributes'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+#define INIT_hipMemRangeGetAttributes_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemRangeGetAttributes.data = (void**)data; \
+ cb_data.args.hipMemRangeGetAttributes.data_sizes = (size_t*)data_sizes; \
+ cb_data.args.hipMemRangeGetAttributes.attributes = (hipMemRangeAttribute*)attributes; \
+ cb_data.args.hipMemRangeGetAttributes.num_attributes = (size_t)num_attributes; \
+ cb_data.args.hipMemRangeGetAttributes.dev_ptr = (const void*)dev_ptr; \
+ cb_data.args.hipMemRangeGetAttributes.count = (size_t)count; \
+};
+// hipMemcpyParam2D[('const hip_Memcpy2D*', 'pCopy')]
+#define INIT_hipMemcpyParam2D_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemcpyParam2D.pCopy = (const hip_Memcpy2D*)pCopy; \
+};
+// hipDevicePrimaryCtxReset[('hipDevice_t', 'dev')]
+#define INIT_hipDevicePrimaryCtxReset_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipDevicePrimaryCtxReset.dev = (hipDevice_t)dev; \
+};
+// hipGetMipmappedArrayLevel[('hipArray_t*', 'levelArray'), ('hipMipmappedArray_const_t', 'mipmappedArray'), ('unsigned int', 'level')]
+#define INIT_hipGetMipmappedArrayLevel_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetMipmappedArrayLevel.levelArray = (hipArray_t*)levelArray; \
+ cb_data.args.hipGetMipmappedArrayLevel.mipmappedArray = (hipMipmappedArray_const_t)mipmappedArray; \
+ cb_data.args.hipGetMipmappedArrayLevel.level = (unsigned int)level; \
+};
+// hipMemsetD32Async[('hipDeviceptr_t', 'dst'), ('int', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+#define INIT_hipMemsetD32Async_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipMemsetD32Async.dst = (hipDeviceptr_t)dst; \
+ cb_data.args.hipMemsetD32Async.value = (int)value; \
+ cb_data.args.hipMemsetD32Async.count = (size_t)count; \
+ cb_data.args.hipMemsetD32Async.stream = (hipStream_t)stream; \
+};
+// hipGetDevice[('int*', 'deviceId')]
+#define INIT_hipGetDevice_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetDevice.deviceId = (int*)deviceId; \
+};
+// hipGetDeviceCount[('int*', 'count')]
+#define INIT_hipGetDeviceCount_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipGetDeviceCount.count = (int*)count; \
+};
+// hipIpcOpenEventHandle[('hipEvent_t*', 'event'), ('hipIpcEventHandle_t', 'handle')]
+#define INIT_hipIpcOpenEventHandle_CB_ARGS_DATA(cb_data) { \
+ cb_data.args.hipIpcOpenEventHandle.event = (hipEvent_t*)event; \
+ cb_data.args.hipIpcOpenEventHandle.handle = (hipIpcEventHandle_t)handle; \
+};
+#define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)
+#if HIP_PROF_HIP_API_STRING
+
+// HIP API args filling method
+static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
+ switch (id) {
+// hipDrvMemcpy3DAsync[('const HIP_MEMCPY3D*', 'pCopy'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipDrvMemcpy3DAsync:
+ if (data->args.hipDrvMemcpy3DAsync.pCopy) data->args.hipDrvMemcpy3DAsync.pCopy__val = *(data->args.hipDrvMemcpy3DAsync.pCopy);
+ break;
+// hipDeviceEnablePeerAccess[('int', 'peerDeviceId'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipDeviceEnablePeerAccess:
+ break;
+// hipFuncSetSharedMemConfig[('const void*', 'func'), ('hipSharedMemConfig', 'config')]
+ case HIP_API_ID_hipFuncSetSharedMemConfig:
+ break;
+// hipMemcpyToSymbolAsync[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyToSymbolAsync:
+ break;
+// hipMallocPitch[('void**', 'ptr'), ('size_t*', 'pitch'), ('size_t', 'width'), ('size_t', 'height')]
+ case HIP_API_ID_hipMallocPitch:
+ if (data->args.hipMallocPitch.ptr) data->args.hipMallocPitch.ptr__val = *(data->args.hipMallocPitch.ptr);
+ if (data->args.hipMallocPitch.pitch) data->args.hipMallocPitch.pitch__val = *(data->args.hipMallocPitch.pitch);
+ break;
+// hipMalloc[('void**', 'ptr'), ('size_t', 'size')]
+ case HIP_API_ID_hipMalloc:
+ if (data->args.hipMalloc.ptr) data->args.hipMalloc.ptr__val = *(data->args.hipMalloc.ptr);
+ break;
+// hipMemsetD16[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemsetD16:
+ break;
+// hipExtStreamGetCUMask[('hipStream_t', 'stream'), ('unsigned int', 'cuMaskSize'), ('unsigned int*', 'cuMask')]
+ case HIP_API_ID_hipExtStreamGetCUMask:
+ if (data->args.hipExtStreamGetCUMask.cuMask) data->args.hipExtStreamGetCUMask.cuMask__val = *(data->args.hipExtStreamGetCUMask.cuMask);
+ break;
+// hipEventRecord[('hipEvent_t', 'event'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipEventRecord:
+ break;
+// hipCtxSynchronize[]
+ case HIP_API_ID_hipCtxSynchronize:
+ break;
+// hipSetDevice[('int', 'deviceId')]
+ case HIP_API_ID_hipSetDevice:
+ break;
+// hipCtxGetApiVersion[('hipCtx_t', 'ctx'), ('int*', 'apiVersion')]
+ case HIP_API_ID_hipCtxGetApiVersion:
+ if (data->args.hipCtxGetApiVersion.apiVersion) data->args.hipCtxGetApiVersion.apiVersion__val = *(data->args.hipCtxGetApiVersion.apiVersion);
+ break;
+// hipMemcpyFromSymbolAsync[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyFromSymbolAsync:
+ break;
+// hipExtGetLinkTypeAndHopCount[('int', 'device1'), ('int', 'device2'), ('unsigned int*', 'linktype'), ('unsigned int*', 'hopcount')]
+ case HIP_API_ID_hipExtGetLinkTypeAndHopCount:
+ if (data->args.hipExtGetLinkTypeAndHopCount.linktype) data->args.hipExtGetLinkTypeAndHopCount.linktype__val = *(data->args.hipExtGetLinkTypeAndHopCount.linktype);
+ if (data->args.hipExtGetLinkTypeAndHopCount.hopcount) data->args.hipExtGetLinkTypeAndHopCount.hopcount__val = *(data->args.hipExtGetLinkTypeAndHopCount.hopcount);
+ break;
+// __hipPopCallConfiguration[('dim3*', 'gridDim'), ('dim3*', 'blockDim'), ('size_t*', 'sharedMem'), ('hipStream_t*', 'stream')]
+ case HIP_API_ID___hipPopCallConfiguration:
+ if (data->args.__hipPopCallConfiguration.gridDim) data->args.__hipPopCallConfiguration.gridDim__val = *(data->args.__hipPopCallConfiguration.gridDim);
+ if (data->args.__hipPopCallConfiguration.blockDim) data->args.__hipPopCallConfiguration.blockDim__val = *(data->args.__hipPopCallConfiguration.blockDim);
+ if (data->args.__hipPopCallConfiguration.sharedMem) data->args.__hipPopCallConfiguration.sharedMem__val = *(data->args.__hipPopCallConfiguration.sharedMem);
+ if (data->args.__hipPopCallConfiguration.stream) data->args.__hipPopCallConfiguration.stream__val = *(data->args.__hipPopCallConfiguration.stream);
+ break;
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk')]
+ case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor:
+ if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks) data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val = *(data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks);
+ break;
+// hipMemset3D[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent')]
+ case HIP_API_ID_hipMemset3D:
+ break;
+// hipStreamCreateWithPriority[('hipStream_t*', 'stream'), ('unsigned int', 'flags'), ('int', 'priority')]
+ case HIP_API_ID_hipStreamCreateWithPriority:
+ if (data->args.hipStreamCreateWithPriority.stream) data->args.hipStreamCreateWithPriority.stream__val = *(data->args.hipStreamCreateWithPriority.stream);
+ break;
+// hipMemcpy2DToArray[('hipArray*', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpy2DToArray:
+ if (data->args.hipMemcpy2DToArray.dst) data->args.hipMemcpy2DToArray.dst__val = *(data->args.hipMemcpy2DToArray.dst);
+ break;
+// hipMemsetD8Async[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemsetD8Async:
+ break;
+// hipCtxGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+ case HIP_API_ID_hipCtxGetCacheConfig:
+ if (data->args.hipCtxGetCacheConfig.cacheConfig) data->args.hipCtxGetCacheConfig.cacheConfig__val = *(data->args.hipCtxGetCacheConfig.cacheConfig);
+ break;
+// hipModuleGetFunction[('hipFunction_t*', 'function'), ('hipModule_t', 'module'), ('const char*', 'kname')]
+ case HIP_API_ID_hipModuleGetFunction:
+ if (data->args.hipModuleGetFunction.function) data->args.hipModuleGetFunction.function__val = *(data->args.hipModuleGetFunction.function);
+ if (data->args.hipModuleGetFunction.kname) data->args.hipModuleGetFunction.kname__val = *(data->args.hipModuleGetFunction.kname);
+ break;
+// hipStreamWaitEvent[('hipStream_t', 'stream'), ('hipEvent_t', 'event'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipStreamWaitEvent:
+ break;
+// hipDeviceGetStreamPriorityRange[('int*', 'leastPriority'), ('int*', 'greatestPriority')]
+ case HIP_API_ID_hipDeviceGetStreamPriorityRange:
+ if (data->args.hipDeviceGetStreamPriorityRange.leastPriority) data->args.hipDeviceGetStreamPriorityRange.leastPriority__val = *(data->args.hipDeviceGetStreamPriorityRange.leastPriority);
+ if (data->args.hipDeviceGetStreamPriorityRange.greatestPriority) data->args.hipDeviceGetStreamPriorityRange.greatestPriority__val = *(data->args.hipDeviceGetStreamPriorityRange.greatestPriority);
+ break;
+// hipModuleLoad[('hipModule_t*', 'module'), ('const char*', 'fname')]
+ case HIP_API_ID_hipModuleLoad:
+ if (data->args.hipModuleLoad.module) data->args.hipModuleLoad.module__val = *(data->args.hipModuleLoad.module);
+ if (data->args.hipModuleLoad.fname) data->args.hipModuleLoad.fname__val = *(data->args.hipModuleLoad.fname);
+ break;
+// hipDevicePrimaryCtxSetFlags[('hipDevice_t', 'dev'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipDevicePrimaryCtxSetFlags:
+ break;
+// hipLaunchCooperativeKernel[('const void*', 'f'), ('dim3', 'gridDim'), ('dim3', 'blockDimX'), ('void**', 'kernelParams'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipLaunchCooperativeKernel:
+ if (data->args.hipLaunchCooperativeKernel.kernelParams) data->args.hipLaunchCooperativeKernel.kernelParams__val = *(data->args.hipLaunchCooperativeKernel.kernelParams);
+ break;
+// hipLaunchCooperativeKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice:
+ if (data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList) data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val = *(data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList);
+ break;
+// hipMemcpyAsync[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyAsync:
+ break;
+// hipMalloc3DArray[('hipArray_t*', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipMalloc3DArray:
+ if (data->args.hipMalloc3DArray.array) data->args.hipMalloc3DArray.array__val = *(data->args.hipMalloc3DArray.array);
+ if (data->args.hipMalloc3DArray.desc) data->args.hipMalloc3DArray.desc__val = *(data->args.hipMalloc3DArray.desc);
+ break;
+// hipMallocHost[('void**', 'ptr'), ('size_t', 'size')]
+ case HIP_API_ID_hipMallocHost:
+ if (data->args.hipMallocHost.ptr) data->args.hipMallocHost.ptr__val = *(data->args.hipMallocHost.ptr);
+ break;
+// hipCtxGetCurrent[('hipCtx_t*', 'ctx')]
+ case HIP_API_ID_hipCtxGetCurrent:
+ if (data->args.hipCtxGetCurrent.ctx) data->args.hipCtxGetCurrent.ctx__val = *(data->args.hipCtxGetCurrent.ctx);
+ break;
+// hipDevicePrimaryCtxGetState[('hipDevice_t', 'dev'), ('unsigned int*', 'flags'), ('int*', 'active')]
+ case HIP_API_ID_hipDevicePrimaryCtxGetState:
+ if (data->args.hipDevicePrimaryCtxGetState.flags) data->args.hipDevicePrimaryCtxGetState.flags__val = *(data->args.hipDevicePrimaryCtxGetState.flags);
+ if (data->args.hipDevicePrimaryCtxGetState.active) data->args.hipDevicePrimaryCtxGetState.active__val = *(data->args.hipDevicePrimaryCtxGetState.active);
+ break;
+// hipEventQuery[('hipEvent_t', 'event')]
+ case HIP_API_ID_hipEventQuery:
+ break;
+// hipEventCreate[('hipEvent_t*', 'event')]
+ case HIP_API_ID_hipEventCreate:
+ if (data->args.hipEventCreate.event) data->args.hipEventCreate.event__val = *(data->args.hipEventCreate.event);
+ break;
+// hipMemGetAddressRange[('hipDeviceptr_t*', 'pbase'), ('size_t*', 'psize'), ('hipDeviceptr_t', 'dptr')]
+ case HIP_API_ID_hipMemGetAddressRange:
+ if (data->args.hipMemGetAddressRange.pbase) data->args.hipMemGetAddressRange.pbase__val = *(data->args.hipMemGetAddressRange.pbase);
+ if (data->args.hipMemGetAddressRange.psize) data->args.hipMemGetAddressRange.psize__val = *(data->args.hipMemGetAddressRange.psize);
+ break;
+// hipMemcpyFromSymbol[('void*', 'dst'), ('const void*', 'symbol'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpyFromSymbol:
+ break;
+// hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')]
+ case HIP_API_ID_hipArrayCreate:
+ if (data->args.hipArrayCreate.pHandle) data->args.hipArrayCreate.pHandle__val = *(data->args.hipArrayCreate.pHandle);
+ if (data->args.hipArrayCreate.pAllocateArray) data->args.hipArrayCreate.pAllocateArray__val = *(data->args.hipArrayCreate.pAllocateArray);
+ break;
+// hipStreamAttachMemAsync[('hipStream_t', 'stream'), ('hipDeviceptr_t*', 'dev_ptr'), ('size_t', 'length'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipStreamAttachMemAsync:
+ if (data->args.hipStreamAttachMemAsync.dev_ptr) data->args.hipStreamAttachMemAsync.dev_ptr__val = *(data->args.hipStreamAttachMemAsync.dev_ptr);
+ break;
+// hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')]
+ case HIP_API_ID_hipStreamGetFlags:
+ if (data->args.hipStreamGetFlags.flags) data->args.hipStreamGetFlags.flags__val = *(data->args.hipStreamGetFlags.flags);
+ break;
+// hipMallocArray[('hipArray**', 'array'), ('const hipChannelFormatDesc*', 'desc'), ('size_t', 'width'), ('size_t', 'height'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipMallocArray:
+ if (data->args.hipMallocArray.array) data->args.hipMallocArray.array__val = *(data->args.hipMallocArray.array);
+ if (data->args.hipMallocArray.desc) data->args.hipMallocArray.desc__val = *(data->args.hipMallocArray.desc);
+ break;
+// hipCtxGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+ case HIP_API_ID_hipCtxGetSharedMemConfig:
+ if (data->args.hipCtxGetSharedMemConfig.pConfig) data->args.hipCtxGetSharedMemConfig.pConfig__val = *(data->args.hipCtxGetSharedMemConfig.pConfig);
+ break;
+// hipDeviceDisablePeerAccess[('int', 'peerDeviceId')]
+ case HIP_API_ID_hipDeviceDisablePeerAccess:
+ break;
+// hipModuleOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+ case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize:
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize) data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize);
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize) data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize);
+ break;
+// hipMemPtrGetInfo[('void*', 'ptr'), ('size_t*', 'size')]
+ case HIP_API_ID_hipMemPtrGetInfo:
+ if (data->args.hipMemPtrGetInfo.size) data->args.hipMemPtrGetInfo.size__val = *(data->args.hipMemPtrGetInfo.size);
+ break;
+// hipFuncGetAttribute[('int*', 'value'), ('hipFunction_attribute', 'attrib'), ('hipFunction_t', 'hfunc')]
+ case HIP_API_ID_hipFuncGetAttribute:
+ if (data->args.hipFuncGetAttribute.value) data->args.hipFuncGetAttribute.value__val = *(data->args.hipFuncGetAttribute.value);
+ break;
+// hipCtxGetFlags[('unsigned int*', 'flags')]
+ case HIP_API_ID_hipCtxGetFlags:
+ if (data->args.hipCtxGetFlags.flags) data->args.hipCtxGetFlags.flags__val = *(data->args.hipCtxGetFlags.flags);
+ break;
+// hipStreamDestroy[('hipStream_t', 'stream')]
+ case HIP_API_ID_hipStreamDestroy:
+ break;
+// __hipPushCallConfiguration[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+ case HIP_API_ID___hipPushCallConfiguration:
+ break;
+// hipMemset3DAsync[('hipPitchedPtr', 'pitchedDevPtr'), ('int', 'value'), ('hipExtent', 'extent'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemset3DAsync:
+ break;
+// hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')]
+ case HIP_API_ID_hipDeviceGetPCIBusId:
+ data->args.hipDeviceGetPCIBusId.pciBusId = (data->args.hipDeviceGetPCIBusId.pciBusId) ? strdup(data->args.hipDeviceGetPCIBusId.pciBusId) : NULL;
+ break;
+// hipInit[('unsigned int', 'flags')]
+ case HIP_API_ID_hipInit:
+ break;
+// hipMemcpyAtoH[('void*', 'dst'), ('hipArray*', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemcpyAtoH:
+ if (data->args.hipMemcpyAtoH.srcArray) data->args.hipMemcpyAtoH.srcArray__val = *(data->args.hipMemcpyAtoH.srcArray);
+ break;
+// hipStreamGetPriority[('hipStream_t', 'stream'), ('int*', 'priority')]
+ case HIP_API_ID_hipStreamGetPriority:
+ if (data->args.hipStreamGetPriority.priority) data->args.hipStreamGetPriority.priority__val = *(data->args.hipStreamGetPriority.priority);
+ break;
+// hipMemset2D[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height')]
+ case HIP_API_ID_hipMemset2D:
+ break;
+// hipMemset2DAsync[('void*', 'dst'), ('size_t', 'pitch'), ('int', 'value'), ('size_t', 'width'), ('size_t', 'height'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemset2DAsync:
+ break;
+// hipDeviceCanAccessPeer[('int*', 'canAccessPeer'), ('int', 'deviceId'), ('int', 'peerDeviceId')]
+ case HIP_API_ID_hipDeviceCanAccessPeer:
+ if (data->args.hipDeviceCanAccessPeer.canAccessPeer) data->args.hipDeviceCanAccessPeer.canAccessPeer__val = *(data->args.hipDeviceCanAccessPeer.canAccessPeer);
+ break;
+// hipLaunchByPtr[('const void*', 'hostFunction')]
+ case HIP_API_ID_hipLaunchByPtr:
+ break;
+// hipMemPrefetchAsync[('const void*', 'dev_ptr'), ('size_t', 'count'), ('int', 'device'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemPrefetchAsync:
+ break;
+// hipCtxDestroy[('hipCtx_t', 'ctx')]
+ case HIP_API_ID_hipCtxDestroy:
+ break;
+// hipMemsetD16Async[('hipDeviceptr_t', 'dest'), ('unsigned short', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemsetD16Async:
+ break;
+// hipModuleUnload[('hipModule_t', 'module')]
+ case HIP_API_ID_hipModuleUnload:
+ break;
+// hipHostUnregister[('void*', 'hostPtr')]
+ case HIP_API_ID_hipHostUnregister:
+ break;
+// hipProfilerStop[]
+ case HIP_API_ID_hipProfilerStop:
+ break;
+// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
+ case HIP_API_ID_hipExtStreamCreateWithCUMask:
+ if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream);
+ if (data->args.hipExtStreamCreateWithCUMask.cuMask) data->args.hipExtStreamCreateWithCUMask.cuMask__val = *(data->args.hipExtStreamCreateWithCUMask.cuMask);
+ break;
+// hipStreamSynchronize[('hipStream_t', 'stream')]
+ case HIP_API_ID_hipStreamSynchronize:
+ break;
+// hipFreeHost[('void*', 'ptr')]
+ case HIP_API_ID_hipFreeHost:
+ break;
+// hipDeviceSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+ case HIP_API_ID_hipDeviceSetCacheConfig:
+ break;
+// hipGetErrorName[]
+ case HIP_API_ID_hipGetErrorName:
+ break;
+// hipMemcpyHtoD[('hipDeviceptr_t', 'dst'), ('void*', 'src'), ('size_t', 'sizeBytes')]
+ case HIP_API_ID_hipMemcpyHtoD:
+ break;
+// hipModuleGetGlobal[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'bytes'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+ case HIP_API_ID_hipModuleGetGlobal:
+ if (data->args.hipModuleGetGlobal.dptr) data->args.hipModuleGetGlobal.dptr__val = *(data->args.hipModuleGetGlobal.dptr);
+ if (data->args.hipModuleGetGlobal.bytes) data->args.hipModuleGetGlobal.bytes__val = *(data->args.hipModuleGetGlobal.bytes);
+ if (data->args.hipModuleGetGlobal.name) data->args.hipModuleGetGlobal.name__val = *(data->args.hipModuleGetGlobal.name);
+ break;
+// hipMemcpyHtoA[('hipArray*', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemcpyHtoA:
+ if (data->args.hipMemcpyHtoA.dstArray) data->args.hipMemcpyHtoA.dstArray__val = *(data->args.hipMemcpyHtoA.dstArray);
+ break;
+// hipCtxCreate[('hipCtx_t*', 'ctx'), ('unsigned int', 'flags'), ('hipDevice_t', 'device')]
+ case HIP_API_ID_hipCtxCreate:
+ if (data->args.hipCtxCreate.ctx) data->args.hipCtxCreate.ctx__val = *(data->args.hipCtxCreate.ctx);
+ break;
+// hipMemcpy2D[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpy2D:
+ break;
+// hipIpcCloseMemHandle[('void*', 'devPtr')]
+ case HIP_API_ID_hipIpcCloseMemHandle:
+ break;
+// hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')]
+ case HIP_API_ID_hipChooseDevice:
+ if (data->args.hipChooseDevice.device) data->args.hipChooseDevice.device__val = *(data->args.hipChooseDevice.device);
+ if (data->args.hipChooseDevice.prop) data->args.hipChooseDevice.prop__val = *(data->args.hipChooseDevice.prop);
+ break;
+// hipDeviceSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+ case HIP_API_ID_hipDeviceSetSharedMemConfig:
+ break;
+// hipMallocMipmappedArray[('hipMipmappedArray_t*', 'mipmappedArray'), ('const hipChannelFormatDesc*', 'desc'), ('hipExtent', 'extent'), ('unsigned int', 'numLevels'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipMallocMipmappedArray:
+ if (data->args.hipMallocMipmappedArray.mipmappedArray) data->args.hipMallocMipmappedArray.mipmappedArray__val = *(data->args.hipMallocMipmappedArray.mipmappedArray);
+ if (data->args.hipMallocMipmappedArray.desc) data->args.hipMallocMipmappedArray.desc__val = *(data->args.hipMallocMipmappedArray.desc);
+ break;
+// hipSetupArgument[('const void*', 'arg'), ('size_t', 'size'), ('size_t', 'offset')]
+ case HIP_API_ID_hipSetupArgument:
+ break;
+// hipIpcGetEventHandle[('hipIpcEventHandle_t*', 'handle'), ('hipEvent_t', 'event')]
+ case HIP_API_ID_hipIpcGetEventHandle:
+ if (data->args.hipIpcGetEventHandle.handle) data->args.hipIpcGetEventHandle.handle__val = *(data->args.hipIpcGetEventHandle.handle);
+ break;
+// hipFreeArray[('hipArray*', 'array')]
+ case HIP_API_ID_hipFreeArray:
+ if (data->args.hipFreeArray.array) data->args.hipFreeArray.array__val = *(data->args.hipFreeArray.array);
+ break;
+// hipCtxSetCacheConfig[('hipFuncCache_t', 'cacheConfig')]
+ case HIP_API_ID_hipCtxSetCacheConfig:
+ break;
+// hipFuncSetCacheConfig[('const void*', 'func'), ('hipFuncCache_t', 'config')]
+ case HIP_API_ID_hipFuncSetCacheConfig:
+ break;
+// hipLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipLaunchKernel:
+ if (data->args.hipLaunchKernel.args) data->args.hipLaunchKernel.args__val = *(data->args.hipLaunchKernel.args);
+ break;
+// hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('hipFunction_t', 'f'), ('int', 'blockSize'), ('size_t', 'dynSharedMemPerBlk'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+ if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks) data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val = *(data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks);
+ break;
+// hipModuleGetTexRef[('textureReference**', 'texRef'), ('hipModule_t', 'hmod'), ('const char*', 'name')]
+ case HIP_API_ID_hipModuleGetTexRef:
+ if (data->args.hipModuleGetTexRef.texRef) data->args.hipModuleGetTexRef.texRef__val = *(data->args.hipModuleGetTexRef.texRef);
+ if (data->args.hipModuleGetTexRef.name) data->args.hipModuleGetTexRef.name__val = *(data->args.hipModuleGetTexRef.name);
+ break;
+// hipFuncSetAttribute[('const void*', 'func'), ('hipFuncAttribute', 'attr'), ('int', 'value')]
+ case HIP_API_ID_hipFuncSetAttribute:
+ break;
+// hipEventElapsedTime[('float*', 'ms'), ('hipEvent_t', 'start'), ('hipEvent_t', 'stop')]
+ case HIP_API_ID_hipEventElapsedTime:
+ if (data->args.hipEventElapsedTime.ms) data->args.hipEventElapsedTime.ms__val = *(data->args.hipEventElapsedTime.ms);
+ break;
+// hipConfigureCall[('dim3', 'gridDim'), ('dim3', 'blockDim'), ('size_t', 'sharedMem'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipConfigureCall:
+ break;
+// hipMemAdvise[('const void*', 'dev_ptr'), ('size_t', 'count'), ('hipMemoryAdvise', 'advice'), ('int', 'device')]
+ case HIP_API_ID_hipMemAdvise:
+ break;
+// hipMemcpy3DAsync[('const hipMemcpy3DParms*', 'p'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpy3DAsync:
+ if (data->args.hipMemcpy3DAsync.p) data->args.hipMemcpy3DAsync.p__val = *(data->args.hipMemcpy3DAsync.p);
+ break;
+// hipEventDestroy[('hipEvent_t', 'event')]
+ case HIP_API_ID_hipEventDestroy:
+ break;
+// hipCtxPopCurrent[('hipCtx_t*', 'ctx')]
+ case HIP_API_ID_hipCtxPopCurrent:
+ if (data->args.hipCtxPopCurrent.ctx) data->args.hipCtxPopCurrent.ctx__val = *(data->args.hipCtxPopCurrent.ctx);
+ break;
+// hipGetSymbolAddress[('void**', 'devPtr'), ('const void*', 'symbol')]
+ case HIP_API_ID_hipGetSymbolAddress:
+ if (data->args.hipGetSymbolAddress.devPtr) data->args.hipGetSymbolAddress.devPtr__val = *(data->args.hipGetSymbolAddress.devPtr);
+ break;
+// hipHostGetFlags[('unsigned int*', 'flagsPtr'), ('void*', 'hostPtr')]
+ case HIP_API_ID_hipHostGetFlags:
+ if (data->args.hipHostGetFlags.flagsPtr) data->args.hipHostGetFlags.flagsPtr__val = *(data->args.hipHostGetFlags.flagsPtr);
+ break;
+// hipHostMalloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipHostMalloc:
+ if (data->args.hipHostMalloc.ptr) data->args.hipHostMalloc.ptr__val = *(data->args.hipHostMalloc.ptr);
+ break;
+// hipCtxSetSharedMemConfig[('hipSharedMemConfig', 'config')]
+ case HIP_API_ID_hipCtxSetSharedMemConfig:
+ break;
+// hipFreeMipmappedArray[('hipMipmappedArray_t', 'mipmappedArray')]
+ case HIP_API_ID_hipFreeMipmappedArray:
+ break;
+// hipMemGetInfo[('size_t*', 'free'), ('size_t*', 'total')]
+ case HIP_API_ID_hipMemGetInfo:
+ if (data->args.hipMemGetInfo.free) data->args.hipMemGetInfo.free__val = *(data->args.hipMemGetInfo.free);
+ if (data->args.hipMemGetInfo.total) data->args.hipMemGetInfo.total__val = *(data->args.hipMemGetInfo.total);
+ break;
+// hipDeviceReset[]
+ case HIP_API_ID_hipDeviceReset:
+ break;
+// hipMemset[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes')]
+ case HIP_API_ID_hipMemset:
+ break;
+// hipMemsetD8[('hipDeviceptr_t', 'dest'), ('unsigned char', 'value'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemsetD8:
+ break;
+// hipMemcpyParam2DAsync[('const hip_Memcpy2D*', 'pCopy'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyParam2DAsync:
+ if (data->args.hipMemcpyParam2DAsync.pCopy) data->args.hipMemcpyParam2DAsync.pCopy__val = *(data->args.hipMemcpyParam2DAsync.pCopy);
+ break;
+// hipHostRegister[('void*', 'hostPtr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipHostRegister:
+ break;
+// hipDriverGetVersion[('int*', 'driverVersion')]
+ case HIP_API_ID_hipDriverGetVersion:
+ if (data->args.hipDriverGetVersion.driverVersion) data->args.hipDriverGetVersion.driverVersion__val = *(data->args.hipDriverGetVersion.driverVersion);
+ break;
+// hipArray3DCreate[('hipArray**', 'array'), ('const HIP_ARRAY3D_DESCRIPTOR*', 'pAllocateArray')]
+ case HIP_API_ID_hipArray3DCreate:
+ if (data->args.hipArray3DCreate.array) data->args.hipArray3DCreate.array__val = *(data->args.hipArray3DCreate.array);
+ if (data->args.hipArray3DCreate.pAllocateArray) data->args.hipArray3DCreate.pAllocateArray__val = *(data->args.hipArray3DCreate.pAllocateArray);
+ break;
+// hipIpcOpenMemHandle[('void**', 'devPtr'), ('hipIpcMemHandle_t', 'handle'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipIpcOpenMemHandle:
+ if (data->args.hipIpcOpenMemHandle.devPtr) data->args.hipIpcOpenMemHandle.devPtr__val = *(data->args.hipIpcOpenMemHandle.devPtr);
+ break;
+// hipGetLastError[]
+ case HIP_API_ID_hipGetLastError:
+ break;
+// hipGetDeviceFlags[('unsigned int*', 'flags')]
+ case HIP_API_ID_hipGetDeviceFlags:
+ if (data->args.hipGetDeviceFlags.flags) data->args.hipGetDeviceFlags.flags__val = *(data->args.hipGetDeviceFlags.flags);
+ break;
+// hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
+ case HIP_API_ID_hipDeviceGetSharedMemConfig:
+ if (data->args.hipDeviceGetSharedMemConfig.pConfig) data->args.hipDeviceGetSharedMemConfig.pConfig__val = *(data->args.hipDeviceGetSharedMemConfig.pConfig);
+ break;
+// hipDrvMemcpy3D[('const HIP_MEMCPY3D*', 'pCopy')]
+ case HIP_API_ID_hipDrvMemcpy3D:
+ if (data->args.hipDrvMemcpy3D.pCopy) data->args.hipDrvMemcpy3D.pCopy__val = *(data->args.hipDrvMemcpy3D.pCopy);
+ break;
+// hipMemcpy2DFromArray[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpy2DFromArray:
+ break;
+// hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+ if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks) data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val = *(data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks);
+ break;
+// hipSetDeviceFlags[('unsigned int', 'flags')]
+ case HIP_API_ID_hipSetDeviceFlags:
+ break;
+// hipHccModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent')]
+ case HIP_API_ID_hipHccModuleLaunchKernel:
+ if (data->args.hipHccModuleLaunchKernel.kernelParams) data->args.hipHccModuleLaunchKernel.kernelParams__val = *(data->args.hipHccModuleLaunchKernel.kernelParams);
+ if (data->args.hipHccModuleLaunchKernel.extra) data->args.hipHccModuleLaunchKernel.extra__val = *(data->args.hipHccModuleLaunchKernel.extra);
+ break;
+// hipFree[('void*', 'ptr')]
+ case HIP_API_ID_hipFree:
+ break;
+// hipOccupancyMaxPotentialBlockSize[('int*', 'gridSize'), ('int*', 'blockSize'), ('const void*', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit')]
+ case HIP_API_ID_hipOccupancyMaxPotentialBlockSize:
+ if (data->args.hipOccupancyMaxPotentialBlockSize.gridSize) data->args.hipOccupancyMaxPotentialBlockSize.gridSize__val = *(data->args.hipOccupancyMaxPotentialBlockSize.gridSize);
+ if (data->args.hipOccupancyMaxPotentialBlockSize.blockSize) data->args.hipOccupancyMaxPotentialBlockSize.blockSize__val = *(data->args.hipOccupancyMaxPotentialBlockSize.blockSize);
+ break;
+// hipDeviceGetAttribute[('int*', 'pi'), ('hipDeviceAttribute_t', 'attr'), ('int', 'deviceId')]
+ case HIP_API_ID_hipDeviceGetAttribute:
+ if (data->args.hipDeviceGetAttribute.pi) data->args.hipDeviceGetAttribute.pi__val = *(data->args.hipDeviceGetAttribute.pi);
+ break;
+// hipDeviceComputeCapability[('int*', 'major'), ('int*', 'minor'), ('hipDevice_t', 'device')]
+ case HIP_API_ID_hipDeviceComputeCapability:
+ if (data->args.hipDeviceComputeCapability.major) data->args.hipDeviceComputeCapability.major__val = *(data->args.hipDeviceComputeCapability.major);
+ if (data->args.hipDeviceComputeCapability.minor) data->args.hipDeviceComputeCapability.minor__val = *(data->args.hipDeviceComputeCapability.minor);
+ break;
+// hipCtxDisablePeerAccess[('hipCtx_t', 'peerCtx')]
+ case HIP_API_ID_hipCtxDisablePeerAccess:
+ break;
+// hipMallocManaged[('void**', 'dev_ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipMallocManaged:
+ if (data->args.hipMallocManaged.dev_ptr) data->args.hipMallocManaged.dev_ptr__val = *(data->args.hipMallocManaged.dev_ptr);
+ break;
+// hipDeviceGetByPCIBusId[('int*', 'device'), ('const char*', 'pciBusId')]
+ case HIP_API_ID_hipDeviceGetByPCIBusId:
+ if (data->args.hipDeviceGetByPCIBusId.device) data->args.hipDeviceGetByPCIBusId.device__val = *(data->args.hipDeviceGetByPCIBusId.device);
+ if (data->args.hipDeviceGetByPCIBusId.pciBusId) data->args.hipDeviceGetByPCIBusId.pciBusId__val = *(data->args.hipDeviceGetByPCIBusId.pciBusId);
+ break;
+// hipIpcGetMemHandle[('hipIpcMemHandle_t*', 'handle'), ('void*', 'devPtr')]
+ case HIP_API_ID_hipIpcGetMemHandle:
+ if (data->args.hipIpcGetMemHandle.handle) data->args.hipIpcGetMemHandle.handle__val = *(data->args.hipIpcGetMemHandle.handle);
+ break;
+// hipMemcpyHtoDAsync[('hipDeviceptr_t', 'dst'), ('void*', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyHtoDAsync:
+ break;
+// hipCtxGetDevice[('hipDevice_t*', 'device')]
+ case HIP_API_ID_hipCtxGetDevice:
+ if (data->args.hipCtxGetDevice.device) data->args.hipCtxGetDevice.device__val = *(data->args.hipCtxGetDevice.device);
+ break;
+// hipMemcpyDtoD[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+ case HIP_API_ID_hipMemcpyDtoD:
+ break;
+// hipModuleLoadData[('hipModule_t*', 'module'), ('const void*', 'image')]
+ case HIP_API_ID_hipModuleLoadData:
+ if (data->args.hipModuleLoadData.module) data->args.hipModuleLoadData.module__val = *(data->args.hipModuleLoadData.module);
+ break;
+// hipDevicePrimaryCtxRelease[('hipDevice_t', 'dev')]
+ case HIP_API_ID_hipDevicePrimaryCtxRelease:
+ break;
+// hipOccupancyMaxActiveBlocksPerMultiprocessor[('int*', 'numBlocks'), ('const void*', 'f'), ('int', 'blockSize'), ('size_t', 'dynamicSMemSize')]
+ case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor:
+ if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks) data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val = *(data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks);
+ break;
+// hipCtxSetCurrent[('hipCtx_t', 'ctx')]
+ case HIP_API_ID_hipCtxSetCurrent:
+ break;
+// hipGetErrorString[]
+ case HIP_API_ID_hipGetErrorString:
+ break;
+// hipStreamCreate[('hipStream_t*', 'stream')]
+ case HIP_API_ID_hipStreamCreate:
+ if (data->args.hipStreamCreate.stream) data->args.hipStreamCreate.stream__val = *(data->args.hipStreamCreate.stream);
+ break;
+// hipDevicePrimaryCtxRetain[('hipCtx_t*', 'pctx'), ('hipDevice_t', 'dev')]
+ case HIP_API_ID_hipDevicePrimaryCtxRetain:
+ if (data->args.hipDevicePrimaryCtxRetain.pctx) data->args.hipDevicePrimaryCtxRetain.pctx__val = *(data->args.hipDevicePrimaryCtxRetain.pctx);
+ break;
+// hipDeviceGet[('hipDevice_t*', 'device'), ('int', 'ordinal')]
+ case HIP_API_ID_hipDeviceGet:
+ if (data->args.hipDeviceGet.device) data->args.hipDeviceGet.device__val = *(data->args.hipDeviceGet.device);
+ break;
+// hipStreamCreateWithFlags[('hipStream_t*', 'stream'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipStreamCreateWithFlags:
+ if (data->args.hipStreamCreateWithFlags.stream) data->args.hipStreamCreateWithFlags.stream__val = *(data->args.hipStreamCreateWithFlags.stream);
+ break;
+// hipMemcpyFromArray[('void*', 'dst'), ('hipArray_const_t', 'srcArray'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpyFromArray:
+ break;
+// hipMemcpy2DAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpy2DAsync:
+ break;
+// hipFuncGetAttributes[('hipFuncAttributes*', 'attr'), ('const void*', 'func')]
+ case HIP_API_ID_hipFuncGetAttributes:
+ if (data->args.hipFuncGetAttributes.attr) data->args.hipFuncGetAttributes.attr__val = *(data->args.hipFuncGetAttributes.attr);
+ break;
+// hipGetSymbolSize[('size_t*', 'size'), ('const void*', 'symbol')]
+ case HIP_API_ID_hipGetSymbolSize:
+ if (data->args.hipGetSymbolSize.size) data->args.hipGetSymbolSize.size__val = *(data->args.hipGetSymbolSize.size);
+ break;
+// hipHostFree[('void*', 'ptr')]
+ case HIP_API_ID_hipHostFree:
+ break;
+// hipEventCreateWithFlags[('hipEvent_t*', 'event'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipEventCreateWithFlags:
+ if (data->args.hipEventCreateWithFlags.event) data->args.hipEventCreateWithFlags.event__val = *(data->args.hipEventCreateWithFlags.event);
+ break;
+// hipStreamQuery[('hipStream_t', 'stream')]
+ case HIP_API_ID_hipStreamQuery:
+ break;
+// hipMemcpy3D[('const hipMemcpy3DParms*', 'p')]
+ case HIP_API_ID_hipMemcpy3D:
+ if (data->args.hipMemcpy3D.p) data->args.hipMemcpy3D.p__val = *(data->args.hipMemcpy3D.p);
+ break;
+// hipMemcpyToSymbol[('const void*', 'symbol'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('size_t', 'offset'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpyToSymbol:
+ break;
+// hipMemcpy[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpy:
+ break;
+// hipPeekAtLastError[]
+ case HIP_API_ID_hipPeekAtLastError:
+ break;
+// hipExtLaunchMultiKernelMultiDevice[('hipLaunchParams*', 'launchParamsList'), ('int', 'numDevices'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice:
+ if (data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList) data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList__val = *(data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList);
+ break;
+// hipHostAlloc[('void**', 'ptr'), ('size_t', 'size'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipHostAlloc:
+ if (data->args.hipHostAlloc.ptr) data->args.hipHostAlloc.ptr__val = *(data->args.hipHostAlloc.ptr);
+ break;
+// hipStreamAddCallback[('hipStream_t', 'stream'), ('hipStreamCallback_t', 'callback'), ('void*', 'userData'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipStreamAddCallback:
+ break;
+// hipMemcpyToArray[('hipArray*', 'dst'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('const void*', 'src'), ('size_t', 'count'), ('hipMemcpyKind', 'kind')]
+ case HIP_API_ID_hipMemcpyToArray:
+ if (data->args.hipMemcpyToArray.dst) data->args.hipMemcpyToArray.dst__val = *(data->args.hipMemcpyToArray.dst);
+ break;
+// hipMemsetD32[('hipDeviceptr_t', 'dest'), ('int', 'value'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemsetD32:
+ break;
+// hipExtModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'globalWorkSizeX'), ('unsigned int', 'globalWorkSizeY'), ('unsigned int', 'globalWorkSizeZ'), ('unsigned int', 'localWorkSizeX'), ('unsigned int', 'localWorkSizeY'), ('unsigned int', 'localWorkSizeZ'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'hStream'), ('void**', 'kernelParams'), ('void**', 'extra'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipExtModuleLaunchKernel:
+ if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams);
+ if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra);
+ break;
+// hipDeviceSynchronize[]
+ case HIP_API_ID_hipDeviceSynchronize:
+ break;
+// hipDeviceGetCacheConfig[('hipFuncCache_t*', 'cacheConfig')]
+ case HIP_API_ID_hipDeviceGetCacheConfig:
+ if (data->args.hipDeviceGetCacheConfig.cacheConfig) data->args.hipDeviceGetCacheConfig.cacheConfig__val = *(data->args.hipDeviceGetCacheConfig.cacheConfig);
+ break;
+// hipMalloc3D[('hipPitchedPtr*', 'pitchedDevPtr'), ('hipExtent', 'extent')]
+ case HIP_API_ID_hipMalloc3D:
+ if (data->args.hipMalloc3D.pitchedDevPtr) data->args.hipMalloc3D.pitchedDevPtr__val = *(data->args.hipMalloc3D.pitchedDevPtr);
+ break;
+// hipPointerGetAttributes[('hipPointerAttribute_t*', 'attributes'), ('const void*', 'ptr')]
+ case HIP_API_ID_hipPointerGetAttributes:
+ if (data->args.hipPointerGetAttributes.attributes) data->args.hipPointerGetAttributes.attributes__val = *(data->args.hipPointerGetAttributes.attributes);
+ break;
+// hipMemsetAsync[('void*', 'dst'), ('int', 'value'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemsetAsync:
+ break;
+// hipDeviceGetName[('char*', 'name'), ('int', 'len'), ('hipDevice_t', 'device')]
+ case HIP_API_ID_hipDeviceGetName:
+ data->args.hipDeviceGetName.name = (data->args.hipDeviceGetName.name) ? strdup(data->args.hipDeviceGetName.name) : NULL;
+ break;
+// hipModuleOccupancyMaxPotentialBlockSizeWithFlags[('int*', 'gridSize'), ('int*', 'blockSize'), ('hipFunction_t', 'f'), ('size_t', 'dynSharedMemPerBlk'), ('int', 'blockSizeLimit'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags:
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize) data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize);
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize) data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize__val = *(data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize);
+ break;
+// hipCtxPushCurrent[('hipCtx_t', 'ctx')]
+ case HIP_API_ID_hipCtxPushCurrent:
+ break;
+// hipMemcpyPeer[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDeviceId'), ('size_t', 'sizeBytes')]
+ case HIP_API_ID_hipMemcpyPeer:
+ break;
+// hipEventSynchronize[('hipEvent_t', 'event')]
+ case HIP_API_ID_hipEventSynchronize:
+ break;
+// hipMemcpyDtoDAsync[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyDtoDAsync:
+ break;
+// hipProfilerStart[]
+ case HIP_API_ID_hipProfilerStart:
+ break;
+// hipExtMallocWithFlags[('void**', 'ptr'), ('size_t', 'sizeBytes'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipExtMallocWithFlags:
+ if (data->args.hipExtMallocWithFlags.ptr) data->args.hipExtMallocWithFlags.ptr__val = *(data->args.hipExtMallocWithFlags.ptr);
+ break;
+// hipCtxEnablePeerAccess[('hipCtx_t', 'peerCtx'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipCtxEnablePeerAccess:
+ break;
+// hipMemAllocHost[('void**', 'ptr'), ('size_t', 'size')]
+ case HIP_API_ID_hipMemAllocHost:
+ if (data->args.hipMemAllocHost.ptr) data->args.hipMemAllocHost.ptr__val = *(data->args.hipMemAllocHost.ptr);
+ break;
+// hipMemcpyDtoHAsync[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyDtoHAsync:
+ break;
+// hipModuleLaunchKernel[('hipFunction_t', 'f'), ('unsigned int', 'gridDimX'), ('unsigned int', 'gridDimY'), ('unsigned int', 'gridDimZ'), ('unsigned int', 'blockDimX'), ('unsigned int', 'blockDimY'), ('unsigned int', 'blockDimZ'), ('unsigned int', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('void**', 'kernelParams'), ('void**', 'extra')]
+ case HIP_API_ID_hipModuleLaunchKernel:
+ if (data->args.hipModuleLaunchKernel.kernelParams) data->args.hipModuleLaunchKernel.kernelParams__val = *(data->args.hipModuleLaunchKernel.kernelParams);
+ if (data->args.hipModuleLaunchKernel.extra) data->args.hipModuleLaunchKernel.extra__val = *(data->args.hipModuleLaunchKernel.extra);
+ break;
+// hipMemAllocPitch[('hipDeviceptr_t*', 'dptr'), ('size_t*', 'pitch'), ('size_t', 'widthInBytes'), ('size_t', 'height'), ('unsigned int', 'elementSizeBytes')]
+ case HIP_API_ID_hipMemAllocPitch:
+ if (data->args.hipMemAllocPitch.dptr) data->args.hipMemAllocPitch.dptr__val = *(data->args.hipMemAllocPitch.dptr);
+ if (data->args.hipMemAllocPitch.pitch) data->args.hipMemAllocPitch.pitch__val = *(data->args.hipMemAllocPitch.pitch);
+ break;
+// hipExtLaunchKernel[('const void*', 'function_address'), ('dim3', 'numBlocks'), ('dim3', 'dimBlocks'), ('void**', 'args'), ('size_t', 'sharedMemBytes'), ('hipStream_t', 'stream'), ('hipEvent_t', 'startEvent'), ('hipEvent_t', 'stopEvent'), ('int', 'flags')]
+ case HIP_API_ID_hipExtLaunchKernel:
+ if (data->args.hipExtLaunchKernel.args) data->args.hipExtLaunchKernel.args__val = *(data->args.hipExtLaunchKernel.args);
+ break;
+// hipMemcpy2DFromArrayAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('hipArray_const_t', 'src'), ('size_t', 'wOffset'), ('size_t', 'hOffset'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpy2DFromArrayAsync:
+ break;
+// hipDeviceGetLimit[('size_t*', 'pValue'), ('hipLimit_t', 'limit')]
+ case HIP_API_ID_hipDeviceGetLimit:
+ if (data->args.hipDeviceGetLimit.pValue) data->args.hipDeviceGetLimit.pValue__val = *(data->args.hipDeviceGetLimit.pValue);
+ break;
+// hipModuleLoadDataEx[('hipModule_t*', 'module'), ('const void*', 'image'), ('unsigned int', 'numOptions'), ('hipJitOption*', 'options'), ('void**', 'optionsValues')]
+ case HIP_API_ID_hipModuleLoadDataEx:
+ if (data->args.hipModuleLoadDataEx.module) data->args.hipModuleLoadDataEx.module__val = *(data->args.hipModuleLoadDataEx.module);
+ if (data->args.hipModuleLoadDataEx.options) data->args.hipModuleLoadDataEx.options__val = *(data->args.hipModuleLoadDataEx.options);
+ if (data->args.hipModuleLoadDataEx.optionsValues) data->args.hipModuleLoadDataEx.optionsValues__val = *(data->args.hipModuleLoadDataEx.optionsValues);
+ break;
+// hipRuntimeGetVersion[('int*', 'runtimeVersion')]
+ case HIP_API_ID_hipRuntimeGetVersion:
+ if (data->args.hipRuntimeGetVersion.runtimeVersion) data->args.hipRuntimeGetVersion.runtimeVersion__val = *(data->args.hipRuntimeGetVersion.runtimeVersion);
+ break;
+// hipMemRangeGetAttribute[('void*', 'data'), ('size_t', 'data_size'), ('hipMemRangeAttribute', 'attribute'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemRangeGetAttribute:
+ break;
+// hipDeviceGetP2PAttribute[('int*', 'value'), ('hipDeviceP2PAttr', 'attr'), ('int', 'srcDevice'), ('int', 'dstDevice')]
+ case HIP_API_ID_hipDeviceGetP2PAttribute:
+ if (data->args.hipDeviceGetP2PAttribute.value) data->args.hipDeviceGetP2PAttribute.value__val = *(data->args.hipDeviceGetP2PAttribute.value);
+ break;
+// hipMemcpyPeerAsync[('void*', 'dst'), ('int', 'dstDeviceId'), ('const void*', 'src'), ('int', 'srcDevice'), ('size_t', 'sizeBytes'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyPeerAsync:
+ break;
+// hipGetDeviceProperties[('hipDeviceProp_t*', 'props'), ('hipDevice_t', 'device')]
+ case HIP_API_ID_hipGetDeviceProperties:
+ if (data->args.hipGetDeviceProperties.props) data->args.hipGetDeviceProperties.props__val = *(data->args.hipGetDeviceProperties.props);
+ break;
+// hipMemcpyDtoH[('void*', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')]
+ case HIP_API_ID_hipMemcpyDtoH:
+ break;
+// hipMemcpyWithStream[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemcpyWithStream:
+ break;
+// hipDeviceTotalMem[('size_t*', 'bytes'), ('hipDevice_t', 'device')]
+ case HIP_API_ID_hipDeviceTotalMem:
+ if (data->args.hipDeviceTotalMem.bytes) data->args.hipDeviceTotalMem.bytes__val = *(data->args.hipDeviceTotalMem.bytes);
+ break;
+// hipHostGetDevicePointer[('void**', 'devPtr'), ('void*', 'hstPtr'), ('unsigned int', 'flags')]
+ case HIP_API_ID_hipHostGetDevicePointer:
+ if (data->args.hipHostGetDevicePointer.devPtr) data->args.hipHostGetDevicePointer.devPtr__val = *(data->args.hipHostGetDevicePointer.devPtr);
+ break;
+// hipMemRangeGetAttributes[('void**', 'data'), ('size_t*', 'data_sizes'), ('hipMemRangeAttribute*', 'attributes'), ('size_t', 'num_attributes'), ('const void*', 'dev_ptr'), ('size_t', 'count')]
+ case HIP_API_ID_hipMemRangeGetAttributes:
+ if (data->args.hipMemRangeGetAttributes.data) data->args.hipMemRangeGetAttributes.data__val = *(data->args.hipMemRangeGetAttributes.data);
+ if (data->args.hipMemRangeGetAttributes.data_sizes) data->args.hipMemRangeGetAttributes.data_sizes__val = *(data->args.hipMemRangeGetAttributes.data_sizes);
+ if (data->args.hipMemRangeGetAttributes.attributes) data->args.hipMemRangeGetAttributes.attributes__val = *(data->args.hipMemRangeGetAttributes.attributes);
+ break;
+// hipMemcpyParam2D[('const hip_Memcpy2D*', 'pCopy')]
+ case HIP_API_ID_hipMemcpyParam2D:
+ if (data->args.hipMemcpyParam2D.pCopy) data->args.hipMemcpyParam2D.pCopy__val = *(data->args.hipMemcpyParam2D.pCopy);
+ break;
+// hipDevicePrimaryCtxReset[('hipDevice_t', 'dev')]
+ case HIP_API_ID_hipDevicePrimaryCtxReset:
+ break;
+// hipGetMipmappedArrayLevel[('hipArray_t*', 'levelArray'), ('hipMipmappedArray_const_t', 'mipmappedArray'), ('unsigned int', 'level')]
+ case HIP_API_ID_hipGetMipmappedArrayLevel:
+ if (data->args.hipGetMipmappedArrayLevel.levelArray) data->args.hipGetMipmappedArrayLevel.levelArray__val = *(data->args.hipGetMipmappedArrayLevel.levelArray);
+ break;
+// hipMemsetD32Async[('hipDeviceptr_t', 'dst'), ('int', 'value'), ('size_t', 'count'), ('hipStream_t', 'stream')]
+ case HIP_API_ID_hipMemsetD32Async:
+ break;
+// hipGetDevice[('int*', 'deviceId')]
+ case HIP_API_ID_hipGetDevice:
+ if (data->args.hipGetDevice.deviceId) data->args.hipGetDevice.deviceId__val = *(data->args.hipGetDevice.deviceId);
+ break;
+// hipGetDeviceCount[('int*', 'count')]
+ case HIP_API_ID_hipGetDeviceCount:
+ if (data->args.hipGetDeviceCount.count) data->args.hipGetDeviceCount.count__val = *(data->args.hipGetDeviceCount.count);
+ break;
+// hipIpcOpenEventHandle[('hipEvent_t*', 'event'), ('hipIpcEventHandle_t', 'handle')]
+ case HIP_API_ID_hipIpcOpenEventHandle:
+ if (data->args.hipIpcOpenEventHandle.event) data->args.hipIpcOpenEventHandle.event__val = *(data->args.hipIpcOpenEventHandle.event);
+ break;
+ default: break;
+ };
+}
+
+#include <sstream>
+#include <string>
+// HIP API string method, method name and parameters
+static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* data) {
+ std::ostringstream oss;
+ switch (id) {
+ case HIP_API_ID_hipDrvMemcpy3DAsync:
+ oss << "hipDrvMemcpy3DAsync(";
+ if (data->args.hipDrvMemcpy3DAsync.pCopy == NULL) oss << "pCopy=NULL";
+ else oss << "pCopy=" << data->args.hipDrvMemcpy3DAsync.pCopy__val;
+ oss << ", stream=" << data->args.hipDrvMemcpy3DAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceEnablePeerAccess:
+ oss << "hipDeviceEnablePeerAccess(";
+ oss << "peerDeviceId=" << data->args.hipDeviceEnablePeerAccess.peerDeviceId;
+ oss << ", flags=" << data->args.hipDeviceEnablePeerAccess.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFuncSetSharedMemConfig:
+ oss << "hipFuncSetSharedMemConfig(";
+ oss << "func=" << data->args.hipFuncSetSharedMemConfig.func;
+ oss << ", config=" << data->args.hipFuncSetSharedMemConfig.config;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyToSymbolAsync:
+ oss << "hipMemcpyToSymbolAsync(";
+ oss << "symbol=" << data->args.hipMemcpyToSymbolAsync.symbol;
+ oss << ", src=" << data->args.hipMemcpyToSymbolAsync.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyToSymbolAsync.sizeBytes;
+ oss << ", offset=" << data->args.hipMemcpyToSymbolAsync.offset;
+ oss << ", kind=" << data->args.hipMemcpyToSymbolAsync.kind;
+ oss << ", stream=" << data->args.hipMemcpyToSymbolAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMallocPitch:
+ oss << "hipMallocPitch(";
+ if (data->args.hipMallocPitch.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipMallocPitch.ptr__val;
+ if (data->args.hipMallocPitch.pitch == NULL) oss << ", pitch=NULL";
+ else oss << ", pitch=" << data->args.hipMallocPitch.pitch__val;
+ oss << ", width=" << data->args.hipMallocPitch.width;
+ oss << ", height=" << data->args.hipMallocPitch.height;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMalloc:
+ oss << "hipMalloc(";
+ if (data->args.hipMalloc.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipMalloc.ptr__val;
+ oss << ", size=" << data->args.hipMalloc.size;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetD16:
+ oss << "hipMemsetD16(";
+ oss << "dest=" << data->args.hipMemsetD16.dest;
+ oss << ", value=" << data->args.hipMemsetD16.value;
+ oss << ", count=" << data->args.hipMemsetD16.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtStreamGetCUMask:
+ oss << "hipExtStreamGetCUMask(";
+ oss << "stream=" << data->args.hipExtStreamGetCUMask.stream;
+ oss << ", cuMaskSize=" << data->args.hipExtStreamGetCUMask.cuMaskSize;
+ if (data->args.hipExtStreamGetCUMask.cuMask == NULL) oss << ", cuMask=NULL";
+ else oss << ", cuMask=" << data->args.hipExtStreamGetCUMask.cuMask__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventRecord:
+ oss << "hipEventRecord(";
+ oss << "event=" << data->args.hipEventRecord.event;
+ oss << ", stream=" << data->args.hipEventRecord.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxSynchronize:
+ oss << "hipCtxSynchronize(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipSetDevice:
+ oss << "hipSetDevice(";
+ oss << "deviceId=" << data->args.hipSetDevice.deviceId;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxGetApiVersion:
+ oss << "hipCtxGetApiVersion(";
+ oss << "ctx=" << data->args.hipCtxGetApiVersion.ctx;
+ if (data->args.hipCtxGetApiVersion.apiVersion == NULL) oss << ", apiVersion=NULL";
+ else oss << ", apiVersion=" << data->args.hipCtxGetApiVersion.apiVersion__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyFromSymbolAsync:
+ oss << "hipMemcpyFromSymbolAsync(";
+ oss << "dst=" << data->args.hipMemcpyFromSymbolAsync.dst;
+ oss << ", symbol=" << data->args.hipMemcpyFromSymbolAsync.symbol;
+ oss << ", sizeBytes=" << data->args.hipMemcpyFromSymbolAsync.sizeBytes;
+ oss << ", offset=" << data->args.hipMemcpyFromSymbolAsync.offset;
+ oss << ", kind=" << data->args.hipMemcpyFromSymbolAsync.kind;
+ oss << ", stream=" << data->args.hipMemcpyFromSymbolAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtGetLinkTypeAndHopCount:
+ oss << "hipExtGetLinkTypeAndHopCount(";
+ oss << "device1=" << data->args.hipExtGetLinkTypeAndHopCount.device1;
+ oss << ", device2=" << data->args.hipExtGetLinkTypeAndHopCount.device2;
+ if (data->args.hipExtGetLinkTypeAndHopCount.linktype == NULL) oss << ", linktype=NULL";
+ else oss << ", linktype=" << data->args.hipExtGetLinkTypeAndHopCount.linktype__val;
+ if (data->args.hipExtGetLinkTypeAndHopCount.hopcount == NULL) oss << ", hopcount=NULL";
+ else oss << ", hopcount=" << data->args.hipExtGetLinkTypeAndHopCount.hopcount__val;
+ oss << ")";
+ break;
+ case HIP_API_ID___hipPopCallConfiguration:
+ oss << "__hipPopCallConfiguration(";
+ if (data->args.__hipPopCallConfiguration.gridDim == NULL) oss << "gridDim=NULL";
+ else oss << "gridDim=" << data->args.__hipPopCallConfiguration.gridDim__val;
+ if (data->args.__hipPopCallConfiguration.blockDim == NULL) oss << ", blockDim=NULL";
+ else oss << ", blockDim=" << data->args.__hipPopCallConfiguration.blockDim__val;
+ if (data->args.__hipPopCallConfiguration.sharedMem == NULL) oss << ", sharedMem=NULL";
+ else oss << ", sharedMem=" << data->args.__hipPopCallConfiguration.sharedMem__val;
+ if (data->args.__hipPopCallConfiguration.stream == NULL) oss << ", stream=NULL";
+ else oss << ", stream=" << data->args.__hipPopCallConfiguration.stream__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor:
+ oss << "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(";
+ if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks == NULL) oss << "numBlocks=NULL";
+ else oss << "numBlocks=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val;
+ oss << ", f=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.f;
+ oss << ", blockSize=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.blockSize;
+ oss << ", dynSharedMemPerBlk=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.dynSharedMemPerBlk;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemset3D:
+ oss << "hipMemset3D(";
+ oss << "pitchedDevPtr=" << data->args.hipMemset3D.pitchedDevPtr;
+ oss << ", value=" << data->args.hipMemset3D.value;
+ oss << ", extent=" << data->args.hipMemset3D.extent;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamCreateWithPriority:
+ oss << "hipStreamCreateWithPriority(";
+ if (data->args.hipStreamCreateWithPriority.stream == NULL) oss << "stream=NULL";
+ else oss << "stream=" << data->args.hipStreamCreateWithPriority.stream__val;
+ oss << ", flags=" << data->args.hipStreamCreateWithPriority.flags;
+ oss << ", priority=" << data->args.hipStreamCreateWithPriority.priority;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy2DToArray:
+ oss << "hipMemcpy2DToArray(";
+ if (data->args.hipMemcpy2DToArray.dst == NULL) oss << "dst=NULL";
+ else oss << "dst=" << data->args.hipMemcpy2DToArray.dst__val;
+ oss << ", wOffset=" << data->args.hipMemcpy2DToArray.wOffset;
+ oss << ", hOffset=" << data->args.hipMemcpy2DToArray.hOffset;
+ oss << ", src=" << data->args.hipMemcpy2DToArray.src;
+ oss << ", spitch=" << data->args.hipMemcpy2DToArray.spitch;
+ oss << ", width=" << data->args.hipMemcpy2DToArray.width;
+ oss << ", height=" << data->args.hipMemcpy2DToArray.height;
+ oss << ", kind=" << data->args.hipMemcpy2DToArray.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetD8Async:
+ oss << "hipMemsetD8Async(";
+ oss << "dest=" << data->args.hipMemsetD8Async.dest;
+ oss << ", value=" << data->args.hipMemsetD8Async.value;
+ oss << ", count=" << data->args.hipMemsetD8Async.count;
+ oss << ", stream=" << data->args.hipMemsetD8Async.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxGetCacheConfig:
+ oss << "hipCtxGetCacheConfig(";
+ if (data->args.hipCtxGetCacheConfig.cacheConfig == NULL) oss << "cacheConfig=NULL";
+ else oss << "cacheConfig=" << data->args.hipCtxGetCacheConfig.cacheConfig__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleGetFunction:
+ oss << "hipModuleGetFunction(";
+ if (data->args.hipModuleGetFunction.function == NULL) oss << "function=NULL";
+ else oss << "function=" << data->args.hipModuleGetFunction.function__val;
+ oss << ", module=" << data->args.hipModuleGetFunction.module;
+ if (data->args.hipModuleGetFunction.kname == NULL) oss << ", kname=NULL";
+ else oss << ", kname=" << data->args.hipModuleGetFunction.kname__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamWaitEvent:
+ oss << "hipStreamWaitEvent(";
+ oss << "stream=" << data->args.hipStreamWaitEvent.stream;
+ oss << ", event=" << data->args.hipStreamWaitEvent.event;
+ oss << ", flags=" << data->args.hipStreamWaitEvent.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetStreamPriorityRange:
+ oss << "hipDeviceGetStreamPriorityRange(";
+ if (data->args.hipDeviceGetStreamPriorityRange.leastPriority == NULL) oss << "leastPriority=NULL";
+ else oss << "leastPriority=" << data->args.hipDeviceGetStreamPriorityRange.leastPriority__val;
+ if (data->args.hipDeviceGetStreamPriorityRange.greatestPriority == NULL) oss << ", greatestPriority=NULL";
+ else oss << ", greatestPriority=" << data->args.hipDeviceGetStreamPriorityRange.greatestPriority__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleLoad:
+ oss << "hipModuleLoad(";
+ if (data->args.hipModuleLoad.module == NULL) oss << "module=NULL";
+ else oss << "module=" << data->args.hipModuleLoad.module__val;
+ if (data->args.hipModuleLoad.fname == NULL) oss << ", fname=NULL";
+ else oss << ", fname=" << data->args.hipModuleLoad.fname__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDevicePrimaryCtxSetFlags:
+ oss << "hipDevicePrimaryCtxSetFlags(";
+ oss << "dev=" << data->args.hipDevicePrimaryCtxSetFlags.dev;
+ oss << ", flags=" << data->args.hipDevicePrimaryCtxSetFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipLaunchCooperativeKernel:
+ oss << "hipLaunchCooperativeKernel(";
+ oss << "f=" << data->args.hipLaunchCooperativeKernel.f;
+ oss << ", gridDim=" << data->args.hipLaunchCooperativeKernel.gridDim;
+ oss << ", blockDimX=" << data->args.hipLaunchCooperativeKernel.blockDimX;
+ if (data->args.hipLaunchCooperativeKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+ else oss << ", kernelParams=" << data->args.hipLaunchCooperativeKernel.kernelParams__val;
+ oss << ", sharedMemBytes=" << data->args.hipLaunchCooperativeKernel.sharedMemBytes;
+ oss << ", stream=" << data->args.hipLaunchCooperativeKernel.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice:
+ oss << "hipLaunchCooperativeKernelMultiDevice(";
+ if (data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList == NULL) oss << "launchParamsList=NULL";
+ else oss << "launchParamsList=" << data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val;
+ oss << ", numDevices=" << data->args.hipLaunchCooperativeKernelMultiDevice.numDevices;
+ oss << ", flags=" << data->args.hipLaunchCooperativeKernelMultiDevice.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyAsync:
+ oss << "hipMemcpyAsync(";
+ oss << "dst=" << data->args.hipMemcpyAsync.dst;
+ oss << ", src=" << data->args.hipMemcpyAsync.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyAsync.sizeBytes;
+ oss << ", kind=" << data->args.hipMemcpyAsync.kind;
+ oss << ", stream=" << data->args.hipMemcpyAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMalloc3DArray:
+ oss << "hipMalloc3DArray(";
+ if (data->args.hipMalloc3DArray.array == NULL) oss << "array=NULL";
+ else oss << "array=" << data->args.hipMalloc3DArray.array__val;
+ if (data->args.hipMalloc3DArray.desc == NULL) oss << ", desc=NULL";
+ else oss << ", desc=" << data->args.hipMalloc3DArray.desc__val;
+ oss << ", extent=" << data->args.hipMalloc3DArray.extent;
+ oss << ", flags=" << data->args.hipMalloc3DArray.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMallocHost:
+ oss << "hipMallocHost(";
+ if (data->args.hipMallocHost.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipMallocHost.ptr__val;
+ oss << ", size=" << data->args.hipMallocHost.size;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxGetCurrent:
+ oss << "hipCtxGetCurrent(";
+ if (data->args.hipCtxGetCurrent.ctx == NULL) oss << "ctx=NULL";
+ else oss << "ctx=" << data->args.hipCtxGetCurrent.ctx__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDevicePrimaryCtxGetState:
+ oss << "hipDevicePrimaryCtxGetState(";
+ oss << "dev=" << data->args.hipDevicePrimaryCtxGetState.dev;
+ if (data->args.hipDevicePrimaryCtxGetState.flags == NULL) oss << ", flags=NULL";
+ else oss << ", flags=" << data->args.hipDevicePrimaryCtxGetState.flags__val;
+ if (data->args.hipDevicePrimaryCtxGetState.active == NULL) oss << ", active=NULL";
+ else oss << ", active=" << data->args.hipDevicePrimaryCtxGetState.active__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventQuery:
+ oss << "hipEventQuery(";
+ oss << "event=" << data->args.hipEventQuery.event;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventCreate:
+ oss << "hipEventCreate(";
+ if (data->args.hipEventCreate.event == NULL) oss << "event=NULL";
+ else oss << "event=" << data->args.hipEventCreate.event__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemGetAddressRange:
+ oss << "hipMemGetAddressRange(";
+ if (data->args.hipMemGetAddressRange.pbase == NULL) oss << "pbase=NULL";
+ else oss << "pbase=" << data->args.hipMemGetAddressRange.pbase__val;
+ if (data->args.hipMemGetAddressRange.psize == NULL) oss << ", psize=NULL";
+ else oss << ", psize=" << data->args.hipMemGetAddressRange.psize__val;
+ oss << ", dptr=" << data->args.hipMemGetAddressRange.dptr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyFromSymbol:
+ oss << "hipMemcpyFromSymbol(";
+ oss << "dst=" << data->args.hipMemcpyFromSymbol.dst;
+ oss << ", symbol=" << data->args.hipMemcpyFromSymbol.symbol;
+ oss << ", sizeBytes=" << data->args.hipMemcpyFromSymbol.sizeBytes;
+ oss << ", offset=" << data->args.hipMemcpyFromSymbol.offset;
+ oss << ", kind=" << data->args.hipMemcpyFromSymbol.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipArrayCreate:
+ oss << "hipArrayCreate(";
+ if (data->args.hipArrayCreate.pHandle == NULL) oss << "pHandle=NULL";
+ else oss << "pHandle=" << (void*)data->args.hipArrayCreate.pHandle__val;
+ if (data->args.hipArrayCreate.pAllocateArray == NULL) oss << ", pAllocateArray=NULL";
+ else oss << ", pAllocateArray=" << data->args.hipArrayCreate.pAllocateArray__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamAttachMemAsync:
+ oss << "hipStreamAttachMemAsync(";
+ oss << "stream=" << data->args.hipStreamAttachMemAsync.stream;
+ if (data->args.hipStreamAttachMemAsync.dev_ptr == NULL) oss << ", dev_ptr=NULL";
+ else oss << ", dev_ptr=" << data->args.hipStreamAttachMemAsync.dev_ptr__val;
+ oss << ", length=" << data->args.hipStreamAttachMemAsync.length;
+ oss << ", flags=" << data->args.hipStreamAttachMemAsync.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamGetFlags:
+ oss << "hipStreamGetFlags(";
+ oss << "stream=" << data->args.hipStreamGetFlags.stream;
+ if (data->args.hipStreamGetFlags.flags == NULL) oss << ", flags=NULL";
+ else oss << ", flags=" << data->args.hipStreamGetFlags.flags__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMallocArray:
+ oss << "hipMallocArray(";
+ if (data->args.hipMallocArray.array == NULL) oss << "array=NULL";
+ else oss << "array=" << (void*)data->args.hipMallocArray.array__val;
+ if (data->args.hipMallocArray.desc == NULL) oss << ", desc=NULL";
+ else oss << ", desc=" << data->args.hipMallocArray.desc__val;
+ oss << ", width=" << data->args.hipMallocArray.width;
+ oss << ", height=" << data->args.hipMallocArray.height;
+ oss << ", flags=" << data->args.hipMallocArray.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxGetSharedMemConfig:
+ oss << "hipCtxGetSharedMemConfig(";
+ if (data->args.hipCtxGetSharedMemConfig.pConfig == NULL) oss << "pConfig=NULL";
+ else oss << "pConfig=" << data->args.hipCtxGetSharedMemConfig.pConfig__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceDisablePeerAccess:
+ oss << "hipDeviceDisablePeerAccess(";
+ oss << "peerDeviceId=" << data->args.hipDeviceDisablePeerAccess.peerDeviceId;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSize:
+ oss << "hipModuleOccupancyMaxPotentialBlockSize(";
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize == NULL) oss << "gridSize=NULL";
+ else oss << "gridSize=" << data->args.hipModuleOccupancyMaxPotentialBlockSize.gridSize__val;
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize == NULL) oss << ", blockSize=NULL";
+ else oss << ", blockSize=" << data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSize__val;
+ oss << ", f=" << data->args.hipModuleOccupancyMaxPotentialBlockSize.f;
+ oss << ", dynSharedMemPerBlk=" << data->args.hipModuleOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk;
+ oss << ", blockSizeLimit=" << data->args.hipModuleOccupancyMaxPotentialBlockSize.blockSizeLimit;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemPtrGetInfo:
+ oss << "hipMemPtrGetInfo(";
+ oss << "ptr=" << data->args.hipMemPtrGetInfo.ptr;
+ if (data->args.hipMemPtrGetInfo.size == NULL) oss << ", size=NULL";
+ else oss << ", size=" << data->args.hipMemPtrGetInfo.size__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFuncGetAttribute:
+ oss << "hipFuncGetAttribute(";
+ if (data->args.hipFuncGetAttribute.value == NULL) oss << "value=NULL";
+ else oss << "value=" << data->args.hipFuncGetAttribute.value__val;
+ oss << ", attrib=" << data->args.hipFuncGetAttribute.attrib;
+ oss << ", hfunc=" << data->args.hipFuncGetAttribute.hfunc;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxGetFlags:
+ oss << "hipCtxGetFlags(";
+ if (data->args.hipCtxGetFlags.flags == NULL) oss << "flags=NULL";
+ else oss << "flags=" << data->args.hipCtxGetFlags.flags__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamDestroy:
+ oss << "hipStreamDestroy(";
+ oss << "stream=" << data->args.hipStreamDestroy.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID___hipPushCallConfiguration:
+ oss << "__hipPushCallConfiguration(";
+ oss << "gridDim=" << data->args.__hipPushCallConfiguration.gridDim;
+ oss << ", blockDim=" << data->args.__hipPushCallConfiguration.blockDim;
+ oss << ", sharedMem=" << data->args.__hipPushCallConfiguration.sharedMem;
+ oss << ", stream=" << data->args.__hipPushCallConfiguration.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemset3DAsync:
+ oss << "hipMemset3DAsync(";
+ oss << "pitchedDevPtr=" << data->args.hipMemset3DAsync.pitchedDevPtr;
+ oss << ", value=" << data->args.hipMemset3DAsync.value;
+ oss << ", extent=" << data->args.hipMemset3DAsync.extent;
+ oss << ", stream=" << data->args.hipMemset3DAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetPCIBusId:
+ oss << "hipDeviceGetPCIBusId(";
+ if (data->args.hipDeviceGetPCIBusId.pciBusId == NULL) oss << "pciBusId=NULL";
+ else oss << "pciBusId=" << data->args.hipDeviceGetPCIBusId.pciBusId__val;
+ oss << ", len=" << data->args.hipDeviceGetPCIBusId.len;
+ oss << ", device=" << data->args.hipDeviceGetPCIBusId.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipInit:
+ oss << "hipInit(";
+ oss << "flags=" << data->args.hipInit.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyAtoH:
+ oss << "hipMemcpyAtoH(";
+ oss << "dst=" << data->args.hipMemcpyAtoH.dst;
+ if (data->args.hipMemcpyAtoH.srcArray == NULL) oss << ", srcArray=NULL";
+ else oss << ", srcArray=" << data->args.hipMemcpyAtoH.srcArray__val;
+ oss << ", srcOffset=" << data->args.hipMemcpyAtoH.srcOffset;
+ oss << ", count=" << data->args.hipMemcpyAtoH.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamGetPriority:
+ oss << "hipStreamGetPriority(";
+ oss << "stream=" << data->args.hipStreamGetPriority.stream;
+ if (data->args.hipStreamGetPriority.priority == NULL) oss << ", priority=NULL";
+ else oss << ", priority=" << data->args.hipStreamGetPriority.priority__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemset2D:
+ oss << "hipMemset2D(";
+ oss << "dst=" << data->args.hipMemset2D.dst;
+ oss << ", pitch=" << data->args.hipMemset2D.pitch;
+ oss << ", value=" << data->args.hipMemset2D.value;
+ oss << ", width=" << data->args.hipMemset2D.width;
+ oss << ", height=" << data->args.hipMemset2D.height;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemset2DAsync:
+ oss << "hipMemset2DAsync(";
+ oss << "dst=" << data->args.hipMemset2DAsync.dst;
+ oss << ", pitch=" << data->args.hipMemset2DAsync.pitch;
+ oss << ", value=" << data->args.hipMemset2DAsync.value;
+ oss << ", width=" << data->args.hipMemset2DAsync.width;
+ oss << ", height=" << data->args.hipMemset2DAsync.height;
+ oss << ", stream=" << data->args.hipMemset2DAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceCanAccessPeer:
+ oss << "hipDeviceCanAccessPeer(";
+ if (data->args.hipDeviceCanAccessPeer.canAccessPeer == NULL) oss << "canAccessPeer=NULL";
+ else oss << "canAccessPeer=" << data->args.hipDeviceCanAccessPeer.canAccessPeer__val;
+ oss << ", deviceId=" << data->args.hipDeviceCanAccessPeer.deviceId;
+ oss << ", peerDeviceId=" << data->args.hipDeviceCanAccessPeer.peerDeviceId;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipLaunchByPtr:
+ oss << "hipLaunchByPtr(";
+ oss << "hostFunction=" << data->args.hipLaunchByPtr.hostFunction;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemPrefetchAsync:
+ oss << "hipMemPrefetchAsync(";
+ oss << "dev_ptr=" << data->args.hipMemPrefetchAsync.dev_ptr;
+ oss << ", count=" << data->args.hipMemPrefetchAsync.count;
+ oss << ", device=" << data->args.hipMemPrefetchAsync.device;
+ oss << ", stream=" << data->args.hipMemPrefetchAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxDestroy:
+ oss << "hipCtxDestroy(";
+ oss << "ctx=" << data->args.hipCtxDestroy.ctx;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetD16Async:
+ oss << "hipMemsetD16Async(";
+ oss << "dest=" << data->args.hipMemsetD16Async.dest;
+ oss << ", value=" << data->args.hipMemsetD16Async.value;
+ oss << ", count=" << data->args.hipMemsetD16Async.count;
+ oss << ", stream=" << data->args.hipMemsetD16Async.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleUnload:
+ oss << "hipModuleUnload(";
+ oss << "module=" << data->args.hipModuleUnload.module;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostUnregister:
+ oss << "hipHostUnregister(";
+ oss << "hostPtr=" << data->args.hipHostUnregister.hostPtr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipProfilerStop:
+ oss << "hipProfilerStop(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtStreamCreateWithCUMask:
+ oss << "hipExtStreamCreateWithCUMask(";
+ if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL";
+ else oss << "stream=" << data->args.hipExtStreamCreateWithCUMask.stream__val;
+ oss << ", cuMaskSize=" << data->args.hipExtStreamCreateWithCUMask.cuMaskSize;
+ if (data->args.hipExtStreamCreateWithCUMask.cuMask == NULL) oss << ", cuMask=NULL";
+ else oss << ", cuMask=" << data->args.hipExtStreamCreateWithCUMask.cuMask__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamSynchronize:
+ oss << "hipStreamSynchronize(";
+ oss << "stream=" << data->args.hipStreamSynchronize.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFreeHost:
+ oss << "hipFreeHost(";
+ oss << "ptr=" << data->args.hipFreeHost.ptr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceSetCacheConfig:
+ oss << "hipDeviceSetCacheConfig(";
+ oss << "cacheConfig=" << data->args.hipDeviceSetCacheConfig.cacheConfig;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetErrorName:
+ oss << "hipGetErrorName(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyHtoD:
+ oss << "hipMemcpyHtoD(";
+ oss << "dst=" << data->args.hipMemcpyHtoD.dst;
+ oss << ", src=" << data->args.hipMemcpyHtoD.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyHtoD.sizeBytes;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleGetGlobal:
+ oss << "hipModuleGetGlobal(";
+ if (data->args.hipModuleGetGlobal.dptr == NULL) oss << "dptr=NULL";
+ else oss << "dptr=" << data->args.hipModuleGetGlobal.dptr__val;
+ if (data->args.hipModuleGetGlobal.bytes == NULL) oss << ", bytes=NULL";
+ else oss << ", bytes=" << data->args.hipModuleGetGlobal.bytes__val;
+ oss << ", hmod=" << data->args.hipModuleGetGlobal.hmod;
+ if (data->args.hipModuleGetGlobal.name == NULL) oss << ", name=NULL";
+ else oss << ", name=" << data->args.hipModuleGetGlobal.name__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyHtoA:
+ oss << "hipMemcpyHtoA(";
+ if (data->args.hipMemcpyHtoA.dstArray == NULL) oss << "dstArray=NULL";
+ else oss << "dstArray=" << data->args.hipMemcpyHtoA.dstArray__val;
+ oss << ", dstOffset=" << data->args.hipMemcpyHtoA.dstOffset;
+ oss << ", srcHost=" << data->args.hipMemcpyHtoA.srcHost;
+ oss << ", count=" << data->args.hipMemcpyHtoA.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxCreate:
+ oss << "hipCtxCreate(";
+ if (data->args.hipCtxCreate.ctx == NULL) oss << "ctx=NULL";
+ else oss << "ctx=" << data->args.hipCtxCreate.ctx__val;
+ oss << ", flags=" << data->args.hipCtxCreate.flags;
+ oss << ", device=" << data->args.hipCtxCreate.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy2D:
+ oss << "hipMemcpy2D(";
+ oss << "dst=" << data->args.hipMemcpy2D.dst;
+ oss << ", dpitch=" << data->args.hipMemcpy2D.dpitch;
+ oss << ", src=" << data->args.hipMemcpy2D.src;
+ oss << ", spitch=" << data->args.hipMemcpy2D.spitch;
+ oss << ", width=" << data->args.hipMemcpy2D.width;
+ oss << ", height=" << data->args.hipMemcpy2D.height;
+ oss << ", kind=" << data->args.hipMemcpy2D.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipIpcCloseMemHandle:
+ oss << "hipIpcCloseMemHandle(";
+ oss << "devPtr=" << data->args.hipIpcCloseMemHandle.devPtr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipChooseDevice:
+ oss << "hipChooseDevice(";
+ if (data->args.hipChooseDevice.device == NULL) oss << "device=NULL";
+ else oss << "device=" << data->args.hipChooseDevice.device__val;
+ if (data->args.hipChooseDevice.prop == NULL) oss << ", prop=NULL";
+ else oss << ", prop=" << data->args.hipChooseDevice.prop__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceSetSharedMemConfig:
+ oss << "hipDeviceSetSharedMemConfig(";
+ oss << "config=" << data->args.hipDeviceSetSharedMemConfig.config;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMallocMipmappedArray:
+ oss << "hipMallocMipmappedArray(";
+ if (data->args.hipMallocMipmappedArray.mipmappedArray == NULL) oss << "mipmappedArray=NULL";
+ else oss << "mipmappedArray=" << data->args.hipMallocMipmappedArray.mipmappedArray__val;
+ if (data->args.hipMallocMipmappedArray.desc == NULL) oss << ", desc=NULL";
+ else oss << ", desc=" << data->args.hipMallocMipmappedArray.desc__val;
+ oss << ", extent=" << data->args.hipMallocMipmappedArray.extent;
+ oss << ", numLevels=" << data->args.hipMallocMipmappedArray.numLevels;
+ oss << ", flags=" << data->args.hipMallocMipmappedArray.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipSetupArgument:
+ oss << "hipSetupArgument(";
+ oss << "arg=" << data->args.hipSetupArgument.arg;
+ oss << ", size=" << data->args.hipSetupArgument.size;
+ oss << ", offset=" << data->args.hipSetupArgument.offset;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipIpcGetEventHandle:
+ oss << "hipIpcGetEventHandle(";
+ if (data->args.hipIpcGetEventHandle.handle == NULL) oss << "handle=NULL";
+ else oss << "handle=" << data->args.hipIpcGetEventHandle.handle__val;
+ oss << ", event=" << data->args.hipIpcGetEventHandle.event;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFreeArray:
+ oss << "hipFreeArray(";
+ if (data->args.hipFreeArray.array == NULL) oss << "array=NULL";
+ else oss << "array=" << data->args.hipFreeArray.array__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxSetCacheConfig:
+ oss << "hipCtxSetCacheConfig(";
+ oss << "cacheConfig=" << data->args.hipCtxSetCacheConfig.cacheConfig;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFuncSetCacheConfig:
+ oss << "hipFuncSetCacheConfig(";
+ oss << "func=" << data->args.hipFuncSetCacheConfig.func;
+ oss << ", config=" << data->args.hipFuncSetCacheConfig.config;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipLaunchKernel:
+ oss << "hipLaunchKernel(";
+ oss << "function_address=" << data->args.hipLaunchKernel.function_address;
+ oss << ", numBlocks=" << data->args.hipLaunchKernel.numBlocks;
+ oss << ", dimBlocks=" << data->args.hipLaunchKernel.dimBlocks;
+ if (data->args.hipLaunchKernel.args == NULL) oss << ", args=NULL";
+ else oss << ", args=" << data->args.hipLaunchKernel.args__val;
+ oss << ", sharedMemBytes=" << data->args.hipLaunchKernel.sharedMemBytes;
+ oss << ", stream=" << data->args.hipLaunchKernel.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+ oss << "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(";
+ if (data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks == NULL) oss << "numBlocks=NULL";
+ else oss << "numBlocks=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val;
+ oss << ", f=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f;
+ oss << ", blockSize=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize;
+ oss << ", dynSharedMemPerBlk=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynSharedMemPerBlk;
+ oss << ", flags=" << data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleGetTexRef:
+ oss << "hipModuleGetTexRef(";
+ if (data->args.hipModuleGetTexRef.texRef == NULL) oss << "texRef=NULL";
+ else oss << "texRef=" << (void*)data->args.hipModuleGetTexRef.texRef__val;
+ oss << ", hmod=" << data->args.hipModuleGetTexRef.hmod;
+ if (data->args.hipModuleGetTexRef.name == NULL) oss << ", name=NULL";
+ else oss << ", name=" << data->args.hipModuleGetTexRef.name__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFuncSetAttribute:
+ oss << "hipFuncSetAttribute(";
+ oss << "func=" << data->args.hipFuncSetAttribute.func;
+ oss << ", attr=" << data->args.hipFuncSetAttribute.attr;
+ oss << ", value=" << data->args.hipFuncSetAttribute.value;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventElapsedTime:
+ oss << "hipEventElapsedTime(";
+ if (data->args.hipEventElapsedTime.ms == NULL) oss << "ms=NULL";
+ else oss << "ms=" << data->args.hipEventElapsedTime.ms__val;
+ oss << ", start=" << data->args.hipEventElapsedTime.start;
+ oss << ", stop=" << data->args.hipEventElapsedTime.stop;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipConfigureCall:
+ oss << "hipConfigureCall(";
+ oss << "gridDim=" << data->args.hipConfigureCall.gridDim;
+ oss << ", blockDim=" << data->args.hipConfigureCall.blockDim;
+ oss << ", sharedMem=" << data->args.hipConfigureCall.sharedMem;
+ oss << ", stream=" << data->args.hipConfigureCall.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemAdvise:
+ oss << "hipMemAdvise(";
+ oss << "dev_ptr=" << data->args.hipMemAdvise.dev_ptr;
+ oss << ", count=" << data->args.hipMemAdvise.count;
+ oss << ", advice=" << data->args.hipMemAdvise.advice;
+ oss << ", device=" << data->args.hipMemAdvise.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy3DAsync:
+ oss << "hipMemcpy3DAsync(";
+ if (data->args.hipMemcpy3DAsync.p == NULL) oss << "p=NULL";
+ else oss << "p=" << data->args.hipMemcpy3DAsync.p__val;
+ oss << ", stream=" << data->args.hipMemcpy3DAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventDestroy:
+ oss << "hipEventDestroy(";
+ oss << "event=" << data->args.hipEventDestroy.event;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxPopCurrent:
+ oss << "hipCtxPopCurrent(";
+ if (data->args.hipCtxPopCurrent.ctx == NULL) oss << "ctx=NULL";
+ else oss << "ctx=" << data->args.hipCtxPopCurrent.ctx__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetSymbolAddress:
+ oss << "hipGetSymbolAddress(";
+ if (data->args.hipGetSymbolAddress.devPtr == NULL) oss << "devPtr=NULL";
+ else oss << "devPtr=" << data->args.hipGetSymbolAddress.devPtr__val;
+ oss << ", symbol=" << data->args.hipGetSymbolAddress.symbol;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostGetFlags:
+ oss << "hipHostGetFlags(";
+ if (data->args.hipHostGetFlags.flagsPtr == NULL) oss << "flagsPtr=NULL";
+ else oss << "flagsPtr=" << data->args.hipHostGetFlags.flagsPtr__val;
+ oss << ", hostPtr=" << data->args.hipHostGetFlags.hostPtr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostMalloc:
+ oss << "hipHostMalloc(";
+ if (data->args.hipHostMalloc.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipHostMalloc.ptr__val;
+ oss << ", size=" << data->args.hipHostMalloc.size;
+ oss << ", flags=" << data->args.hipHostMalloc.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxSetSharedMemConfig:
+ oss << "hipCtxSetSharedMemConfig(";
+ oss << "config=" << data->args.hipCtxSetSharedMemConfig.config;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFreeMipmappedArray:
+ oss << "hipFreeMipmappedArray(";
+ oss << "mipmappedArray=" << data->args.hipFreeMipmappedArray.mipmappedArray;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemGetInfo:
+ oss << "hipMemGetInfo(";
+ if (data->args.hipMemGetInfo.free == NULL) oss << "free=NULL";
+ else oss << "free=" << data->args.hipMemGetInfo.free__val;
+ if (data->args.hipMemGetInfo.total == NULL) oss << ", total=NULL";
+ else oss << ", total=" << data->args.hipMemGetInfo.total__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceReset:
+ oss << "hipDeviceReset(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemset:
+ oss << "hipMemset(";
+ oss << "dst=" << data->args.hipMemset.dst;
+ oss << ", value=" << data->args.hipMemset.value;
+ oss << ", sizeBytes=" << data->args.hipMemset.sizeBytes;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetD8:
+ oss << "hipMemsetD8(";
+ oss << "dest=" << data->args.hipMemsetD8.dest;
+ oss << ", value=" << data->args.hipMemsetD8.value;
+ oss << ", count=" << data->args.hipMemsetD8.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyParam2DAsync:
+ oss << "hipMemcpyParam2DAsync(";
+ if (data->args.hipMemcpyParam2DAsync.pCopy == NULL) oss << "pCopy=NULL";
+ else oss << "pCopy=" << data->args.hipMemcpyParam2DAsync.pCopy__val;
+ oss << ", stream=" << data->args.hipMemcpyParam2DAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostRegister:
+ oss << "hipHostRegister(";
+ oss << "hostPtr=" << data->args.hipHostRegister.hostPtr;
+ oss << ", sizeBytes=" << data->args.hipHostRegister.sizeBytes;
+ oss << ", flags=" << data->args.hipHostRegister.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDriverGetVersion:
+ oss << "hipDriverGetVersion(";
+ if (data->args.hipDriverGetVersion.driverVersion == NULL) oss << "driverVersion=NULL";
+ else oss << "driverVersion=" << data->args.hipDriverGetVersion.driverVersion__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipArray3DCreate:
+ oss << "hipArray3DCreate(";
+ if (data->args.hipArray3DCreate.array == NULL) oss << "array=NULL";
+ else oss << "array=" << (void*)data->args.hipArray3DCreate.array__val;
+ if (data->args.hipArray3DCreate.pAllocateArray == NULL) oss << ", pAllocateArray=NULL";
+ else oss << ", pAllocateArray=" << data->args.hipArray3DCreate.pAllocateArray__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipIpcOpenMemHandle:
+ oss << "hipIpcOpenMemHandle(";
+ if (data->args.hipIpcOpenMemHandle.devPtr == NULL) oss << "devPtr=NULL";
+ else oss << "devPtr=" << data->args.hipIpcOpenMemHandle.devPtr__val;
+ oss << ", handle=" << data->args.hipIpcOpenMemHandle.handle;
+ oss << ", flags=" << data->args.hipIpcOpenMemHandle.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetLastError:
+ oss << "hipGetLastError(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetDeviceFlags:
+ oss << "hipGetDeviceFlags(";
+ if (data->args.hipGetDeviceFlags.flags == NULL) oss << "flags=NULL";
+ else oss << "flags=" << data->args.hipGetDeviceFlags.flags__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetSharedMemConfig:
+ oss << "hipDeviceGetSharedMemConfig(";
+ if (data->args.hipDeviceGetSharedMemConfig.pConfig == NULL) oss << "pConfig=NULL";
+ else oss << "pConfig=" << data->args.hipDeviceGetSharedMemConfig.pConfig__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDrvMemcpy3D:
+ oss << "hipDrvMemcpy3D(";
+ if (data->args.hipDrvMemcpy3D.pCopy == NULL) oss << "pCopy=NULL";
+ else oss << "pCopy=" << data->args.hipDrvMemcpy3D.pCopy__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy2DFromArray:
+ oss << "hipMemcpy2DFromArray(";
+ oss << "dst=" << data->args.hipMemcpy2DFromArray.dst;
+ oss << ", dpitch=" << data->args.hipMemcpy2DFromArray.dpitch;
+ oss << ", src=" << data->args.hipMemcpy2DFromArray.src;
+ oss << ", wOffset=" << data->args.hipMemcpy2DFromArray.wOffset;
+ oss << ", hOffset=" << data->args.hipMemcpy2DFromArray.hOffset;
+ oss << ", width=" << data->args.hipMemcpy2DFromArray.width;
+ oss << ", height=" << data->args.hipMemcpy2DFromArray.height;
+ oss << ", kind=" << data->args.hipMemcpy2DFromArray.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
+ oss << "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(";
+ if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks == NULL) oss << "numBlocks=NULL";
+ else oss << "numBlocks=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.numBlocks__val;
+ oss << ", f=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.f;
+ oss << ", blockSize=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize;
+ oss << ", dynamicSMemSize=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.dynamicSMemSize;
+ oss << ", flags=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipSetDeviceFlags:
+ oss << "hipSetDeviceFlags(";
+ oss << "flags=" << data->args.hipSetDeviceFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHccModuleLaunchKernel:
+ oss << "hipHccModuleLaunchKernel(";
+ oss << "f=" << data->args.hipHccModuleLaunchKernel.f;
+ oss << ", globalWorkSizeX=" << data->args.hipHccModuleLaunchKernel.globalWorkSizeX;
+ oss << ", globalWorkSizeY=" << data->args.hipHccModuleLaunchKernel.globalWorkSizeY;
+ oss << ", globalWorkSizeZ=" << data->args.hipHccModuleLaunchKernel.globalWorkSizeZ;
+ oss << ", blockDimX=" << data->args.hipHccModuleLaunchKernel.blockDimX;
+ oss << ", blockDimY=" << data->args.hipHccModuleLaunchKernel.blockDimY;
+ oss << ", blockDimZ=" << data->args.hipHccModuleLaunchKernel.blockDimZ;
+ oss << ", sharedMemBytes=" << data->args.hipHccModuleLaunchKernel.sharedMemBytes;
+ oss << ", hStream=" << data->args.hipHccModuleLaunchKernel.hStream;
+ if (data->args.hipHccModuleLaunchKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+ else oss << ", kernelParams=" << data->args.hipHccModuleLaunchKernel.kernelParams__val;
+ if (data->args.hipHccModuleLaunchKernel.extra == NULL) oss << ", extra=NULL";
+ else oss << ", extra=" << data->args.hipHccModuleLaunchKernel.extra__val;
+ oss << ", startEvent=" << data->args.hipHccModuleLaunchKernel.startEvent;
+ oss << ", stopEvent=" << data->args.hipHccModuleLaunchKernel.stopEvent;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFree:
+ oss << "hipFree(";
+ oss << "ptr=" << data->args.hipFree.ptr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipOccupancyMaxPotentialBlockSize:
+ oss << "hipOccupancyMaxPotentialBlockSize(";
+ if (data->args.hipOccupancyMaxPotentialBlockSize.gridSize == NULL) oss << "gridSize=NULL";
+ else oss << "gridSize=" << data->args.hipOccupancyMaxPotentialBlockSize.gridSize__val;
+ if (data->args.hipOccupancyMaxPotentialBlockSize.blockSize == NULL) oss << ", blockSize=NULL";
+ else oss << ", blockSize=" << data->args.hipOccupancyMaxPotentialBlockSize.blockSize__val;
+ oss << ", f=" << data->args.hipOccupancyMaxPotentialBlockSize.f;
+ oss << ", dynSharedMemPerBlk=" << data->args.hipOccupancyMaxPotentialBlockSize.dynSharedMemPerBlk;
+ oss << ", blockSizeLimit=" << data->args.hipOccupancyMaxPotentialBlockSize.blockSizeLimit;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetAttribute:
+ oss << "hipDeviceGetAttribute(";
+ if (data->args.hipDeviceGetAttribute.pi == NULL) oss << "pi=NULL";
+ else oss << "pi=" << data->args.hipDeviceGetAttribute.pi__val;
+ oss << ", attr=" << data->args.hipDeviceGetAttribute.attr;
+ oss << ", deviceId=" << data->args.hipDeviceGetAttribute.deviceId;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceComputeCapability:
+ oss << "hipDeviceComputeCapability(";
+ if (data->args.hipDeviceComputeCapability.major == NULL) oss << "major=NULL";
+ else oss << "major=" << data->args.hipDeviceComputeCapability.major__val;
+ if (data->args.hipDeviceComputeCapability.minor == NULL) oss << ", minor=NULL";
+ else oss << ", minor=" << data->args.hipDeviceComputeCapability.minor__val;
+ oss << ", device=" << data->args.hipDeviceComputeCapability.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxDisablePeerAccess:
+ oss << "hipCtxDisablePeerAccess(";
+ oss << "peerCtx=" << data->args.hipCtxDisablePeerAccess.peerCtx;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMallocManaged:
+ oss << "hipMallocManaged(";
+ if (data->args.hipMallocManaged.dev_ptr == NULL) oss << "dev_ptr=NULL";
+ else oss << "dev_ptr=" << data->args.hipMallocManaged.dev_ptr__val;
+ oss << ", size=" << data->args.hipMallocManaged.size;
+ oss << ", flags=" << data->args.hipMallocManaged.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetByPCIBusId:
+ oss << "hipDeviceGetByPCIBusId(";
+ if (data->args.hipDeviceGetByPCIBusId.device == NULL) oss << "device=NULL";
+ else oss << "device=" << data->args.hipDeviceGetByPCIBusId.device__val;
+ if (data->args.hipDeviceGetByPCIBusId.pciBusId == NULL) oss << ", pciBusId=NULL";
+ else oss << ", pciBusId=" << data->args.hipDeviceGetByPCIBusId.pciBusId__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipIpcGetMemHandle:
+ oss << "hipIpcGetMemHandle(";
+ if (data->args.hipIpcGetMemHandle.handle == NULL) oss << "handle=NULL";
+ else oss << "handle=" << data->args.hipIpcGetMemHandle.handle__val;
+ oss << ", devPtr=" << data->args.hipIpcGetMemHandle.devPtr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyHtoDAsync:
+ oss << "hipMemcpyHtoDAsync(";
+ oss << "dst=" << data->args.hipMemcpyHtoDAsync.dst;
+ oss << ", src=" << data->args.hipMemcpyHtoDAsync.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyHtoDAsync.sizeBytes;
+ oss << ", stream=" << data->args.hipMemcpyHtoDAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxGetDevice:
+ oss << "hipCtxGetDevice(";
+ if (data->args.hipCtxGetDevice.device == NULL) oss << "device=NULL";
+ else oss << "device=" << data->args.hipCtxGetDevice.device__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyDtoD:
+ oss << "hipMemcpyDtoD(";
+ oss << "dst=" << data->args.hipMemcpyDtoD.dst;
+ oss << ", src=" << data->args.hipMemcpyDtoD.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyDtoD.sizeBytes;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleLoadData:
+ oss << "hipModuleLoadData(";
+ if (data->args.hipModuleLoadData.module == NULL) oss << "module=NULL";
+ else oss << "module=" << data->args.hipModuleLoadData.module__val;
+ oss << ", image=" << data->args.hipModuleLoadData.image;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDevicePrimaryCtxRelease:
+ oss << "hipDevicePrimaryCtxRelease(";
+ oss << "dev=" << data->args.hipDevicePrimaryCtxRelease.dev;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipOccupancyMaxActiveBlocksPerMultiprocessor:
+ oss << "hipOccupancyMaxActiveBlocksPerMultiprocessor(";
+ if (data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks == NULL) oss << "numBlocks=NULL";
+ else oss << "numBlocks=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.numBlocks__val;
+ oss << ", f=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.f;
+ oss << ", blockSize=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.blockSize;
+ oss << ", dynamicSMemSize=" << data->args.hipOccupancyMaxActiveBlocksPerMultiprocessor.dynamicSMemSize;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxSetCurrent:
+ oss << "hipCtxSetCurrent(";
+ oss << "ctx=" << data->args.hipCtxSetCurrent.ctx;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetErrorString:
+ oss << "hipGetErrorString(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamCreate:
+ oss << "hipStreamCreate(";
+ if (data->args.hipStreamCreate.stream == NULL) oss << "stream=NULL";
+ else oss << "stream=" << data->args.hipStreamCreate.stream__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDevicePrimaryCtxRetain:
+ oss << "hipDevicePrimaryCtxRetain(";
+ if (data->args.hipDevicePrimaryCtxRetain.pctx == NULL) oss << "pctx=NULL";
+ else oss << "pctx=" << data->args.hipDevicePrimaryCtxRetain.pctx__val;
+ oss << ", dev=" << data->args.hipDevicePrimaryCtxRetain.dev;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGet:
+ oss << "hipDeviceGet(";
+ if (data->args.hipDeviceGet.device == NULL) oss << "device=NULL";
+ else oss << "device=" << data->args.hipDeviceGet.device__val;
+ oss << ", ordinal=" << data->args.hipDeviceGet.ordinal;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamCreateWithFlags:
+ oss << "hipStreamCreateWithFlags(";
+ if (data->args.hipStreamCreateWithFlags.stream == NULL) oss << "stream=NULL";
+ else oss << "stream=" << data->args.hipStreamCreateWithFlags.stream__val;
+ oss << ", flags=" << data->args.hipStreamCreateWithFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyFromArray:
+ oss << "hipMemcpyFromArray(";
+ oss << "dst=" << data->args.hipMemcpyFromArray.dst;
+ oss << ", srcArray=" << data->args.hipMemcpyFromArray.srcArray;
+ oss << ", wOffset=" << data->args.hipMemcpyFromArray.wOffset;
+ oss << ", hOffset=" << data->args.hipMemcpyFromArray.hOffset;
+ oss << ", count=" << data->args.hipMemcpyFromArray.count;
+ oss << ", kind=" << data->args.hipMemcpyFromArray.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy2DAsync:
+ oss << "hipMemcpy2DAsync(";
+ oss << "dst=" << data->args.hipMemcpy2DAsync.dst;
+ oss << ", dpitch=" << data->args.hipMemcpy2DAsync.dpitch;
+ oss << ", src=" << data->args.hipMemcpy2DAsync.src;
+ oss << ", spitch=" << data->args.hipMemcpy2DAsync.spitch;
+ oss << ", width=" << data->args.hipMemcpy2DAsync.width;
+ oss << ", height=" << data->args.hipMemcpy2DAsync.height;
+ oss << ", kind=" << data->args.hipMemcpy2DAsync.kind;
+ oss << ", stream=" << data->args.hipMemcpy2DAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipFuncGetAttributes:
+ oss << "hipFuncGetAttributes(";
+ if (data->args.hipFuncGetAttributes.attr == NULL) oss << "attr=NULL";
+ else oss << "attr=" << data->args.hipFuncGetAttributes.attr__val;
+ oss << ", func=" << data->args.hipFuncGetAttributes.func;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetSymbolSize:
+ oss << "hipGetSymbolSize(";
+ if (data->args.hipGetSymbolSize.size == NULL) oss << "size=NULL";
+ else oss << "size=" << data->args.hipGetSymbolSize.size__val;
+ oss << ", symbol=" << data->args.hipGetSymbolSize.symbol;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostFree:
+ oss << "hipHostFree(";
+ oss << "ptr=" << data->args.hipHostFree.ptr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventCreateWithFlags:
+ oss << "hipEventCreateWithFlags(";
+ if (data->args.hipEventCreateWithFlags.event == NULL) oss << "event=NULL";
+ else oss << "event=" << data->args.hipEventCreateWithFlags.event__val;
+ oss << ", flags=" << data->args.hipEventCreateWithFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamQuery:
+ oss << "hipStreamQuery(";
+ oss << "stream=" << data->args.hipStreamQuery.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy3D:
+ oss << "hipMemcpy3D(";
+ if (data->args.hipMemcpy3D.p == NULL) oss << "p=NULL";
+ else oss << "p=" << data->args.hipMemcpy3D.p__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyToSymbol:
+ oss << "hipMemcpyToSymbol(";
+ oss << "symbol=" << data->args.hipMemcpyToSymbol.symbol;
+ oss << ", src=" << data->args.hipMemcpyToSymbol.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyToSymbol.sizeBytes;
+ oss << ", offset=" << data->args.hipMemcpyToSymbol.offset;
+ oss << ", kind=" << data->args.hipMemcpyToSymbol.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy:
+ oss << "hipMemcpy(";
+ oss << "dst=" << data->args.hipMemcpy.dst;
+ oss << ", src=" << data->args.hipMemcpy.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpy.sizeBytes;
+ oss << ", kind=" << data->args.hipMemcpy.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipPeekAtLastError:
+ oss << "hipPeekAtLastError(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice:
+ oss << "hipExtLaunchMultiKernelMultiDevice(";
+ if (data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList == NULL) oss << "launchParamsList=NULL";
+ else oss << "launchParamsList=" << data->args.hipExtLaunchMultiKernelMultiDevice.launchParamsList__val;
+ oss << ", numDevices=" << data->args.hipExtLaunchMultiKernelMultiDevice.numDevices;
+ oss << ", flags=" << data->args.hipExtLaunchMultiKernelMultiDevice.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostAlloc:
+ oss << "hipHostAlloc(";
+ if (data->args.hipHostAlloc.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipHostAlloc.ptr__val;
+ oss << ", size=" << data->args.hipHostAlloc.size;
+ oss << ", flags=" << data->args.hipHostAlloc.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipStreamAddCallback:
+ oss << "hipStreamAddCallback(";
+ oss << "stream=" << data->args.hipStreamAddCallback.stream;
+ oss << ", callback=" << data->args.hipStreamAddCallback.callback;
+ oss << ", userData=" << data->args.hipStreamAddCallback.userData;
+ oss << ", flags=" << data->args.hipStreamAddCallback.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyToArray:
+ oss << "hipMemcpyToArray(";
+ if (data->args.hipMemcpyToArray.dst == NULL) oss << "dst=NULL";
+ else oss << "dst=" << data->args.hipMemcpyToArray.dst__val;
+ oss << ", wOffset=" << data->args.hipMemcpyToArray.wOffset;
+ oss << ", hOffset=" << data->args.hipMemcpyToArray.hOffset;
+ oss << ", src=" << data->args.hipMemcpyToArray.src;
+ oss << ", count=" << data->args.hipMemcpyToArray.count;
+ oss << ", kind=" << data->args.hipMemcpyToArray.kind;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetD32:
+ oss << "hipMemsetD32(";
+ oss << "dest=" << data->args.hipMemsetD32.dest;
+ oss << ", value=" << data->args.hipMemsetD32.value;
+ oss << ", count=" << data->args.hipMemsetD32.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtModuleLaunchKernel:
+ oss << "hipExtModuleLaunchKernel(";
+ oss << "f=" << data->args.hipExtModuleLaunchKernel.f;
+ oss << ", globalWorkSizeX=" << data->args.hipExtModuleLaunchKernel.globalWorkSizeX;
+ oss << ", globalWorkSizeY=" << data->args.hipExtModuleLaunchKernel.globalWorkSizeY;
+ oss << ", globalWorkSizeZ=" << data->args.hipExtModuleLaunchKernel.globalWorkSizeZ;
+ oss << ", localWorkSizeX=" << data->args.hipExtModuleLaunchKernel.localWorkSizeX;
+ oss << ", localWorkSizeY=" << data->args.hipExtModuleLaunchKernel.localWorkSizeY;
+ oss << ", localWorkSizeZ=" << data->args.hipExtModuleLaunchKernel.localWorkSizeZ;
+ oss << ", sharedMemBytes=" << data->args.hipExtModuleLaunchKernel.sharedMemBytes;
+ oss << ", hStream=" << data->args.hipExtModuleLaunchKernel.hStream;
+ if (data->args.hipExtModuleLaunchKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+ else oss << ", kernelParams=" << data->args.hipExtModuleLaunchKernel.kernelParams__val;
+ if (data->args.hipExtModuleLaunchKernel.extra == NULL) oss << ", extra=NULL";
+ else oss << ", extra=" << data->args.hipExtModuleLaunchKernel.extra__val;
+ oss << ", startEvent=" << data->args.hipExtModuleLaunchKernel.startEvent;
+ oss << ", stopEvent=" << data->args.hipExtModuleLaunchKernel.stopEvent;
+ oss << ", flags=" << data->args.hipExtModuleLaunchKernel.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceSynchronize:
+ oss << "hipDeviceSynchronize(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetCacheConfig:
+ oss << "hipDeviceGetCacheConfig(";
+ if (data->args.hipDeviceGetCacheConfig.cacheConfig == NULL) oss << "cacheConfig=NULL";
+ else oss << "cacheConfig=" << data->args.hipDeviceGetCacheConfig.cacheConfig__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMalloc3D:
+ oss << "hipMalloc3D(";
+ if (data->args.hipMalloc3D.pitchedDevPtr == NULL) oss << "pitchedDevPtr=NULL";
+ else oss << "pitchedDevPtr=" << data->args.hipMalloc3D.pitchedDevPtr__val;
+ oss << ", extent=" << data->args.hipMalloc3D.extent;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipPointerGetAttributes:
+ oss << "hipPointerGetAttributes(";
+ if (data->args.hipPointerGetAttributes.attributes == NULL) oss << "attributes=NULL";
+ else oss << "attributes=" << data->args.hipPointerGetAttributes.attributes__val;
+ oss << ", ptr=" << data->args.hipPointerGetAttributes.ptr;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetAsync:
+ oss << "hipMemsetAsync(";
+ oss << "dst=" << data->args.hipMemsetAsync.dst;
+ oss << ", value=" << data->args.hipMemsetAsync.value;
+ oss << ", sizeBytes=" << data->args.hipMemsetAsync.sizeBytes;
+ oss << ", stream=" << data->args.hipMemsetAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetName:
+ oss << "hipDeviceGetName(";
+ if (data->args.hipDeviceGetName.name == NULL) oss << "name=NULL";
+ else oss << "name=" << data->args.hipDeviceGetName.name__val;
+ oss << ", len=" << data->args.hipDeviceGetName.len;
+ oss << ", device=" << data->args.hipDeviceGetName.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleOccupancyMaxPotentialBlockSizeWithFlags:
+ oss << "hipModuleOccupancyMaxPotentialBlockSizeWithFlags(";
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize == NULL) oss << "gridSize=NULL";
+ else oss << "gridSize=" << data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.gridSize__val;
+ if (data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize == NULL) oss << ", blockSize=NULL";
+ else oss << ", blockSize=" << data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSize__val;
+ oss << ", f=" << data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.f;
+ oss << ", dynSharedMemPerBlk=" << data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.dynSharedMemPerBlk;
+ oss << ", blockSizeLimit=" << data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.blockSizeLimit;
+ oss << ", flags=" << data->args.hipModuleOccupancyMaxPotentialBlockSizeWithFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxPushCurrent:
+ oss << "hipCtxPushCurrent(";
+ oss << "ctx=" << data->args.hipCtxPushCurrent.ctx;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyPeer:
+ oss << "hipMemcpyPeer(";
+ oss << "dst=" << data->args.hipMemcpyPeer.dst;
+ oss << ", dstDeviceId=" << data->args.hipMemcpyPeer.dstDeviceId;
+ oss << ", src=" << data->args.hipMemcpyPeer.src;
+ oss << ", srcDeviceId=" << data->args.hipMemcpyPeer.srcDeviceId;
+ oss << ", sizeBytes=" << data->args.hipMemcpyPeer.sizeBytes;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipEventSynchronize:
+ oss << "hipEventSynchronize(";
+ oss << "event=" << data->args.hipEventSynchronize.event;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyDtoDAsync:
+ oss << "hipMemcpyDtoDAsync(";
+ oss << "dst=" << data->args.hipMemcpyDtoDAsync.dst;
+ oss << ", src=" << data->args.hipMemcpyDtoDAsync.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyDtoDAsync.sizeBytes;
+ oss << ", stream=" << data->args.hipMemcpyDtoDAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipProfilerStart:
+ oss << "hipProfilerStart(";
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtMallocWithFlags:
+ oss << "hipExtMallocWithFlags(";
+ if (data->args.hipExtMallocWithFlags.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipExtMallocWithFlags.ptr__val;
+ oss << ", sizeBytes=" << data->args.hipExtMallocWithFlags.sizeBytes;
+ oss << ", flags=" << data->args.hipExtMallocWithFlags.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipCtxEnablePeerAccess:
+ oss << "hipCtxEnablePeerAccess(";
+ oss << "peerCtx=" << data->args.hipCtxEnablePeerAccess.peerCtx;
+ oss << ", flags=" << data->args.hipCtxEnablePeerAccess.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemAllocHost:
+ oss << "hipMemAllocHost(";
+ if (data->args.hipMemAllocHost.ptr == NULL) oss << "ptr=NULL";
+ else oss << "ptr=" << data->args.hipMemAllocHost.ptr__val;
+ oss << ", size=" << data->args.hipMemAllocHost.size;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyDtoHAsync:
+ oss << "hipMemcpyDtoHAsync(";
+ oss << "dst=" << data->args.hipMemcpyDtoHAsync.dst;
+ oss << ", src=" << data->args.hipMemcpyDtoHAsync.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyDtoHAsync.sizeBytes;
+ oss << ", stream=" << data->args.hipMemcpyDtoHAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleLaunchKernel:
+ oss << "hipModuleLaunchKernel(";
+ oss << "f=" << data->args.hipModuleLaunchKernel.f;
+ oss << ", gridDimX=" << data->args.hipModuleLaunchKernel.gridDimX;
+ oss << ", gridDimY=" << data->args.hipModuleLaunchKernel.gridDimY;
+ oss << ", gridDimZ=" << data->args.hipModuleLaunchKernel.gridDimZ;
+ oss << ", blockDimX=" << data->args.hipModuleLaunchKernel.blockDimX;
+ oss << ", blockDimY=" << data->args.hipModuleLaunchKernel.blockDimY;
+ oss << ", blockDimZ=" << data->args.hipModuleLaunchKernel.blockDimZ;
+ oss << ", sharedMemBytes=" << data->args.hipModuleLaunchKernel.sharedMemBytes;
+ oss << ", stream=" << data->args.hipModuleLaunchKernel.stream;
+ if (data->args.hipModuleLaunchKernel.kernelParams == NULL) oss << ", kernelParams=NULL";
+ else oss << ", kernelParams=" << data->args.hipModuleLaunchKernel.kernelParams__val;
+ if (data->args.hipModuleLaunchKernel.extra == NULL) oss << ", extra=NULL";
+ else oss << ", extra=" << data->args.hipModuleLaunchKernel.extra__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemAllocPitch:
+ oss << "hipMemAllocPitch(";
+ if (data->args.hipMemAllocPitch.dptr == NULL) oss << "dptr=NULL";
+ else oss << "dptr=" << data->args.hipMemAllocPitch.dptr__val;
+ if (data->args.hipMemAllocPitch.pitch == NULL) oss << ", pitch=NULL";
+ else oss << ", pitch=" << data->args.hipMemAllocPitch.pitch__val;
+ oss << ", widthInBytes=" << data->args.hipMemAllocPitch.widthInBytes;
+ oss << ", height=" << data->args.hipMemAllocPitch.height;
+ oss << ", elementSizeBytes=" << data->args.hipMemAllocPitch.elementSizeBytes;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipExtLaunchKernel:
+ oss << "hipExtLaunchKernel(";
+ oss << "function_address=" << data->args.hipExtLaunchKernel.function_address;
+ oss << ", numBlocks=" << data->args.hipExtLaunchKernel.numBlocks;
+ oss << ", dimBlocks=" << data->args.hipExtLaunchKernel.dimBlocks;
+ if (data->args.hipExtLaunchKernel.args == NULL) oss << ", args=NULL";
+ else oss << ", args=" << data->args.hipExtLaunchKernel.args__val;
+ oss << ", sharedMemBytes=" << data->args.hipExtLaunchKernel.sharedMemBytes;
+ oss << ", stream=" << data->args.hipExtLaunchKernel.stream;
+ oss << ", startEvent=" << data->args.hipExtLaunchKernel.startEvent;
+ oss << ", stopEvent=" << data->args.hipExtLaunchKernel.stopEvent;
+ oss << ", flags=" << data->args.hipExtLaunchKernel.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpy2DFromArrayAsync:
+ oss << "hipMemcpy2DFromArrayAsync(";
+ oss << "dst=" << data->args.hipMemcpy2DFromArrayAsync.dst;
+ oss << ", dpitch=" << data->args.hipMemcpy2DFromArrayAsync.dpitch;
+ oss << ", src=" << data->args.hipMemcpy2DFromArrayAsync.src;
+ oss << ", wOffset=" << data->args.hipMemcpy2DFromArrayAsync.wOffset;
+ oss << ", hOffset=" << data->args.hipMemcpy2DFromArrayAsync.hOffset;
+ oss << ", width=" << data->args.hipMemcpy2DFromArrayAsync.width;
+ oss << ", height=" << data->args.hipMemcpy2DFromArrayAsync.height;
+ oss << ", kind=" << data->args.hipMemcpy2DFromArrayAsync.kind;
+ oss << ", stream=" << data->args.hipMemcpy2DFromArrayAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetLimit:
+ oss << "hipDeviceGetLimit(";
+ if (data->args.hipDeviceGetLimit.pValue == NULL) oss << "pValue=NULL";
+ else oss << "pValue=" << data->args.hipDeviceGetLimit.pValue__val;
+ oss << ", limit=" << data->args.hipDeviceGetLimit.limit;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipModuleLoadDataEx:
+ oss << "hipModuleLoadDataEx(";
+ if (data->args.hipModuleLoadDataEx.module == NULL) oss << "module=NULL";
+ else oss << "module=" << data->args.hipModuleLoadDataEx.module__val;
+ oss << ", image=" << data->args.hipModuleLoadDataEx.image;
+ oss << ", numOptions=" << data->args.hipModuleLoadDataEx.numOptions;
+ if (data->args.hipModuleLoadDataEx.options == NULL) oss << ", options=NULL";
+ else oss << ", options=" << data->args.hipModuleLoadDataEx.options__val;
+ if (data->args.hipModuleLoadDataEx.optionsValues == NULL) oss << ", optionsValues=NULL";
+ else oss << ", optionsValues=" << data->args.hipModuleLoadDataEx.optionsValues__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipRuntimeGetVersion:
+ oss << "hipRuntimeGetVersion(";
+ if (data->args.hipRuntimeGetVersion.runtimeVersion == NULL) oss << "runtimeVersion=NULL";
+ else oss << "runtimeVersion=" << data->args.hipRuntimeGetVersion.runtimeVersion__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemRangeGetAttribute:
+ oss << "hipMemRangeGetAttribute(";
+ oss << "data=" << data->args.hipMemRangeGetAttribute.data;
+ oss << ", data_size=" << data->args.hipMemRangeGetAttribute.data_size;
+ oss << ", attribute=" << data->args.hipMemRangeGetAttribute.attribute;
+ oss << ", dev_ptr=" << data->args.hipMemRangeGetAttribute.dev_ptr;
+ oss << ", count=" << data->args.hipMemRangeGetAttribute.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceGetP2PAttribute:
+ oss << "hipDeviceGetP2PAttribute(";
+ if (data->args.hipDeviceGetP2PAttribute.value == NULL) oss << "value=NULL";
+ else oss << "value=" << data->args.hipDeviceGetP2PAttribute.value__val;
+ oss << ", attr=" << data->args.hipDeviceGetP2PAttribute.attr;
+ oss << ", srcDevice=" << data->args.hipDeviceGetP2PAttribute.srcDevice;
+ oss << ", dstDevice=" << data->args.hipDeviceGetP2PAttribute.dstDevice;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyPeerAsync:
+ oss << "hipMemcpyPeerAsync(";
+ oss << "dst=" << data->args.hipMemcpyPeerAsync.dst;
+ oss << ", dstDeviceId=" << data->args.hipMemcpyPeerAsync.dstDeviceId;
+ oss << ", src=" << data->args.hipMemcpyPeerAsync.src;
+ oss << ", srcDevice=" << data->args.hipMemcpyPeerAsync.srcDevice;
+ oss << ", sizeBytes=" << data->args.hipMemcpyPeerAsync.sizeBytes;
+ oss << ", stream=" << data->args.hipMemcpyPeerAsync.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetDeviceProperties:
+ oss << "hipGetDeviceProperties(";
+ if (data->args.hipGetDeviceProperties.props == NULL) oss << "props=NULL";
+ else oss << "props=" << data->args.hipGetDeviceProperties.props__val;
+ oss << ", device=" << data->args.hipGetDeviceProperties.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyDtoH:
+ oss << "hipMemcpyDtoH(";
+ oss << "dst=" << data->args.hipMemcpyDtoH.dst;
+ oss << ", src=" << data->args.hipMemcpyDtoH.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyDtoH.sizeBytes;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyWithStream:
+ oss << "hipMemcpyWithStream(";
+ oss << "dst=" << data->args.hipMemcpyWithStream.dst;
+ oss << ", src=" << data->args.hipMemcpyWithStream.src;
+ oss << ", sizeBytes=" << data->args.hipMemcpyWithStream.sizeBytes;
+ oss << ", kind=" << data->args.hipMemcpyWithStream.kind;
+ oss << ", stream=" << data->args.hipMemcpyWithStream.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDeviceTotalMem:
+ oss << "hipDeviceTotalMem(";
+ if (data->args.hipDeviceTotalMem.bytes == NULL) oss << "bytes=NULL";
+ else oss << "bytes=" << data->args.hipDeviceTotalMem.bytes__val;
+ oss << ", device=" << data->args.hipDeviceTotalMem.device;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipHostGetDevicePointer:
+ oss << "hipHostGetDevicePointer(";
+ if (data->args.hipHostGetDevicePointer.devPtr == NULL) oss << "devPtr=NULL";
+ else oss << "devPtr=" << data->args.hipHostGetDevicePointer.devPtr__val;
+ oss << ", hstPtr=" << data->args.hipHostGetDevicePointer.hstPtr;
+ oss << ", flags=" << data->args.hipHostGetDevicePointer.flags;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemRangeGetAttributes:
+ oss << "hipMemRangeGetAttributes(";
+ if (data->args.hipMemRangeGetAttributes.data == NULL) oss << "data=NULL";
+ else oss << "data=" << data->args.hipMemRangeGetAttributes.data__val;
+ if (data->args.hipMemRangeGetAttributes.data_sizes == NULL) oss << ", data_sizes=NULL";
+ else oss << ", data_sizes=" << data->args.hipMemRangeGetAttributes.data_sizes__val;
+ if (data->args.hipMemRangeGetAttributes.attributes == NULL) oss << ", attributes=NULL";
+ else oss << ", attributes=" << data->args.hipMemRangeGetAttributes.attributes__val;
+ oss << ", num_attributes=" << data->args.hipMemRangeGetAttributes.num_attributes;
+ oss << ", dev_ptr=" << data->args.hipMemRangeGetAttributes.dev_ptr;
+ oss << ", count=" << data->args.hipMemRangeGetAttributes.count;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemcpyParam2D:
+ oss << "hipMemcpyParam2D(";
+ if (data->args.hipMemcpyParam2D.pCopy == NULL) oss << "pCopy=NULL";
+ else oss << "pCopy=" << data->args.hipMemcpyParam2D.pCopy__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipDevicePrimaryCtxReset:
+ oss << "hipDevicePrimaryCtxReset(";
+ oss << "dev=" << data->args.hipDevicePrimaryCtxReset.dev;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetMipmappedArrayLevel:
+ oss << "hipGetMipmappedArrayLevel(";
+ if (data->args.hipGetMipmappedArrayLevel.levelArray == NULL) oss << "levelArray=NULL";
+ else oss << "levelArray=" << data->args.hipGetMipmappedArrayLevel.levelArray__val;
+ oss << ", mipmappedArray=" << data->args.hipGetMipmappedArrayLevel.mipmappedArray;
+ oss << ", level=" << data->args.hipGetMipmappedArrayLevel.level;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipMemsetD32Async:
+ oss << "hipMemsetD32Async(";
+ oss << "dst=" << data->args.hipMemsetD32Async.dst;
+ oss << ", value=" << data->args.hipMemsetD32Async.value;
+ oss << ", count=" << data->args.hipMemsetD32Async.count;
+ oss << ", stream=" << data->args.hipMemsetD32Async.stream;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetDevice:
+ oss << "hipGetDevice(";
+ if (data->args.hipGetDevice.deviceId == NULL) oss << "deviceId=NULL";
+ else oss << "deviceId=" << data->args.hipGetDevice.deviceId__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipGetDeviceCount:
+ oss << "hipGetDeviceCount(";
+ if (data->args.hipGetDeviceCount.count == NULL) oss << "count=NULL";
+ else oss << "count=" << data->args.hipGetDeviceCount.count__val;
+ oss << ")";
+ break;
+ case HIP_API_ID_hipIpcOpenEventHandle:
+ oss << "hipIpcOpenEventHandle(";
+ if (data->args.hipIpcOpenEventHandle.event == NULL) oss << "event=NULL";
+ else oss << "event=" << data->args.hipIpcOpenEventHandle.event__val;
+ oss << ", handle=" << data->args.hipIpcOpenEventHandle.handle;
+ oss << ")";
+ break;
+ default: oss << "unknown";
+ };
+ return strdup(oss.str().c_str());
+}
+#endif // HIP_PROF_HIP_API_STRING
+#endif // _HIP_PROF_STR_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_runtime.h b/third_party/rocm/include/hip/hcc_detail/hip_runtime.h
new file mode 100644
index 0000000..5411bb3
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_runtime.h
@@ -0,0 +1,612 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/hip_runtime.h
+ * @brief Contains definitions of APIs for HIP runtime.
+ */
+
+//#pragma once
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
+
+#include <hip/hcc_detail/hip_common.h>
+
+//---
+// Top part of file can be compiled with any compiler
+
+//#include <cstring>
+#if __cplusplus
+#include <cmath>
+#include <cstdint>
+#else
+#include <math.h>
+#include <string.h>
+#include <stddef.h>
+#endif //__cplusplus
+
+// __hip_malloc is not working. Disable it by default.
+#ifndef __HIP_ENABLE_DEVICE_MALLOC__
+#define __HIP_ENABLE_DEVICE_MALLOC__ 0
+#endif
+
+#if __HCC_OR_HIP_CLANG__
+
+#if __HIP__
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+#endif
+
+#define CUDA_SUCCESS hipSuccess
+
+#include <hip/hip_runtime_api.h>
+#endif // __HCC_OR_HIP_CLANG__
+
+#if __HCC__
+// define HIP_ENABLE_PRINTF to enable printf
+#ifdef HIP_ENABLE_PRINTF
+#define HCC_ENABLE_ACCELERATOR_PRINTF 1
+#endif
+
+//---
+// Remainder of this file only compiles with HCC
+#if defined __HCC__
+#include "grid_launch.h"
+#include "hc_printf.hpp"
+// TODO-HCC-GL - change this to typedef.
+// typedef grid_launch_parm hipLaunchParm ;
+
+#if GENERIC_GRID_LAUNCH == 0
+#define hipLaunchParm grid_launch_parm
+#else
+namespace hip_impl {
+struct Empty_launch_parm {};
+} // namespace hip_impl
+#define hipLaunchParm hip_impl::Empty_launch_parm
+#endif // GENERIC_GRID_LAUNCH
+
+#if defined(GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
+#else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
+#error(HCC must support GRID_LAUNCH_20)
+#endif // GRID_LAUNCH_VERSION
+
+#endif // HCC
+
+#if GENERIC_GRID_LAUNCH == 1 && defined __HCC__
+#include "grid_launch_GGL.hpp"
+#endif // GENERIC_GRID_LAUNCH
+
+#endif // HCC
+
+#if __HCC_OR_HIP_CLANG__
+extern int HIP_TRACE_API;
+
+#ifdef __cplusplus
+#include <hip/hcc_detail/hip_ldg.h>
+#endif
+#include <hip/hcc_detail/hip_atomic.h>
+#include <hip/hcc_detail/host_defines.h>
+#include <hip/hcc_detail/device_functions.h>
+#include <hip/hcc_detail/surface_functions.h>
+#if __HCC__
+ #include <hip/hcc_detail/math_functions.h>
+ #include <hip/hcc_detail/texture_functions.h>
+#else
+ #include <hip/hcc_detail/texture_fetch_functions.h>
+ #include <hip/hcc_detail/texture_indirect_functions.h>
+#endif
+// TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
+#if defined(__KALMAR_ACCELERATOR__) && !defined(__HCC_ACCELERATOR__)
+#define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
+#endif
+
+// TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
+#if defined(__HCC__) && __HIP_DEVICE_COMPILE__ == 1
+#undef assert
+#define assert(COND) \
+ { \
+ if (!(COND)) { \
+ abort(); \
+ } \
+ }
+#endif
+
+
+// Feature tests:
+#if (defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)) || __HIP_DEVICE_COMPILE__
+// Device compile and not host compile:
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (1)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (1)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (1)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (1)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
+
+#endif /* Device feature flags */
+
+
+#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
+ __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
+#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
+ __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
+ amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
+#define select_impl_(_1, _2, impl_, ...) impl_
+#define __launch_bounds__(...) \
+ select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
+
+// Detect if we are compiling C++ mode or C mode
+#if defined(__cplusplus)
+#define __HCC_CPP__
+#elif defined(__STDC_VERSION__)
+#define __HCC_C__
+#endif
+
+__host__ inline void* __get_dynamicgroupbaseptr() { return nullptr; }
+
+#if __HIP_ARCH_GFX701__ == 0
+
+__device__ unsigned __hip_ds_bpermute(int index, unsigned src);
+__device__ float __hip_ds_bpermutef(int index, float src);
+__device__ unsigned __hip_ds_permute(int index, unsigned src);
+__device__ float __hip_ds_permutef(int index, float src);
+
+template <int pattern>
+__device__ unsigned __hip_ds_swizzle_N(unsigned int src);
+template <int pattern>
+__device__ float __hip_ds_swizzlef_N(float src);
+
+template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
+__device__ int __hip_move_dpp_N(int src);
+
+#endif //__HIP_ARCH_GFX803__ == 1
+
+#endif // __HCC_OR_HIP_CLANG__
+
+#if defined __HCC__
+
+namespace hip_impl {
+ struct GroupId {
+ using R = decltype(hc_get_group_id(0));
+
+ __device__
+ R operator()(std::uint32_t x) const noexcept { return hc_get_group_id(x); }
+ };
+ struct GroupSize {
+ using R = decltype(hc_get_group_size(0));
+
+ __device__
+ R operator()(std::uint32_t x) const noexcept {
+ return hc_get_group_size(x);
+ }
+ };
+ struct NumGroups {
+ using R = decltype(hc_get_num_groups(0));
+
+ __device__
+ R operator()(std::uint32_t x) const noexcept {
+ return hc_get_num_groups(x);
+ }
+ };
+ struct WorkitemId {
+ using R = decltype(hc_get_workitem_id(0));
+
+ __device__
+ R operator()(std::uint32_t x) const noexcept {
+ return hc_get_workitem_id(x);
+ }
+ };
+} // Namespace hip_impl.
+
+template <typename F>
+struct Coordinates {
+ using R = decltype(F{}(0));
+
+ struct X { __device__ operator R() const noexcept { return F{}(0); } };
+ struct Y { __device__ operator R() const noexcept { return F{}(1); } };
+ struct Z { __device__ operator R() const noexcept { return F{}(2); } };
+
+ static constexpr X x{};
+ static constexpr Y y{};
+ static constexpr Z z{};
+};
+
+inline
+__device__
+std::uint32_t operator*(Coordinates<hip_impl::NumGroups>::X,
+ Coordinates<hip_impl::GroupSize>::X) noexcept {
+ return hc_get_grid_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(Coordinates<hip_impl::GroupSize>::X,
+ Coordinates<hip_impl::NumGroups>::X) noexcept {
+ return hc_get_grid_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(Coordinates<hip_impl::NumGroups>::Y,
+ Coordinates<hip_impl::GroupSize>::Y) noexcept {
+ return hc_get_grid_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(Coordinates<hip_impl::GroupSize>::Y,
+ Coordinates<hip_impl::NumGroups>::Y) noexcept {
+ return hc_get_grid_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(Coordinates<hip_impl::NumGroups>::Z,
+ Coordinates<hip_impl::GroupSize>::Z) noexcept {
+ return hc_get_grid_size(2);
+}
+inline
+__device__
+std::uint32_t operator*(Coordinates<hip_impl::GroupSize>::Z,
+ Coordinates<hip_impl::NumGroups>::Z) noexcept {
+ return hc_get_grid_size(2);
+}
+
+static constexpr Coordinates<hip_impl::GroupSize> blockDim{};
+static constexpr Coordinates<hip_impl::GroupId> blockIdx{};
+static constexpr Coordinates<hip_impl::NumGroups> gridDim{};
+static constexpr Coordinates<hip_impl::WorkitemId> threadIdx{};
+
+#define hipThreadIdx_x (hc_get_workitem_id(0))
+#define hipThreadIdx_y (hc_get_workitem_id(1))
+#define hipThreadIdx_z (hc_get_workitem_id(2))
+
+#define hipBlockIdx_x (hc_get_group_id(0))
+#define hipBlockIdx_y (hc_get_group_id(1))
+#define hipBlockIdx_z (hc_get_group_id(2))
+
+#define hipBlockDim_x (hc_get_group_size(0))
+#define hipBlockDim_y (hc_get_group_size(1))
+#define hipBlockDim_z (hc_get_group_size(2))
+
+#define hipGridDim_x (hc_get_num_groups(0))
+#define hipGridDim_y (hc_get_num_groups(1))
+#define hipGridDim_z (hc_get_num_groups(2))
+
+#endif // defined __HCC__
+
+#ifndef __OPENMP_AMDGCN__
+#if __HCC_OR_HIP_CLANG__
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#if __HIP_ENABLE_DEVICE_MALLOC__
+extern "C" __device__ void* __hip_malloc(size_t);
+extern "C" __device__ void* __hip_free(void* ptr);
+static inline __device__ void* malloc(size_t size) { return __hip_malloc(size); }
+static inline __device__ void* free(void* ptr) { return __hip_free(ptr); }
+#else
+static inline __device__ void* malloc(size_t size) { __builtin_trap(); return nullptr; }
+static inline __device__ void* free(void* ptr) { __builtin_trap(); return nullptr; }
+#endif
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#endif //__HCC_OR_HIP_CLANG__
+#endif // !__OPENMP_AMDGCN__
+
+#ifdef __HCC__
+
+#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
+
+#define HIP_KERNEL_NAME(...) (__VA_ARGS__)
+#define HIP_SYMBOL(X) #X
+
+#if defined __HCC_CPP__
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block,
+ grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block,
+ grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block,
+ grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
+extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block,
+ grid_launch_parm* lp, const char* kernelNameStr, bool lockAcquired = 0);
+extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed = 0);
+
+#if GENERIC_GRID_LAUNCH == 0
+//#warning "Original hipLaunchKernel defined"
+// Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be
+// either size_t or dim3 types
+#define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
+ do { \
+ grid_launch_parm lp; \
+ lp.dynamic_group_mem_bytes = _groupMemBytes; \
+ hipStream_t trueStream = \
+ (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
+ _kernelName(lp, ##__VA_ARGS__); \
+ ihipPostLaunchKernel(#_kernelName, trueStream, lp); \
+ } while (0)
+#endif // GENERIC_GRID_LAUNCH
+
+#elif defined(__HCC_C__)
+
+// TODO - develop C interface.
+
+#endif //__HCC_CPP__
+
+// End doxygen API:
+/**
+ * @}
+ */
+
+//
+// hip-clang functions
+//
+#elif defined(__clang__) && defined(__HIP__)
+
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+#define HIP_SYMBOL(X) X
+
+typedef int hipLaunchParm;
+
+template <std::size_t n, typename... Ts,
+ typename std::enable_if<n == sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>&, void*) {}
+
+template <std::size_t n, typename... Ts,
+ typename std::enable_if<n != sizeof...(Ts)>::type* = nullptr>
+void pArgs(const std::tuple<Ts...>& formals, void** _vargs) {
+ using T = typename std::tuple_element<n, std::tuple<Ts...> >::type;
+
+ static_assert(!std::is_reference<T>{},
+ "A __global__ function cannot have a reference as one of its "
+ "arguments.");
+#if defined(HIP_STRICT)
+ static_assert(std::is_trivially_copyable<T>{},
+ "Only TriviallyCopyable types can be arguments to a __global__ "
+ "function");
+#endif
+ _vargs[n] = const_cast<void*>(reinterpret_cast<const void*>(&std::get<n>(formals)));
+ return pArgs<n + 1>(formals, _vargs);
+}
+
+template <typename... Formals, typename... Actuals>
+std::tuple<Formals...> validateArgsCountType(void (*kernel)(Formals...), std::tuple<Actuals...>(actuals)) {
+ static_assert(sizeof...(Formals) == sizeof...(Actuals), "Argument Count Mismatch");
+ std::tuple<Formals...> to_formals{std::move(actuals)};
+ return to_formals;
+}
+
+#if defined(HIP_TEMPLATE_KERNEL_LAUNCH)
+template <typename... Args, typename F = void (*)(Args...)>
+void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+ std::uint32_t sharedMemBytes, hipStream_t stream, Args... args) {
+ constexpr size_t count = sizeof...(Args);
+ auto tup_ = std::tuple<Args...>{args...};
+ auto tup = validateArgsCountType(kernel, tup_);
+ void* _Args[count];
+ pArgs<0>(tup, _Args);
+
+ auto k = reinterpret_cast<void*>(kernel);
+ hipLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream);
+}
+#else
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
+ do { \
+ kernelName<<<(numBlocks), (numThreads), (memPerBlock), (streamId)>>>(__VA_ARGS__); \
+ } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+#endif
+
+#include <hip/hip_runtime_api.h>
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
+struct __HIP_BlockIdx {
+ __device__
+ std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
+};
+struct __HIP_BlockDim {
+ __device__
+ std::uint32_t operator()(std::uint32_t x) const noexcept {
+ return __ockl_get_local_size(x);
+ }
+};
+struct __HIP_GridDim {
+ __device__
+ std::uint32_t operator()(std::uint32_t x) const noexcept {
+ return __ockl_get_num_groups(x);
+ }
+};
+struct __HIP_ThreadIdx {
+ __device__
+ std::uint32_t operator()(std::uint32_t x) const noexcept {
+ return __ockl_get_local_id(x);
+ }
+};
+
+template <typename F>
+struct __HIP_Coordinates {
+ using R = decltype(F{}(0));
+
+ struct X { __device__ operator R() const noexcept { return F{}(0); } };
+ struct Y { __device__ operator R() const noexcept { return F{}(1); } };
+ struct Z { __device__ operator R() const noexcept { return F{}(2); } };
+
+ static constexpr X x{};
+ static constexpr Y y{};
+ static constexpr Z z{};
+#ifdef __cplusplus
+ __device__ operator dim3() const { return dim3(x, y, z); }
+#endif
+
+};
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::X __HIP_Coordinates<F>::x;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::Y __HIP_Coordinates<F>::y;
+template <typename F>
+#if !defined(_MSC_VER)
+__attribute__((weak))
+#endif
+constexpr typename __HIP_Coordinates<F>::Z __HIP_Coordinates<F>::z;
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_size(uint);
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::X,
+ __HIP_Coordinates<__HIP_BlockDim>::X) noexcept {
+ return __ockl_get_global_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::X,
+ __HIP_Coordinates<__HIP_GridDim>::X) noexcept {
+ return __ockl_get_global_size(0);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Y,
+ __HIP_Coordinates<__HIP_BlockDim>::Y) noexcept {
+ return __ockl_get_global_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Y,
+ __HIP_Coordinates<__HIP_GridDim>::Y) noexcept {
+ return __ockl_get_global_size(1);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::Z,
+ __HIP_Coordinates<__HIP_BlockDim>::Z) noexcept {
+ return __ockl_get_global_size(2);
+}
+inline
+__device__
+std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::Z,
+ __HIP_Coordinates<__HIP_GridDim>::Z) noexcept {
+ return __ockl_get_global_size(2);
+}
+
+static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
+static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
+static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
+static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_id(uint);
+#define hipThreadIdx_x (__ockl_get_local_id(0))
+#define hipThreadIdx_y (__ockl_get_local_id(1))
+#define hipThreadIdx_z (__ockl_get_local_id(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_group_id(uint);
+#define hipBlockIdx_x (__ockl_get_group_id(0))
+#define hipBlockIdx_y (__ockl_get_group_id(1))
+#define hipBlockIdx_z (__ockl_get_group_id(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_local_size(uint);
+#define hipBlockDim_x (__ockl_get_local_size(0))
+#define hipBlockDim_y (__ockl_get_local_size(1))
+#define hipBlockDim_z (__ockl_get_local_size(2))
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_num_groups(uint);
+#define hipGridDim_x (__ockl_get_num_groups(0))
+#define hipGridDim_y (__ockl_get_num_groups(1))
+#define hipGridDim_z (__ockl_get_num_groups(2))
+
+#include <hip/hcc_detail/math_functions.h>
+
+#if __HIP_HCC_COMPAT_MODE__
+// Define HCC work item functions in terms of HIP builtin variables.
+#pragma push_macro("__DEFINE_HCC_FUNC")
+#define __DEFINE_HCC_FUNC(hc_fun,hip_var) \
+inline __device__ __attribute__((always_inline)) uint hc_get_##hc_fun(uint i) { \
+ if (i==0) \
+ return hip_var.x; \
+ else if(i==1) \
+ return hip_var.y; \
+ else \
+ return hip_var.z; \
+}
+
+__DEFINE_HCC_FUNC(workitem_id, threadIdx)
+__DEFINE_HCC_FUNC(group_id, blockIdx)
+__DEFINE_HCC_FUNC(group_size, blockDim)
+__DEFINE_HCC_FUNC(num_groups, gridDim)
+#pragma pop_macro("__DEFINE_HCC_FUNC")
+
+extern "C" __device__ __attribute__((const)) size_t __ockl_get_global_id(uint);
+inline __device__ __attribute__((always_inline)) uint
+hc_get_workitem_absolute_id(int dim)
+{
+ return (uint)__ockl_get_global_id(dim);
+}
+
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// Support std::complex.
+#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#pragma push_macro("__CUDA__")
+#define __CUDA__
+#include <__clang_cuda_math_forward_declares.h>
+#include <__clang_cuda_complex_builtins.h>
+// Workaround for using libc++ with HIP-Clang.
+// The following headers requires clang include path before standard C++ include path.
+// However libc++ include path requires to be before clang include path.
+// To workaround this, we pass -isystem with the parent directory of clang include
+// path instead of the clang include path itself.
+#include <include/cuda_wrappers/algorithm>
+#include <include/cuda_wrappers/complex>
+#include <include/cuda_wrappers/new>
+#undef __CUDA__
+#pragma pop_macro("__CUDA__")
+#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#endif // defined(__clang__) && defined(__HIP__)
+
+#include <hip/hcc_detail/hip_memory.h>
+
+#endif // HIP_HCC_DETAIL_RUNTIME_H
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_runtime_api.h b/third_party/rocm/include/hip/hcc_detail/hip_runtime_api.h
new file mode 100644
index 0000000..1980004
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_runtime_api.h
@@ -0,0 +1,4358 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//#pragma once
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_API_H
+/**
+ * @file hcc_detail/hip_runtime_api.h
+ * @brief Contains C function APIs for HIP runtime. This file does not use any HCC builtin or
+ * special language extensions (-hc mode) ; those functions in hip_runtime.h.
+ */
+#include <stdint.h>
+#include <stddef.h>
+
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+
+#ifndef __HIP_ROCclr__
+#define __HIP_ROCclr__ 0
+#endif
+
+#include <hip/hcc_detail/host_defines.h>
+#include <hip/hcc_detail/driver_types.h>
+#include <hip/hcc_detail/hip_texture_types.h>
+#include <hip/hcc_detail/hip_surface_types.h>
+
+#if !__HIP_ROCclr__ && defined(__cplusplus)
+#include <hsa/hsa.h>
+#include <hip/hcc_detail/program_state.hpp>
+#endif
+
+#if defined(_MSC_VER)
+#define DEPRECATED(msg) __declspec(deprecated(msg))
+#else // !defined(_MSC_VER)
+#define DEPRECATED(msg) __attribute__ ((deprecated(msg)))
+#endif // !defined(_MSC_VER)
+
+#define DEPRECATED_MSG "This API is marked as deprecated and may not be supported in future releases. For more details please refer https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_deprecated_api_list.md"
+
+#if defined(__HCC__) && (__hcc_workweek__ < 16155)
+#error("This version of HIP requires a newer version of HCC.");
+#endif
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
+#define HIP_LAUNCH_PARAM_END ((void*)0x03)
+
+#ifdef __cplusplus
+ #define __dparm(x) \
+ = x
+#else
+ #define __dparm(x)
+#endif
+
+#ifdef __GNUC__
+#pragma GCC visibility push (default)
+#endif
+
+#ifdef __cplusplus
+
+namespace hip_impl {
+hipError_t hip_init();
+} // namespace hip_impl
+#endif
+
+// Structure definitions:
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//---
+// API-visible structures
+typedef struct ihipCtx_t* hipCtx_t;
+
+// Note many APIs also use integer deviceIds as an alternative to the device pointer:
+typedef int hipDevice_t;
+
+typedef enum hipDeviceP2PAttr {
+ hipDevP2PAttrPerformanceRank = 0,
+ hipDevP2PAttrAccessSupported,
+ hipDevP2PAttrNativeAtomicSupported,
+ hipDevP2PAttrHipArrayAccessSupported
+} hipDeviceP2PAttr;
+
+typedef struct ihipStream_t* hipStream_t;
+
+#define hipIpcMemLazyEnablePeerAccess 0
+
+#define HIP_IPC_HANDLE_SIZE 64
+
+typedef struct hipIpcMemHandle_st {
+ char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcMemHandle_t;
+
+#if __HIP_ROCclr__
+// TODO: IPC event handle currently unsupported
+struct ihipIpcEventHandle_t;
+typedef struct ihipIpcEventHandle_t* hipIpcEventHandle_t;
+#else
+typedef struct hipIpcEventHandle_st {
+ char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcEventHandle_t;
+#endif
+typedef struct ihipModule_t* hipModule_t;
+
+typedef struct ihipModuleSymbol_t* hipFunction_t;
+
+typedef struct hipFuncAttributes {
+ int binaryVersion;
+ int cacheModeCA;
+ size_t constSizeBytes;
+ size_t localSizeBytes;
+ int maxDynamicSharedSizeBytes;
+ int maxThreadsPerBlock;
+ int numRegs;
+ int preferredShmemCarveout;
+ int ptxVersion;
+ size_t sharedSizeBytes;
+} hipFuncAttributes;
+
+typedef struct ihipEvent_t* hipEvent_t;
+
+enum hipLimit_t {
+ hipLimitMallocHeapSize = 0x02,
+};
+
+/**
+ * @addtogroup GlobalDefs More
+ * @{
+ */
+//! Flags that can be used with hipStreamCreateWithFlags
+#define hipStreamDefault \
+ 0x00 ///< Default stream creation flags. These are used with hipStreamCreate().
+#define hipStreamNonBlocking 0x01 ///< Stream does not implicitly synchronize with null stream
+
+
+//! Flags that can be used with hipEventCreateWithFlags:
+#define hipEventDefault 0x0 ///< Default flags
+#define hipEventBlockingSync \
+ 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency.
+#define hipEventDisableTiming \
+ 0x2 ///< Disable event's capability to record timing information. May improve performance.
+#define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP.
+#define hipEventReleaseToDevice \
+ 0x40000000 /// < Use a device-scope release when recording this event. This flag is useful to
+ /// obtain more precise timings of commands between events. The flag is a no-op on
+ /// CUDA platforms.
+#define hipEventReleaseToSystem \
+ 0x80000000 /// < Use a system-scope release that when recording this event. This flag is
+ /// useful to make non-coherent host memory visible to the host. The flag is a
+ /// no-op on CUDA platforms.
+
+
+//! Flags that can be used with hipHostMalloc
+#define hipHostMallocDefault 0x0
+#define hipHostMallocPortable 0x1 ///< Memory is considered allocated by all contexts.
+#define hipHostMallocMapped \
+ 0x2 ///< Map the allocation into the address space for the current device. The device pointer
+ ///< can be obtained with #hipHostGetDevicePointer.
+#define hipHostMallocWriteCombined 0x4
+#define hipHostMallocNumaUser \
+ 0x20000000 ///< Host memory allocation will follow numa policy set by user
+
+#define hipHostMallocCoherent \
+ 0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
+ ///< allocation.
+#define hipHostMallocNonCoherent \
+ 0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific
+ ///< allocation.
+
+#define hipMemAttachGlobal 0x01 ///< Memory can be accessed by any stream on any device
+#define hipMemAttachHost 0x02 ///< Memory cannot be accessed by any stream on any device
+#define hipMemAttachSingle 0x04 ///< Memory can only be accessed by a single stream on
+ ///< the associated device
+
+#define hipDeviceMallocDefault 0x0
+#define hipDeviceMallocFinegrained 0x1 ///< Memory is allocated in fine grained region of device.
+
+//! Flags that can be used with hipHostRegister
+#define hipHostRegisterDefault 0x0 ///< Memory is Mapped and Portable
+#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts.
+#define hipHostRegisterMapped \
+ 0x2 ///< Map the allocation into the address space for the current device. The device pointer
+ ///< can be obtained with #hipHostGetDevicePointer.
+#define hipHostRegisterIoMemory 0x4 ///< Not supported.
+#define hipExtHostRegisterCoarseGrained 0x8 ///< Coarse Grained host memory lock
+
+#define hipDeviceScheduleAuto 0x0 ///< Automatically select between Spin and Yield
+#define hipDeviceScheduleSpin \
+ 0x1 ///< Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and
+ ///< may consume more power.
+#define hipDeviceScheduleYield \
+ 0x2 ///< Yield the CPU to the operating system when waiting. May increase latency, but lowers
+ ///< power and is friendlier to other threads in the system.
+#define hipDeviceScheduleBlockingSync 0x4
+#define hipDeviceScheduleMask 0x7
+
+#define hipDeviceMapHost 0x8
+#define hipDeviceLmemResizeToMax 0x16
+
+#define hipArrayDefault 0x00 ///< Default HIP array allocation flag
+#define hipArrayLayered 0x01
+#define hipArraySurfaceLoadStore 0x02
+#define hipArrayCubemap 0x04
+#define hipArrayTextureGather 0x08
+
+#define hipOccupancyDefault 0x00
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01
+#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02
+
+#define hipCpuDeviceId ((int)-1)
+#define hipInvalidDeviceId ((int)-2)
+
+// Flags that can be used with hipExtLaunch Set of APIs
+#define hipExtAnyOrderLaunch 0x01 ///< AnyOrderLaunch of kernels
+
+/*
+ * @brief HIP Memory Advise values
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipMemoryAdvise {
+ hipMemAdviseSetReadMostly = 1, ///< Data will mostly be read and only occassionally
+ ///< be written to
+ hipMemAdviseUnsetReadMostly = 2, ///< Undo the effect of hipMemAdviseSetReadMostly
+ hipMemAdviseSetPreferredLocation = 3, ///< Set the preferred location for the data as
+ ///< the specified device
+ hipMemAdviseUnsetPreferredLocation = 4, ///< Clear the preferred location for the data
+ hipMemAdviseSetAccessedBy = 5, ///< Data will be accessed by the specified device,
+ ///< so prevent page faults as much as possible
+ hipMemAdviseUnsetAccessedBy = 6 ///< Let the Unified Memory subsystem decide on
+ ///< the page faulting policy for the specified device
+} hipMemoryAdvise;
+
+/*
+ * @brief HIP range attributes
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipMemRangeAttribute {
+ hipMemRangeAttributeReadMostly = 1, ///< Whether the range will mostly be read and
+ ///< only occassionally be written to
+ hipMemRangeAttributePreferredLocation = 2, ///< The preferred location of the range
+ hipMemRangeAttributeAccessedBy = 3, ///< Memory range has cudaMemAdviseSetAccessedBy
+ ///< set for specified device
+ hipMemRangeAttributeLastPrefetchLocation = 4,///< The last location to which the range was prefetched
+} hipMemRangeAttribute;
+
+/*
+ * @brief hipJitOption
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipJitOption {
+ hipJitOptionMaxRegisters = 0,
+ hipJitOptionThreadsPerBlock,
+ hipJitOptionWallTime,
+ hipJitOptionInfoLogBuffer,
+ hipJitOptionInfoLogBufferSizeBytes,
+ hipJitOptionErrorLogBuffer,
+ hipJitOptionErrorLogBufferSizeBytes,
+ hipJitOptionOptimizationLevel,
+ hipJitOptionTargetFromContext,
+ hipJitOptionTarget,
+ hipJitOptionFallbackStrategy,
+ hipJitOptionGenerateDebugInfo,
+ hipJitOptionLogVerbose,
+ hipJitOptionGenerateLineInfo,
+ hipJitOptionCacheMode,
+ hipJitOptionSm3xOpt,
+ hipJitOptionFastCompile,
+ hipJitOptionNumOptions
+} hipJitOption;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncAttribute {
+ hipFuncAttributeMaxDynamicSharedMemorySize = 8,
+ hipFuncAttributePreferredSharedMemoryCarveout = 9,
+ hipFuncAttributeMax
+} hipFuncAttribute;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncCache_t {
+ hipFuncCachePreferNone, ///< no preference for shared memory or L1 (default)
+ hipFuncCachePreferShared, ///< prefer larger shared memory and smaller L1 cache
+ hipFuncCachePreferL1, ///< prefer larger L1 cache and smaller shared memory
+ hipFuncCachePreferEqual, ///< prefer equal size L1 cache and shared memory
+} hipFuncCache_t;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipSharedMemConfig {
+ hipSharedMemBankSizeDefault, ///< The compiler selects a device-specific value for the banking.
+ hipSharedMemBankSizeFourByte, ///< Shared mem is banked at 4-bytes intervals and performs best
+ ///< when adjacent threads access data 4 bytes apart.
+ hipSharedMemBankSizeEightByte ///< Shared mem is banked at 8-byte intervals and performs best
+ ///< when adjacent threads access data 4 bytes apart.
+} hipSharedMemConfig;
+
+/**
+ * Struct for data in 3D
+ *
+ */
+typedef struct dim3 {
+ uint32_t x; ///< x
+ uint32_t y; ///< y
+ uint32_t z; ///< z
+#ifdef __cplusplus
+ __host__ __device__ dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
+#endif
+} dim3;
+
+typedef struct hipLaunchParams_t {
+ void* func; ///< Device function symbol
+ dim3 gridDim; ///< Grid dimentions
+ dim3 blockDim; ///< Block dimentions
+ void **args; ///< Arguments
+ size_t sharedMem; ///< Shared memory
+ hipStream_t stream; ///< Stream identifier
+} hipLaunchParams;
+
+#if __HIP_HAS_GET_PCH
+/**
+ * Internal use only. This API may change in the future
+ * Pre-Compiled header for online compilation
+ *
+ */
+ void __hipGetPCH(const char** pch, unsigned int*size);
+#endif
+
+
+// Doxygen end group GlobalDefs
+/** @} */
+
+
+//-------------------------------------------------------------------------------------------------
+
+
+// The handle allows the async commands to use the stream even if the parent hipStream_t goes
+// out-of-scope.
+// typedef class ihipStream_t * hipStream_t;
+
+
+/*
+ * Opaque structure allows the true event (pointed at by the handle) to remain "live" even if the
+ * surrounding hipEvent_t goes out-of-scope. This is handy for cases where the hipEvent_t goes
+ * out-of-scope but the true event is being written by some async queue or device */
+// typedef struct hipEvent_t {
+// struct ihipEvent_t *_handle;
+//} hipEvent_t;
+
+
+/**
+ * @defgroup API HIP API
+ * @{
+ *
+ * Defines the HIP API. See the individual sections for more information.
+ */
+
+
+/**
+ * @defgroup Driver Initialization and Version
+ * @{
+ * This section describes the initializtion and version functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief Explicitly initializes the HIP runtime.
+ *
+ * Most HIP APIs implicitly initialize the HIP runtime.
+ * This API provides control over the timing of the initialization.
+ */
+// TODO-ctx - more description on error codes.
+hipError_t hipInit(unsigned int flags);
+
+/**
+ * @brief Returns the approximate HIP driver version.
+ *
+ * @param [out] driverVersion
+ *
+ * @returns #hipSuccess, #hipErrorInavlidValue
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision.
+ * This function always set *driverVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the driver revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ * @see hipRuntimeGetVersion
+ */
+hipError_t hipDriverGetVersion(int* driverVersion);
+
+/**
+ * @brief Returns the approximate HIP Runtime version.
+ *
+ * @param [out] runtimeVersion
+ *
+ * @returns #hipSuccess, #hipErrorInavlidValue
+ *
+ * @warning On HIP/HCC path this function returns HIP runtime patch version however on
+ * HIP/NVCC path this function return CUDA runtime version.
+ *
+ * @see hipDriverGetVersion
+ */
+hipError_t hipRuntimeGetVersion(int* runtimeVersion);
+
+
+/**
+ * @brief Returns a handle to a compute device
+ * @param [out] device
+ * @param [in] ordinal
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGet(hipDevice_t* device, int ordinal);
+
+/**
+ * @brief Returns the compute capability of the device
+ * @param [out] major
+ * @param [out] minor
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
+
+/**
+ * @brief Returns an identifer string for the device.
+ * @param [out] name
+ * @param [in] len
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
+
+
+/**
+ * @brief Returns a value for attr of link between two devices
+ * @param [out] value
+ * @param [in] attr
+ * @param [in] srcDevice
+ * @param [in] dstDevice
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+ int srcDevice, int dstDevice);
+
+/**
+ * @brief Returns a PCI Bus Id string for the device, overloaded to take int device ID.
+ * @param [out] pciBusId
+ * @param [in] len
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, int device);
+
+
+/**
+ * @brief Returns a handle to a compute device.
+ * @param [out] device handle
+ * @param [in] PCI Bus ID
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId);
+
+
+/**
+ * @brief Returns the total amount of memory on the device.
+ * @param [out] bytes
+ * @param [in] device
+ *
+ * @returns #hipSuccess, #hipErrorInavlidDevice
+ */
+hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device);
+
+
+// doxygen end initialization
+/**
+ * @}
+ */
+
+/**
+ * @defgroup Device Device Management
+ * @{
+ * This section describes the device management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Waits on all active streams on current device
+ *
+ * When this command is invoked, the host thread gets blocked until all the commands associated
+ * with streams associated with the device. HIP does not support multiple blocking modes (yet!).
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipSetDevice, hipDeviceReset
+ */
+hipError_t hipDeviceSynchronize(void);
+
+
+/**
+ * @brief The state of current device is discarded and updated to a fresh state.
+ *
+ * Calling this function deletes all streams created, memory allocated, kernels running, events
+ * created. Make sure that no other thread is using the device or streams, memory, kernels, events
+ * associated with the current device.
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipDeviceSynchronize
+ */
+hipError_t hipDeviceReset(void);
+
+
+/**
+ * @brief Set default device to be used for subsequent hip API calls from this thread.
+ *
+ * @param[in] deviceId Valid device in range 0...hipGetDeviceCount().
+ *
+ * Sets @p device as the default device for the calling host thread. Valid device id's are 0...
+ * (hipGetDeviceCount()-1).
+ *
+ * Many HIP APIs implicitly use the "default device" :
+ *
+ * - Any device memory subsequently allocated from this host thread (using hipMalloc) will be
+ * allocated on device.
+ * - Any streams or events created from this host thread will be associated with device.
+ * - Any kernels launched from this host thread (using hipLaunchKernel) will be executed on device
+ * (unless a specific stream is specified, in which case the device associated with that stream will
+ * be used).
+ *
+ * This function may be called from any host thread. Multiple host threads may use the same device.
+ * This function does no synchronization with the previous or new device, and has very little
+ * runtime overhead. Applications can use hipSetDevice to quickly switch the default device before
+ * making a HIP runtime call which uses the default device.
+ *
+ * The default device is stored in thread-local-storage for each thread.
+ * Thread-pool implementations may inherit the default device of the previous thread. A good
+ * practice is to always call hipSetDevice at the start of HIP coding sequency to establish a known
+ * standard device.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorDeviceAlreadyInUse
+ *
+ * @see hipGetDevice, hipGetDeviceCount
+ */
+hipError_t hipSetDevice(int deviceId);
+
+
+/**
+ * @brief Return the default device id for the calling host thread.
+ *
+ * @param [out] device *device is written with the default device
+ *
+ * HIP maintains an default device for each thread using thread-local-storage.
+ * This device is used implicitly for HIP runtime APIs called by this thread.
+ * hipGetDevice returns in * @p device the default device for the calling host thread.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ * @see hipSetDevice, hipGetDevicesizeBytes
+ */
+hipError_t hipGetDevice(int* deviceId);
+
+
+/**
+ * @brief Return number of compute-capable devices.
+ *
+ * @param [output] count Returns number of compute-capable devices.
+ *
+ * @returns #hipSuccess, #hipErrorNoDevice
+ *
+ *
+ * Returns in @p *count the number of devices that have ability to run compute commands. If there
+ * are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. If 1 or more
+ * devices can be found, then hipGetDeviceCount returns #hipSuccess.
+ */
+hipError_t hipGetDeviceCount(int* count);
+
+/**
+ * @brief Query for a specific device attribute.
+ *
+ * @param [out] pi pointer to value to return
+ * @param [in] attr attribute to query
+ * @param [in] deviceId which device to query for information
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId);
+
+/**
+ * @brief Returns device properties.
+ *
+ * @param [out] prop written with device properties
+ * @param [in] deviceId which device to query for information
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice
+ * @bug HCC always returns 0 for maxThreadsPerMultiProcessor
+ * @bug HCC always returns 0 for regsPerBlock
+ * @bug HCC always returns 0 for l2CacheSize
+ *
+ * Populates hipGetDeviceProperties with information for the specified device.
+ */
+hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId);
+
+
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * @param [in] cacheConfig
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache. This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig);
+
+
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [in] cacheConfig
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache. This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* cacheConfig);
+
+/**
+ * @brief Get Resource limits of current device
+ *
+ * @param [out] pValue
+ * @param [in] limit
+ *
+ * @returns #hipSuccess, #hipErrorUnsupportedLimit, #hipErrorInvalidValue
+ * Note: Currently, only hipLimitMallocHeapSize is available
+ *
+ */
+hipError_t hipDeviceGetLimit(size_t* pValue, enum hipLimit_t limit);
+
+
+/**
+ * @brief Returns bank width of shared memory for current device
+ *
+ * @param [out] pConfig
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* pConfig);
+
+/**
+ * @brief Gets the flags set for current device
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ */
+hipError_t hipGetDeviceFlags(unsigned int* flags);
+
+/**
+ * @brief The bank width of shared memory on current device is set
+ *
+ * @param [in] config
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config);
+
+/**
+ * @brief The current device behavior is changed according the flags passed.
+ *
+ * @param [in] flags
+ *
+ * The schedule flags impact how HIP waits for the completion of a command running on a device.
+ * hipDeviceScheduleSpin : HIP runtime will actively spin in the thread which submitted the
+ * work until the command completes. This offers the lowest latency, but will consume a CPU core
+ * and may increase power. hipDeviceScheduleYield : The HIP runtime will yield the CPU to
+ * system so that other tasks can use it. This may increase latency to detect the completion but
+ * will consume less power and is friendlier to other tasks in the system.
+ * hipDeviceScheduleBlockingSync : On ROCm platform, this is a synonym for hipDeviceScheduleYield.
+ * hipDeviceScheduleAuto : Use a hueristic to select between Spin and Yield modes. If the
+ * number of HIP contexts is greater than the number of logical processors in the system, use Spin
+ * scheduling. Else use Yield scheduling.
+ *
+ *
+ * hipDeviceMapHost : Allow mapping host memory. On ROCM, this is always allowed and
+ * the flag is ignored. hipDeviceLmemResizeToMax : @warning ROCm silently ignores this flag.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorSetOnActiveProcess
+ *
+ *
+ */
+hipError_t hipSetDeviceFlags(unsigned flags);
+
+/**
+ * @brief Device which matches hipDeviceProp_t is returned
+ *
+ * @param [out] device ID
+ * @param [in] device properties pointer
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop);
+
+/**
+ * @brief Returns the link type and hop count between two devices
+ *
+ * @param [in] device1 Ordinal for device1
+ * @param [in] device2 Ordinal for device2
+ * @param [out] linktype Returns the link type (See hsa_amd_link_info_type_t) between the two devices
+ * @param [out] hopcount Returns the hop count between the two devices
+ *
+ * Queries and returns the HSA link type and the hop count between the two specified devices.
+ *
+ * @returns #hipSuccess, #hipInvalidDevice, #hipErrorRuntimeOther
+ */
+hipError_t hipExtGetLinkTypeAndHopCount(int device1, int device2, uint32_t* linktype, uint32_t* hopcount);
+
+
+// TODO: implement IPC apis
+
+/**
+ * @brief Gets an interprocess memory handle for an existing device memory
+ * allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with hipMalloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with hipFree and a subsequent call
+ * to hipMalloc returns memory with the same device address,
+ * hipIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * @param handle - Pointer to user allocated hipIpcMemHandle to return
+ * the handle in.
+ * @param devPtr - Base pointer to previously allocated device memory
+ *
+ * @returns
+ * hipSuccess,
+ * hipErrorInvalidHandle,
+ * hipErrorOutOfMemory,
+ * hipErrorMapFailed,
+ *
+ */
+hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr);
+
+/**
+ * @brief Opens an interprocess memory handle exported from another process
+ * and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with hipIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * hipIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called hipDeviceEnablePeerAccess. This behavior is
+ * controlled by the hipIpcMemLazyEnablePeerAccess flag.
+ * hipDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open hipIpcMemHandles are restricted in the following way.
+ * hipIpcMemHandles from each device in a given process may only be opened
+ * by one context per device per other process.
+ *
+ * Memory returned from hipIpcOpenMemHandle must be freed with
+ * hipIpcCloseMemHandle.
+ *
+ * Calling hipFree on an exported memory region before calling
+ * hipIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * @param devPtr - Returned device pointer
+ * @param handle - hipIpcMemHandle to open
+ * @param flags - Flags for this operation. Must be specified as hipIpcMemLazyEnablePeerAccess
+ *
+ * @returns
+ * hipSuccess,
+ * hipErrorMapFailed,
+ * hipErrorInvalidHandle,
+ * hipErrorTooManyPeers
+ *
+ * @note No guarantees are made about the address returned in @p *devPtr.
+ * In particular, multiple processes may not receive the same address for the same @p handle.
+ *
+ */
+hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags);
+
+/**
+ * @brief Close memory mapped with hipIpcOpenMemHandle
+ *
+ * Unmaps memory returnd by hipIpcOpenMemHandle. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * @param devPtr - Device pointer returned by hipIpcOpenMemHandle
+ *
+ * @returns
+ * hipSuccess,
+ * hipErrorMapFailed,
+ * hipErrorInvalidHandle,
+ *
+ */
+hipError_t hipIpcCloseMemHandle(void* devPtr);
+
+
+hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
+hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle);
+
+// end doxygen Device
+/**
+ * @}
+ */
+
+/**
+ *
+ * @defgroup Execution Execution Control
+ * @{
+ * This section describes the execution control functions of HIP runtime API.
+ *
+ */
+/**
+ * @brief Set attribute for a specific function
+ *
+ * @param [in] func;
+ * @param [in] attr;
+ * @param [in] value;
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value);
+
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [in] config;
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized
+ * Note: AMD devices and some Nvidia GPUS do not support reconfigurable cache. This hint is ignored
+ * on those architectures.
+ *
+ */
+hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t config);
+
+/**
+ * @brief Set shared memory configuation for a specific function
+ *
+ * @param [in] func
+ * @param [in] config
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDeviceFunction, #hipErrorInvalidValue
+ *
+ * Note: AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ */
+hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config);
+
+//doxygen end execution
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Error Error Handling
+ * @{
+ * This section describes the error handling functions of HIP runtime API.
+ */
+
+/**
+ * @brief Return last error returned by any HIP runtime API call and resets the stored error code to
+ * #hipSuccess
+ *
+ * @returns return code from last HIP called from the active host thread
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread, and then resets the saved error to #hipSuccess.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipGetLastError(void);
+
+
+/**
+ * @brief Return last error returned by any HIP runtime API call.
+ *
+ * @return #hipSuccess
+ *
+ * Returns the last error that has been returned by any of the runtime calls in the same host
+ * thread. Unlike hipGetLastError, this function does not reset the saved error code.
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+hipError_t hipPeekAtLastError(void);
+
+
+/**
+ * @brief Return name of the specified error code in text form.
+ *
+ * @param hip_error Error code to convert to name.
+ * @return const char pointer to the NULL-terminated error name
+ *
+ * @see hipGetErrorString, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+const char* hipGetErrorName(hipError_t hip_error);
+
+
+/**
+ * @brief Return handy text string message to explain the error which occurred
+ *
+ * @param hipError Error code to convert to string.
+ * @return const char pointer to the NULL-terminated error string
+ *
+ * @warning : on HCC, this function returns the name of the error (same as hipGetErrorName)
+ *
+ * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t
+ */
+const char* hipGetErrorString(hipError_t hipError);
+
+// end doxygen Error
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Stream Stream Management
+ * @{
+ * This section describes the stream management functions of HIP runtime API.
+ * The following Stream APIs are not (yet) supported in HIP:
+ * - cudaStreamAttachMemAsync
+ */
+
+
+/**
+ * @brief Create an asynchronous stream.
+ *
+ * @param[in, out] stream Valid pointer to hipStream_t. This function writes the memory with the
+ * newly created stream.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to
+ * reference the newly created stream in subsequent hipStream* commands. The stream is allocated on
+ * the heap and will remain allocated even if the handle goes out-of-scope. To release the memory
+ * used by the stream, applicaiton must call hipStreamDestroy.
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipStreamCreate(hipStream_t* stream);
+
+
+/**
+ * @brief Create an asynchronous stream.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] flags to control stream creation.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to
+ * reference the newly created stream in subsequent hipStream* commands. The stream is allocated on
+ * the heap and will remain allocated even if the handle goes out-of-scope. To release the memory
+ * used by the stream, applicaiton must call hipStreamDestroy. Flags controls behavior of the
+ * stream. See #hipStreamDefault, #hipStreamNonBlocking.
+ *
+ *
+ * @see hipStreamCreate, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+
+hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags);
+
+
+/**
+ * @brief Create an asynchronous stream with the specified priority.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] flags to control stream creation.
+ * @param[in ] priority of the stream. Lower numbers represent higher priorities.
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream with the specified priority. @p stream returns an opaque handle
+ * that can be used to reference the newly created stream in subsequent hipStream* commands. The
+ * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
+ * To release the memory used by the stream, applicaiton must call hipStreamDestroy. Flags controls
+ * behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking.
+ *
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+
+hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority);
+
+
+/**
+ * @brief Returns numerical values that correspond to the least and greatest stream priority.
+ *
+ * @param[in, out] leastPriority pointer in which value corresponding to least priority is returned.
+ * @param[in, out] greatestPriority pointer in which value corresponding to greatest priority is returned.
+ *
+ * Returns in *leastPriority and *greatestPriority the numerical values that correspond to the least
+ * and greatest stream priority respectively. Stream priorities follow a convention where lower numbers
+ * imply greater priorities. The range of meaningful stream priorities is given by
+ * [*greatestPriority, *leastPriority]. If the user attempts to create a stream with a priority value
+ * that is outside the the meaningful range as specified by this API, the priority is automatically
+ * clamped to within the valid range.
+ */
+
+hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+
+
+/**
+ * @brief Destroys the specified stream.
+ *
+ * @param[in, out] stream Valid pointer to hipStream_t. This function writes the memory with the
+ * newly created stream.
+ * @return #hipSuccess #hipErrorInvalidHandle
+ *
+ * Destroys the specified stream.
+ *
+ * If commands are still executing on the specified stream, some may complete execution before the
+ * queue is deleted.
+ *
+ * The queue may be destroyed while some commands are still inflight, or may wait for all commands
+ * queued to the stream before destroying it.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamQuery, hipStreamWaitEvent,
+ * hipStreamSynchronize
+ */
+hipError_t hipStreamDestroy(hipStream_t stream);
+
+
+/**
+ * @brief Return #hipSuccess if all of the operations in the specified @p stream have completed, or
+ * #hipErrorNotReady if not.
+ *
+ * @param[in] stream stream to query
+ *
+ * @return #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle
+ *
+ * This is thread-safe and returns a snapshot of the current state of the queue. However, if other
+ * host threads are sending work to the stream, the status may change immediately after the function
+ * is called. It is typically used for debug.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamSynchronize,
+ * hipStreamDestroy
+ */
+hipError_t hipStreamQuery(hipStream_t stream);
+
+
+/**
+ * @brief Wait for all commands in stream to complete.
+ *
+ * @param[in] stream stream identifier.
+ *
+ * @return #hipSuccess, #hipErrorInvalidHandle
+ *
+ * This command is host-synchronous : the host will block until the specified stream is empty.
+ *
+ * This command follows standard null-stream semantics. Specifically, specifying the null stream
+ * will cause the command to wait for other streams on the same device to complete all pending
+ * operations.
+ *
+ * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active
+ * or blocking.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamWaitEvent, hipStreamDestroy
+ *
+ */
+hipError_t hipStreamSynchronize(hipStream_t stream);
+
+
+/**
+ * @brief Make the specified compute stream wait for an event
+ *
+ * @param[in] stream stream to make wait.
+ * @param[in] event event to wait on
+ * @param[in] flags control operation [must be 0]
+ *
+ * @return #hipSuccess, #hipErrorInvalidHandle
+ *
+ * This function inserts a wait operation into the specified stream.
+ * All future work submitted to @p stream will wait until @p event reports completion before
+ * beginning execution.
+ *
+ * This function only waits for commands in the current stream to complete. Notably,, this function
+ * does not impliciy wait for commands in the default stream to complete, even if the specified
+ * stream is created with hipStreamNonBlocking = 0.
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamDestroy
+ */
+hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags);
+
+
+/**
+ * @brief Return flags associated with this stream.
+ *
+ * @param[in] stream stream to be queried
+ * @param[in,out] flags Pointer to an unsigned integer in which the stream's flags are returned
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
+ *
+ * Return flags associated with this stream in *@p flags.
+ *
+ * @see hipStreamCreateWithFlags
+ */
+hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int* flags);
+
+
+/**
+ * @brief Query the priority of a stream.
+ *
+ * @param[in] stream stream to be queried
+ * @param[in,out] priority Pointer to an unsigned integer in which the stream's priority is returned
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidHandle
+ *
+ * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidHandle
+ *
+ * Query the priority of a stream. The priority is returned in in priority.
+ *
+ * @see hipStreamCreateWithFlags
+ */
+hipError_t hipStreamGetPriority(hipStream_t stream, int* priority);
+
+
+/**
+ * @brief Create an asynchronous stream with the specified CU mask.
+ *
+ * @param[in, out] stream Pointer to new stream
+ * @param[in ] cuMaskSize Size of CU mask bit array passed in.
+ * @param[in ] cuMask Bit-vector representing the CU mask. Each active bit represents using one CU.
+ * The first 32 bits represent the first 32 CUs, and so on. If its size is greater than physical
+ * CU number (i.e., multiProcessorCount member of hipDeviceProp_t), the extra elements are ignored.
+ * It is user's responsibility to make sure the input is meaningful.
+ * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ *
+ * Create a new asynchronous stream with the specified CU mask. @p stream returns an opaque handle
+ * that can be used to reference the newly created stream in subsequent hipStream* commands. The
+ * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope.
+ * To release the memory used by the stream, application must call hipStreamDestroy.
+ *
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipExtStreamCreateWithCUMask(hipStream_t* stream, uint32_t cuMaskSize, const uint32_t* cuMask);
+
+
+/**
+ * @brief Get CU mask associated with an asynchronous stream
+ *
+ * @param[in] stream stream to be queried
+ * @param[in] cuMaskSize number of the block of memories (uint32_t *) allocated by user
+ * @param[out] cuMask Pointer to a pre-allocated block of memories (uint32_t *) in which
+ * the stream's CU mask is returned. The CU mask is returned in a chunck of 32 bits where
+ * each active bit represents one active CU
+ * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorInvalidValue
+ *
+ * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy
+ */
+hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32_t* cuMask);
+
+/**
+ * Stream CallBack struct
+ */
+typedef void (*hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+
+/**
+ * @brief Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed. For each
+ * cudaStreamAddCallback call, a callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ * @param[in] stream - Stream to add callback to
+ * @param[in] callback - The function to call once preceding stream operations are complete
+ * @param[in] userData - User specified data to be passed to the callback function
+ * @param[in] flags - Reserved for future use, must be 0
+ * @return #hipSuccess, #hipErrorInvalidHandle, #hipErrorNotSupported
+ *
+ * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamSynchronize,
+ * hipStreamWaitEvent, hipStreamDestroy, hipStreamCreateWithPriority
+ *
+ */
+hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void* userData,
+ unsigned int flags);
+
+
+// end doxygen Stream
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Event Event Management
+ * @{
+ * This section describes the event management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Create an event with the specified flags
+ *
+ * @param[in,out] event Returns the newly created event.
+ * @param[in] flags Flags to control event behavior. Valid values are #hipEventDefault,
+ #hipEventBlockingSync, #hipEventDisableTiming, #hipEventInterprocess
+
+ * #hipEventDefault : Default flag. The event will use active synchronization and will support
+ timing. Blocking synchronization provides lowest possible latency at the expense of dedicating a
+ CPU to poll on the event.
+ * #hipEventBlockingSync : The event will use blocking synchronization : if hipEventSynchronize is
+ called on this event, the thread will block until the event completes. This can increase latency
+ for the synchroniation but can result in lower power and more resources for other CPU threads.
+ * #hipEventDisableTiming : Disable recording of timing information.
+
+ * @warning On AMD platform, hipEventInterprocess support is under development. Use of this flag
+ will return an error.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ #hipErrorLaunchFailure, #hipErrorOutOfMemory
+ *
+ * @see hipEventCreate, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime
+ */
+hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned flags);
+
+
+/**
+ * Create an event
+ *
+ * @param[in,out] event Returns the newly created event.
+ *
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorLaunchFailure, #hipErrorOutOfMemory
+ *
+ * @see hipEventCreateWithFlags, hipEventRecord, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ */
+hipError_t hipEventCreate(hipEvent_t* event);
+
+
+/**
+ * @brief Record an event in the specified stream.
+ *
+ * @param[in] event event to record.
+ * @param[in] stream stream in which to record event.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ * hipEventQuery() or hipEventSynchronize() must be used to determine when the event
+ * transitions from "recording" (after hipEventRecord() is called) to "recorded"
+ * (when timestamps are set, if requested).
+ *
+ * Events which are recorded in a non-NULL stream will transition to
+ * from recording to "recorded" state when they reach the head of
+ * the specified stream, after all previous
+ * commands in that stream have completed executing.
+ *
+ * If hipEventRecord() has been previously called on this event, then this call will overwrite any
+ * existing state in event.
+ *
+ * If this function is called on an event that is currently being recorded, results are undefined
+ * - either outstanding recording may save state into the event, and the order is not guaranteed.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize,
+ * hipEventDestroy, hipEventElapsedTime
+ *
+ */
+#ifdef __cplusplus
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream = NULL);
+#else
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
+#endif
+
+/**
+ * @brief Destroy the specified event.
+ *
+ * @param[in] event Event to destroy.
+ * @returns #hipSuccess, #hipErrorNotInitialized, #hipErrorInvalidValue,
+ * #hipErrorLaunchFailure
+ *
+ * Releases memory associated with the event. If the event is recording but has not completed
+ * recording when hipEventDestroy() is called, the function will return immediately and the
+ * completion_future resources will be released later, when the hipDevice is synchronized.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventRecord,
+ * hipEventElapsedTime
+ *
+ * @returns #hipSuccess
+ */
+hipError_t hipEventDestroy(hipEvent_t event);
+
+
+/**
+ * @brief Wait for an event to complete.
+ *
+ * This function will block until the event is ready, waiting for all previous work in the stream
+ * specified when event was recorded with hipEventRecord().
+ *
+ * If hipEventRecord() has not been called on @p event, this function returns immediately.
+ *
+ * TODO-hip- This function needs to support hipEventBlockingSync parameter.
+ *
+ * @param[in] event Event on which to wait.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized,
+ * #hipErrorInvalidHandle, #hipErrorLaunchFailure
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
+ * hipEventElapsedTime
+ */
+hipError_t hipEventSynchronize(hipEvent_t event);
+
+
+/**
+ * @brief Return the elapsed time between two events.
+ *
+ * @param[out] ms : Return time between start and stop in ms.
+ * @param[in] start : Start event.
+ * @param[in] stop : Stop event.
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorNotReady, #hipErrorInvalidHandle,
+ * #hipErrorNotInitialized, #hipErrorLaunchFailure
+ *
+ * Computes the elapsed time between two events. Time is computed in ms, with
+ * a resolution of approximately 1 us.
+ *
+ * Events which are recorded in a NULL stream will block until all commands
+ * on all other streams complete execution, and then record the timestamp.
+ *
+ * Events which are recorded in a non-NULL stream will record their timestamp
+ * when they reach the head of the specified stream, after all previous
+ * commands in that stream have completed executing. Thus the time that
+ * the event recorded may be significantly after the host calls hipEventRecord().
+ *
+ * If hipEventRecord() has not been called on either event, then #hipErrorInvalidHandle is
+ * returned. If hipEventRecord() has been called on both events, but the timestamp has not yet been
+ * recorded on one or both events (that is, hipEventQuery() would return #hipErrorNotReady on at
+ * least one of the events), then #hipErrorNotReady is returned.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventDestroy, hipEventRecord,
+ * hipEventSynchronize
+ */
+hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop);
+
+
+/**
+ * @brief Query event status
+ *
+ * @param[in] event Event to query.
+ * @returns #hipSuccess, #hipErrorNotReady, #hipErrorInvalidHandle, #hipErrorInvalidValue,
+ * #hipErrorNotInitialized, #hipErrorLaunchFailure
+ *
+ * Query the status of the specified event. This function will return #hipErrorNotReady if all
+ * commands in the appropriate stream (specified to hipEventRecord()) have completed. If that work
+ * has not completed, or if hipEventRecord() was not called on the event, then #hipSuccess is
+ * returned.
+ *
+ * @see hipEventCreate, hipEventCreateWithFlags, hipEventRecord, hipEventDestroy,
+ * hipEventSynchronize, hipEventElapsedTime
+ */
+hipError_t hipEventQuery(hipEvent_t event);
+
+
+// end doxygen Events
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Memory Memory Management
+ * @{
+ * This section describes the memory management functions of HIP runtime API.
+ * The following CUDA APIs are not currently supported:
+ * - cudaMalloc3D
+ * - cudaMalloc3DArray
+ * - TODO - more 2D, 3D, array APIs here.
+ *
+ *
+ */
+
+/**
+ * @brief Return attributes for the specified pointer
+ *
+ * @param[out] attributes for the specified pointer
+ * @param[in] pointer to get attributes for
+ *
+ * @return #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ *
+ * @see hipGetDeviceCount, hipGetDevice, hipSetDevice, hipChooseDevice
+ */
+hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr);
+
+/**
+ * @brief Allocate memory on the default accelerator
+ *
+ * @param[out] ptr Pointer to the allocated memory
+ * @param[in] size Requested memory size
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+ *
+ * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
+ * hipHostFree, hipHostMalloc
+ */
+hipError_t hipMalloc(void** ptr, size_t size);
+
+/**
+ * @brief Allocate memory on the default accelerator
+ *
+ * @param[out] ptr Pointer to the allocated memory
+ * @param[in] size Requested memory size
+ * @param[in] flags Type of memory allocation
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory, #hipErrorInvalidValue (bad context, null *ptr)
+ *
+ * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray,
+ * hipHostFree, hipHostMalloc
+ */
+hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flags);
+
+/**
+ * @brief Allocate pinned host memory [Deprecated]
+ *
+ * @param[out] ptr Pointer to the allocated host pinned memory
+ * @param[in] size Requested memory size
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @deprecated use hipHostMalloc() instead
+ */
+DEPRECATED("use hipHostMalloc instead")
+hipError_t hipMallocHost(void** ptr, size_t size);
+
+/**
+ * @brief Allocate pinned host memory [Deprecated]
+ *
+ * @param[out] ptr Pointer to the allocated host pinned memory
+ * @param[in] size Requested memory size
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @deprecated use hipHostMalloc() instead
+ */
+DEPRECATED("use hipHostMalloc instead")
+hipError_t hipMemAllocHost(void** ptr, size_t size);
+
+/**
+ * @brief Allocate device accessible page locked host memory
+ *
+ * @param[out] ptr Pointer to the allocated host pinned memory
+ * @param[in] size Requested memory size
+ * @param[in] flags Type of host memory allocation
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @see hipSetDeviceFlags, hipHostFree
+ */
+hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags);
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @addtogroup MemoryM Managed Memory (ROCm HMM)
+ * @{
+ * @ingroup Memory
+ * This section describes the managed memory management functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief Allocates memory that will be automatically managed by AMD HMM.
+ *
+ * @param [out] dev_ptr - pointer to allocated device memory
+ * @param [in] size - requested allocation size in bytes
+ * @param [in] flags - must be either hipMemAttachGlobal or hipMemAttachHost
+ * (defaults to hipMemAttachGlobal)
+ *
+ * @returns #hipSuccess, #hipErrorMemoryAllocation, #hipErrorNotSupported, #hipErrorInvalidValue
+ */
+hipError_t hipMallocManaged(void** dev_ptr,
+ size_t size,
+ unsigned int flags __dparm(hipMemAttachGlobal));
+
+/**
+ * @brief Prefetches memory to the specified destination device using AMD HMM.
+ *
+ * @param [in] dev_ptr pointer to be prefetched
+ * @param [in] count size in bytes for prefetching
+ * @param [in] device destination device to prefetch to
+ * @param [in] stream stream to enqueue prefetch operation
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemPrefetchAsync(const void* dev_ptr,
+ size_t count,
+ int device,
+ hipStream_t stream __dparm(0));
+
+/**
+ * @brief Advise about the usage of a given memory range to AMD HMM.
+ *
+ * @param [in] dev_ptr pointer to memory to set the advice for
+ * @param [in] count size in bytes of the memory range
+ * @param [in] advice advice to be applied for the specified memory range
+ * @param [in] device device to apply the advice for
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemAdvise(const void* dev_ptr,
+ size_t count,
+ hipMemoryAdvise advice,
+ int device);
+
+/**
+ * @brief Query an attribute of a given memory range in AMD HMM.
+ *
+ * @param [in/out] data a pointer to a memory location where the result of each
+ * attribute query will be written to
+ * @param [in] data_size the size of data
+ * @param [in] attribute the attribute to query
+ * @param [in] dev_ptr start of the range to query
+ * @param [in] count size of the range to query
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemRangeGetAttribute(void* data,
+ size_t data_size,
+ hipMemRangeAttribute attribute,
+ const void* dev_ptr,
+ size_t count);
+
+/**
+ * @brief Query attributes of a given memory range in AMD HMM.
+ *
+ * @param [in/out] data a two-dimensional array containing pointers to memory locations
+ * where the result of each attribute query will be written to
+ * @param [in] data_sizes an array, containing the sizes of each result
+ * @param [in] attributes the attribute to query
+ * @param [in] num_attributes an array of attributes to query (numAttributes and the number
+ * of attributes in this array should match)
+ * @param [in] dev_ptr start of the range to query
+ * @param [in] count size of the range to query
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipMemRangeGetAttributes(void** data,
+ size_t* data_sizes,
+ hipMemRangeAttribute* attributes,
+ size_t num_attributes,
+ const void* dev_ptr,
+ size_t count);
+
+/**
+ * @brief Attach memory to a stream asynchronously in AMD HMM.
+ *
+ * @param [in] stream - stream in which to enqueue the attach operation
+ * @param [in] dev_ptr - pointer to memory (must be a pointer to managed memory or
+ * to a valid host-accessible region of system-allocated memory)
+ * @param [in] length - length of memory (defaults to zero)
+ * @param [in] flags - must be one of cudaMemAttachGlobal, cudaMemAttachHost or
+ * cudaMemAttachSingle (defaults to cudaMemAttachSingle)
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipStreamAttachMemAsync(hipStream_t stream,
+ hipDeviceptr_t* dev_ptr,
+ size_t length __dparm(0),
+ unsigned int flags __dparm(hipMemAttachSingle));
+
+// end doxygen Managed Memory
+/**
+ * @}
+ */
+
+/**
+ * @brief Allocate device accessible page locked host memory [Deprecated]
+ *
+ * @param[out] ptr Pointer to the allocated host pinned memory
+ * @param[in] size Requested memory size
+ * @param[in] flags Type of host memory allocation
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @deprecated use hipHostMalloc() instead
+ */
+DEPRECATED("use hipHostMalloc instead")
+hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags);
+
+/**
+ * @brief Get Device pointer from Host Pointer allocated through hipHostMalloc
+ *
+ * @param[out] dstPtr Device Pointer mapped to passed host pointer
+ * @param[in] hstPtr Host Pointer allocated through hipHostMalloc
+ * @param[in] flags Flags to be passed for extension
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorOutOfMemory
+ *
+ * @see hipSetDeviceFlags, hipHostMalloc
+ */
+hipError_t hipHostGetDevicePointer(void** devPtr, void* hstPtr, unsigned int flags);
+
+/**
+ * @brief Return flags associated with host pointer
+ *
+ * @param[out] flagsPtr Memory location to store flags
+ * @param[in] hostPtr Host Pointer allocated through hipHostMalloc
+ * @return #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipHostMalloc
+ */
+hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr);
+
+/**
+ * @brief Register host memory so it can be accessed from the current device.
+ *
+ * @param[out] hostPtr Pointer to host memory to be registered.
+ * @param[in] sizeBytes size of the host memory
+ * @param[in] flags. See below.
+ *
+ * Flags:
+ * - #hipHostRegisterDefault Memory is Mapped and Portable
+ * - #hipHostRegisterPortable Memory is considered registered by all contexts. HIP only supports
+ * one context so this is always assumed true.
+ * - #hipHostRegisterMapped Map the allocation into the address space for the current device.
+ * The device pointer can be obtained with #hipHostGetDevicePointer.
+ *
+ *
+ * After registering the memory, use #hipHostGetDevicePointer to obtain the mapped device pointer.
+ * On many systems, the mapped device pointer will have a different value than the mapped host
+ * pointer. Applications must use the device pointer in device code, and the host pointer in device
+ * code.
+ *
+ * On some systems, registered memory is pinned. On some systems, registered memory may not be
+ * actually be pinned but uses OS or hardware facilities to all GPU access to the host memory.
+ *
+ * Developers are strongly encouraged to register memory blocks which are aligned to the host
+ * cache-line size. (typically 64-bytes but can be obtains from the CPUID instruction).
+ *
+ * If registering non-aligned pointers, the application must take care when register pointers from
+ * the same cache line on different devices. HIP's coarse-grained synchronization model does not
+ * guarantee correct results if different devices write to different parts of the same cache block -
+ * typically one of the writes will "win" and overwrite data from the other registered memory
+ * region.
+ *
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer
+ */
+hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags);
+
+/**
+ * @brief Un-register host pointer
+ *
+ * @param[in] hostPtr Host pointer previously registered with #hipHostRegister
+ * @return Error code
+ *
+ * @see hipHostRegister
+ */
+hipError_t hipHostUnregister(void* hostPtr);
+
+/**
+ * Allocates at least width (in bytes) * height bytes of linear memory
+ * Padding may occur to ensure alighnment requirements are met for the given row
+ * The change in width size due to padding will be returned in *pitch.
+ * Currently the alignment is set to 128 bytes
+ *
+ * @param[out] ptr Pointer to the allocated device memory
+ * @param[out] pitch Pitch for allocation (in bytes)
+ * @param[in] width Requested pitched allocation width (in bytes)
+ * @param[in] height Requested pitched allocation height
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ *
+ * @return Error code
+ *
+ * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+
+hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height);
+
+/**
+ * Allocates at least width (in bytes) * height bytes of linear memory
+ * Padding may occur to ensure alighnment requirements are met for the given row
+ * The change in width size due to padding will be returned in *pitch.
+ * Currently the alignment is set to 128 bytes
+ *
+ * @param[out] dptr Pointer to the allocated device memory
+ * @param[out] pitch Pitch for allocation (in bytes)
+ * @param[in] width Requested pitched allocation width (in bytes)
+ * @param[in] height Requested pitched allocation height
+ *
+ * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned.
+ * The intended usage of pitch is as a separate parameter of the allocation, used to compute addresses within the 2D array.
+ * Given the row and column of an array element of type T, the address is computed as:
+ * T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ *
+ * @return Error code
+ *
+ * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+
+hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr, size_t* pitch, size_t widthInBytes, size_t height, unsigned int elementSizeBytes);
+
+/**
+ * @brief Free memory allocated by the hcc hip memory allocation API.
+ * This API performs an implicit hipDeviceSynchronize() call.
+ * If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ * @param[in] ptr Pointer to memory to be freed
+ * @return #hipSuccess
+ * @return #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated
+ * with hipHostMalloc)
+ *
+ * @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipFree(void* ptr);
+
+/**
+ * @brief Free memory allocated by the hcc hip host memory allocation API. [Deprecated]
+ *
+ * @param[in] ptr Pointer to memory to be freed
+ * @return #hipSuccess,
+ * #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
+ hipMalloc)
+
+ * @deprecated use hipHostFree() instead
+ */
+DEPRECATED("use hipHostFree instead")
+hipError_t hipFreeHost(void* ptr);
+
+/**
+ * @brief Free memory allocated by the hcc hip host memory allocation API
+ * This API performs an implicit hipDeviceSynchronize() call.
+ * If pointer is NULL, the hip runtime is initialized and hipSuccess is returned.
+ *
+ * @param[in] ptr Pointer to memory to be freed
+ * @return #hipSuccess,
+ * #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with
+ * hipMalloc)
+ *
+ * @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D,
+ * hipMalloc3DArray, hipHostMalloc
+ */
+hipError_t hipHostFree(void* ptr);
+
+/**
+ * @brief Copy data from src to dst.
+ *
+ * It supports memory from host to device,
+ * device to host, device to device and host to host
+ * The src and dst must not overlap.
+ *
+ * For hipMemcpy, the copy is always performed by the current device (set by hipSetDevice).
+ * For multi-gpu or peer-to-peer configurations, it is recommended to set the current device to the
+ * device where the src data is physically located. For optimal peer-to-peer copies, the copy device
+ * must be able to access the src and dst pointers (by calling hipDeviceEnablePeerAccess with copy
+ * agent as the current device and src/dest as the peerDevice argument. if this is not done, the
+ * hipMemcpy will still work, but will perform the copy using a staging buffer on the host.
+ * Calling hipMemcpy with dst and src pointers that do not match the hipMemcpyKind results in
+ * undefined behavior.
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ * @param[in] copyType Memory copy type
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknowni
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
+
+// TODO: Add description
+hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
+ hipMemcpyKind kind, hipStream_t stream);
+/**
+ * @brief Copy data from Host to Device
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ *
+ * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t sizeBytes);
+
+/**
+ * @brief Copy data from Device to Host
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ *
+ * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes);
+
+/**
+ * @brief Copy data from Device to Device
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ *
+ * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes);
+
+/**
+ * @brief Copy data from Host to Device asynchronously
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ *
+ * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream);
+
+/**
+ * @brief Copy data from Device to Host asynchronously
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ *
+ * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream);
+
+/**
+ * @brief Copy data from Device to Device asynchronously
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ *
+ * @return #hipSuccess, #hipErrorDeInitialized, #hipErrorNotInitialized, #hipErrorInvalidContext,
+ * #hipErrorInvalidValue
+ *
+ * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost,
+ * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA,
+ * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD,
+ * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync,
+ * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo,
+ * hipMemHostAlloc, hipMemHostGetDevicePointer
+ */
+hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes,
+ hipStream_t stream);
+
+#if __HIP_ROCclr__
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
+ hipModule_t hmod, const char* name);
+
+hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol);
+hipError_t hipGetSymbolSize(size_t* size, const void* symbol);
+hipError_t hipMemcpyToSymbol(const void* symbol, const void* src,
+ size_t sizeBytes, size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyHostToDevice));
+hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
+ size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind, hipStream_t stream __dparm(0));
+hipError_t hipMemcpyFromSymbol(void* dst, const void* symbol,
+ size_t sizeBytes, size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost));
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbol,
+ size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind,
+ hipStream_t stream __dparm(0));
+#else
+hipError_t hipModuleGetGlobal(void**, size_t*, hipModule_t, const char*);
+
+#ifdef __cplusplus //Start : Not supported in gcc
+namespace hip_impl {
+inline
+__attribute__((visibility("hidden")))
+hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
+ const char* name);
+} // Namespace hip_impl.
+
+
+/**
+ * @brief Copies the memory address of symbol @p symbolName to @p devPtr
+ *
+ * @param[in] symbolName - Symbol on device
+ * @param[out] devPtr - Pointer to a pointer to the memory referred to by the symbol
+ * @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
+ *
+ * @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
+ //HIP_INIT_API(hipGetSymbolAddress, devPtr, symbolName);
+ hip_impl::hip_init();
+ size_t size = 0;
+ return hip_impl::read_agent_global_from_process(devPtr, &size, (const char*)symbolName);
+}
+
+
+/**
+ * @brief Copies the size of symbol @p symbolName to @p size
+ *
+ * @param[in] symbolName - Symbol on device
+ * @param[out] size - Pointer to the size of the symbol
+ * @return #hipSuccess, #hipErrorNotInitialized, #hipErrorNotFound
+ *
+ * @see hipGetSymbolSize, hipMemcpyToSymbol, hipMemcpyFromSymbol, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
+ // HIP_INIT_API(hipGetSymbolSize, size, symbolName);
+ hip_impl::hip_init();
+ void* devPtr = nullptr;
+ return hip_impl::read_agent_global_from_process(&devPtr, size, (const char*)symbolName);
+}
+#endif // End : Not supported in gcc
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+namespace hip_impl {
+hipError_t hipMemcpyToSymbol(void*, const void*, size_t, size_t, hipMemcpyKind,
+ const char*);
+} // Namespace hip_impl.
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area
+ * pointed to by @p offset bytes from the start of symbol @p symbol.
+ *
+ * The memory areas may not overlap. Symbol can either be a variable that resides in global or
+ * constant memory space, or it can be a character string, naming a variable that resides in global
+ * or constant memory space. Kind can be either hipMemcpyHostToDevice or hipMemcpyDeviceToDevice
+ * TODO: cudaErrorInvalidSymbol and cudaErrorInvalidMemcpyDirection is not supported, use
+ * hipErrorUnknown for now.
+ *
+ * @param[in] symbolName - Symbol destination on device
+ * @param[in] src - Data being copy from
+ * @param[in] sizeBytes - Data size in bytes
+ * @param[in] offset - Offset from start of symbol in bytes
+ * @param[in] kind - Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
+ *
+ * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol,
+ * hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync,
+ * hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+#ifdef __cplusplus
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipMemcpyToSymbol(const void* symbolName, const void* src,
+ size_t sizeBytes, size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
+ if (!symbolName) return hipErrorInvalidSymbol;
+
+ hipDeviceptr_t dst = NULL;
+ hipGetSymbolAddress(&dst, (const char*)symbolName);
+
+ return hip_impl::hipMemcpyToSymbol(dst, src, sizeBytes, offset, kind,
+ (const char*)symbolName);
+}
+#endif
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+namespace hip_impl {
+hipError_t hipMemcpyToSymbolAsync(void*, const void*, size_t, size_t,
+ hipMemcpyKind, hipStream_t, const char*);
+hipError_t hipMemcpyFromSymbol(void*, const void*, size_t, size_t,
+ hipMemcpyKind, const char*);
+hipError_t hipMemcpyFromSymbolAsync(void*, const void*, size_t, size_t,
+ hipMemcpyKind, hipStream_t, const char*);
+} // Namespace hip_impl.
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * @brief Copies @p sizeBytes bytes from the memory area pointed to by @p src to the memory area
+ * pointed to by @p offset bytes from the start of symbol @p symbol
+ *
+ * The memory areas may not overlap. Symbol can either be a variable that resides in global or
+ * constant memory space, or it can be a character string, naming a variable that resides in global
+ * or constant memory space. Kind can be either hipMemcpyHostToDevice or hipMemcpyDeviceToDevice
+ * hipMemcpyToSymbolAsync() is asynchronous with respect to the host, so the call may return before
+ * copy is complete.
+ * TODO: cudaErrorInvalidSymbol and cudaErrorInvalidMemcpyDirection is not supported, use
+ * hipErrorUnknown for now.
+ *
+ * @param[in] symbolName - Symbol destination on device
+ * @param[in] src - Data being copy from
+ * @param[in] sizeBytes - Data size in bytes
+ * @param[in] offset - Offset from start of symbol in bytes
+ * @param[in] kind - Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
+ *
+ * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol,
+ * hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync,
+ * hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+
+#ifdef __cplusplus //Start : Not supported in gcc
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void* src,
+ size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+ if (!symbolName) return hipErrorInvalidSymbol;
+
+ hipDeviceptr_t dst = NULL;
+ hipGetSymbolAddress(&dst, symbolName);
+
+ return hip_impl::hipMemcpyToSymbolAsync(dst, src, sizeBytes, offset, kind,
+ stream,
+ (const char*)symbolName);
+}
+
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName,
+ size_t sizeBytes, size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+ if (!symbolName) return hipErrorInvalidSymbol;
+
+ hipDeviceptr_t src = NULL;
+ hipGetSymbolAddress(&src, symbolName);
+
+ return hip_impl::hipMemcpyFromSymbol(dst, src, sizeBytes, offset, kind,
+ (const char*)symbolName);
+}
+
+inline
+__attribute__((visibility("hidden")))
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
+ size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind,
+ hipStream_t stream __dparm(0)) {
+ if (!symbolName) return hipErrorInvalidSymbol;
+
+ hipDeviceptr_t src = NULL;
+ hipGetSymbolAddress(&src, symbolName);
+
+ return hip_impl::hipMemcpyFromSymbolAsync(dst, src, sizeBytes, offset, kind,
+ stream,
+ (const char*)symbolName);
+}
+#endif // End : Not supported in gcc
+
+#endif // __HIP_ROCclr__
+/**
+ * @brief Copy data from src to dst asynchronously.
+ *
+ * @warning If host or dest are not pinned, the memory copy will be performed synchronously. For
+ * best performance, use hipHostMalloc to allocate host memory that is transferred asynchronously.
+ *
+ * @warning on HCC hipMemcpyAsync does not support overlapped H2D and D2H copies.
+ * For hipMemcpy, the copy is always performed by the device associated with the specified stream.
+ *
+ * For multi-gpu or peer-to-peer configurations, it is recommended to use a stream which is a
+ * attached to the device where the src data is physically located. For optimal peer-to-peer copies,
+ * the copy device must be able to access the src and dst pointers (by calling
+ * hipDeviceEnablePeerAccess with copy agent as the current device and src/dest as the peerDevice
+ * argument. if this is not done, the hipMemcpy will still work, but will perform the copy using a
+ * staging buffer on the host.
+ *
+ * @param[out] dst Data being copy to
+ * @param[in] src Data being copy from
+ * @param[in] sizeBytes Data size in bytes
+ * @param[in] accelerator_view Accelerator view which the copy is being enqueued
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
+ *
+ * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyToSymbol,
+ * hipMemcpyFromSymbol, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync,
+ * hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync,
+ * hipMemcpyFromSymbolAsync
+ */
+hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+ hipStream_t stream __dparm(0));
+
+/**
+ * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ * @param[out] dst Data being filled
+ * @param[in] constant value to be set
+ * @param[in] sizeBytes Data size in bytes
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemset(void* dst, int value, size_t sizeBytes);
+
+/**
+ * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ * @param[out] dst Data ptr to be filled
+ * @param[in] constant value to be set
+ * @param[in] number of values to be set
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count);
+
+/**
+ * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * byte value value.
+ *
+ * hipMemsetD8Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ * @param[out] dst Data ptr to be filled
+ * @param[in] constant value to be set
+ * @param[in] number of values to be set
+ * @param[in] stream - Stream identifier
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * short value value.
+ *
+ * @param[out] dst Data ptr to be filled
+ * @param[in] constant value to be set
+ * @param[in] number of values to be set
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count);
+
+/**
+ * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant
+ * short value value.
+ *
+ * hipMemsetD16Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ * @param[out] dst Data ptr to be filled
+ * @param[in] constant value to be set
+ * @param[in] number of values to be set
+ * @param[in] stream - Stream identifier
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Fills the memory area pointed to by dest with the constant integer
+ * value for specified number of times.
+ *
+ * @param[out] dst Data being filled
+ * @param[in] constant value to be set
+ * @param[in] number of values to be set
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ */
+hipError_t hipMemsetD32(hipDeviceptr_t dest, int value, size_t count);
+
+/**
+ * @brief Fills the first sizeBytes bytes of the memory area pointed to by dev with the constant
+ * byte value value.
+ *
+ * hipMemsetAsync() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ * @param[out] dst Pointer to device memory
+ * @param[in] value - Value to set for each byte of specified memory
+ * @param[in] sizeBytes - Size in bytes to set
+ * @param[in] stream - Stream identifier
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Fills the memory area pointed to by dev with the constant integer
+ * value for specified number of times.
+ *
+ * hipMemsetD32Async() is asynchronous with respect to the host, so the call may return before the
+ * memset is complete. The operation can optionally be associated to a stream by passing a non-zero
+ * stream argument. If stream is non-zero, the operation may overlap with operations in other
+ * streams.
+ *
+ * @param[out] dst Pointer to device memory
+ * @param[in] value - Value to set for each byte of specified memory
+ * @param[in] count - number of values to be set
+ * @param[in] stream - Stream identifier
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemsetD32Async(hipDeviceptr_t dst, int value, size_t count,
+ hipStream_t stream __dparm(0));
+
+/**
+ * @brief Fills the memory area pointed to by dst with the constant value.
+ *
+ * @param[out] dst Pointer to device memory
+ * @param[in] pitch - data size in bytes
+ * @param[in] value - constant value to be set
+ * @param[in] width
+ * @param[in] height
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+
+hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height);
+
+/**
+ * @brief Fills asynchronously the memory area pointed to by dst with the constant value.
+ *
+ * @param[in] dst Pointer to device memory
+ * @param[in] pitch - data size in bytes
+ * @param[in] value - constant value to be set
+ * @param[in] width
+ * @param[in] height
+ * @param[in] stream
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+
+hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height,hipStream_t stream __dparm(0));
+
+/**
+ * @brief Fills synchronously the memory area pointed to by pitchedDevPtr with the constant value.
+ *
+ * @param[in] pitchedDevPtr
+ * @param[in] value - constant value to be set
+ * @param[in] extent
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent );
+
+/**
+ * @brief Fills asynchronously the memory area pointed to by pitchedDevPtr with the constant value.
+ *
+ * @param[in] pitchedDevPtr
+ * @param[in] value - constant value to be set
+ * @param[in] extent
+ * @param[in] stream
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree
+ */
+hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ,hipStream_t stream __dparm(0));
+
+/**
+ * @brief Query memory info.
+ * Return snapshot of free memory, and total allocatable memory on the device.
+ *
+ * Returns in *free a snapshot of the current free memory.
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue
+ * @warning On HCC, the free memory only accounts for memory allocated by this process and may be
+ *optimistic.
+ **/
+hipError_t hipMemGetInfo(size_t* free, size_t* total);
+
+
+hipError_t hipMemPtrGetInfo(void* ptr, size_t* size);
+
+
+/**
+ * @brief Allocate an array on the device.
+ *
+ * @param[out] array Pointer to allocated array in device memory
+ * @param[in] desc Requested channel format
+ * @param[in] width Requested array allocation width
+ * @param[in] height Requested array allocation height
+ * @param[in] flags Requested properties of allocated array
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, size_t width,
+ size_t height __dparm(0), unsigned int flags __dparm(hipArrayDefault));
+hipError_t hipArrayCreate(hipArray** pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray);
+
+hipError_t hipArray3DCreate(hipArray** array, const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray);
+
+hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent);
+
+/**
+ * @brief Frees an array on the device.
+ *
+ * @param[in] array Pointer to array to free
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized
+ *
+ * @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipHostMalloc, hipHostFree
+ */
+hipError_t hipFreeArray(hipArray* array);
+
+/**
+ * @brief Frees a mipmapped array on the device
+ *
+ * @param[in] mipmappedArray - Pointer to mipmapped array to free
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipFreeMipmappedArray(hipMipmappedArray_t mipmappedArray);
+
+/**
+ * @brief Allocate an array on the device.
+ *
+ * @param[out] array Pointer to allocated array in device memory
+ * @param[in] desc Requested channel format
+ * @param[in] extent Requested array allocation width, height and depth
+ * @param[in] flags Requested properties of allocated array
+ * @return #hipSuccess, #hipErrorOutOfMemory
+ *
+ * @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree
+ */
+
+hipError_t hipMalloc3DArray(hipArray** array, const struct hipChannelFormatDesc* desc,
+ struct hipExtent extent, unsigned int flags);
+
+/**
+ * @brief Allocate a mipmapped array on the device
+ *
+ * @param[out] mipmappedArray - Pointer to allocated mipmapped array in device memory
+ * @param[in] desc - Requested channel format
+ * @param[in] extent - Requested allocation size (width field in elements)
+ * @param[in] numLevels - Number of mipmap levels to allocate
+ * @param[in] flags - Flags for extensions
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation
+ */
+hipError_t hipMallocMipmappedArray(
+ hipMipmappedArray_t *mipmappedArray,
+ const struct hipChannelFormatDesc* desc,
+ struct hipExtent extent,
+ unsigned int numLevels,
+ unsigned int flags __dparm(0));
+
+/**
+ * @brief Gets a mipmap level of a HIP mipmapped array
+ *
+ * @param[out] levelArray - Returned mipmap level HIP array
+ * @param[in] mipmappedArray - HIP mipmapped array
+ * @param[in] level - Mipmap level
+ *
+ * @return #hipSuccess, #hipErrorInvalidValue
+ */
+hipError_t hipGetMipmappedArrayLevel(
+ hipArray_t *levelArray,
+ hipMipmappedArray_const_t mipmappedArray,
+ unsigned int level);
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] dpitch Pitch of destination memory
+ * @param[in] src Source memory address
+ * @param[in] spitch Pitch of source memory
+ * @param[in] width Width of matrix transfer (columns in bytes)
+ * @param[in] height Height of matrix transfer (rows)
+ * @param[in] kind Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+ size_t height, hipMemcpyKind kind);
+
+/**
+ * @brief Copies memory for 2D arrays.
+ * @param[in] pCopy Parameters for the memory copy
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpyToSymbol, hipMemcpyAsync
+*/
+hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy);
+
+/**
+ * @brief Copies memory for 2D arrays.
+ * @param[in] pCopy Parameters for the memory copy
+ * @param[in] stream Stream to use
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray,
+ * hipMemcpyToSymbol, hipMemcpyAsync
+*/
+hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] dpitch Pitch of destination memory
+ * @param[in] src Source memory address
+ * @param[in] spitch Pitch of source memory
+ * @param[in] width Width of matrix transfer (columns in bytes)
+ * @param[in] height Height of matrix transfer (rows)
+ * @param[in] kind Type of transfer
+ * @param[in] stream Stream to use
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+ size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] wOffset Destination starting X offset
+ * @param[in] hOffset Destination starting Y offset
+ * @param[in] src Source memory address
+ * @param[in] spitch Pitch of source memory
+ * @param[in] width Width of matrix transfer (columns in bytes)
+ * @param[in] height Height of matrix transfer (rows)
+ * @param[in] kind Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+ size_t spitch, size_t width, size_t height, hipMemcpyKind kind);
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] wOffset Destination starting X offset
+ * @param[in] hOffset Destination starting Y offset
+ * @param[in] src Source memory address
+ * @param[in] count size in bytes to copy
+ * @param[in] kind Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+ size_t count, hipMemcpyKind kind);
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] srcArray Source memory address
+ * @param[in] woffset Source starting X offset
+ * @param[in] hOffset Source starting Y offset
+ * @param[in] count Size in bytes to copy
+ * @param[in] kind Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray, size_t wOffset, size_t hOffset,
+ size_t count, hipMemcpyKind kind);
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] dpitch Pitch of destination memory
+ * @param[in] src Source memory address
+ * @param[in] wOffset Source starting X offset
+ * @param[in] hOffset Source starting Y offset
+ * @param[in] width Width of matrix transfer (columns in bytes)
+ * @param[in] height Height of matrix transfer (rows)
+ * @param[in] kind Type of transfer
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DFromArray( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind);
+
+/**
+ * @brief Copies data between host and device asynchronously.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] dpitch Pitch of destination memory
+ * @param[in] src Source memory address
+ * @param[in] wOffset Source starting X offset
+ * @param[in] hOffset Source starting Y offset
+ * @param[in] width Width of matrix transfer (columns in bytes)
+ * @param[in] height Height of matrix transfer (rows)
+ * @param[in] kind Type of transfer
+ * @param[in] stream Accelerator view which the copy is being enqueued
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy2DFromArrayAsync( void* dst, size_t dpitch, hipArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dst Destination memory address
+ * @param[in] srcArray Source array
+ * @param[in] srcoffset Offset in bytes of source array
+ * @param[in] count Size of memory copy in bytes
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset, size_t count);
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] dstArray Destination memory address
+ * @param[in] dstOffset Offset in bytes of destination array
+ * @param[in] srcHost Source host pointer
+ * @param[in] count Size of memory copy in bytes
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost, size_t count);
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] p 3D memory copy parameters
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy3D(const struct hipMemcpy3DParms* p);
+
+/**
+ * @brief Copies data between host and device asynchronously.
+ *
+ * @param[in] p 3D memory copy parameters
+ * @param[in] stream Stream to use
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms* p, hipStream_t stream __dparm(0));
+
+/**
+ * @brief Copies data between host and device.
+ *
+ * @param[in] pCopy 3D memory copy parameters
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy);
+
+/**
+ * @brief Copies data between host and device asynchronously.
+ *
+ * @param[in] pCopy 3D memory copy parameters
+ * @param[in] stream Stream to use
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue,
+ * #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection
+ *
+ * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol,
+ * hipMemcpyAsync
+ */
+hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream);
+
+// doxygen end Memory
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup PeerToPeer PeerToPeer Device Memory Access
+ * @{
+ * @warning PeerToPeer support is experimental.
+ * This section describes the PeerToPeer device memory access functions of HIP runtime API.
+ */
+
+/**
+ * @brief Determine if a device can access a peer's memory.
+ *
+ * @param [out] canAccessPeer Returns the peer access capability (0 or 1)
+ * @param [in] device - device from where memory may be accessed.
+ * @param [in] peerDevice - device where memory is physically located
+ *
+ * Returns "1" in @p canAccessPeer if the specified @p device is capable
+ * of directly accessing memory physically located on peerDevice , or "0" if not.
+ *
+ * Returns "0" in @p canAccessPeer if deviceId == peerDeviceId, and both are valid devices : a
+ * device is not a peer of itself.
+ *
+ * @returns #hipSuccess,
+ * @returns #hipErrorInvalidDevice if deviceId or peerDeviceId are not valid devices
+ */
+hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int deviceId, int peerDeviceId);
+
+
+/**
+ * @brief Enable direct access from current device's virtual address space to memory allocations
+ * physically located on a peer device.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the
+ * current device. In addition, all future memory allocations on peerDeviceId will be mapped into
+ * the address space of the current device when the memory is allocated. The peer memory remains
+ * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerDeviceId
+ * @param [in] flags
+ *
+ * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device.
+ */
+hipError_t hipDeviceEnablePeerAccess(int peerDeviceId, unsigned int flags);
+
+
+/**
+ * @brief Disable direct access from current device's virtual address space to memory allocations
+ * physically located on a peer device.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
+ * enabled from the current device.
+ *
+ * @param [in] peerDeviceId
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ */
+hipError_t hipDeviceDisablePeerAccess(int peerDeviceId);
+
+/**
+ * @brief Get information on memory allocations.
+ *
+ * @param [out] pbase - BAse pointer address
+ * @param [out] psize - Size of allocation
+ * @param [in] dptr- Device Pointer
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevicePointer
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize, hipDeviceptr_t dptr);
+
+#ifndef USE_PEER_NON_UNIFIED
+#define USE_PEER_NON_UNIFIED 1
+#endif
+
+#if USE_PEER_NON_UNIFIED == 1
+/**
+ * @brief Copies memory from one device to memory on another device.
+ *
+ * @param [out] dst - Destination device pointer.
+ * @param [in] dstDeviceId - Destination device
+ * @param [in] src - Source device pointer
+ * @param [in] srcDeviceId - Source device
+ * @param [in] sizeBytes - Size of memory copy in bytes
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId,
+ size_t sizeBytes);
+
+/**
+ * @brief Copies memory from one device to memory on another device.
+ *
+ * @param [out] dst - Destination device pointer.
+ * @param [in] dstDevice - Destination device
+ * @param [in] src - Source device pointer
+ * @param [in] srcDevice - Source device
+ * @param [in] sizeBytes - Size of memory copy in bytes
+ * @param [in] stream - Stream identifier
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDevice
+ */
+hipError_t hipMemcpyPeerAsync(void* dst, int dstDeviceId, const void* src, int srcDevice,
+ size_t sizeBytes, hipStream_t stream __dparm(0));
+#endif
+
+
+// doxygen end PeerToPeer
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Context Context Management
+ * @{
+ * This section describes the context management functions of HIP runtime API.
+ */
+
+/**
+ *
+ * @addtogroup ContextD Context Management [Deprecated]
+ * @{
+ * @ingroup Context
+ * This section describes the deprecated context management functions of HIP runtime API.
+ */
+
+/**
+ * @brief Create a context and set it as current/ default context
+ *
+ * @param [out] ctx
+ * @param [in] flags
+ * @param [in] associated device handle
+ *
+ * @return #hipSuccess
+ *
+ * @see hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent,
+ * hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device);
+
+/**
+ * @brief Destroy a HIP context.
+ *
+ * @param [in] ctx Context to destroy
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue
+ *
+ * @see hipCtxCreate, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,hipCtxSetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxDestroy(hipCtx_t ctx);
+
+/**
+ * @brief Pop the current/default context and return the popped context.
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxSetCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxPopCurrent(hipCtx_t* ctx);
+
+/**
+ * @brief Push the context to be set as current/ default context
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxPushCurrent(hipCtx_t ctx);
+
+/**
+ * @brief Set the passed context as current/default
+ *
+ * @param [in] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSetCurrent(hipCtx_t ctx);
+
+/**
+ * @brief Get the handle of the current/ default context
+ *
+ * @param [out] ctx
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetCurrent(hipCtx_t* ctx);
+
+/**
+ * @brief Get the handle of the device associated with current/default context
+ *
+ * @param [out] device
+ *
+ * @returns #hipSuccess, #hipErrorInvalidContext
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize
+ */
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetDevice(hipDevice_t* device);
+
+/**
+ * @brief Returns the approximate HIP api version.
+ *
+ * @param [in] ctx Context to check
+ * @param [out] apiVersion
+ *
+ * @return #hipSuccess
+ *
+ * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision.
+ * This function always set *apiVersion to 4 as an approximation though HIP supports
+ * some features which were introduced in later CUDA SDK revisions.
+ * HIP apps code should not rely on the api revision number here and should
+ * use arch feature flags to test device capabilities or conditional compilation.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent,
+ * hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion);
+
+/**
+ * @brief Set Cache configuration for a specific function
+ *
+ * @param [out] cacheConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache. This hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetCacheConfig(hipFuncCache_t* cacheConfig);
+
+/**
+ * @brief Set L1/Shared cache partition.
+ *
+ * @param [in] cacheConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support reconfigurable cache. This hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSetCacheConfig(hipFuncCache_t cacheConfig);
+
+/**
+ * @brief Set Shared memory bank configuration.
+ *
+ * @param [in] sharedMemoryConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config);
+
+/**
+ * @brief Get Shared memory bank configuration.
+ *
+ * @param [out] sharedMemoryConfiguration
+ *
+ * @return #hipSuccess
+ *
+ * @warning AMD devices and some Nvidia GPUS do not support shared cache banking, and the hint is
+ * ignored on those architectures.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig);
+
+/**
+ * @brief Blocks until the default context has completed all preceding requested tasks.
+ *
+ * @return #hipSuccess
+ *
+ * @warning This function waits for all streams on the default context to complete execution, and
+ * then returns.
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxSynchronize(void);
+
+/**
+ * @brief Return flags used for creating default context.
+ *
+ * @param [out] flags
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxGetFlags(unsigned int* flags);
+
+/**
+ * @brief Enables direct access to memory allocations in a peer context.
+ *
+ * Memory which already allocated on peer device will be mapped into the address space of the
+ * current device. In addition, all future memory allocations on peerDeviceId will be mapped into
+ * the address space of the current device when the memory is allocated. The peer memory remains
+ * accessible from the current device until a call to hipDeviceDisablePeerAccess or hipDeviceReset.
+ *
+ *
+ * @param [in] peerCtx
+ * @param [in] flags
+ *
+ * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue,
+ * #hipErrorPeerAccessAlreadyEnabled
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning PeerToPeer support is experimental.
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags);
+
+/**
+ * @brief Disable direct access from current context's virtual address space to memory allocations
+ * physically located on a peer context.Disables direct access to memory allocations in a peer
+ * context and unregisters any registered allocations.
+ *
+ * Returns hipErrorPeerAccessNotEnabled if direct access to memory on peerDevice has not yet been
+ * enabled from the current device.
+ *
+ * @param [in] peerCtx
+ *
+ * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning PeerToPeer support is experimental.
+ */
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx);
+
+// doxygen end Context deprecated
+/**
+ * @}
+ */
+
+/**
+ * @brief Get the state of the primary context.
+ *
+ * @param [in] Device to get primary context flags for
+ * @param [out] Pointer to store flags
+ * @param [out] Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags, int* active);
+
+/**
+ * @brief Release the primary context on the GPU.
+ *
+ * @param [in] Device which primary context is released
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ * @warning This function return #hipSuccess though doesn't release the primaryCtx by design on
+ * HIP/HCC path.
+ */
+hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev);
+
+/**
+ * @brief Retain the primary context on the GPU.
+ *
+ * @param [out] Returned context handle of the new context
+ * @param [in] Device which primary context is released
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev);
+
+/**
+ * @brief Resets the primary context on the GPU.
+ *
+ * @param [in] Device which primary context is reset
+ *
+ * @returns #hipSuccess
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev);
+
+/**
+ * @brief Set flags for the primary context.
+ *
+ * @param [in] Device for which the primary context flags are set
+ * @param [in] New flags for the device
+ *
+ * @returns #hipSuccess, #hipErrorContextAlreadyInUse
+ *
+ * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,
+ * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice
+ */
+hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags);
+
+// doxygen end Context Management
+/**
+ * @}
+ */
+
+/**
+ *
+ * @defgroup Module Module Management
+ * @{
+ * This section describes the module management functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief Loads code object from file into a hipModule_t
+ *
+ * @param [in] fname
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorFileNotFound,
+ * hipErrorOutOfMemory, hipErrorSharedObjectInitFailed, hipErrorNotInitialized
+ *
+ *
+ */
+hipError_t hipModuleLoad(hipModule_t* module, const char* fname);
+
+/**
+ * @brief Frees the module
+ *
+ * @param [in] module
+ *
+ * @returns hipSuccess, hipInvalidValue
+ * module is freed and the code objects associated with it are destroyed
+ *
+ */
+
+hipError_t hipModuleUnload(hipModule_t module);
+
+/**
+ * @brief Function with kname will be extracted if present in module
+ *
+ * @param [in] module
+ * @param [in] kname
+ * @param [out] function
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidContext, hipErrorNotInitialized,
+ * hipErrorNotFound,
+ */
+hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module, const char* kname);
+
+/**
+ * @brief Find out attributes for a given function.
+ *
+ * @param [out] attr
+ * @param [in] func
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDeviceFunction
+ */
+
+hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func);
+
+/**
+ * @brief Find out a specific attribute for a given function.
+ *
+ * @param [out] value
+ * @param [in] attrib
+ * @param [in] hfunc
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorInvalidDeviceFunction
+ */
+hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc);
+
+#if !__HIP_ROCclr__
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+namespace hip_impl {
+ class agent_globals_impl;
+ class agent_globals {
+ public:
+ agent_globals();
+ ~agent_globals();
+ agent_globals(const agent_globals&) = delete;
+
+ hipError_t read_agent_global_from_module(hipDeviceptr_t* dptr, size_t* bytes,
+ hipModule_t hmod, const char* name);
+ hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
+ const char* name);
+ private:
+ agent_globals_impl* impl;
+ };
+
+ inline
+ __attribute__((visibility("hidden")))
+ agent_globals& get_agent_globals() {
+ static agent_globals ag;
+ return ag;
+ }
+
+ extern "C"
+ inline
+ __attribute__((visibility("hidden")))
+ hipError_t read_agent_global_from_process(hipDeviceptr_t* dptr, size_t* bytes,
+ const char* name) {
+ return get_agent_globals().read_agent_global_from_process(dptr, bytes, name);
+ }
+} // Namespace hip_impl.
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/**
+ * @brief returns device memory pointer and size of the kernel present in the module with symbol @p
+ * name
+ *
+ * @param [out] dptr
+ * @param [out] bytes
+ * @param [in] hmod
+ * @param [in] name
+ *
+ * @returns hipSuccess, hipErrorInvalidValue, hipErrorNotInitialized
+ */
+hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
+ hipModule_t hmod, const char* name);
+#endif // __HIP_ROCclr__
+
+/**
+ * @brief returns the handle of the texture reference with the name from the module.
+ *
+ * @param [in] hmod
+ * @param [in] name
+ * @param [out] texRef
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorNotFound, hipErrorInvalidValue
+ */
+hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const char* name);
+
+/**
+ * @brief builds module from code object which resides in host memory. Image is pointer to that
+ * location.
+ *
+ * @param [in] image
+ * @param [out] module
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
+
+/**
+ * @brief builds module from code object which resides in host memory. Image is pointer to that
+ * location. Options are not used. hipModuleLoadData is called.
+ *
+ * @param [in] image
+ * @param [out] module
+ * @param [in] number of options
+ * @param [in] options for JIT
+ * @param [in] option values for JIT
+ *
+ * @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized
+ */
+hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned int numOptions,
+ hipJitOption* options, void** optionValues);
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra
+ *
+ * @param [in] f Kernel to launch.
+ * @param [in] gridDimX X grid dimension specified as multiple of blockDimX.
+ * @param [in] gridDimY Y grid dimension specified as multiple of blockDimY.
+ * @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ.
+ * @param [in] blockDimX X block dimensions specified in work-items
+ * @param [in] blockDimY Y grid dimension specified in work-items
+ * @param [in] blockDimZ Z grid dimension specified in work-items
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ * kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ * @param [in] kernelParams
+ * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and
+ * must be in the memory layout and alignment expected by the kernel.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please
+ * refer to hip_porting_driver_api.md for sample usage.
+ */
+hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
+ unsigned int gridDimZ, unsigned int blockDimX,
+ unsigned int blockDimY, unsigned int blockDimZ,
+ unsigned int sharedMemBytes, hipStream_t stream,
+ void** kernelParams, void** extra);
+
+
+#if __HIP_ROCclr__ && !defined(__HCC__)
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ * to kernelparams or extra, where thread blocks can cooperate and synchronize as they execute
+ *
+ * @param [in] f Kernel to launch.
+ * @param [in] gridDim Grid dimensions specified as multiple of blockDim.
+ * @param [in] blockDim Block dimensions specified in work-items
+ * @param [in] kernelParams A list of kernel arguments
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ * kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
+ */
+hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDimX,
+ void** kernelParams, unsigned int sharedMemBytes,
+ hipStream_t stream);
+
+/**
+ * @brief Launches kernels on multiple devices where thread blocks can cooperate and
+ * synchronize as they execute.
+ *
+ * @param [in] hipLaunchParams List of launch parameters, one per device.
+ * @param [in] numDevices Size of the launchParamsList array.
+ * @param [in] flags Flags to control launch behavior.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue, hipErrorCooperativeLaunchTooLarge
+ */
+hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+ int numDevices, unsigned int flags);
+
+#endif
+
+/**
+ * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched
+ * on respective streams before enqueuing any other work on the specified streams from any other threads
+ *
+ *
+ * @param [in] hipLaunchParams List of launch parameters, one per device.
+ * @param [in] numDevices Size of the launchParamsList array.
+ * @param [in] flags Flags to control launch behavior.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ */
+hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
+ int numDevices, unsigned int flags);
+
+
+// doxygen end Module
+/**
+ * @}
+ */
+
+/**
+ *
+ * @defgroup Occupancy Occupancy
+ * @{
+ * This section describes the occupancy functions of HIP runtime API.
+ *
+ */
+
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize minimum grid size for maximum potential occupancy
+ * @param [out] blockSize block size for maximum potential occupancy
+ * @param [in] f kernel function for which occupancy is calulated
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
+ */
+
+//TODO - Match CUoccupancyB2DSize
+hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ hipFunction_t f, size_t dynSharedMemPerBlk,
+ int blockSizeLimit);
+
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize minimum grid size for maximum potential occupancy
+ * @param [out] blockSize block size for maximum potential occupancy
+ * @param [in] f kernel function for which occupancy is calulated
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit
+ * @param [in] flags Extra flags for occupancy calculation (only default supported)
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
+ */
+//TODO - Match CUoccupancyB2DSize
+hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+ hipFunction_t f, size_t dynSharedMemPerBlk,
+ int blockSizeLimit, unsigned int flags);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks Returned occupancy
+ * @param [in] func Kernel function (hipFunction) for which occupancy is calulated
+ * @param [in] blockSize Block size the kernel is intended to be launched with
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ */
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
+ int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks Returned occupancy
+ * @param [in] f Kernel function(hipFunction_t) for which occupancy is calulated
+ * @param [in] blockSize Block size the kernel is intended to be launched with
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] flags Extra flags for occupancy calculation (only default supported)
+ */
+hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ int* numBlocks, hipFunction_t f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks Returned occupancy
+ * @param [in] func Kernel function for which occupancy is calulated
+ * @param [in] blockSize Block size the kernel is intended to be launched with
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ */
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+ int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk);
+
+/**
+ * @brief Returns occupancy for a device function.
+ *
+ * @param [out] numBlocks Returned occupancy
+ * @param [in] f Kernel function for which occupancy is calulated
+ * @param [in] blockSize Block size the kernel is intended to be launched with
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] flags Extra flags for occupancy calculation (currently ignored)
+ */
+hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ int* numBlocks, const void* f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags __dparm(hipOccupancyDefault));
+
+/**
+ * @brief determine the grid and block sizes to achieves maximum occupancy for a kernel
+ *
+ * @param [out] gridSize minimum grid size for maximum potential occupancy
+ * @param [out] blockSize block size for maximum potential occupancy
+ * @param [in] f kernel function for which occupancy is calulated
+ * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block
+ * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue
+ */
+hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ const void* f, size_t dynSharedMemPerBlk,
+ int blockSizeLimit);
+
+// doxygen end Occupancy
+/**
+ * @}
+ */
+
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Profiler Profiler Control[Deprecated]
+ * @{
+ * This section describes the profiler control functions of HIP runtime API.
+ *
+ * @warning The cudaProfilerInitialize API format for "configFile" is not supported.
+ *
+ */
+
+
+// TODO - expand descriptions:
+/**
+ * @brief Start recording of profiling information
+ * When using this API, start the profiler with profiling disabled. (--startdisabled)
+ * @warning : hipProfilerStart API is under development.
+ */
+DEPRECATED("use roctracer/rocTX instead")
+hipError_t hipProfilerStart();
+
+
+/**
+ * @brief Stop recording of profiling information.
+ * When using this API, start the profiler with profiling disabled. (--startdisabled)
+ * @warning : hipProfilerStop API is under development.
+ */
+DEPRECATED("use roctracer/rocTX instead")
+hipError_t hipProfilerStop();
+
+// doxygen end profiler
+/**
+ * @}
+ */
+
+/**
+ *-------------------------------------------------------------------------------------------------
+ *-------------------------------------------------------------------------------------------------
+ * @defgroup Clang Launch API to support the triple-chevron syntax
+ * @{
+ * This section describes the API to support the triple-chevron syntax.
+ */
+
+/**
+ * @brief Configure a kernel launch.
+ *
+ * @param [in] gridDim grid dimension specified as multiple of blockDim.
+ * @param [in] blockDim block dimensions specified in work-items
+ * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dparm(0), hipStream_t stream __dparm(0));
+
+
+/**
+ * @brief Set a kernel argument.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @param [in] arg Pointer the argument in host memory.
+ * @param [in] size Size of the argument.
+ * @param [in] offset Offset of the argument on the argument stack.
+ *
+ */
+hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
+
+
+/**
+ * @brief Launch a kernel.
+ *
+ * @param [in] func Kernel to launch.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+hipError_t hipLaunchByPtr(const void* func);
+
+
+/**
+ * @brief Push configuration of a kernel launch.
+ *
+ * @param [in] gridDim grid dimension specified as multiple of blockDim.
+ * @param [in] blockDim block dimensions specified in work-items
+ * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+
+hipError_t __hipPushCallConfiguration(dim3 gridDim,
+ dim3 blockDim,
+ size_t sharedMem __dparm(0),
+ hipStream_t stream __dparm(0));
+
+/**
+ * @brief Pop configuration of a kernel launch.
+ *
+ * @param [out] gridDim grid dimension specified as multiple of blockDim.
+ * @param [out] blockDim block dimensions specified in work-items
+ * @param [out] sharedMem Amount of dynamic shared memory to allocate for this kernel. The
+ * kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [out] stream Stream where the kernel should be dispatched. May be 0, in which case the
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ */
+hipError_t __hipPopCallConfiguration(dim3 *gridDim,
+ dim3 *blockDim,
+ size_t *sharedMem,
+ hipStream_t *stream);
+
+/**
+ * @brief C compliant kernel launch API
+ *
+ * @param [in] function_address - kernel stub function pointer.
+ * @param [in] numBlocks - number of blocks
+ * @param [in] dimBlocks - dimension of a block
+ * @param [in] args - kernel arguments
+ * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The
+ * Kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream - Stream where the kernel should be dispatched. May be 0, in which case th
+ * default stream is used with associated synchronization rules.
+ *
+ * @returns #hipSuccess, #hipErrorInvalidValue, hipInvalidDevice
+ *
+ */
+hipError_t hipLaunchKernel(const void* function_address,
+ dim3 numBlocks,
+ dim3 dimBlocks,
+ void** args,
+ size_t sharedMemBytes __dparm(0),
+ hipStream_t stream __dparm(0));
+
+#if __HIP_ROCclr__ || !defined(__HCC__)
+//TODO: Move this to hip_ext.h
+hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks, dim3 dimBlocks,
+ void** args, size_t sharedMemBytes, hipStream_t stream,
+ hipEvent_t startEvent, hipEvent_t stopEvent, int flags);
+// doxygen end Clang launch
+/**
+ * @}
+ */
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture(
+ size_t* offset,
+ const textureReference* tex,
+ const void* devPtr,
+ const hipChannelFormatDesc* desc,
+ size_t size __dparm(UINT_MAX));
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture2D(
+ size_t* offset,
+ const textureReference* tex,
+ const void* devPtr,
+ const hipChannelFormatDesc* desc,
+ size_t width,
+ size_t height,
+ size_t pitch);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTextureToArray(
+ const textureReference* tex,
+ hipArray_const_t array,
+ const hipChannelFormatDesc* desc);
+
+hipError_t hipBindTextureToMipmappedArray(
+ const textureReference* tex,
+ hipMipmappedArray_const_t mipmappedArray,
+ const hipChannelFormatDesc* desc);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipGetTextureAlignmentOffset(
+ size_t* offset,
+ const textureReference* texref);
+
+hipError_t hipGetTextureReference(
+ const textureReference** texref,
+ const void* symbol);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipUnbindTexture(const textureReference* tex);
+
+hipError_t hipCreateTextureObject(
+ hipTextureObject_t* pTexObject,
+ const hipResourceDesc* pResDesc,
+ const hipTextureDesc* pTexDesc,
+ const struct hipResourceViewDesc* pResViewDesc);
+
+hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
+
+hipError_t hipGetChannelDesc(
+ hipChannelFormatDesc* desc,
+ hipArray_const_t array);
+
+hipError_t hipGetTextureObjectResourceDesc(
+ hipResourceDesc* pResDesc,
+ hipTextureObject_t textureObject);
+
+hipError_t hipGetTextureObjectResourceViewDesc(
+ struct hipResourceViewDesc* pResViewDesc,
+ hipTextureObject_t textureObject);
+
+hipError_t hipGetTextureObjectTextureDesc(
+ hipTextureDesc* pTexDesc,
+ hipTextureObject_t textureObject);
+
+hipError_t hipTexRefGetAddress(
+ hipDeviceptr_t* dev_ptr,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetAddressMode(
+ enum hipTextureAddressMode* pam,
+ const textureReference* texRef,
+ int dim);
+
+hipError_t hipTexRefGetFilterMode(
+ enum hipTextureFilterMode* pfm,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetFlags(
+ unsigned int* pFlags,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetFormat(
+ hipArray_Format* pFormat,
+ int* pNumChannels,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetMaxAnisotropy(
+ int* pmaxAnsio,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetMipmapFilterMode(
+ enum hipTextureFilterMode* pfm,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetMipmapLevelBias(
+ float* pbias,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetMipmapLevelClamp(
+ float* pminMipmapLevelClamp,
+ float* pmaxMipmapLevelClamp,
+ const textureReference* texRef);
+
+hipError_t hipTexRefGetMipMappedArray(
+ hipMipmappedArray_t* pArray,
+ const textureReference* texRef);
+
+hipError_t hipTexRefSetAddress(
+ size_t* ByteOffset,
+ textureReference* texRef,
+ hipDeviceptr_t dptr,
+ size_t bytes);
+
+hipError_t hipTexRefSetAddress2D(
+ textureReference* texRef,
+ const HIP_ARRAY_DESCRIPTOR* desc,
+ hipDeviceptr_t dptr,
+ size_t Pitch);
+
+hipError_t hipTexRefSetAddressMode(
+ textureReference* texRef,
+ int dim,
+ enum hipTextureAddressMode am);
+
+hipError_t hipTexRefSetArray(
+ textureReference* tex,
+ hipArray_const_t array,
+ unsigned int flags);
+
+hipError_t hipTexRefSetBorderColor(
+ textureReference* texRef,
+ float* pBorderColor);
+
+hipError_t hipTexRefSetFilterMode(
+ textureReference* texRef,
+ enum hipTextureFilterMode fm);
+
+hipError_t hipTexRefSetFlags(
+ textureReference* texRef,
+ unsigned int Flags);
+
+hipError_t hipTexRefSetFormat(
+ textureReference* texRef,
+ hipArray_Format fmt,
+ int NumPackedComponents);
+
+hipError_t hipTexRefSetMaxAnisotropy(
+ textureReference* texRef,
+ unsigned int maxAniso);
+
+hipError_t hipTexRefSetMipmapFilterMode(
+ textureReference* texRef,
+ enum hipTextureFilterMode fm);
+
+hipError_t hipTexRefSetMipmapLevelBias(
+ textureReference* texRef,
+ float bias);
+
+hipError_t hipTexRefSetMipmapLevelClamp(
+ textureReference* texRef,
+ float minMipMapLevelClamp,
+ float maxMipMapLevelClamp);
+
+hipError_t hipTexRefSetMipmappedArray(
+ textureReference* texRef,
+ struct hipMipmappedArray* mipmappedArray,
+ unsigned int Flags);
+
+hipError_t hipMipmappedArrayCreate(
+ hipMipmappedArray_t* pHandle,
+ HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc,
+ unsigned int numMipmapLevels);
+
+hipError_t hipMipmappedArrayDestroy(
+ hipMipmappedArray_t hMipmappedArray);
+
+hipError_t hipMipmappedArrayGetLevel(
+ hipArray_t* pLevelArray,
+ hipMipmappedArray_t hMipMappedArray,
+ unsigned int level);
+
+hipError_t hipTexObjectCreate(
+ hipTextureObject_t* pTexObject,
+ const HIP_RESOURCE_DESC* pResDesc,
+ const HIP_TEXTURE_DESC* pTexDesc,
+ const HIP_RESOURCE_VIEW_DESC* pResViewDesc);
+
+hipError_t hipTexObjectDestroy(
+ hipTextureObject_t texObject);
+
+hipError_t hipTexObjectGetResourceDesc(
+ HIP_RESOURCE_DESC* pResDesc,
+ hipTextureObject_t texObject);
+
+hipError_t hipTexObjectGetResourceViewDesc(
+ HIP_RESOURCE_VIEW_DESC* pResViewDesc,
+ hipTextureObject_t texObject);
+
+hipError_t hipTexObjectGetTextureDesc(
+ HIP_TEXTURE_DESC* pTexDesc,
+ hipTextureObject_t texObject);
+#endif
+
+/**
+ * @}
+ */
+
+
+#ifdef __cplusplus
+} /* extern "c" */
+#endif
+
+#if defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
+template <typename T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0) {
+ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),dynSharedMemPerBlk,blockSizeLimit);
+}
+
+template <typename T>
+static hipError_t __host__ inline hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+ T f, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0, unsigned int flags = 0 ) {
+ return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, reinterpret_cast<const void*>(f),dynSharedMemPerBlk,blockSizeLimit);
+}
+#endif // defined(__cplusplus) && !defined(__HCC__) && defined(__clang__) && defined(__HIP__)
+
+#if defined(__cplusplus) && !defined(__HCC__)
+
+template <typename T>
+hipError_t hipGetSymbolAddress(void** devPtr, const T &symbol) {
+ return ::hipGetSymbolAddress(devPtr, (const void *)&symbol);
+}
+
+template <typename T>
+hipError_t hipGetSymbolSize(size_t* size, const T &symbol) {
+ return ::hipGetSymbolSize(size, (const void *)&symbol);
+}
+
+template <typename T>
+hipError_t hipMemcpyToSymbol(const T& symbol, const void* src, size_t sizeBytes,
+ size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyHostToDevice)) {
+ return ::hipMemcpyToSymbol((const void*)&symbol, src, sizeBytes, offset, kind);
+}
+
+template <typename T>
+hipError_t hipMemcpyToSymbolAsync(const T& symbol, const void* src, size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+ return ::hipMemcpyToSymbolAsync((const void*)&symbol, src, sizeBytes, offset, kind, stream);
+}
+
+template <typename T>
+hipError_t hipMemcpyFromSymbol(void* dst, const T &symbol,
+ size_t sizeBytes, size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+ return ::hipMemcpyFromSymbol(dst, (const void*)&symbol, sizeBytes, offset, kind);
+}
+
+template <typename T>
+hipError_t hipMemcpyFromSymbolAsync(void* dst, const T& symbol, size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind, hipStream_t stream __dparm(0)) {
+ return ::hipMemcpyFromSymbolAsync(dst, (const void*)&symbol, sizeBytes, offset, kind, stream);
+}
+
+#endif
+
+#if USE_PROF_API
+#include <hip/hcc_detail/hip_prof_str.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Callback/Activity API
+ */
+hipError_t hipRegisterApiCallback(uint32_t id, void* fun, void* arg);
+hipError_t hipRemoveApiCallback(uint32_t id);
+hipError_t hipRegisterActivityCallback(uint32_t id, void* fun, void* arg);
+hipError_t hipRemoveActivityCallback(uint32_t id);
+const char* hipApiName(uint32_t id);
+const char* hipKernelNameRef(const hipFunction_t f);
+const char* hipKernelNameRefByPtr(const void* hostFunction, hipStream_t stream);
+int hipGetStreamDeviceId(hipStream_t stream);
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#ifdef __cplusplus
+
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(
+ int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk) {
+ return hipOccupancyMaxActiveBlocksPerMultiprocessor(
+ numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk);
+}
+
+template <class T>
+inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ int* numBlocks, T f, int blockSize, size_t dynSharedMemPerBlk, unsigned int flags) {
+ return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+ numBlocks, reinterpret_cast<const void*>(f), blockSize, dynSharedMemPerBlk, flags);
+}
+
+class TlsData;
+
+#if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture(size_t* offset, textureReference* tex, const void* devPtr,
+ const hipChannelFormatDesc* desc, size_t size = UINT_MAX);
+#endif
+
+#if !__HIP_ROCclr__
+hipError_t ihipBindTextureImpl(TlsData *tls, int dim, enum hipTextureReadMode readMode, size_t* offset,
+ const void* devPtr, const struct hipChannelFormatDesc* desc,
+ size_t size, textureReference* tex);
+#endif
+
+/*
+ * @brief hipBindTexture Binds size bytes of the memory area pointed to by @p devPtr to the texture
+ *reference tex.
+ *
+ * @p desc describes how the memory is interpreted when fetching values from the texture. The @p
+ *offset parameter is an optional byte offset as with the low-level hipBindTexture() function. Any
+ *memory previously bound to tex is unbound.
+ *
+ * @param[in] offset - Offset in bytes
+ * @param[out] tex - texture to bind
+ * @param[in] devPtr - Memory area on device
+ * @param[in] desc - Channel format
+ * @param[in] size - Size of the memory area pointed to by devPtr
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
+ **/
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex, const void* devPtr,
+ const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
+ return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &desc, size, &tex);
+}
+#endif
+
+/*
+ * @brief hipBindTexture Binds size bytes of the memory area pointed to by @p devPtr to the texture
+ *reference tex.
+ *
+ * @p desc describes how the memory is interpreted when fetching values from the texture. The @p
+ *offset parameter is an optional byte offset as with the low-level hipBindTexture() function. Any
+ *memory previously bound to tex is unbound.
+ *
+ * @param[in] offset - Offset in bytes
+ * @param[in] tex - texture to bind
+ * @param[in] devPtr - Memory area on device
+ * @param[in] size - Size of the memory area pointed to by devPtr
+ * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree, #hipErrorUnknown
+ **/
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex, const void* devPtr,
+ size_t size = UINT_MAX) {
+ return ihipBindTextureImpl(nullptr, dim, readMode, offset, devPtr, &(tex.channelDesc), size, &tex);
+}
+#endif
+
+// C API
+#if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture2D(size_t* offset, textureReference* tex, const void* devPtr,
+ const hipChannelFormatDesc* desc, size_t width, size_t height,
+ size_t pitch);
+#endif
+
+#if !__HIP_ROCclr__
+hipError_t ihipBindTexture2DImpl(int dim, enum hipTextureReadMode readMode, size_t* offset,
+ const void* devPtr, const struct hipChannelFormatDesc* desc,
+ size_t width, size_t height, textureReference* tex, size_t pitch);
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture2D(size_t* offset, struct texture<T, dim, readMode>& tex,
+ const void* devPtr, size_t width, size_t height, size_t pitch) {
+ return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &(tex.channelDesc), width, height,
+ &tex);
+}
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTexture2D(size_t* offset, struct texture<T, dim, readMode>& tex,
+ const void* devPtr, const struct hipChannelFormatDesc& desc,
+ size_t width, size_t height, size_t pitch) {
+ return ihipBindTexture2DImpl(dim, readMode, offset, devPtr, &desc, width, height, &tex);
+}
+#endif
+
+// C API
+#if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTextureToArray(textureReference* tex, hipArray_const_t array,
+ const hipChannelFormatDesc* desc);
+#endif
+
+#if !__HIP_ROCclr__
+hipError_t ihipBindTextureToArrayImpl(TlsData *tls, int dim, enum hipTextureReadMode readMode,
+ hipArray_const_t array,
+ const struct hipChannelFormatDesc& desc,
+ textureReference* tex);
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
+ return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, tex.channelDesc, &tex);
+}
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipBindTextureToArray(struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+ const struct hipChannelFormatDesc& desc) {
+ return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, desc, &tex);
+}
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+inline static hipError_t hipBindTextureToArray(struct texture<T, dim, readMode> *tex,
+ hipArray_const_t array,
+ const struct hipChannelFormatDesc* desc) {
+ return ihipBindTextureToArrayImpl(nullptr, dim, readMode, array, *desc, tex);
+}
+#endif
+
+// C API
+#if !__HIP_ROCclr__
+hipError_t hipBindTextureToMipmappedArray(const textureReference* tex,
+ hipMipmappedArray_const_t mipmappedArray,
+ const hipChannelFormatDesc* desc);
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
+ hipMipmappedArray_const_t mipmappedArray) {
+ return hipSuccess;
+}
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+hipError_t hipBindTextureToMipmappedArray(const texture<T, dim, readMode>& tex,
+ hipMipmappedArray_const_t mipmappedArray,
+ const hipChannelFormatDesc& desc) {
+ return hipSuccess;
+}
+#endif
+
+#if __HIP_ROCclr__ && !defined(__HCC__)
+
+template <typename F>
+inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) {
+return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize,(hipFunction_t)kernel, dynSharedMemPerBlk, blockSizeLimit);
+}
+
+template <class T>
+inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+ void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+ return hipLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim,
+ blockDim, kernelParams, sharedMemBytes, stream);
+}
+
+template <class T>
+inline hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+ unsigned int numDevices, unsigned int flags = 0) {
+ return hipLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags);
+}
+
+
+template <class T>
+inline hipError_t hipExtLaunchMultiKernelMultiDevice(hipLaunchParams* launchParamsList,
+ unsigned int numDevices, unsigned int flags = 0) {
+ return hipExtLaunchMultiKernelMultiDevice(launchParamsList, numDevices, flags);
+}
+
+#endif
+
+/*
+ * @brief Unbinds the textuer bound to @p tex
+ *
+ * @param[in] tex - texture to unbind
+ *
+ * @return #hipSuccess
+ **/
+#if !__HIP_ROCclr__
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipUnbindTexture(const textureReference* tex);
+#endif
+
+#if !__HIP_ROCclr__
+extern hipError_t ihipUnbindTextureImpl(const hipTextureObject_t& textureObject);
+#endif
+
+#if !__HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
+ return ihipUnbindTextureImpl(tex.textureObject);
+}
+#endif
+
+#if !__HIP_ROCclr__
+hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array);
+
+DEPRECATED(DEPRECATED_MSG)
+hipError_t hipGetTextureAlignmentOffset(size_t* offset, const textureReference* texref);
+
+hipError_t hipGetTextureReference(const textureReference** texref, const void* symbol);
+
+hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc,
+ const hipTextureDesc* pTexDesc,
+ const hipResourceViewDesc* pResViewDesc);
+
+hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
+
+hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+ hipTextureObject_t textureObject);
+hipError_t hipGetTextureObjectResourceViewDesc(hipResourceViewDesc* pResViewDesc,
+ hipTextureObject_t textureObject);
+hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc,
+ hipTextureObject_t textureObject);
+hipError_t hipTexRefSetArray(textureReference* tex, hipArray_const_t array, unsigned int flags);
+
+hipError_t hipTexRefGetArray(hipArray_t* array, textureReference tex);
+
+hipError_t hipTexRefSetAddressMode(textureReference* tex, int dim, hipTextureAddressMode am);
+
+hipError_t hipTexRefGetAddressMode(hipTextureAddressMode* am, textureReference tex, int dim);
+
+hipError_t hipTexRefSetFilterMode(textureReference* tex, hipTextureFilterMode fm);
+
+hipError_t hipTexRefSetFlags(textureReference* tex, unsigned int flags);
+
+hipError_t hipTexRefSetFormat(textureReference* tex, hipArray_Format fmt, int NumPackedComponents);
+
+hipError_t hipTexRefSetAddress(size_t* offset, textureReference* tex, hipDeviceptr_t devPtr,
+ size_t size);
+
+hipError_t hipTexRefGetAddress(hipDeviceptr_t* dev_ptr, textureReference tex);
+
+hipError_t hipTexRefSetAddress2D(textureReference* tex, const HIP_ARRAY_DESCRIPTOR* desc,
+ hipDeviceptr_t devPtr, size_t pitch);
+#endif
+
+hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipResourceDesc* pResDesc);
+
+hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
+
+#if __HIP_ROCclr__
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+ const void* devPtr, size_t size = UINT_MAX) {
+ return hipBindTexture(offset, &tex, devPtr, &tex.channelDesc, size);
+}
+
+template <class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t
+ hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex, const void* devPtr,
+ const struct hipChannelFormatDesc& desc, size_t size = UINT_MAX) {
+ return hipBindTexture(offset, &tex, devPtr, &desc, size);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture2D(
+ size_t *offset,
+ const struct texture<T, dim, readMode> &tex,
+ const void *devPtr,
+ size_t width,
+ size_t height,
+ size_t pitch)
+{
+ return hipBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTexture2D(
+ size_t *offset,
+ const struct texture<T, dim, readMode> &tex,
+ const void *devPtr,
+ const struct hipChannelFormatDesc &desc,
+ size_t width,
+ size_t height,
+ size_t pitch)
+{
+ return hipBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTextureToArray(
+ const struct texture<T, dim, readMode> &tex,
+ hipArray_const_t array)
+{
+ struct hipChannelFormatDesc desc;
+ hipError_t err = hipGetChannelDesc(&desc, array);
+ return (err == hipSuccess) ? hipBindTextureToArray(&tex, array, &desc) : err;
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipBindTextureToArray(
+ const struct texture<T, dim, readMode> &tex,
+ hipArray_const_t array,
+ const struct hipChannelFormatDesc &desc)
+{
+ return hipBindTextureToArray(&tex, array, &desc);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+static inline hipError_t hipBindTextureToMipmappedArray(
+ const struct texture<T, dim, readMode> &tex,
+ hipMipmappedArray_const_t mipmappedArray)
+{
+ struct hipChannelFormatDesc desc;
+ hipArray_t levelArray;
+ hipError_t err = hipGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
+ if (err != hipSuccess) {
+ return err;
+ }
+ err = hipGetChannelDesc(&desc, levelArray);
+ return (err == hipSuccess) ? hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc) : err;
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+static inline hipError_t hipBindTextureToMipmappedArray(
+ const struct texture<T, dim, readMode> &tex,
+ hipMipmappedArray_const_t mipmappedArray,
+ const struct hipChannelFormatDesc &desc)
+{
+ return hipBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
+}
+
+template<class T, int dim, enum hipTextureReadMode readMode>
+DEPRECATED(DEPRECATED_MSG)
+static inline hipError_t hipUnbindTexture(
+ const struct texture<T, dim, readMode> &tex)
+{
+ return hipUnbindTexture(&tex);
+}
+#endif
+
+// doxygen end Texture
+/**
+ * @}
+ */
+
+
+#endif
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+// doxygen end HIP API
+/**
+ * @}
+ */
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_runtime_prof.h b/third_party/rocm/include/hip/hcc_detail/hip_runtime_prof.h
new file mode 100644
index 0000000..ffd8b0a
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_runtime_prof.h
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+
+// HIP ROCclr Op IDs enumeration
+enum HipVdiOpId {
+ kHipVdiOpIdDispatch = 0,
+ kHipVdiOpIdCopy = 1,
+ kHipVdiOpIdBarrier = 2,
+ kHipVdiOpIdNumber = 3
+};
+
+// Types of ROCclr commands
+enum HipVdiCommandKind {
+ kHipVdiCommandKernel = 0x11F0,
+ kHipVdiMemcpyDeviceToHost = 0x11F3,
+ kHipHipVdiMemcpyHostToDevice = 0x11F4,
+ kHipVdiMemcpyDeviceToDevice = 0x11F5,
+ kHipVidMemcpyDeviceToHostRect = 0x1201,
+ kHipVdiMemcpyHostToDeviceRect = 0x1202,
+ kHipVdiMemcpyDeviceToDeviceRect = 0x1203,
+ kHipVdiFillMemory = 0x1207,
+};
+
+/**
+ * @brief Initializes activity callback
+ *
+ * @param [input] id_callback Event ID callback function
+ * @param [input] op_callback Event operation callback function
+ * @param [input] arg Arguments passed into callback
+ *
+ * @returns None
+ */
+void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg);
+
+/**
+ * @brief Enables activity callback
+ *
+ * @param [input] op Operation, which will trigger a callback (@see HipVdiOpId)
+ * @param [input] enable Enable state for the callback
+ *
+ * @returns True if successful
+ */
+bool hipEnableActivityCallback(uint32_t op, bool enable);
+
+/**
+ * @brief Returns the description string for the operation kind
+ *
+ * @param [input] id Command kind id (@see HipVdiCommandKind)
+ *
+ * @returns A pointer to a const string with the command description
+ */
+const char* hipGetCmdName(uint32_t id);
+
+#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H
+
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_surface_types.h b/third_party/rocm/include/hip/hcc_detail/hip_surface_types.h
new file mode 100644
index 0000000..f74c01d
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_surface_types.h
@@ -0,0 +1,54 @@
+/*
+Copyright (c) 2015- present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/hip_surface_types.h
+ * @brief Defines surface types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_TYPES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_TYPES_H
+
+#include <hip/hcc_detail/driver_types.h>
+
+/**
+ * An opaque value that represents a hip surface object
+ */
+typedef unsigned long long hipSurfaceObject_t;
+
+/**
+ * hip surface reference
+ */
+struct surfaceReference {
+ hipSurfaceObject_t surfaceObject;
+};
+
+/**
+ * hip surface boundary modes
+ */
+enum hipSurfaceBoundaryMode {
+ hipBoundaryModeZero = 0,
+ hipBoundaryModeTrap = 1,
+ hipBoundaryModeClamp = 2
+};
+
+#endif /* !HIP_INCLUDE_HIP_HCC_DETAIL_HIP_SURFACE_TYPES_H */
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_texture_types.h b/third_party/rocm/include/hip/hcc_detail/hip_texture_types.h
new file mode 100644
index 0000000..a46b236
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_texture_types.h
@@ -0,0 +1,96 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/hip_texture_types.h
+ * @brief Defines the different newt vector types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_TEXTURE_TYPES_H
+
+/*******************************************************************************
+ * *
+ * *
+ * *
+ *******************************************************************************/
+#include <limits.h>
+//#include <hip/hcc_detail/driver_types.h>
+#include <hip/hcc_detail/channel_descriptor.h>
+#include <hip/hcc_detail/texture_types.h>
+
+#if __cplusplus
+
+/*******************************************************************************
+ * *
+ * *
+ * *
+ *******************************************************************************/
+#if __HIP__
+#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
+#else
+#define __HIP_TEXTURE_ATTRIB
+#endif
+
+typedef textureReference* hipTexRef;
+
+template <class T, int texType = hipTextureType1D,
+ enum hipTextureReadMode mode = hipReadModeElementType>
+struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
+ texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
+ enum hipTextureAddressMode aMode = hipAddressModeClamp) {
+ normalized = norm;
+ readMode = mode;
+ filterMode = fMode;
+ addressMode[0] = aMode;
+ addressMode[1] = aMode;
+ addressMode[2] = aMode;
+ channelDesc = hipCreateChannelDesc<T>();
+ sRGB = 0;
+ textureObject = nullptr;
+ maxAnisotropy = 0;
+ mipmapLevelBias = 0;
+ minMipmapLevelClamp = 0;
+ maxMipmapLevelClamp = 0;
+ }
+
+ texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
+ struct hipChannelFormatDesc desc) {
+ normalized = norm;
+ readMode = mode;
+ filterMode = fMode;
+ addressMode[0] = aMode;
+ addressMode[1] = aMode;
+ addressMode[2] = aMode;
+ channelDesc = desc;
+ sRGB = 0;
+ textureObject = nullptr;
+ maxAnisotropy = 0;
+ mipmapLevelBias = 0;
+ minMipmapLevelClamp = 0;
+ maxMipmapLevelClamp = 0;
+ }
+};
+
+#endif /* __cplusplus */
+
+#endif /* !HIP_INCLUDE_HIP_HCC_DETAIL_HIP_TEXTURE_TYPES_H */
diff --git a/third_party/rocm/include/hip/hcc_detail/hip_vector_types.h b/third_party/rocm/include/hip/hcc_detail/hip_vector_types.h
new file mode 100644
index 0000000..69525c5
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hip_vector_types.h
@@ -0,0 +1,1593 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/hip_vector_types.h
+ * @brief Defines the different newt vector types for HIP runtime.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_VECTOR_TYPES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_VECTOR_TYPES_H
+
+#if defined(__HCC__) && (__hcc_workweek__ < 16032)
+#error("This version of HIP requires a newer version of HCC.");
+#endif
+
+#include "hip/hcc_detail/host_defines.h"
+
+#if defined(__has_attribute)
+ #if __has_attribute(ext_vector_type)
+ #define __NATIVE_VECTOR__(n, T) T __attribute__((ext_vector_type(n)))
+ #else
+ #define __NATIVE_VECTOR__(n, T) T[n]
+ #endif
+
+#if defined(__cplusplus)
+ #include <array>
+ #include <iosfwd>
+ #include <type_traits>
+
+ namespace hip_impl {
+ template<typename, typename, unsigned int> struct Scalar_accessor;
+ } // Namespace hip_impl.
+
+ namespace std {
+ template<typename T, typename U, unsigned int n>
+ struct is_integral<hip_impl::Scalar_accessor<T, U, n>>
+ : is_integral<T> {};
+ template<typename T, typename U, unsigned int n>
+ struct is_floating_point<hip_impl::Scalar_accessor<T, U, n>>
+ : is_floating_point<T> {};
+ } // Namespace std.
+
+ namespace hip_impl {
+ template<typename T, typename Vector, unsigned int idx>
+ struct Scalar_accessor {
+ struct Address {
+ const Scalar_accessor* p;
+
+ __host__ __device__
+ operator const T*() const noexcept {
+ return &reinterpret_cast<const T*>(p)[idx];
+ }
+ __host__ __device__
+ operator const T*() const volatile noexcept {
+ return &reinterpret_cast<const T*>(p)[idx];
+ }
+ __host__ __device__
+ operator T*() noexcept {
+ return &reinterpret_cast<T*>(
+ const_cast<Scalar_accessor*>(p))[idx];
+ }
+ __host__ __device__
+ operator T*() volatile noexcept {
+ return &reinterpret_cast<T*>(
+ const_cast<Scalar_accessor*>(p))[idx];
+ }
+ };
+
+ friend
+ inline
+ std::ostream& operator<<(std::ostream& os,
+ const Scalar_accessor& x) noexcept {
+ return os << x.data[idx];
+ }
+ friend
+ inline
+ std::istream& operator>>(std::istream& is,
+ Scalar_accessor& x) noexcept {
+ T tmp;
+ is >> tmp;
+ x.data[idx] = tmp;
+
+ return is;
+ }
+
+ // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor
+ Vector data;
+
+ __host__ __device__
+ operator T() const noexcept { return data[idx]; }
+ __host__ __device__
+ operator T() const volatile noexcept { return data[idx]; }
+
+#ifdef __HIP_ENABLE_VECTOR_SCALAR_ACCESSORY_ENUM_CONVERSION__
+ // The conversions to enum are fairly ghastly, but unfortunately used in
+ // some pre-existing, difficult to modify, code.
+ template<
+ typename U,
+ typename std::enable_if<
+ !std::is_same<U, T>{} &&
+ std::is_enum<U>{} &&
+ std::is_convertible<
+ T, typename std::enable_if<std::is_enum<U>::value, std::underlying_type<U>>::type::type>{}>::type* = nullptr>
+ __host__ __device__
+ operator U() const noexcept { return static_cast<U>(data[idx]); }
+ template<
+ typename U,
+ typename std::enable_if<
+ !std::is_same<U, T>{} &&
+ std::is_enum<U>{} &&
+ std::is_convertible<
+ T, typename std::enable_if<std::is_enum<U>::value, std::underlying_type<U>>::type::type>{}>::type* = nullptr>
+ __host__ __device__
+ operator U() const volatile noexcept { return static_cast<U>(data[idx]); }
+#endif
+
+ __host__ __device__
+ operator T&() noexcept {
+ return reinterpret_cast<
+ T (&)[sizeof(Vector) / sizeof(T)]>(data)[idx];
+ }
+ __host__ __device__
+ operator volatile T&() volatile noexcept {
+ return reinterpret_cast<
+ volatile T (&)[sizeof(Vector) / sizeof(T)]>(data)[idx];
+ }
+
+ __host__ __device__
+ Address operator&() const noexcept { return Address{this}; }
+
+ __host__ __device__
+ Scalar_accessor& operator=(const Scalar_accessor& x) noexcept {
+ data[idx] = x.data[idx];
+
+ return *this;
+ }
+ __host__ __device__
+ Scalar_accessor& operator=(T x) noexcept {
+ data[idx] = x;
+
+ return *this;
+ }
+ __host__ __device__
+ volatile Scalar_accessor& operator=(T x) volatile noexcept {
+ data[idx] = x;
+
+ return *this;
+ }
+
+ __host__ __device__
+ Scalar_accessor& operator++() noexcept {
+ ++data[idx];
+ return *this;
+ }
+ __host__ __device__
+ T operator++(int) noexcept {
+ auto r{data[idx]};
+ ++data[idx];
+ return *this;
+ }
+ __host__ __device__
+ Scalar_accessor& operator--() noexcept {
+ --data[idx];
+ return *this;
+ }
+ __host__ __device__
+ T operator--(int) noexcept {
+ auto r{data[idx]};
+ --data[idx];
+ return *this;
+ }
+
+ // TODO: convertibility is too restrictive, constraint should be on
+ // the operator being invocable with a value of type U.
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator+=(U x) noexcept {
+ data[idx] += x;
+ return *this;
+ }
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator-=(U x) noexcept {
+ data[idx] -= x;
+ return *this;
+ }
+
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator*=(U x) noexcept {
+ data[idx] *= x;
+ return *this;
+ }
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator/=(U x) noexcept {
+ data[idx] /= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_convertible<U, T>{} &&
+ std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator%=(U x) noexcept {
+ data[idx] %= x;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_convertible<U, T>{} &&
+ std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator>>=(U x) noexcept {
+ data[idx] >>= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_convertible<U, T>{} &&
+ std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator<<=(U x) noexcept {
+ data[idx] <<= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_convertible<U, T>{} &&
+ std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator&=(U x) noexcept {
+ data[idx] &= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_convertible<U, T>{} &&
+ std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator|=(U x) noexcept {
+ data[idx] |= x;
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_convertible<U, T>{} &&
+ std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Scalar_accessor& operator^=(U x) noexcept {
+ data[idx] ^= x;
+ return *this;
+ }
+ };
+
+ inline
+ constexpr
+ unsigned int next_pot(unsigned int x) {
+ // Precondition: x > 1.
+ return 1u << (32u - __builtin_clz(x - 1u));
+ }
+ } // Namespace hip_impl.
+
+ template<typename T, unsigned int n> struct HIP_vector_base;
+
+ template<typename T>
+ struct HIP_vector_base<T, 1> {
+ using Native_vec_ = __NATIVE_VECTOR__(1, T);
+
+ union {
+ Native_vec_ data;
+#if __HIP_CLANG_ONLY__
+ struct {
+ T x;
+ };
+#else
+ hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+#endif
+ };
+
+ using value_type = T;
+
+ __host__ __device__
+ HIP_vector_base() = default;
+ __host__ __device__
+ explicit
+ constexpr
+ HIP_vector_base(T x) noexcept : data{x} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(const HIP_vector_base&) = default;
+ __host__ __device__
+ constexpr
+ HIP_vector_base(HIP_vector_base&&) = default;
+ __host__ __device__
+ ~HIP_vector_base() = default;
+
+ __host__ __device__
+ HIP_vector_base& operator=(const HIP_vector_base& x) noexcept {
+ #if __has_attribute(ext_vector_type)
+ data = x.data;
+ #else
+ data[0] = x.data[0];
+ #endif
+
+ return *this;
+ }
+ };
+
+ template<typename T>
+ struct HIP_vector_base<T, 2> {
+ using Native_vec_ = __NATIVE_VECTOR__(2, T);
+
+ union
+ #if !__has_attribute(ext_vector_type)
+ alignas(hip_impl::next_pot(2 * sizeof(T)))
+ #endif
+ {
+ Native_vec_ data;
+#if __HIP_CLANG_ONLY__
+ struct {
+ T x;
+ T y;
+ };
+#else
+ hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+ hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
+#endif
+ };
+
+ using value_type = T;
+
+ __host__ __device__
+ HIP_vector_base() = default;
+ __host__ __device__
+ explicit
+ constexpr
+ HIP_vector_base(T x) noexcept : data{x, x} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(T x, T y) noexcept : data{x, y} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(const HIP_vector_base&) = default;
+ __host__ __device__
+ constexpr
+ HIP_vector_base(HIP_vector_base&&) = default;
+ __host__ __device__
+ ~HIP_vector_base() = default;
+
+ __host__ __device__
+ HIP_vector_base& operator=(const HIP_vector_base& x) noexcept {
+ #if __has_attribute(ext_vector_type)
+ data = x.data;
+ #else
+ data[0] = x.data[0];
+ data[1] = x.data[1];
+ #endif
+
+ return *this;
+ }
+ };
+
+ template<typename T>
+ struct HIP_vector_base<T, 3> {
+ struct Native_vec_ {
+ T d[3];
+
+ __host__ __device__
+ Native_vec_() = default;
+
+ __host__ __device__
+ explicit
+ constexpr
+ Native_vec_(T x) noexcept : d{x, x, x} {}
+ __host__ __device__
+ constexpr
+ Native_vec_(T x, T y, T z) noexcept : d{x, y, z} {}
+ __host__ __device__
+ constexpr
+ Native_vec_(const Native_vec_&) = default;
+ __host__ __device__
+ constexpr
+ Native_vec_(Native_vec_&&) = default;
+ __host__ __device__
+ ~Native_vec_() = default;
+
+ __host__ __device__
+ Native_vec_& operator=(const Native_vec_&) = default;
+ __host__ __device__
+ Native_vec_& operator=(Native_vec_&&) = default;
+
+ __host__ __device__
+ T& operator[](unsigned int idx) noexcept { return d[idx]; }
+ __host__ __device__
+ T operator[](unsigned int idx) const noexcept { return d[idx]; }
+
+ __host__ __device__
+ Native_vec_& operator+=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] += x.d[i];
+ return *this;
+ }
+ __host__ __device__
+ Native_vec_& operator-=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] -= x.d[i];
+ return *this;
+ }
+
+ __host__ __device__
+ Native_vec_& operator*=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] *= x.d[i];
+ return *this;
+ }
+ __host__ __device__
+ Native_vec_& operator/=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] /= x.d[i];
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_ operator-() const noexcept
+ {
+ auto r{*this};
+ for (auto&& x : r.d) x = -x;
+ return r;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_ operator~() const noexcept
+ {
+ auto r{*this};
+ for (auto&& x : r.d) x = ~x;
+ return r;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_& operator%=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] %= x.d[i];
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_& operator^=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] ^= x.d[i];
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_& operator|=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] |= x.d[i];
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_& operator&=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] &= x.d[i];
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_& operator>>=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] >>= x.d[i];
+ return *this;
+ }
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ Native_vec_& operator<<=(const Native_vec_& x) noexcept
+ {
+ for (auto i = 0u; i != 3u; ++i) d[i] <<= x.d[i];
+ return *this;
+ }
+
+ using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));
+ __host__ __device__
+ Vec3_cmp operator==(const Native_vec_& x) const noexcept
+ {
+ return Vec3_cmp{d[0] == x.d[0], d[1] == x.d[1], d[2] == x.d[2]};
+ }
+ };
+
+ union {
+ Native_vec_ data;
+ struct {
+ T x;
+ T y;
+ T z;
+ };
+ };
+
+ using value_type = T;
+
+ __host__ __device__
+ HIP_vector_base() = default;
+ __host__ __device__
+ explicit
+ constexpr
+ HIP_vector_base(T x) noexcept : data{x, x, x} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(T x, T y, T z) noexcept : data{x, y, z} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(const HIP_vector_base&) = default;
+ __host__ __device__
+ constexpr
+ HIP_vector_base(HIP_vector_base&&) = default;
+ __host__ __device__
+ ~HIP_vector_base() = default;
+
+ __host__ __device__
+ HIP_vector_base& operator=(const HIP_vector_base&) = default;
+ __host__ __device__
+ HIP_vector_base& operator=(HIP_vector_base&&) = default;
+ };
+
+ template<typename T>
+ struct HIP_vector_base<T, 4> {
+ using Native_vec_ = __NATIVE_VECTOR__(4, T);
+
+ union
+ #if !__has_attribute(ext_vector_type)
+ alignas(hip_impl::next_pot(4 * sizeof(T)))
+ #endif
+ {
+ Native_vec_ data;
+#if __HIP_CLANG_ONLY__
+ struct {
+ T x;
+ T y;
+ T z;
+ T w;
+ };
+#else
+ hip_impl::Scalar_accessor<T, Native_vec_, 0> x;
+ hip_impl::Scalar_accessor<T, Native_vec_, 1> y;
+ hip_impl::Scalar_accessor<T, Native_vec_, 2> z;
+ hip_impl::Scalar_accessor<T, Native_vec_, 3> w;
+#endif
+ };
+
+ using value_type = T;
+
+ __host__ __device__
+ HIP_vector_base() = default;
+ __host__ __device__
+ explicit
+ constexpr
+ HIP_vector_base(T x) noexcept : data{x, x, x, x} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(T x, T y, T z, T w) noexcept : data{x, y, z, w} {}
+ __host__ __device__
+ constexpr
+ HIP_vector_base(const HIP_vector_base&) = default;
+ __host__ __device__
+ constexpr
+ HIP_vector_base(HIP_vector_base&&) = default;
+ __host__ __device__
+ ~HIP_vector_base() = default;
+
+ __host__ __device__
+ HIP_vector_base& operator=(const HIP_vector_base& x) noexcept {
+ #if __has_attribute(ext_vector_type)
+ data = x.data;
+ #else
+ data[0] = x.data[0];
+ data[1] = x.data[1];
+ data[2] = x.data[2];
+ data[3] = x.data[3];
+ #endif
+
+ return *this;
+ }
+ };
+
+ template<typename T, unsigned int rank>
+ struct HIP_vector_type : public HIP_vector_base<T, rank> {
+ using HIP_vector_base<T, rank>::data;
+ using typename HIP_vector_base<T, rank>::Native_vec_;
+
+ __host__ __device__
+ HIP_vector_type() = default;
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ explicit
+ constexpr
+ HIP_vector_type(U x) noexcept
+ : HIP_vector_base<T, rank>{static_cast<T>(x)}
+ {}
+ template< // TODO: constrain based on type as well.
+ typename... Us,
+ typename std::enable_if<
+ (rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
+ __host__ __device__
+ constexpr
+ HIP_vector_type(Us... xs) noexcept
+ : HIP_vector_base<T, rank>{static_cast<T>(xs)...}
+ {}
+ __host__ __device__
+ constexpr
+ HIP_vector_type(const HIP_vector_type&) = default;
+ __host__ __device__
+ constexpr
+ HIP_vector_type(HIP_vector_type&&) = default;
+ __host__ __device__
+ ~HIP_vector_type() = default;
+
+ __host__ __device__
+ HIP_vector_type& operator=(const HIP_vector_type&) = default;
+ __host__ __device__
+ HIP_vector_type& operator=(HIP_vector_type&&) = default;
+
+ // Operators
+ __host__ __device__
+ HIP_vector_type& operator++() noexcept
+ {
+ return *this += HIP_vector_type{1};
+ }
+ __host__ __device__
+ HIP_vector_type operator++(int) noexcept
+ {
+ auto tmp(*this);
+ ++*this;
+ return tmp;
+ }
+
+ __host__ __device__
+ HIP_vector_type& operator--() noexcept
+ {
+ return *this -= HIP_vector_type{1};
+ }
+ __host__ __device__
+ HIP_vector_type operator--(int) noexcept
+ {
+ auto tmp(*this);
+ --*this;
+ return tmp;
+ }
+
+ __host__ __device__
+ HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept
+ {
+ data += x.data;
+ return *this;
+ }
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator+=(U x) noexcept
+ {
+ return *this += HIP_vector_type{x};
+ }
+
+ __host__ __device__
+ HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept
+ {
+ data -= x.data;
+ return *this;
+ }
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator-=(U x) noexcept
+ {
+ return *this -= HIP_vector_type{x};
+ }
+
+ __host__ __device__
+ HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept
+ {
+ data *= x.data;
+ return *this;
+ }
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator*=(U x) noexcept
+ {
+ return *this *= HIP_vector_type{x};
+ }
+
+ __host__ __device__
+ HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
+ {
+ data /= x.data;
+ return *this;
+ }
+ template<
+ typename U,
+ typename std::enable_if<
+ std::is_convertible<U, T>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator/=(U x) noexcept
+ {
+ return *this /= HIP_vector_type{x};
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type operator-() const noexcept
+ {
+ auto tmp(*this);
+ tmp.data = -tmp.data;
+ return tmp;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type operator~() const noexcept
+ {
+ HIP_vector_type r{*this};
+ r.data = ~r.data;
+ return r;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept
+ {
+ data %= x.data;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept
+ {
+ data ^= x.data;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept
+ {
+ data |= x.data;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept
+ {
+ data &= x.data;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept
+ {
+ data >>= x.data;
+ return *this;
+ }
+
+ template<
+ typename U = T,
+ typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+ __host__ __device__
+ HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept
+ {
+ data <<= x.data;
+ return *this;
+ }
+ };
+
+ template<typename T, unsigned int n>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator+(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} += y;
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator+(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} += HIP_vector_type<T, n>{y};
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator+(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} += y;
+ }
+
+ template<typename T, unsigned int n>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator-(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} -= y;
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator-(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} -= HIP_vector_type<T, n>{y};
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator-(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} -= y;
+ }
+
+ template<typename T, unsigned int n>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator*(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} *= y;
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator*(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} *= HIP_vector_type<T, n>{y};
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator*(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} *= y;
+ }
+
+ template<typename T, unsigned int n>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator/(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} /= y;
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator/(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} /= HIP_vector_type<T, n>{y};
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator/(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} /= y;
+ }
+
+ template<typename V>
+ __host__ __device__
+ inline
+ constexpr
+ bool _hip_any_zero(const V& x, int n) noexcept
+ {
+ return
+ (n == -1) ? true : ((x[n] == 0) ? false : _hip_any_zero(x, n - 1));
+ }
+
+ template<typename T, unsigned int n>
+ __host__ __device__
+ inline
+ constexpr
+ bool operator==(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return _hip_any_zero(x.data == y.data, n - 1);
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return x == HIP_vector_type<T, n>{y};
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} == y;
+ }
+
+ template<typename T, unsigned int n>
+ __host__ __device__
+ inline
+ constexpr
+ bool operator!=(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return !(x == y);
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return !(x == y);
+ }
+ template<typename T, unsigned int n, typename U>
+ __host__ __device__
+ inline
+ constexpr
+ bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template<
+ typename T,
+ unsigned int n,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator%(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} %= y;
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator%(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} %= HIP_vector_type<T, n>{y};
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator%(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} %= y;
+ }
+
+ template<
+ typename T,
+ unsigned int n,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator^(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} ^= y;
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator^(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} ^= HIP_vector_type<T, n>{y};
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator^(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} ^= y;
+ }
+
+ template<
+ typename T,
+ unsigned int n,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator|(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} |= y;
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator|(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} |= HIP_vector_type<T, n>{y};
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator|(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} |= y;
+ }
+
+ template<
+ typename T,
+ unsigned int n,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator&(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} &= y;
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator&(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} &= HIP_vector_type<T, n>{y};
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator&(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} &= y;
+ }
+
+ template<
+ typename T,
+ unsigned int n,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator>>(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} >>= y;
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator>>(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} >>= HIP_vector_type<T, n>{y};
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator>>(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} >>= y;
+ }
+
+ template<
+ typename T,
+ unsigned int n,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator<<(
+ const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} <<= y;
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator<<(
+ const HIP_vector_type<T, n>& x, U y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} <<= HIP_vector_type<T, n>{y};
+ }
+ template<
+ typename T,
+ unsigned int n,
+ typename U,
+ typename std::enable_if<std::is_arithmetic<U>::value>::type,
+ typename std::enable_if<std::is_integral<T>{}>* = nullptr>
+ __host__ __device__
+ inline
+ constexpr
+ HIP_vector_type<T, n> operator<<(
+ U x, const HIP_vector_type<T, n>& y) noexcept
+ {
+ return HIP_vector_type<T, n>{x} <<= y;
+ }
+
+ #define __MAKE_VECTOR_TYPE__(CUDA_name, T) \
+ using CUDA_name##1 = HIP_vector_type<T, 1>;\
+ using CUDA_name##2 = HIP_vector_type<T, 2>;\
+ using CUDA_name##3 = HIP_vector_type<T, 3>;\
+ using CUDA_name##4 = HIP_vector_type<T, 4>;
+#else
+ #define __MAKE_VECTOR_TYPE__(CUDA_name, T) \
+ typedef struct {\
+ T x;\
+ } CUDA_name##1;\
+ typedef struct {\
+ T x;\
+ T y;\
+ } CUDA_name##2;\
+ typedef struct {\
+ T x;\
+ T y;\
+ T z;\
+ } CUDA_name##3;\
+ typedef struct {\
+ T x;\
+ T y;\
+ T z;\
+ T w;\
+ } CUDA_name##4;
+#endif
+
+__MAKE_VECTOR_TYPE__(uchar, unsigned char);
+__MAKE_VECTOR_TYPE__(char, char);
+__MAKE_VECTOR_TYPE__(ushort, unsigned short);
+__MAKE_VECTOR_TYPE__(short, short);
+__MAKE_VECTOR_TYPE__(uint, unsigned int);
+__MAKE_VECTOR_TYPE__(int, int);
+__MAKE_VECTOR_TYPE__(ulong, unsigned long);
+__MAKE_VECTOR_TYPE__(long, long);
+__MAKE_VECTOR_TYPE__(ulonglong, unsigned long long);
+__MAKE_VECTOR_TYPE__(longlong, long long);
+__MAKE_VECTOR_TYPE__(float, float);
+__MAKE_VECTOR_TYPE__(double, double);
+
+#ifdef __cplusplus
+#define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x) { type r{x}; return r; }
+
+#define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x, comp y) { type r{x, y}; return r; }
+
+#define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x, comp y, comp z) { type r{x, y, z}; return r; }
+
+#define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x, comp y, comp z, comp w) { \
+ type r{x, y, z, w}; \
+ return r; \
+ }
+#else
+ #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x) { type r; r.x =x; return r; }
+
+ #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x, comp y) { type r; r.x=x; r.y=y; return r; }
+
+ #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x, comp y, comp z) { type r; r.x=x; r.y=y; r.z=z; return r; }
+
+ #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
+ static inline __device__ __host__ \
+ type make_##type(comp x, comp y, comp z, comp w) { \
+ type r; r.x=x; r.y=y; r.z=z; r.w=w; \
+ return r; \
+ }
+#endif
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned char, uchar4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed char, char1);
+DECLOP_MAKE_TWO_COMPONENT(signed char, char2);
+DECLOP_MAKE_THREE_COMPONENT(signed char, char3);
+DECLOP_MAKE_FOUR_COMPONENT(signed char, char4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned short, ushort1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned short, ushort2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned short, ushort3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned short, ushort4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed short, short1);
+DECLOP_MAKE_TWO_COMPONENT(signed short, short2);
+DECLOP_MAKE_THREE_COMPONENT(signed short, short3);
+DECLOP_MAKE_FOUR_COMPONENT(signed short, short4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned int, uint1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned int, uint2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned int, uint3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned int, uint4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed int, int1);
+DECLOP_MAKE_TWO_COMPONENT(signed int, int2);
+DECLOP_MAKE_THREE_COMPONENT(signed int, int3);
+DECLOP_MAKE_FOUR_COMPONENT(signed int, int4);
+
+DECLOP_MAKE_ONE_COMPONENT(float, float1);
+DECLOP_MAKE_TWO_COMPONENT(float, float2);
+DECLOP_MAKE_THREE_COMPONENT(float, float3);
+DECLOP_MAKE_FOUR_COMPONENT(float, float4);
+
+DECLOP_MAKE_ONE_COMPONENT(double, double1);
+DECLOP_MAKE_TWO_COMPONENT(double, double2);
+DECLOP_MAKE_THREE_COMPONENT(double, double3);
+DECLOP_MAKE_FOUR_COMPONENT(double, double4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned long, ulong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long, ulong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long, ulong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long, ulong4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed long, long1);
+DECLOP_MAKE_TWO_COMPONENT(signed long, long2);
+DECLOP_MAKE_THREE_COMPONENT(signed long, long3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long, long4);
+
+DECLOP_MAKE_ONE_COMPONENT(unsigned long long, ulonglong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long long, ulonglong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long long, ulonglong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long long, ulonglong4);
+
+DECLOP_MAKE_ONE_COMPONENT(signed long long, longlong1);
+DECLOP_MAKE_TWO_COMPONENT(signed long long, longlong2);
+DECLOP_MAKE_THREE_COMPONENT(signed long long, longlong3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long long, longlong4);
+#else // !defined(__has_attribute)
+
+#if defined(_MSC_VER)
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+
+typedef union { char data; } char1;
+typedef union { char data[2]; } char2;
+typedef union { char data[4]; } char4;
+typedef union { char4 data; } char3;
+typedef union { __m64 data; } char8;
+typedef union { __m128i data; } char16;
+
+typedef union { unsigned char data; } uchar1;
+typedef union { unsigned char data[2]; } uchar2;
+typedef union { unsigned char data[4]; } uchar4;
+typedef union { uchar4 data; } uchar3;
+typedef union { __m64 data; } uchar8;
+typedef union { __m128i data; } uchar16;
+
+typedef union { short data; } short1;
+typedef union { short data[2]; } short2;
+typedef union { __m64 data; } short4;
+typedef union { short4 data; } short3;
+typedef union { __m128i data; } short8;
+typedef union { __m128i data[2]; } short16;
+
+typedef union { unsigned short data; } ushort1;
+typedef union { unsigned short data[2]; } ushort2;
+typedef union { __m64 data; } ushort4;
+typedef union { ushort4 data; } ushort3;
+typedef union { __m128i data; } ushort8;
+typedef union { __m128i data[2]; } ushort16;
+
+typedef union { int data; } int1;
+typedef union { __m64 data; } int2;
+typedef union { __m128i data; } int4;
+typedef union { int4 data; } int3;
+typedef union { __m128i data[2]; } int8;
+typedef union { __m128i data[4];} int16;
+
+typedef union { unsigned int data; } uint1;
+typedef union { __m64 data; } uint2;
+typedef union { __m128i data; } uint4;
+typedef union { uint4 data; } uint3;
+typedef union { __m128i data[2]; } uint8;
+typedef union { __m128i data[4]; } uint16;
+
+#if !defined(_WIN64)
+typedef union { int data; } long1;
+typedef union { __m64 data; } long2;
+typedef union { __m128i data; } long4;
+typedef union { long4 data; } long3;
+typedef union { __m128i data[2]; } long8;
+typedef union { __m128i data[4]; } long16;
+
+typedef union { unsigned int data; } ulong1;
+typedef union { __m64 data; } ulong2;
+typedef union { __m128i data; } ulong4;
+typedef union { ulong4 data; } ulong3;
+typedef union { __m128i data[2]; } ulong8;
+typedef union { __m128i data[4]; } ulong16;
+#else // defined(_WIN64)
+typedef union { __m64 data; } long1;
+typedef union { __m128i data; } long2;
+typedef union { __m128i data[2]; } long4;
+typedef union { long4 data; } long3;
+typedef union { __m128i data[4]; } long8;
+typedef union { __m128i data[8]; } long16;
+
+typedef union { __m64 data; } ulong1;
+typedef union { __m128i data; } ulong2;
+typedef union { __m128i data[2]; } ulong4;
+typedef union { ulong4 data; } ulong3;
+typedef union { __m128i data[4]; } ulong8;
+typedef union { __m128i data[8]; } ulong16;
+#endif // defined(_WIN64)
+
+typedef union { __m64 data; } longlong1;
+typedef union { __m128i data; } longlong2;
+typedef union { __m128i data[2]; } longlong4;
+typedef union { longlong4 data; } longlong3;
+typedef union { __m128i data[4]; } longlong8;
+typedef union { __m128i data[8]; } longlong16;
+
+typedef union { __m64 data; } ulonglong1;
+typedef union { __m128i data; } ulonglong2;
+typedef union { __m128i data[2]; } ulonglong4;
+typedef union { ulonglong4 data; } ulonglong3;
+typedef union { __m128i data[4]; } ulonglong8;
+typedef union { __m128i data[8]; } ulonglong16;
+
+typedef union { float data; } float1;
+typedef union { __m64 data; } float2;
+typedef union { __m128 data; } float4;
+typedef union { float4 data; } float3;
+typedef union { __m256 data; } float8;
+typedef union { __m256 data[2]; } float16;
+
+typedef union { double data; } double1;
+typedef union { __m128d data; } double2;
+typedef union { __m256d data; } double4;
+typedef union { double4 data; } double3;
+typedef union { __m256d data[2]; } double8;
+typedef union { __m256d data[4]; } double16;
+
+#else // !defined(_MSC_VER)
+
+typedef union { char data; } char1;
+typedef union { char data[2]; } char2;
+typedef union { char data[4]; } char4;
+typedef union { char data[8]; } char8;
+typedef union { char data[16]; } char16;
+typedef union { char4 data; } char3;
+
+typedef union { unsigned char data; } uchar1;
+typedef union { unsigned char data[2]; } uchar2;
+typedef union { unsigned char data[4]; } uchar4;
+typedef union { unsigned char data[8]; } uchar8;
+typedef union { unsigned char data[16]; } uchar16;
+typedef union { uchar4 data; } uchar3;
+
+typedef union { short data; } short1;
+typedef union { short data[2]; } short2;
+typedef union { short data[4]; } short4;
+typedef union { short data[8]; } short8;
+typedef union { short data[16]; } short16;
+typedef union { short4 data; } short3;
+
+typedef union { unsigned short data; } ushort1;
+typedef union { unsigned short data[2]; } ushort2;
+typedef union { unsigned short data[4]; } ushort4;
+typedef union { unsigned short data[8]; } ushort8;
+typedef union { unsigned short data[16]; } ushort16;
+typedef union { ushort4 data; } ushort3;
+
+typedef union { int data; } int1;
+typedef union { int data[2]; } int2;
+typedef union { int data[4]; } int4;
+typedef union { int data[8]; } int8;
+typedef union { int data[16]; } int16;
+typedef union { int4 data; } int3;
+
+typedef union { unsigned int data; } uint1;
+typedef union { unsigned int data[2]; } uint2;
+typedef union { unsigned int data[4]; } uint4;
+typedef union { unsigned int data[8]; } uint8;
+typedef union { unsigned int data[16]; } uint16;
+typedef union { uint4 data; } uint3;
+
+typedef union { long data; } long1;
+typedef union { long data[2]; } long2;
+typedef union { long data[4]; } long4;
+typedef union { long data[8]; } long8;
+typedef union { long data[16]; } long16;
+typedef union { long4 data; } long3;
+
+typedef union { unsigned long data; } ulong1;
+typedef union { unsigned long data[2]; } ulong2;
+typedef union { unsigned long data[4]; } ulong4;
+typedef union { unsigned long data[8]; } ulong8;
+typedef union { unsigned long data[16]; } ulong16;
+typedef union { ulong4 data; } ulong3;
+
+typedef union { long long data; } longlong1;
+typedef union { long long data[2]; } longlong2;
+typedef union { long long data[4]; } longlong4;
+typedef union { long long data[8]; } longlong8;
+typedef union { long long data[16]; } longlong16;
+typedef union { longlong4 data; } longlong3;
+
+typedef union { unsigned long long data; } ulonglong1;
+typedef union { unsigned long long data[2]; } ulonglong2;
+typedef union { unsigned long long data[4]; } ulonglong4;
+typedef union { unsigned long long data[8]; } ulonglong8;
+typedef union { unsigned long long data[16]; } ulonglong16;
+typedef union { ulonglong4 data; } ulonglong3;
+
+typedef union { float data; } float1;
+typedef union { float data[2]; } float2;
+typedef union { float data[4]; } float4;
+typedef union { float data[8]; } float8;
+typedef union { float data[16]; } float16;
+typedef union { float4 data; } float3;
+
+typedef union { double data; } double1;
+typedef union { double data[2]; } double2;
+typedef union { double data[4]; } double4;
+typedef union { double data[8]; } double8;
+typedef union { double data[16]; } double16;
+typedef union { double4 data; } double3;
+
+#endif // defined(_MSC_VER)
+#endif // defined(__has_attribute)
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/hiprtc.h b/third_party/rocm/include/hip/hcc_detail/hiprtc.h
new file mode 100644
index 0000000..fecea75
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hiprtc.h
@@ -0,0 +1,94 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIPRTC_H
+#define HIPRTC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#if !defined(_WIN32)
+#pragma GCC visibility push (default)
+#endif
+
+enum hiprtcResult {
+ HIPRTC_SUCCESS = 0,
+ HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+ HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+ HIPRTC_ERROR_INVALID_INPUT = 3,
+ HIPRTC_ERROR_INVALID_PROGRAM = 4,
+ HIPRTC_ERROR_INVALID_OPTION = 5,
+ HIPRTC_ERROR_COMPILATION = 6,
+ HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+ HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+ HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+ HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+ HIPRTC_ERROR_INTERNAL_ERROR = 11
+};
+
+const char* hiprtcGetErrorString(hiprtcResult result);
+
+
+hiprtcResult hiprtcVersion(int* major, int* minor);
+
+typedef struct _hiprtcProgram* hiprtcProgram;
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog,
+ const char* name_expression);
+
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog,
+ int numOptions,
+ const char** options);
+
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog,
+ const char* src,
+ const char* name,
+ int numHeaders,
+ const char** headers,
+ const char** includeNames);
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog);
+
+hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog,
+ const char* name_expression,
+ const char** lowered_name);
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log);
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog,
+ size_t* logSizeRet);
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
+
+#if !defined(_WIN32)
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif //HIPRTC_H
diff --git a/third_party/rocm/include/hip/hcc_detail/host_defines.h b/third_party/rocm/include/hip/hcc_detail/host_defines.h
new file mode 100644
index 0000000..72f3932
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/host_defines.h
@@ -0,0 +1,97 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/host_defines.h
+ * @brief TODO-doc
+ */
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H
+
+
+// Add guard to Generic Grid Launch method
+#ifndef GENERIC_GRID_LAUNCH
+#define GENERIC_GRID_LAUNCH 1
+#endif
+
+#ifdef __HCC__
+/**
+ * Function and kernel markers
+ */
+#define __host__ __attribute__((cpu))
+#define __device__ __attribute__((hc))
+
+#if GENERIC_GRID_LAUNCH == 0
+#define __global__ __attribute__((hc_grid_launch)) __attribute__((used))
+#else
+#if __hcc_workweek__ >= 17481
+#define __global__ __attribute__((annotate("__HIP_global_function__"), cpu, hc, used))
+#else
+#define __global__ __attribute__((hc, used))
+#endif
+#endif // GENERIC_GRID_LAUNCH
+
+#define __noinline__ __attribute__((noinline))
+#define __forceinline__ inline __attribute__((always_inline))
+
+
+/*
+ * Variable Type Qualifiers:
+ */
+// _restrict is supported by the compiler
+#define __shared__ tile_static
+#define __constant__ __attribute__((hc, annotate("__HIP_constant__")))
+
+#elif defined(__clang__) && defined(__HIP__)
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#define __noinline__ __attribute__((noinline))
+#define __forceinline__ inline __attribute__((always_inline))
+
+#else
+
+// Non-HCC compiler
+/**
+ * Function and kernel markers
+ */
+#define __host__
+#define __device__
+
+#define __global__
+
+#define __noinline__
+#define __forceinline__ inline
+
+#define __shared__
+#define __constant__
+
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/hsa_helpers.hpp b/third_party/rocm/include/hip/hcc_detail/hsa_helpers.hpp
new file mode 100644
index 0000000..af4f0c9
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/hsa_helpers.hpp
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <hsa/hsa.h>
+
+#include <cstdint>
+#include <functional>
+#include <string>
+
+namespace hip_impl {
+inline void* address(hsa_executable_symbol_t x) {
+ void* r = nullptr;
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
+
+ return r;
+}
+
+inline hsa_agent_t agent(hsa_executable_symbol_t x) {
+ hsa_agent_t r = {};
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_AGENT, &r);
+
+ return r;
+}
+
+inline std::uint32_t group_size(hsa_executable_symbol_t x) {
+ std::uint32_t r = 0u;
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &r);
+
+ return r;
+}
+
+inline hsa_isa_t isa(hsa_agent_t x) {
+ hsa_isa_t r = {};
+ hsa_agent_iterate_isas(x,
+ [](hsa_isa_t i, void* o) {
+ *static_cast<hsa_isa_t*>(o) = i; // Pick the first.
+
+ return HSA_STATUS_INFO_BREAK;
+ },
+ &r);
+
+ return r;
+}
+
+inline std::uint64_t kernel_object(hsa_executable_symbol_t x) {
+ std::uint64_t r = 0u;
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &r);
+
+ return r;
+}
+
+inline std::string name(hsa_executable_symbol_t x) {
+ std::uint32_t sz = 0u;
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
+
+ std::string r(sz, '\0');
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
+
+ return r;
+}
+
+inline std::uint32_t private_size(hsa_executable_symbol_t x) {
+ std::uint32_t r = 0u;
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &r);
+
+ return r;
+}
+
+inline std::uint32_t size(hsa_executable_symbol_t x) {
+ std::uint32_t r = 0;
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
+
+ return r;
+}
+
+inline hsa_symbol_kind_t type(hsa_executable_symbol_t x) {
+ hsa_symbol_kind_t r = {};
+ hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &r);
+
+ return r;
+}
+} // namespace hip_impl
\ No newline at end of file
diff --git a/third_party/rocm/include/hip/hcc_detail/library_types.h b/third_party/rocm/include/hip/hcc_detail/library_types.h
new file mode 100644
index 0000000..6fcd0dc
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/library_types.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_LIBRARY_TYPES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_LIBRARY_TYPES_H
+
+typedef enum hipDataType {
+ HIP_R_16F = 2,
+ HIP_R_32F = 0,
+ HIP_R_64F = 1,
+ HIP_C_16F = 6,
+ HIP_C_32F = 4,
+ HIP_C_64F = 5
+} hipDataType;
+
+typedef enum hipLibraryPropertyType {
+ HIP_LIBRARY_MAJOR_VERSION,
+ HIP_LIBRARY_MINOR_VERSION,
+ HIP_LIBRARY_PATCH_LEVEL
+} hipLibraryPropertyType;
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/llvm_intrinsics.h b/third_party/rocm/include/hip/hcc_detail/llvm_intrinsics.h
new file mode 100644
index 0000000..330b3d9
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/llvm_intrinsics.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hcc_detail/llvm_intrinsics.h
+ * @brief Contains declarations for wrapper functions for llvm intrinsics
+ * like llvm.amdgcn.s.barrier.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_LLVM_INTRINSICS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_LLVM_INTRINSICS_H
+
+#include "hip/hcc_detail/host_defines.h"
+
+// FIXME: These should all be removed and proper builtins used.
+__device__
+unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
+
+__device__
+int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle");
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/macro_based_grid_launch.hpp b/third_party/rocm/include/hip/hcc_detail/macro_based_grid_launch.hpp
new file mode 100644
index 0000000..96d449b
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/macro_based_grid_launch.hpp
@@ -0,0 +1,798 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "concepts.hpp"
+#include "helpers.hpp"
+
+#include "hc.hpp"
+#include "hip/hip_ext.h"
+#include "hip_runtime.h"
+
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace hip_impl {
+namespace {
+struct New_grid_launch_tag {};
+struct Old_grid_launch_tag {};
+
+template <typename C, typename D>
+class RAII_guard {
+ D dtor_;
+
+ public:
+ RAII_guard() = default;
+
+ RAII_guard(const C& ctor, D dtor) : dtor_{std::move(dtor)} { ctor(); }
+
+ RAII_guard(const RAII_guard&) = default;
+ RAII_guard(RAII_guard&&) = default;
+
+ RAII_guard& operator=(const RAII_guard&) = default;
+ RAII_guard& operator=(RAII_guard&&) = default;
+
+ ~RAII_guard() { dtor_(); }
+};
+
+template <typename C, typename D>
+RAII_guard<C, D> make_RAII_guard(const C& ctor, D dtor) {
+ return RAII_guard<C, D>{ctor, std::move(dtor)};
+}
+
+template <FunctionalProcedure F, typename... Ts>
+using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
+ Old_grid_launch_tag>::type;
+} // namespace
+
+// TODO: - dispatch rank should be derived from the domain dimensions passed
+// in, and not always assumed to be 3;
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> ==
+ {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag, dim3 num_blocks,
+ dim3 dim_blocks, int group_mem_bytes,
+ const hc::accelerator_view& acc_v, K k) {
+ const auto d =
+ hc::extent<3>{num_blocks.z * dim_blocks.z, num_blocks.y * dim_blocks.y,
+ num_blocks.x * dim_blocks.x}
+ .tile_with_dynamic(dim_blocks.z, dim_blocks.y, dim_blocks.x, group_mem_bytes);
+
+ try {
+ hc::parallel_for_each(acc_v, d, k);
+ } catch (std::exception& ex) {
+ std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
+ hip_throw(ex);
+ }
+}
+
+// TODO: these are workarounds, they should be removed.
+
+hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&);
+void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t);
+void unlock_stream_hip_(hipStream_t, void*, const char*, hc::accelerator_view*);
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline void grid_launch_hip_impl_(New_grid_launch_tag,
+ dim3 num_blocks, dim3 dim_blocks,
+ int group_mem_bytes,
+ hipStream_t stream,
+ const char* kernel_name, K k) {
+ void* lck_stream = nullptr;
+ auto acc_v = lock_stream_hip_(stream, lck_stream);
+ auto stream_guard =
+ make_RAII_guard(std::bind(print_prelaunch_trace_, kernel_name, num_blocks, dim_blocks,
+ group_mem_bytes, stream),
+ std::bind(unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v));
+
+ try {
+ grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+ group_mem_bytes, acc_v, std::move(k));
+ } catch (std::exception& ex) {
+ std::cerr << "Failed in " << __func__ << ", with exception: " << ex.what() << std::endl;
+ hip_throw(ex);
+ }
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> ==
+ {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(Old_grid_launch_tag,
+ dim3 num_blocks, dim3 dim_blocks,
+ int group_mem_bytes,
+ hipStream_t stream, K k) {
+ grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+ group_mem_bytes, std::move(stream), std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {hipLaunchParm, Ts...}) inline void grid_launch_hip_impl_(
+ Old_grid_launch_tag, dim3 num_blocks, dim3 dim_blocks, int group_mem_bytes, hipStream_t stream,
+ const char* kernel_name, K k) {
+ grid_launch_hip_impl_(New_grid_launch_tag{}, std::move(num_blocks), std::move(dim_blocks),
+ group_mem_bytes, std::move(stream), kernel_name, std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline std::enable_if_t<
+ !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
+ int group_mem_bytes, hipStream_t stream,
+ const char* kernel_name, K k) {
+ grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
+ std::move(dim_blocks), group_mem_bytes, std::move(stream), kernel_name,
+ std::move(k));
+}
+
+template <FunctionalProcedure K, typename... Ts>
+requires(Domain<K> == {Ts...}) inline std::enable_if_t<
+ !std::is_function<K>::value> grid_launch_hip_(dim3 num_blocks, dim3 dim_blocks,
+ int group_mem_bytes, hipStream_t stream, K k) {
+ grid_launch_hip_impl_(is_new_grid_launch_t<K, Ts...>{}, std::move(num_blocks),
+ std::move(dim_blocks), group_mem_bytes, std::move(stream), std::move(k));
+}
+
+// TODO: these are temporary and purposefully noisy and disruptive.
+#define make_kernel_name_hip(k, n) \
+ HIP_kernel_functor_name_begin##_##k##_##HIP_kernel_functor_name_end##_##n
+
+#define make_kernel_functor_hip_30(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+ p22, p23, p24, p25, p26, p27) \
+ struct make_kernel_name_hip(function_name, 28) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ std::decay_t<decltype(p22)> _p22_; \
+ std::decay_t<decltype(p23)> _p23_; \
+ std::decay_t<decltype(p24)> _p24_; \
+ std::decay_t<decltype(p25)> _p25_; \
+ std::decay_t<decltype(p26)> _p26_; \
+ std::decay_t<decltype(p27)> _p27_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
+ _p22_, _p23_, _p24_, _p25_, _p26_, _p27_); \
+ } \
+ }
+#define make_kernel_functor_hip_29(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+ p22, p23, p24, p25, p26) \
+ struct make_kernel_name_hip(function_name, 27) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ std::decay_t<decltype(p22)> _p22_; \
+ std::decay_t<decltype(p23)> _p23_; \
+ std::decay_t<decltype(p24)> _p24_; \
+ std::decay_t<decltype(p25)> _p25_; \
+ std::decay_t<decltype(p26)> _p26_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
+ _p22_, _p23_, _p24_, _p25_, _p26_); \
+ } \
+ }
+#define make_kernel_functor_hip_28(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+ p22, p23, p24, p25) \
+ struct make_kernel_name_hip(function_name, 26) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ std::decay_t<decltype(p22)> _p22_; \
+ std::decay_t<decltype(p23)> _p23_; \
+ std::decay_t<decltype(p24)> _p24_; \
+ std::decay_t<decltype(p25)> _p25_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
+ _p22_, _p23_, _p24_, _p25_); \
+ } \
+ }
+#define make_kernel_functor_hip_27(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+ p22, p23, p24) \
+ struct make_kernel_name_hip(function_name, 25) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ std::decay_t<decltype(p22)> _p22_; \
+ std::decay_t<decltype(p23)> _p23_; \
+ std::decay_t<decltype(p24)> _p24_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
+ _p22_, _p23_, _p24_); \
+ } \
+ }
+#define make_kernel_functor_hip_26(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+ p22, p23) \
+ struct make_kernel_name_hip(function_name, 24) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ std::decay_t<decltype(p22)> _p22_; \
+ std::decay_t<decltype(p23)> _p23_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
+ _p22_, _p23_); \
+ } \
+ }
+#define make_kernel_functor_hip_25(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, \
+ p22) \
+ struct make_kernel_name_hip(function_name, 23) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ std::decay_t<decltype(p22)> _p22_; \
+ __attribute__((used, flatten)) void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_, \
+ _p22_); \
+ } \
+ }
+#define make_kernel_functor_hip_24(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21) \
+ struct make_kernel_name_hip(function_name, 22) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ std::decay_t<decltype(p21)> _p21_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_, _p21_); \
+ } \
+ }
+#define make_kernel_functor_hip_23(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) \
+ struct make_kernel_name_hip(function_name, 21) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ std::decay_t<decltype(p20)> _p20_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_, _p20_); \
+ } \
+ }
+#define make_kernel_functor_hip_22(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) \
+ struct make_kernel_name_hip(function_name, 20) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ std::decay_t<decltype(p19)> _p19_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_, _p19_); \
+ } \
+ }
+#define make_kernel_functor_hip_21(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) \
+ struct make_kernel_name_hip(function_name, 19) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ std::decay_t<decltype(p18)> _p18_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_, _p18_); \
+ } \
+ }
+#define make_kernel_functor_hip_20(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16, p17) \
+ struct make_kernel_name_hip(function_name, 18) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ std::decay_t<decltype(p17)> _p17_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_, _p17_); \
+ } \
+ }
+#define make_kernel_functor_hip_19(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15, p16) \
+ struct make_kernel_name_hip(function_name, 17) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ std::decay_t<decltype(p16)> _p16_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_, _p16_); \
+ } \
+ }
+#define make_kernel_functor_hip_18(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14, p15) \
+ struct make_kernel_name_hip(function_name, 16) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ std::decay_t<decltype(p15)> _p15_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_, _p15_); \
+ } \
+ }
+#define make_kernel_functor_hip_17(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13, p14) \
+ struct make_kernel_name_hip(function_name, 15) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ std::decay_t<decltype(p14)> _p14_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_, _p14_); \
+ } \
+ }
+#define make_kernel_functor_hip_16(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12, p13) \
+ struct make_kernel_name_hip(function_name, 14) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ std::decay_t<decltype(p13)> _p13_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_, _p13_); \
+ } \
+ }
+#define make_kernel_functor_hip_15(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11, p12) \
+ struct make_kernel_name_hip(function_name, 13) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ std::decay_t<decltype(p12)> _p12_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_, \
+ _p12_); \
+ } \
+ }
+#define make_kernel_functor_hip_14(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10, p11) \
+ struct make_kernel_name_hip(function_name, 12) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ std::decay_t<decltype(p11)> _p11_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_, _p11_); \
+ } \
+ }
+#define make_kernel_functor_hip_13(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9, p10) \
+ struct make_kernel_name_hip(function_name, 11) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ std::decay_t<decltype(p10)> _p10_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { \
+ kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_, _p10_); \
+ } \
+ }
+#define make_kernel_functor_hip_12(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, \
+ p9) \
+ struct make_kernel_name_hip(function_name, 10) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ std::decay_t<decltype(p9)> _p9_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_); } \
+ }
+#define make_kernel_functor_hip_11(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8) \
+ struct make_kernel_name_hip(function_name, 9) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ std::decay_t<decltype(p8)> _p8_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_); } \
+ }
+#define make_kernel_functor_hip_10(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7) \
+ struct make_kernel_name_hip(function_name, 8) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ std::decay_t<decltype(p7)> _p7_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_); } \
+ }
+#define make_kernel_functor_hip_9(function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6) \
+ struct make_kernel_name_hip(function_name, 7) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ std::decay_t<decltype(p6)> _p6_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_); } \
+ }
+#define make_kernel_functor_hip_8(function_name, kernel_name, p0, p1, p2, p3, p4, p5) \
+ struct make_kernel_name_hip(function_name, 6) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ std::decay_t<decltype(p5)> _p5_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_); } \
+ }
+#define make_kernel_functor_hip_7(function_name, kernel_name, p0, p1, p2, p3, p4) \
+ struct make_kernel_name_hip(function_name, 5) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ std::decay_t<decltype(p4)> _p4_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_); } \
+ }
+#define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3) \
+ struct make_kernel_name_hip(function_name, 4) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ std::decay_t<decltype(p3)> _p3_; \
+ void operator()(const hc::tiled_index<3>&) const \
+ [[hc]] { kernel_name(_p0_, _p1_, _p2_, _p3_); } \
+ }
+#define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2) \
+ struct make_kernel_name_hip(function_name, 3) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ std::decay_t<decltype(p2)> _p2_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_, _p2_); } \
+ }
+#define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1) \
+ struct make_kernel_name_hip(function_name, 2) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ std::decay_t<decltype(p1)> _p1_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_, _p1_); } \
+ }
+#define fofo(f, n) kernel_prefix_hip##f##kernel_suffix_hip##n
+#define make_kernel_functor_hip_3(function_name, kernel_name, p0) \
+ struct make_kernel_name_hip(function_name, 1) { \
+ std::decay_t<decltype(p0)> _p0_; \
+ void operator()(const hc::tiled_index<3>&) const [[hc]] { kernel_name(_p0_); } \
+ }
+#define make_kernel_functor_hip_2(function_name, kernel_name) \
+ struct make_kernel_name_hip(function_name, 0) { \
+ void operator()(const hc::tiled_index<3>&)[[hc]] { return kernel_name(hipLaunchParm{}); } \
+ }
+#define make_kernel_functor_hip_1(...)
+#define make_kernel_functor_hip_0(...)
+#define make_kernel_functor_hip_(...) overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__)
+
+
+#define hipLaunchNamedKernelGGL(function_name, kernel_name, num_blocks, dim_blocks, \
+ group_mem_bytes, stream, ...) \
+ do { \
+ make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__) \
+ hip_kernel_functor_impl_{__VA_ARGS__}; \
+ hip_impl::grid_launch_hip_(num_blocks, dim_blocks, group_mem_bytes, stream, #kernel_name, \
+ hip_kernel_functor_impl_); \
+ } while (0)
+
+#define hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...) \
+ do { \
+ hipLaunchNamedKernelGGL(unnamed, kernel_name, num_blocks, dim_blocks, group_mem_bytes, \
+ stream, ##__VA_ARGS__); \
+ } while (0)
+
+#define hipLaunchKernel(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...) \
+ do { \
+ hipLaunchKernelGGL(kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, \
+ hipLaunchParm{}, ##__VA_ARGS__); \
+ } while (0)
+} // namespace hip_impl
diff --git a/third_party/rocm/include/hip/hcc_detail/math_functions.h b/third_party/rocm/include/hip/hcc_detail/math_functions.h
new file mode 100644
index 0000000..3dbc9a2
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/math_functions.h
@@ -0,0 +1,1557 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "hip_fp16_math_fwd.h"
+#include "hip_vector_types.h"
+#include "math_fwd.h"
+
+#include <hip/hcc_detail/host_defines.h>
+
+#include <algorithm>
+
+// assert.h is only for the host version of assert.
+// The device version of assert is implemented in hip/hcc_detail/hip_runtime.h.
+// Users should include hip_runtime.h for the device version of assert.
+#if !__HIP_DEVICE_COMPILE__
+#include <assert.h>
+#endif
+
+#include <limits.h>
+#include <limits>
+#include <stdint.h>
+
+// HCC's own math functions should be included first, otherwise there will
+// be conflicts when hip/math_functions.h is included before hip/hip_runtime.h.
+#ifdef __HCC__
+#include "kalmar_math.h"
+#endif
+
+#if _LIBCPP_VERSION && __HIP__
+namespace std {
+template <>
+struct __numeric_type<_Float16>
+{
+ static _Float16 __test(_Float16);
+
+ typedef _Float16 type;
+ static const bool value = true;
+};
+}
+#endif // _LIBCPP_VERSION
+
+#pragma push_macro("__DEVICE__")
+#pragma push_macro("__RETURN_TYPE")
+
+#ifdef __HCC__
+#define __DEVICE__ __device__
+#define __RETURN_TYPE int
+#else // to be consistent with __clang_cuda_math_forward_declares
+#define __DEVICE__ static __device__
+#define __RETURN_TYPE bool
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+__DEVICE__
+inline
+uint64_t __make_mantissa_base8(const char* tagp)
+{
+ uint64_t r = 0;
+ while (tagp) {
+ char tmp = *tagp;
+
+ if (tmp >= '0' && tmp <= '7') r = (r * 8u) + tmp - '0';
+ else return 0;
+
+ ++tagp;
+ }
+
+ return r;
+}
+
+__DEVICE__
+inline
+uint64_t __make_mantissa_base10(const char* tagp)
+{
+ uint64_t r = 0;
+ while (tagp) {
+ char tmp = *tagp;
+
+ if (tmp >= '0' && tmp <= '9') r = (r * 10u) + tmp - '0';
+ else return 0;
+
+ ++tagp;
+ }
+
+ return r;
+}
+
+__DEVICE__
+inline
+uint64_t __make_mantissa_base16(const char* tagp)
+{
+ uint64_t r = 0;
+ while (tagp) {
+ char tmp = *tagp;
+
+ if (tmp >= '0' && tmp <= '9') r = (r * 16u) + tmp - '0';
+ else if (tmp >= 'a' && tmp <= 'f') r = (r * 16u) + tmp - 'a' + 10;
+ else if (tmp >= 'A' && tmp <= 'F') r = (r * 16u) + tmp - 'A' + 10;
+ else return 0;
+
+ ++tagp;
+ }
+
+ return r;
+}
+
+__DEVICE__
+inline
+uint64_t __make_mantissa(const char* tagp)
+{
+ if (!tagp) return 0u;
+
+ if (*tagp == '0') {
+ ++tagp;
+
+ if (*tagp == 'x' || *tagp == 'X') return __make_mantissa_base16(tagp);
+ else return __make_mantissa_base8(tagp);
+ }
+
+ return __make_mantissa_base10(tagp);
+}
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+// DOT FUNCTIONS
+#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__
+__DEVICE__
+inline
+int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
+ return __ockl_sdot2(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
+ return __ockl_udot2(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
+ return __ockl_sdot4(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
+ return __ockl_udot4(a.data, b.data, c, saturate);
+}
+__DEVICE__
+inline
+int amd_mixed_dot(int a, int b, int c, bool saturate) {
+ return __ockl_sdot8(a, b, c, saturate);
+}
+__DEVICE__
+inline
+uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
+ return __ockl_udot8(a, b, c, saturate);
+}
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// BEGIN FLOAT
+__DEVICE__
+inline
+float abs(float x) { return __ocml_fabs_f32(x); }
+__DEVICE__
+inline
+float acosf(float x) { return __ocml_acos_f32(x); }
+__DEVICE__
+inline
+float acoshf(float x) { return __ocml_acosh_f32(x); }
+__DEVICE__
+inline
+float asinf(float x) { return __ocml_asin_f32(x); }
+__DEVICE__
+inline
+float asinhf(float x) { return __ocml_asinh_f32(x); }
+__DEVICE__
+inline
+float atan2f(float x, float y) { return __ocml_atan2_f32(x, y); }
+__DEVICE__
+inline
+float atanf(float x) { return __ocml_atan_f32(x); }
+__DEVICE__
+inline
+float atanhf(float x) { return __ocml_atanh_f32(x); }
+__DEVICE__
+inline
+float cbrtf(float x) { return __ocml_cbrt_f32(x); }
+__DEVICE__
+inline
+float ceilf(float x) { return __ocml_ceil_f32(x); }
+__DEVICE__
+inline
+float copysignf(float x, float y) { return __ocml_copysign_f32(x, y); }
+__DEVICE__
+inline
+float cosf(float x) { return __ocml_cos_f32(x); }
+__DEVICE__
+inline
+float coshf(float x) { return __ocml_cosh_f32(x); }
+__DEVICE__
+inline
+float cospif(float x) { return __ocml_cospi_f32(x); }
+__DEVICE__
+inline
+float cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); }
+__DEVICE__
+inline
+float cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); }
+__DEVICE__
+inline
+float erfcf(float x) { return __ocml_erfc_f32(x); }
+__DEVICE__
+inline
+float erfcinvf(float x) { return __ocml_erfcinv_f32(x); }
+__DEVICE__
+inline
+float erfcxf(float x) { return __ocml_erfcx_f32(x); }
+__DEVICE__
+inline
+float erff(float x) { return __ocml_erf_f32(x); }
+__DEVICE__
+inline
+float erfinvf(float x) { return __ocml_erfinv_f32(x); }
+__DEVICE__
+inline
+float exp10f(float x) { return __ocml_exp10_f32(x); }
+__DEVICE__
+inline
+float exp2f(float x) { return __ocml_exp2_f32(x); }
+__DEVICE__
+inline
+float expf(float x) { return __ocml_exp_f32(x); }
+__DEVICE__
+inline
+float expm1f(float x) { return __ocml_expm1_f32(x); }
+__DEVICE__
+inline
+float fabsf(float x) { return __ocml_fabs_f32(x); }
+__DEVICE__
+inline
+float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
+__DEVICE__
+inline
+float fdividef(float x, float y) { return x / y; }
+__DEVICE__
+inline
+float floorf(float x) { return __ocml_floor_f32(x); }
+__DEVICE__
+inline
+float fmaf(float x, float y, float z) { return __ocml_fma_f32(x, y, z); }
+__DEVICE__
+inline
+float fmaxf(float x, float y) { return __ocml_fmax_f32(x, y); }
+__DEVICE__
+inline
+float fminf(float x, float y) { return __ocml_fmin_f32(x, y); }
+__DEVICE__
+inline
+float fmodf(float x, float y) { return __ocml_fmod_f32(x, y); }
+__DEVICE__
+inline
+float frexpf(float x, int* nptr)
+{
+ int tmp;
+ float r =
+ __ocml_frexp_f32(x, (__attribute__((address_space(5))) int*) &tmp);
+ *nptr = tmp;
+
+ return r;
+}
+__DEVICE__
+inline
+float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
+__DEVICE__
+inline
+int ilogbf(float x) { return __ocml_ilogb_f32(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isfinite(float x) { return __ocml_isfinite_f32(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isinf(float x) { return __ocml_isinf_f32(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isnan(float x) { return __ocml_isnan_f32(x); }
+__DEVICE__
+inline
+float j0f(float x) { return __ocml_j0_f32(x); }
+__DEVICE__
+inline
+float j1f(float x) { return __ocml_j1_f32(x); }
+__DEVICE__
+inline
+float jnf(int n, float x)
+{ // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+ // for linear recurrences to get O(log n) steps, but it's unclear if
+ // it'd be beneficial in this case.
+ if (n == 0) return j0f(x);
+ if (n == 1) return j1f(x);
+
+ float x0 = j0f(x);
+ float x1 = j1f(x);
+ for (int i = 1; i < n; ++i) {
+ float x2 = (2 * i) / x * x1 - x0;
+ x0 = x1;
+ x1 = x2;
+ }
+
+ return x1;
+}
+__DEVICE__
+inline
+float ldexpf(float x, int e) { return __ocml_ldexp_f32(x, e); }
+__DEVICE__
+inline
+float lgammaf(float x) { return __ocml_lgamma_f32(x); }
+__DEVICE__
+inline
+long long int llrintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline
+long long int llroundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline
+float log10f(float x) { return __ocml_log10_f32(x); }
+__DEVICE__
+inline
+float log1pf(float x) { return __ocml_log1p_f32(x); }
+__DEVICE__
+inline
+float log2f(float x) { return __ocml_log2_f32(x); }
+__DEVICE__
+inline
+float logbf(float x) { return __ocml_logb_f32(x); }
+__DEVICE__
+inline
+float logf(float x) { return __ocml_log_f32(x); }
+__DEVICE__
+inline
+long int lrintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline
+long int lroundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline
+float modff(float x, float* iptr)
+{
+ float tmp;
+ float r =
+ __ocml_modf_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+ *iptr = tmp;
+
+ return r;
+}
+__DEVICE__
+inline
+float nanf(const char* tagp)
+{
+ union {
+ float val;
+ struct ieee_float {
+ uint32_t mantissa : 22;
+ uint32_t quiet : 1;
+ uint32_t exponent : 8;
+ uint32_t sign : 1;
+ } bits;
+
+ static_assert(sizeof(float) == sizeof(ieee_float), "");
+ } tmp;
+
+ tmp.bits.sign = 0u;
+ tmp.bits.exponent = ~0u;
+ tmp.bits.quiet = 1u;
+ tmp.bits.mantissa = __make_mantissa(tagp);
+
+ return tmp.val;
+}
+__DEVICE__
+inline
+float nearbyintf(float x) { return __ocml_nearbyint_f32(x); }
+__DEVICE__
+inline
+float nextafterf(float x, float y) { return __ocml_nextafter_f32(x, y); }
+__DEVICE__
+inline
+float norm3df(float x, float y, float z) { return __ocml_len3_f32(x, y, z); }
+__DEVICE__
+inline
+float norm4df(float x, float y, float z, float w)
+{
+ return __ocml_len4_f32(x, y, z, w);
+}
+__DEVICE__
+inline
+float normcdff(float x) { return __ocml_ncdf_f32(x); }
+__DEVICE__
+inline
+float normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); }
+__DEVICE__
+inline
+float normf(int dim, const float* a)
+{ // TODO: placeholder until OCML adds support.
+ float r = 0;
+ while (dim--) { r += a[0] * a[0]; ++a; }
+
+ return __ocml_sqrt_f32(r);
+}
+__DEVICE__
+inline
+float powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__DEVICE__
+inline
+float powif(float base, int iexp) { return __ocml_pown_f32(base, iexp); }
+__DEVICE__
+inline
+float rcbrtf(float x) { return __ocml_rcbrt_f32(x); }
+__DEVICE__
+inline
+float remainderf(float x, float y) { return __ocml_remainder_f32(x, y); }
+__DEVICE__
+inline
+float remquof(float x, float y, int* quo)
+{
+ int tmp;
+ float r =
+ __ocml_remquo_f32(x, y, (__attribute__((address_space(5))) int*) &tmp);
+ *quo = tmp;
+
+ return r;
+}
+__DEVICE__
+inline
+float rhypotf(float x, float y) { return __ocml_rhypot_f32(x, y); }
+__DEVICE__
+inline
+float rintf(float x) { return __ocml_rint_f32(x); }
+__DEVICE__
+inline
+float rnorm3df(float x, float y, float z)
+{
+ return __ocml_rlen3_f32(x, y, z);
+}
+
+__DEVICE__
+inline
+float rnorm4df(float x, float y, float z, float w)
+{
+ return __ocml_rlen4_f32(x, y, z, w);
+}
+__DEVICE__
+inline
+float rnormf(int dim, const float* a)
+{ // TODO: placeholder until OCML adds support.
+ float r = 0;
+ while (dim--) { r += a[0] * a[0]; ++a; }
+
+ return __ocml_rsqrt_f32(r);
+}
+__DEVICE__
+inline
+float roundf(float x) { return __ocml_round_f32(x); }
+__DEVICE__
+inline
+float rsqrtf(float x) { return __ocml_rsqrt_f32(x); }
+__DEVICE__
+inline
+float scalblnf(float x, long int n)
+{
+ return (n < INT_MAX) ? __ocml_scalbn_f32(x, n) : __ocml_scalb_f32(x, n);
+}
+__DEVICE__
+inline
+float scalbnf(float x, int n) { return __ocml_scalbn_f32(x, n); }
+__DEVICE__
+inline
+__RETURN_TYPE signbit(float x) { return __ocml_signbit_f32(x); }
+__DEVICE__
+inline
+void sincosf(float x, float* sptr, float* cptr)
+{
+ float tmp;
+
+ *sptr =
+ __ocml_sincos_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+ *cptr = tmp;
+}
+__DEVICE__
+inline
+void sincospif(float x, float* sptr, float* cptr)
+{
+ float tmp;
+
+ *sptr =
+ __ocml_sincospi_f32(x, (__attribute__((address_space(5))) float*) &tmp);
+ *cptr = tmp;
+}
+__DEVICE__
+inline
+float sinf(float x) { return __ocml_sin_f32(x); }
+__DEVICE__
+inline
+float sinhf(float x) { return __ocml_sinh_f32(x); }
+__DEVICE__
+inline
+float sinpif(float x) { return __ocml_sinpi_f32(x); }
+__DEVICE__
+inline
+float sqrtf(float x) { return __ocml_sqrt_f32(x); }
+__DEVICE__
+inline
+float tanf(float x) { return __ocml_tan_f32(x); }
+__DEVICE__
+inline
+float tanhf(float x) { return __ocml_tanh_f32(x); }
+__DEVICE__
+inline
+float tgammaf(float x) { return __ocml_tgamma_f32(x); }
+__DEVICE__
+inline
+float truncf(float x) { return __ocml_trunc_f32(x); }
+__DEVICE__
+inline
+float y0f(float x) { return __ocml_y0_f32(x); }
+__DEVICE__
+inline
+float y1f(float x) { return __ocml_y1_f32(x); }
+__DEVICE__
+inline
+float ynf(int n, float x)
+{ // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+ // for linear recurrences to get O(log n) steps, but it's unclear if
+ // it'd be beneficial in this case. Placeholder until OCML adds
+ // support.
+ if (n == 0) return y0f(x);
+ if (n == 1) return y1f(x);
+
+ float x0 = y0f(x);
+ float x1 = y1f(x);
+ for (int i = 1; i < n; ++i) {
+ float x2 = (2 * i) / x * x1 - x0;
+ x0 = x1;
+ x1 = x2;
+ }
+
+ return x1;
+}
+
+// BEGIN INTRINSICS
+__DEVICE__
+inline
+float __cosf(float x) { return __ocml_native_cos_f32(x); }
+__DEVICE__
+inline
+float __exp10f(float x) { return __ocml_native_exp10_f32(x); }
+__DEVICE__
+inline
+float __expf(float x) { return __ocml_native_exp_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fadd_rd(float x, float y) { return __ocml_add_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fadd_rn(float x, float y) { return x + y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fadd_ru(float x, float y) { return __ocml_add_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fadd_rz(float x, float y) { return __ocml_add_rtz_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rd(float x, float y) { return __ocml_div_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fdiv_rn(float x, float y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fdiv_ru(float x, float y) { return __ocml_div_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fdiv_rz(float x, float y) { return __ocml_div_rtz_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fdividef(float x, float y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmaf_rd(float x, float y, float z)
+{
+ return __ocml_fma_rtn_f32(x, y, z);
+}
+#endif
+__DEVICE__
+inline
+float __fmaf_rn(float x, float y, float z)
+{
+ return __ocml_fma_f32(x, y, z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmaf_ru(float x, float y, float z)
+{
+ return __ocml_fma_rtp_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmaf_rz(float x, float y, float z)
+{
+ return __ocml_fma_rtz_f32(x, y, z);
+}
+__DEVICE__
+inline
+float __fmul_rd(float x, float y) { return __ocml_mul_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fmul_rn(float x, float y) { return x * y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fmul_ru(float x, float y) { return __ocml_mul_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
+__DEVICE__
+inline
+float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#endif
+__DEVICE__
+inline
+float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); }
+__DEVICE__
+inline
+float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
+#endif
+__DEVICE__
+inline
+float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsqrt_rd(float x) { return __ocml_sqrt_rtn_f32(x); }
+#endif
+__DEVICE__
+inline
+float __fsqrt_rn(float x) { return __ocml_native_sqrt_f32(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsqrt_ru(float x) { return __ocml_sqrt_rtp_f32(x); }
+__DEVICE__
+inline
+float __fsqrt_rz(float x) { return __ocml_sqrt_rtz_f32(x); }
+__DEVICE__
+inline
+float __fsub_rd(float x, float y) { return __ocml_sub_rtn_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __fsub_rn(float x, float y) { return x - y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+float __fsub_ru(float x, float y) { return __ocml_sub_rtp_f32(x, y); }
+__DEVICE__
+inline
+float __fsub_rz(float x, float y) { return __ocml_sub_rtz_f32(x, y); }
+#endif
+__DEVICE__
+inline
+float __log10f(float x) { return __ocml_native_log10_f32(x); }
+__DEVICE__
+inline
+float __log2f(float x) { return __ocml_native_log2_f32(x); }
+__DEVICE__
+inline
+float __logf(float x) { return __ocml_native_log_f32(x); }
+__DEVICE__
+inline
+float __powf(float x, float y) { return __ocml_pow_f32(x, y); }
+__DEVICE__
+inline
+float __saturatef(float x) { return (x < 0) ? 0 : ((x > 1) ? 1 : x); }
+__DEVICE__
+inline
+void __sincosf(float x, float* sptr, float* cptr)
+{
+ *sptr = __ocml_native_sin_f32(x);
+ *cptr = __ocml_native_cos_f32(x);
+}
+__DEVICE__
+inline
+float __sinf(float x) { return __ocml_native_sin_f32(x); }
+__DEVICE__
+inline
+float __tanf(float x) { return __ocml_tan_f32(x); }
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__DEVICE__
+inline
+double abs(double x) { return __ocml_fabs_f64(x); }
+__DEVICE__
+inline
+double acos(double x) { return __ocml_acos_f64(x); }
+__DEVICE__
+inline
+double acosh(double x) { return __ocml_acosh_f64(x); }
+__DEVICE__
+inline
+double asin(double x) { return __ocml_asin_f64(x); }
+__DEVICE__
+inline
+double asinh(double x) { return __ocml_asinh_f64(x); }
+__DEVICE__
+inline
+double atan(double x) { return __ocml_atan_f64(x); }
+__DEVICE__
+inline
+double atan2(double x, double y) { return __ocml_atan2_f64(x, y); }
+__DEVICE__
+inline
+double atanh(double x) { return __ocml_atanh_f64(x); }
+__DEVICE__
+inline
+double cbrt(double x) { return __ocml_cbrt_f64(x); }
+__DEVICE__
+inline
+double ceil(double x) { return __ocml_ceil_f64(x); }
+__DEVICE__
+inline
+double copysign(double x, double y) { return __ocml_copysign_f64(x, y); }
+__DEVICE__
+inline
+double cos(double x) { return __ocml_cos_f64(x); }
+__DEVICE__
+inline
+double cosh(double x) { return __ocml_cosh_f64(x); }
+__DEVICE__
+inline
+double cospi(double x) { return __ocml_cospi_f64(x); }
+__DEVICE__
+inline
+double cyl_bessel_i0(double x) { return __ocml_i0_f64(x); }
+__DEVICE__
+inline
+double cyl_bessel_i1(double x) { return __ocml_i1_f64(x); }
+__DEVICE__
+inline
+double erf(double x) { return __ocml_erf_f64(x); }
+__DEVICE__
+inline
+double erfc(double x) { return __ocml_erfc_f64(x); }
+__DEVICE__
+inline
+double erfcinv(double x) { return __ocml_erfcinv_f64(x); }
+__DEVICE__
+inline
+double erfcx(double x) { return __ocml_erfcx_f64(x); }
+__DEVICE__
+inline
+double erfinv(double x) { return __ocml_erfinv_f64(x); }
+__DEVICE__
+inline
+double exp(double x) { return __ocml_exp_f64(x); }
+__DEVICE__
+inline
+double exp10(double x) { return __ocml_exp10_f64(x); }
+__DEVICE__
+inline
+double exp2(double x) { return __ocml_exp2_f64(x); }
+__DEVICE__
+inline
+double expm1(double x) { return __ocml_expm1_f64(x); }
+__DEVICE__
+inline
+double fabs(double x) { return __ocml_fabs_f64(x); }
+__DEVICE__
+inline
+double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
+__DEVICE__
+inline
+double floor(double x) { return __ocml_floor_f64(x); }
+__DEVICE__
+inline
+double fma(double x, double y, double z) { return __ocml_fma_f64(x, y, z); }
+__DEVICE__
+inline
+double fmax(double x, double y) { return __ocml_fmax_f64(x, y); }
+__DEVICE__
+inline
+double fmin(double x, double y) { return __ocml_fmin_f64(x, y); }
+__DEVICE__
+inline
+double fmod(double x, double y) { return __ocml_fmod_f64(x, y); }
+__DEVICE__
+inline
+double frexp(double x, int* nptr)
+{
+ int tmp;
+ double r =
+ __ocml_frexp_f64(x, (__attribute__((address_space(5))) int*) &tmp);
+ *nptr = tmp;
+
+ return r;
+}
+__DEVICE__
+inline
+double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
+__DEVICE__
+inline
+int ilogb(double x) { return __ocml_ilogb_f64(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isfinite(double x) { return __ocml_isfinite_f64(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isinf(double x) { return __ocml_isinf_f64(x); }
+__DEVICE__
+inline
+__RETURN_TYPE isnan(double x) { return __ocml_isnan_f64(x); }
+__DEVICE__
+inline
+double j0(double x) { return __ocml_j0_f64(x); }
+__DEVICE__
+inline
+double j1(double x) { return __ocml_j1_f64(x); }
+__DEVICE__
+inline
+double jn(int n, double x)
+{ // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+ // for linear recurrences to get O(log n) steps, but it's unclear if
+ // it'd be beneficial in this case. Placeholder until OCML adds
+ // support.
+ if (n == 0) return j0f(x);
+ if (n == 1) return j1f(x);
+
+ double x0 = j0f(x);
+ double x1 = j1f(x);
+ for (int i = 1; i < n; ++i) {
+ double x2 = (2 * i) / x * x1 - x0;
+ x0 = x1;
+ x1 = x2;
+ }
+
+ return x1;
+}
+__DEVICE__
+inline
+double ldexp(double x, int e) { return __ocml_ldexp_f64(x, e); }
+__DEVICE__
+inline
+double lgamma(double x) { return __ocml_lgamma_f64(x); }
+__DEVICE__
+inline
+long long int llrint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline
+long long int llround(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline
+double log(double x) { return __ocml_log_f64(x); }
+__DEVICE__
+inline
+double log10(double x) { return __ocml_log10_f64(x); }
+__DEVICE__
+inline
+double log1p(double x) { return __ocml_log1p_f64(x); }
+__DEVICE__
+inline
+double log2(double x) { return __ocml_log2_f64(x); }
+__DEVICE__
+inline
+double logb(double x) { return __ocml_logb_f64(x); }
+__DEVICE__
+inline
+long int lrint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline
+long int lround(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline
+double modf(double x, double* iptr)
+{
+ double tmp;
+ double r =
+ __ocml_modf_f64(x, (__attribute__((address_space(5))) double*) &tmp);
+ *iptr = tmp;
+
+ return r;
+}
+__DEVICE__
+inline
+double nan(const char* tagp)
+{
+#if !_WIN32
+ union {
+ double val;
+ struct ieee_double {
+ uint64_t mantissa : 51;
+ uint32_t quiet : 1;
+ uint32_t exponent : 11;
+ uint32_t sign : 1;
+ } bits;
+ static_assert(sizeof(double) == sizeof(ieee_double), "");
+ } tmp;
+
+ tmp.bits.sign = 0u;
+ tmp.bits.exponent = ~0u;
+ tmp.bits.quiet = 1u;
+ tmp.bits.mantissa = __make_mantissa(tagp);
+
+ return tmp.val;
+#else
+ static_assert(sizeof(uint64_t)==sizeof(double));
+ uint64_t val = __make_mantissa(tagp);
+ val |= 0xFFF << 51;
+ return *reinterpret_cast<double*>(&val);
+#endif
+}
+__DEVICE__
+inline
+double nearbyint(double x) { return __ocml_nearbyint_f64(x); }
+__DEVICE__
+inline
+double nextafter(double x, double y) { return __ocml_nextafter_f64(x, y); }
+__DEVICE__
+inline
+double norm(int dim, const double* a)
+{ // TODO: placeholder until OCML adds support.
+ double r = 0;
+ while (dim--) { r += a[0] * a[0]; ++a; }
+
+ return __ocml_sqrt_f64(r);
+}
+__DEVICE__
+inline
+double norm3d(double x, double y, double z)
+{
+ return __ocml_len3_f64(x, y, z);
+}
+__DEVICE__
+inline
+double norm4d(double x, double y, double z, double w)
+{
+ return __ocml_len4_f64(x, y, z, w);
+}
+__DEVICE__
+inline
+double normcdf(double x) { return __ocml_ncdf_f64(x); }
+__DEVICE__
+inline
+double normcdfinv(double x) { return __ocml_ncdfinv_f64(x); }
+__DEVICE__
+inline
+double pow(double x, double y) { return __ocml_pow_f64(x, y); }
+__DEVICE__
+inline
+double powi(double base, int iexp) { return __ocml_pown_f64(base, iexp); }
+__DEVICE__
+inline
+double rcbrt(double x) { return __ocml_rcbrt_f64(x); }
+__DEVICE__
+inline
+double remainder(double x, double y) { return __ocml_remainder_f64(x, y); }
+__DEVICE__
+inline
+double remquo(double x, double y, int* quo)
+{
+ int tmp;
+ double r =
+ __ocml_remquo_f64(x, y, (__attribute__((address_space(5))) int*) &tmp);
+ *quo = tmp;
+
+ return r;
+}
+__DEVICE__
+inline
+double rhypot(double x, double y) { return __ocml_rhypot_f64(x, y); }
+__DEVICE__
+inline
+double rint(double x) { return __ocml_rint_f64(x); }
+__DEVICE__
+inline
+double rnorm(int dim, const double* a)
+{ // TODO: placeholder until OCML adds support.
+ double r = 0;
+ while (dim--) { r += a[0] * a[0]; ++a; }
+
+ return __ocml_rsqrt_f64(r);
+}
+__DEVICE__
+inline
+double rnorm3d(double x, double y, double z)
+{
+ return __ocml_rlen3_f64(x, y, z);
+}
+__DEVICE__
+inline
+double rnorm4d(double x, double y, double z, double w)
+{
+ return __ocml_rlen4_f64(x, y, z, w);
+}
+__DEVICE__
+inline
+double round(double x) { return __ocml_round_f64(x); }
+__DEVICE__
+inline
+double rsqrt(double x) { return __ocml_rsqrt_f64(x); }
+__DEVICE__
+inline
+double scalbln(double x, long int n)
+{
+ return (n < INT_MAX) ? __ocml_scalbn_f64(x, n) : __ocml_scalb_f64(x, n);
+}
+__DEVICE__
+inline
+double scalbn(double x, int n) { return __ocml_scalbn_f64(x, n); }
+__DEVICE__
+inline
+__RETURN_TYPE signbit(double x) { return __ocml_signbit_f64(x); }
+__DEVICE__
+inline
+double sin(double x) { return __ocml_sin_f64(x); }
+__DEVICE__
+inline
+void sincos(double x, double* sptr, double* cptr)
+{
+ double tmp;
+ *sptr =
+ __ocml_sincos_f64(x, (__attribute__((address_space(5))) double*) &tmp);
+ *cptr = tmp;
+}
+__DEVICE__
+inline
+void sincospi(double x, double* sptr, double* cptr)
+{
+ double tmp;
+ *sptr = __ocml_sincospi_f64(
+ x, (__attribute__((address_space(5))) double*) &tmp);
+ *cptr = tmp;
+}
+__DEVICE__
+inline
+double sinh(double x) { return __ocml_sinh_f64(x); }
+__DEVICE__
+inline
+double sinpi(double x) { return __ocml_sinpi_f64(x); }
+__DEVICE__
+inline
+double sqrt(double x) { return __ocml_sqrt_f64(x); }
+__DEVICE__
+inline
+double tan(double x) { return __ocml_tan_f64(x); }
+__DEVICE__
+inline
+double tanh(double x) { return __ocml_tanh_f64(x); }
+__DEVICE__
+inline
+double tgamma(double x) { return __ocml_tgamma_f64(x); }
+__DEVICE__
+inline
+double trunc(double x) { return __ocml_trunc_f64(x); }
+__DEVICE__
+inline
+double y0(double x) { return __ocml_y0_f64(x); }
+__DEVICE__
+inline
+double y1(double x) { return __ocml_y1_f64(x); }
+__DEVICE__
+inline
+double yn(int n, double x)
+{ // TODO: we could use Ahmes multiplication and the Miller & Brown algorithm
+ // for linear recurrences to get O(log n) steps, but it's unclear if
+ // it'd be beneficial in this case. Placeholder until OCML adds
+ // support.
+ if (n == 0) return j0f(x);
+ if (n == 1) return j1f(x);
+
+ double x0 = j0f(x);
+ double x1 = j1f(x);
+ for (int i = 1; i < n; ++i) {
+ double x2 = (2 * i) / x * x1 - x0;
+ x0 = x1;
+ x1 = x2;
+ }
+
+ return x1;
+}
+
+// BEGIN INTRINSICS
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dadd_rd(double x, double y) { return __ocml_add_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __dadd_rn(double x, double y) { return x + y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dadd_ru(double x, double y) { return __ocml_add_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dadd_rz(double x, double y) { return __ocml_add_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rd(double x, double y) { return __ocml_div_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __ddiv_rn(double x, double y) { return x / y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __ddiv_ru(double x, double y) { return __ocml_div_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __ddiv_rz(double x, double y) { return __ocml_div_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rd(double x, double y) { return __ocml_mul_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __dmul_rn(double x, double y) { return x * y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dmul_ru(double x, double y) { return __ocml_mul_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
+#endif
+__DEVICE__
+inline
+double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__DEVICE__
+inline
+double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
+#endif
+__DEVICE__
+inline
+double __dsqrt_rn(double x) { return __ocml_sqrt_f64(x); }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dsqrt_ru(double x) { return __ocml_sqrt_rtp_f64(x); }
+__DEVICE__
+inline
+double __dsqrt_rz(double x) { return __ocml_sqrt_rtz_f64(x); }
+__DEVICE__
+inline
+double __dsub_rd(double x, double y) { return __ocml_sub_rtn_f64(x, y); }
+#endif
+__DEVICE__
+inline
+double __dsub_rn(double x, double y) { return x - y; }
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __dsub_ru(double x, double y) { return __ocml_sub_rtp_f64(x, y); }
+__DEVICE__
+inline
+double __dsub_rz(double x, double y) { return __ocml_sub_rtz_f64(x, y); }
+__DEVICE__
+inline
+double __fma_rd(double x, double y, double z)
+{
+ return __ocml_fma_rtn_f64(x, y, z);
+}
+#endif
+__DEVICE__
+inline
+double __fma_rn(double x, double y, double z)
+{
+ return __ocml_fma_f64(x, y, z);
+}
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+inline
+double __fma_ru(double x, double y, double z)
+{
+ return __ocml_fma_rtp_f64(x, y, z);
+}
+__DEVICE__
+inline
+double __fma_rz(double x, double y, double z)
+{
+ return __ocml_fma_rtz_f64(x, y, z);
+}
+#endif
+// END INTRINSICS
+// END DOUBLE
+
+// BEGIN INTEGER
+__DEVICE__
+inline
+int abs(int x)
+{
+ int sgn = x >> (sizeof(int) * CHAR_BIT - 1);
+ return (x ^ sgn) - sgn;
+}
+__DEVICE__
+inline
+long labs(long x)
+{
+ long sgn = x >> (sizeof(long) * CHAR_BIT - 1);
+ return (x ^ sgn) - sgn;
+}
+__DEVICE__
+inline
+long long llabs(long long x)
+{
+ long long sgn = x >> (sizeof(long long) * CHAR_BIT - 1);
+ return (x ^ sgn) - sgn;
+}
+
+#if defined(__cplusplus)
+ __DEVICE__
+ inline
+ long abs(long x) { return labs(x); }
+ __DEVICE__
+ inline
+ long long abs(long long x) { return llabs(x); }
+#endif
+// END INTEGER
+
+__DEVICE__
+inline _Float16 fma(_Float16 x, _Float16 y, _Float16 z) {
+ return __ocml_fma_f16(x, y, z);
+}
+
+__DEVICE__
+inline float fma(float x, float y, float z) {
+ return fmaf(x, y, z);
+}
+
+#pragma push_macro("__DEF_FLOAT_FUN")
+#pragma push_macro("__DEF_FLOAT_FUN2")
+#pragma push_macro("__DEF_FLOAT_FUN2I")
+#pragma push_macro("__HIP_OVERLOAD")
+#pragma push_macro("__HIP_OVERLOAD2")
+
+// __hip_enable_if::type is a type function which returns __T if __B is true.
+template<bool __B, class __T = void>
+struct __hip_enable_if {};
+
+template <class __T> struct __hip_enable_if<true, __T> {
+ typedef __T type;
+};
+
+// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
+// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
+// floor(double).
+#define __HIP_OVERLOAD1(__retty, __fn) \
+ template <typename __T> \
+ __DEVICE__ \
+ typename __hip_enable_if<std::numeric_limits<__T>::is_integer, \
+ __retty>::type \
+ __fn(__T __x) { \
+ return ::__fn((double)__x); \
+ }
+
+// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
+// or integer argument to avoid compilation error due to ambibuity. e.g.
+// max(5.0f, 6.0) is resolved with max(double, double).
+#define __HIP_OVERLOAD2(__retty, __fn) \
+ template <typename __T1, typename __T2> \
+ __DEVICE__ typename __hip_enable_if< \
+ std::numeric_limits<__T1>::is_specialized && \
+ std::numeric_limits<__T2>::is_specialized, \
+ __retty>::type \
+ __fn(__T1 __x, __T2 __y) { \
+ return __fn((double)__x, (double)__y); \
+ }
+
+// Define cmath functions with float argument and returns float.
+#define __DEF_FUN1(retty, func) \
+__DEVICE__ \
+inline \
+float func(float x) \
+{ \
+ return func##f(x); \
+} \
+__HIP_OVERLOAD1(retty, func)
+
+// Define cmath functions with float argument and returns retty.
+#define __DEF_FUNI(retty, func) \
+__DEVICE__ \
+inline \
+retty func(float x) \
+{ \
+ return func##f(x); \
+} \
+__HIP_OVERLOAD1(retty, func)
+
+// define cmath functions with two float arguments.
+#define __DEF_FUN2(retty, func) \
+__DEVICE__ \
+inline \
+float func(float x, float y) \
+{ \
+ return func##f(x, y); \
+} \
+__HIP_OVERLOAD2(retty, func)
+
+__DEF_FUN1(double, acos)
+__DEF_FUN1(double, acosh)
+__DEF_FUN1(double, asin)
+__DEF_FUN1(double, asinh)
+__DEF_FUN1(double, atan)
+__DEF_FUN2(double, atan2);
+__DEF_FUN1(double, atanh)
+__DEF_FUN1(double, cbrt)
+__DEF_FUN1(double, ceil)
+__DEF_FUN2(double, copysign);
+__DEF_FUN1(double, cos)
+__DEF_FUN1(double, cosh)
+__DEF_FUN1(double, erf)
+__DEF_FUN1(double, erfc)
+__DEF_FUN1(double, exp)
+__DEF_FUN1(double, exp2)
+__DEF_FUN1(double, expm1)
+__DEF_FUN1(double, fabs)
+__DEF_FUN2(double, fdim);
+__DEF_FUN1(double, floor)
+__DEF_FUN2(double, fmax);
+__DEF_FUN2(double, fmin);
+__DEF_FUN2(double, fmod);
+//__HIP_OVERLOAD1(int, fpclassify)
+__DEF_FUN2(double, hypot);
+__DEF_FUNI(int, ilogb)
+__HIP_OVERLOAD1(bool, isfinite)
+__HIP_OVERLOAD2(bool, isgreater);
+__HIP_OVERLOAD2(bool, isgreaterequal);
+__HIP_OVERLOAD1(bool, isinf);
+__HIP_OVERLOAD2(bool, isless);
+__HIP_OVERLOAD2(bool, islessequal);
+__HIP_OVERLOAD2(bool, islessgreater);
+__HIP_OVERLOAD1(bool, isnan);
+//__HIP_OVERLOAD1(bool, isnormal)
+__HIP_OVERLOAD2(bool, isunordered);
+__DEF_FUN1(double, lgamma)
+__DEF_FUN1(double, log)
+__DEF_FUN1(double, log10)
+__DEF_FUN1(double, log1p)
+__DEF_FUN1(double, log2)
+__DEF_FUN1(double, logb)
+__DEF_FUNI(long long, llrint)
+__DEF_FUNI(long long, llround)
+__DEF_FUNI(long, lrint)
+__DEF_FUNI(long, lround)
+__DEF_FUN1(double, nearbyint);
+__DEF_FUN2(double, nextafter);
+__DEF_FUN2(double, pow);
+__DEF_FUN2(double, remainder);
+__DEF_FUN1(double, rint);
+__DEF_FUN1(double, round);
+__HIP_OVERLOAD1(bool, signbit)
+__DEF_FUN1(double, sin)
+__DEF_FUN1(double, sinh)
+__DEF_FUN1(double, sqrt)
+__DEF_FUN1(double, tan)
+__DEF_FUN1(double, tanh)
+__DEF_FUN1(double, tgamma)
+__DEF_FUN1(double, trunc);
+
+// define cmath functions with a float and an integer argument.
+#define __DEF_FLOAT_FUN2I(func) \
+__DEVICE__ \
+inline \
+float func(float x, int y) \
+{ \
+ return func##f(x, y); \
+}
+__DEF_FLOAT_FUN2I(scalbn)
+__DEF_FLOAT_FUN2I(ldexp)
+
+template<class T>
+__DEVICE__ inline T min(T arg1, T arg2) {
+ return (arg1 < arg2) ? arg1 : arg2;
+}
+
+template<class T>
+__DEVICE__ inline T max(T arg1, T arg2) {
+ return (arg1 > arg2) ? arg1 : arg2;
+}
+
+#if __HCC__
+
+__DEVICE__ inline static uint32_t min(uint32_t arg1, int32_t arg2) {
+ return min(arg1, (uint32_t) arg2);
+}
+/*__DEVICE__ inline static uint32_t min(int32_t arg1, uint32_t arg2) {
+ return min((uint32_t) arg1, arg2);
+}
+
+__DEVICE__ inline static uint64_t min(uint64_t arg1, int64_t arg2) {
+ return min(arg1, (uint64_t) arg2);
+}
+__DEVICE__ inline static uint64_t min(int64_t arg1, uint64_t arg2) {
+ return min((uint64_t) arg1, arg2);
+}
+
+__DEVICE__ inline static unsigned long long min(unsigned long long arg1, long long arg2) {
+ return min(arg1, (unsigned long long) arg2);
+}
+__DEVICE__ inline static unsigned long long min(long long arg1, unsigned long long arg2) {
+ return min((unsigned long long) arg1, arg2);
+}*/
+
+__DEVICE__ inline static uint32_t max(uint32_t arg1, int32_t arg2) {
+ return max(arg1, (uint32_t) arg2);
+}
+__DEVICE__ inline static uint32_t max(int32_t arg1, uint32_t arg2) {
+ return max((uint32_t) arg1, arg2);
+}
+
+/*__DEVICE__ inline static uint64_t max(uint64_t arg1, int64_t arg2) {
+ return max(arg1, (uint64_t) arg2);
+}
+__DEVICE__ inline static uint64_t max(int64_t arg1, uint64_t arg2) {
+ return max((uint64_t) arg1, arg2);
+}
+
+__DEVICE__ inline static unsigned long long max(unsigned long long arg1, long long arg2) {
+ return max(arg1, (unsigned long long) arg2);
+}
+__DEVICE__ inline static unsigned long long max(long long arg1, unsigned long long arg2) {
+ return max((unsigned long long) arg1, arg2);
+}*/
+#else
+__DEVICE__ inline int min(int arg1, int arg2) {
+ return (arg1 < arg2) ? arg1 : arg2;
+}
+__DEVICE__ inline int max(int arg1, int arg2) {
+ return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__ inline int min(uint32_t arg1, int arg2) {
+ return (arg1 < arg2) ? arg1 : arg2;
+}
+__DEVICE__ inline int max(uint32_t arg1, int arg2) {
+ return (arg1 > arg2) ? arg1 : arg2;
+}
+
+__DEVICE__
+inline
+float max(float x, float y) {
+ return fmaxf(x, y);
+}
+
+__DEVICE__
+inline
+double max(double x, double y) {
+ return fmax(x, y);
+}
+
+__DEVICE__
+inline
+float min(float x, float y) {
+ return fminf(x, y);
+}
+
+__DEVICE__
+inline
+double min(double x, double y) {
+ return fmin(x, y);
+}
+
+__HIP_OVERLOAD2(double, max)
+__HIP_OVERLOAD2(double, min)
+
+#endif
+
+__host__ inline static int min(int arg1, int arg2) {
+ return std::min(arg1, arg2);
+}
+
+__host__ inline static int max(int arg1, int arg2) {
+ return std::max(arg1, arg2);
+}
+
+__DEVICE__
+inline float pow(float base, int iexp) {
+ return powif(base, iexp);
+}
+
+__DEVICE__
+inline double pow(double base, int iexp) {
+ return powi(base, iexp);
+}
+
+__DEVICE__
+inline _Float16 pow(_Float16 base, int iexp) {
+ return __ocml_pown_f16(base, iexp);
+}
+
+#pragma pop_macro("__DEF_FLOAT_FUN")
+#pragma pop_macro("__DEF_FLOAT_FUN2")
+#pragma pop_macro("__DEF_FLOAT_FUN2I")
+#pragma pop_macro("__HIP_OVERLOAD")
+#pragma pop_macro("__HIP_OVERLOAD2")
+
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__RETURN_TYPE")
+
+// For backward compatibility.
+// There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros
+// defined after including math_functions.h.
+#include <hip/hcc_detail/hip_runtime.h>
diff --git a/third_party/rocm/include/hip/hcc_detail/math_fwd.h b/third_party/rocm/include/hip/hcc_detail/math_fwd.h
new file mode 100644
index 0000000..c197af8
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/math_fwd.h
@@ -0,0 +1,714 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "host_defines.h"
+#if defined(__cplusplus)
+ extern "C" {
+#endif
+
+// DOT FUNCTIONS
+#if (__hcc_workweek__ >= 19015) || __HIP_CLANG_ONLY__
+__device__
+__attribute__((const))
+int __ockl_sdot2(
+ HIP_vector_base<short, 2>::Native_vec_,
+ HIP_vector_base<short, 2>::Native_vec_,
+ int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot2(
+ HIP_vector_base<unsigned short, 2>::Native_vec_,
+ HIP_vector_base<unsigned short, 2>::Native_vec_,
+ unsigned int, bool);
+
+__device__
+__attribute__((const))
+int __ockl_sdot4(
+ HIP_vector_base<char, 4>::Native_vec_,
+ HIP_vector_base<char, 4>::Native_vec_,
+ int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot4(
+ HIP_vector_base<unsigned char, 4>::Native_vec_,
+ HIP_vector_base<unsigned char, 4>::Native_vec_,
+ unsigned int, bool);
+
+__device__
+__attribute__((const))
+int __ockl_sdot8(int, int, int, bool);
+
+__device__
+__attribute__((const))
+unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
+#endif
+
+#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+// BEGIN FLOAT
+__device__
+__attribute__((const))
+float __ocml_acos_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_acosh_f32(float);
+__device__
+__attribute__((const))
+float __ocml_asin_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_asinh_f32(float);
+__device__
+__attribute__((const))
+float __ocml_atan2_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_atan_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_atanh_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_cbrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_ceil_f32(float);
+__device__
+__attribute__((const))
+__device__
+float __ocml_copysign_f32(float, float);
+__device__
+float __ocml_cos_f32(float);
+__device__
+float __ocml_native_cos_f32(float);
+__device__
+__attribute__((pure))
+__device__
+float __ocml_cosh_f32(float);
+__device__
+float __ocml_cospi_f32(float);
+__device__
+float __ocml_i0_f32(float);
+__device__
+float __ocml_i1_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfc_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfcinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfcx_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erf_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_erfinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_exp10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp2_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_exp_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_exp_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_expm1_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fabs_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fdim_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_floor_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fma_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fmax_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_fmin_f32(float, float);
+__device__
+__attribute__((const))
+__device__
+float __ocml_fmod_f32(float, float);
+__device__
+float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+float __ocml_hypot_f32(float, float);
+__device__
+__attribute__((const))
+int __ocml_ilogb_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isfinite_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isinf_f32(float);
+__device__
+__attribute__((const))
+int __ocml_isnan_f32(float);
+__device__
+float __ocml_j0_f32(float);
+__device__
+float __ocml_j1_f32(float);
+__device__
+__attribute__((const))
+float __ocml_ldexp_f32(float, int);
+__device__
+float __ocml_lgamma_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log10_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log1p_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log2_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log2_f32(float);
+__device__
+__attribute__((const))
+float __ocml_logb_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_log_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_native_log_f32(float);
+__device__
+float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
+__device__
+__attribute__((const))
+float __ocml_nearbyint_f32(float);
+__device__
+__attribute__((const))
+float __ocml_nextafter_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_len3_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_len4_f32(float, float, float, float);
+__device__
+__attribute__((pure))
+float __ocml_ncdf_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_ncdfinv_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_pow_f32(float, float);
+__device__
+__attribute__((pure))
+float __ocml_pown_f32(float, int);
+__device__
+__attribute__((pure))
+float __ocml_rcbrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_remainder_f32(float, float);
+__device__
+float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+float __ocml_rhypot_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_rint_f32(float);
+__device__
+__attribute__((const))
+float __ocml_rlen3_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_rlen4_f32(float, float, float, float);
+__device__
+__attribute__((const))
+float __ocml_round_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_rsqrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_scalb_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_scalbn_f32(float, int);
+__device__
+__attribute__((const))
+int __ocml_signbit_f32(float);
+__device__
+float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
+__device__
+float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
+__device__
+float __ocml_sin_f32(float);
+__device__
+float __ocml_native_sin_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_sinh_f32(float);
+__device__
+float __ocml_sinpi_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_f32(float);
+__device__
+__attribute__((const))
+float __ocml_native_sqrt_f32(float);
+__device__
+float __ocml_tan_f32(float);
+__device__
+__attribute__((pure))
+float __ocml_tanh_f32(float);
+__device__
+float __ocml_tgamma_f32(float);
+__device__
+__attribute__((const))
+float __ocml_trunc_f32(float);
+__device__
+float __ocml_y0_f32(float);
+__device__
+float __ocml_y1_f32(float);
+
+// BEGIN INTRINSICS
+__device__
+__attribute__((const))
+float __ocml_add_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_add_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sub_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_mul_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rte_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtn_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtp_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_div_rtz_f32(float, float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rte_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtn_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtp_f32(float);
+__device__
+__attribute__((const))
+float __ocml_sqrt_rtz_f32(float);
+__device__
+__attribute__((const))
+float __ocml_fma_rte_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtn_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtp_f32(float, float, float);
+__device__
+__attribute__((const))
+float __ocml_fma_rtz_f32(float, float, float);
+
+__device__
+__attribute__((const))
+float __llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
+__device__
+__attribute__((const))
+float __llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
+__device__
+__attribute__((const))
+float __llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
+__device__
+__attribute__((const))
+float __llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+// END INTRINSICS
+// END FLOAT
+
+// BEGIN DOUBLE
+__device__
+__attribute__((const))
+double __ocml_acos_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_acosh_f64(double);
+__device__
+__attribute__((const))
+double __ocml_asin_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_asinh_f64(double);
+__device__
+__attribute__((const))
+double __ocml_atan2_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_atan_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_atanh_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_cbrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_ceil_f64(double);
+__device__
+__attribute__((const))
+double __ocml_copysign_f64(double, double);
+__device__
+double __ocml_cos_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_cosh_f64(double);
+__device__
+double __ocml_cospi_f64(double);
+__device__
+double __ocml_i0_f64(double);
+__device__
+double __ocml_i1_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfc_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfcinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfcx_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erf_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_erfinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp10_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp2_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_exp_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_expm1_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fabs_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fdim_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_floor_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fma_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fmax_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_fmin_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_fmod_f64(double, double);
+__device__
+double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+double __ocml_hypot_f64(double, double);
+__device__
+__attribute__((const))
+int __ocml_ilogb_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isfinite_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isinf_f64(double);
+__device__
+__attribute__((const))
+int __ocml_isnan_f64(double);
+__device__
+double __ocml_j0_f64(double);
+__device__
+double __ocml_j1_f64(double);
+__device__
+__attribute__((const))
+double __ocml_ldexp_f64(double, int);
+__device__
+double __ocml_lgamma_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log10_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log1p_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log2_f64(double);
+__device__
+__attribute__((const))
+double __ocml_logb_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_log_f64(double);
+__device__
+double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
+__device__
+__attribute__((const))
+double __ocml_nearbyint_f64(double);
+__device__
+__attribute__((const))
+double __ocml_nextafter_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_len3_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_len4_f64(double, double, double, double);
+__device__
+__attribute__((pure))
+double __ocml_ncdf_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_ncdfinv_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_pow_f64(double, double);
+__device__
+__attribute__((pure))
+double __ocml_pown_f64(double, int);
+__device__
+__attribute__((pure))
+double __ocml_rcbrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_remainder_f64(double, double);
+__device__
+double __ocml_remquo_f64(
+ double, double, __attribute__((address_space(5))) int*);
+__device__
+__attribute__((const))
+double __ocml_rhypot_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_rint_f64(double);
+__device__
+__attribute__((const))
+double __ocml_rlen3_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_rlen4_f64(double, double, double, double);
+__device__
+__attribute__((const))
+double __ocml_round_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_rsqrt_f64(double);
+__device__
+__attribute__((const))
+double __ocml_scalb_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_scalbn_f64(double, int);
+__device__
+__attribute__((const))
+int __ocml_signbit_f64(double);
+__device__
+double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
+__device__
+double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
+__device__
+double __ocml_sin_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_sinh_f64(double);
+__device__
+double __ocml_sinpi_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_f64(double);
+__device__
+double __ocml_tan_f64(double);
+__device__
+__attribute__((pure))
+double __ocml_tanh_f64(double);
+__device__
+double __ocml_tgamma_f64(double);
+__device__
+__attribute__((const))
+double __ocml_trunc_f64(double);
+__device__
+double __ocml_y0_f64(double);
+__device__
+double __ocml_y1_f64(double);
+
+// BEGIN INTRINSICS
+__device__
+__attribute__((const))
+double __ocml_add_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_add_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sub_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_mul_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rte_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtn_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtp_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_div_rtz_f64(double, double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rte_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtn_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtp_f64(double);
+__device__
+__attribute__((const))
+double __ocml_sqrt_rtz_f64(double);
+__device__
+__attribute__((const))
+double __ocml_fma_rte_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtn_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtp_f64(double, double, double);
+__device__
+__attribute__((const))
+double __ocml_fma_rtz_f64(double, double, double);
+
+__device__
+__attribute__((const))
+double __llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
+__device__
+__attribute__((const))
+double __llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+// END INTRINSICS
+// END DOUBLE
+
+#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
+
+#if defined(__cplusplus)
+ } // extern "C"
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/ockl_image.h b/third_party/rocm/include/hip/hcc_detail/ockl_image.h
new file mode 100644
index 0000000..b32b23f
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/ockl_image.h
@@ -0,0 +1,135 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_vector_types.h>
+
+extern "C" {
+
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
+
+__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
+
+__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
+
+__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
+
+__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
+
+__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
+
+};
\ No newline at end of file
diff --git a/third_party/rocm/include/hip/hcc_detail/program_state.hpp b/third_party/rocm/include/hip/hcc_detail/program_state.hpp
new file mode 100644
index 0000000..6128a4c
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/program_state.hpp
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hsa/amd_hsa_kernel_code.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+#include <hsa/hsa_ven_amd_loader.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include <hip/hip_common.h>
+
+struct ihipModuleSymbol_t;
+using hipFunction_t = ihipModuleSymbol_t*;
+
+namespace hip_impl {
+
+// This section contains internal APIs that
+// needs to be exported
+#ifdef __GNUC__
+#pragma GCC visibility push (default)
+#endif
+
+struct kernarg_impl;
+class kernarg {
+public:
+ kernarg();
+ kernarg(kernarg&&);
+ ~kernarg();
+ std::uint8_t* data();
+ std::size_t size();
+ void reserve(std::size_t);
+ void resize(std::size_t);
+private:
+ kernarg_impl* impl;
+};
+
+class kernargs_size_align;
+class program_state_impl;
+class program_state {
+public:
+ program_state();
+ ~program_state();
+ program_state(const program_state&) = delete;
+
+ hipFunction_t kernel_descriptor(std::uintptr_t,
+ hsa_agent_t);
+
+ kernargs_size_align get_kernargs_size_align(std::uintptr_t);
+ hsa_executable_t load_executable(const char*, const size_t,
+ hsa_executable_t,
+ hsa_agent_t);
+ hsa_executable_t load_executable_no_copy(const char*, const size_t,
+ hsa_executable_t,
+ hsa_agent_t);
+
+ void* global_addr_by_name(const char* name);
+
+private:
+ friend class agent_globals_impl;
+ program_state_impl* impl;
+};
+
+class kernargs_size_align {
+public:
+ std::size_t size(std::size_t n) const;
+ std::size_t alignment(std::size_t n) const;
+ const void* getHandle() const {return handle;};
+private:
+ const void* handle;
+ friend kernargs_size_align program_state::get_kernargs_size_align(std::uintptr_t);
+};
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+inline
+__attribute__((visibility("hidden")))
+program_state& get_program_state() {
+ static program_state ps;
+ return ps;
+}
+} // Namespace hip_impl.
diff --git a/third_party/rocm/include/hip/hcc_detail/surface_functions.h b/third_party/rocm/include/hip/hcc_detail/surface_functions.h
new file mode 100644
index 0000000..b9cab1f
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/surface_functions.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2018 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_SURFACE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_SURFACE_FUNCTIONS_H
+
+#include <hip/hcc_detail/hip_surface_types.h>
+
+#define __SURFACE_FUNCTIONS_DECL__ static inline __device__
+template <class T>
+__SURFACE_FUNCTIONS_DECL__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
+ int boundaryMode = hipBoundaryModeZero) {
+ hipArray* arrayPtr = (hipArray*)surfObj;
+ size_t width = arrayPtr->width;
+ size_t height = arrayPtr->height;
+ int32_t xOffset = x / sizeof(T);
+ T* dataPtr = (T*)arrayPtr->data;
+ if ((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0)) {
+ if (boundaryMode == hipBoundaryModeZero) {
+ *data = 0;
+ }
+ } else {
+ *data = *(dataPtr + y * width + xOffset);
+ }
+}
+
+template <class T>
+__SURFACE_FUNCTIONS_DECL__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
+ int boundaryMode = hipBoundaryModeZero) {
+ hipArray* arrayPtr = (hipArray*)surfObj;
+ size_t width = arrayPtr->width;
+ size_t height = arrayPtr->height;
+ int32_t xOffset = x / sizeof(T);
+ T* dataPtr = (T*)arrayPtr->data;
+ if (!((xOffset > width) || (xOffset < 0) || (y > height) || (y < 0))) {
+ *(dataPtr + y * width + xOffset) = data;
+ }
+}
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/texture_fetch_functions.h b/third_party/rocm/include/hip/hcc_detail/texture_fetch_functions.h
new file mode 100644
index 0000000..03c1780
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/texture_fetch_functions.h
@@ -0,0 +1,386 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#include <hip/hip_vector_types.h>
+#include <hip/texture_types.h>
+#include <hip/hcc_detail/ockl_image.h>
+
+#include <type_traits>
+
+#define TEXTURE_PARAMETERS_INIT \
+ unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
+ unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
+
+template<typename T>
+struct __hip_is_tex_channel_type
+{
+ static constexpr bool value =
+ std::is_same<T, char>::value ||
+ std::is_same<T, unsigned char>::value ||
+ std::is_same<T, short>::value ||
+ std::is_same<T, unsigned short>::value ||
+ std::is_same<T, int>::value ||
+ std::is_same<T, unsigned int>::value ||
+ std::is_same<T, float>::value;
+};
+
+template<
+ typename T,
+ unsigned int rank>
+struct __hip_is_tex_channel_type<HIP_vector_type<T, rank>>
+{
+ static constexpr bool value =
+ __hip_is_tex_channel_type<T>::value &&
+ ((rank == 1) ||
+ (rank == 2) ||
+ (rank == 4));
+};
+
+template<typename T>
+struct __hip_is_tex_normalized_channel_type
+{
+ static constexpr bool value =
+ std::is_same<T, char>::value ||
+ std::is_same<T, unsigned char>::value ||
+ std::is_same<T, short>::value ||
+ std::is_same<T, unsigned short>::value;
+};
+
+template<
+ typename T,
+ unsigned int rank>
+struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
+{
+ static constexpr bool value =
+ __hip_is_tex_normalized_channel_type<T>::value &&
+ ((rank == 1) ||
+ (rank == 2) ||
+ (rank == 4));
+};
+
+template <
+ typename T,
+ hipTextureReadMode readMode,
+ typename Enable = void>
+struct __hip_tex_ret
+{
+ static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <
+ typename T,
+ hipTextureReadMode readMode>
+using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
+
+template <typename T>
+struct __hip_tex_ret<
+ T,
+ hipReadModeElementType,
+ typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
+{
+ using type = T;
+};
+
+template<
+ typename T,
+ unsigned int rank>
+struct __hip_tex_ret<
+ HIP_vector_type<T, rank>,
+ hipReadModeElementType,
+ typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+ using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
+};
+
+template<typename T>
+struct __hip_tex_ret<
+ T,
+ hipReadModeNormalizedFloat,
+ typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
+{
+ using type = float;
+};
+
+template<
+ typename T,
+ unsigned int rank>
+struct __hip_tex_ret<
+ HIP_vector_type<T, rank>,
+ hipReadModeNormalizedFloat,
+ typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+ using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_load_1Db(i, x);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_1D(i, s, x);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ // TODO missing in device libs.
+ // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+ // return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+ return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ // TODO missing in device libs.
+ // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+ // return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+ return {};
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+ TEXTURE_PARAMETERS_INIT;
+ auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+ return *reinterpret_cast<__hip_tex_ret_t<T, readMode>*>(&tmp);
+}
+
+template <
+ typename T,
+ hipTextureReadMode readMode,
+ typename Enable = void>
+struct __hip_tex2dgather_ret
+{
+ static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
+};
+
+template <
+ typename T,
+ hipTextureReadMode readMode>
+using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
+
+template <typename T>
+struct __hip_tex2dgather_ret<
+ T,
+ hipReadModeElementType,
+ typename std::enable_if<__hip_is_tex_channel_type<T>::value, bool>::type>
+{
+ using type = HIP_vector_type<T, 4>;
+};
+
+template<
+ typename T,
+ unsigned int rank>
+struct __hip_tex2dgather_ret<
+ HIP_vector_type<T, rank>,
+ hipReadModeElementType,
+ typename std::enable_if<__hip_is_tex_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
+{
+ using type = HIP_vector_type<T, 4>;
+};
+
+template <typename T>
+struct __hip_tex2dgather_ret<
+ T,
+ hipReadModeNormalizedFloat,
+ typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
+{
+ using type = float4;
+};
+
+template <typename T, hipTextureReadMode readMode>
+static __forceinline__ __device__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
+{
+ TEXTURE_PARAMETERS_INIT;
+ switch (comp) {
+ case 1: {
+ auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+ }
+ case 2: {
+ auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+ }
+ case 3: {
+ auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+ }
+ default: {
+ auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<__hip_tex2dgather_ret_t<T, readMode>*>(&tmp);
+ }
+ }
+ return {};
+}
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/texture_functions.h b/third_party/rocm/include/hip/hcc_detail/texture_functions.h
new file mode 100644
index 0000000..4a84507
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/texture_functions.h
@@ -0,0 +1,11102 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_TEXTURE_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_TEXTURE_FUNCTIONS_H
+#include <hip/hcc_detail/hip_vector_types.h>
+#include <hip/hcc_detail/hip_texture_types.h>
+
+#pragma push_macro("TYPEDEF_VECTOR_VALUE_TYPE")
+#define TYPEDEF_VECTOR_VALUE_TYPE(SCALAR_TYPE) \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##2_vector_value_type __attribute__((ext_vector_type(2))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##3_vector_value_type __attribute__((ext_vector_type(3))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##4_vector_value_type __attribute__((ext_vector_type(4))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##8_vector_value_type __attribute__((ext_vector_type(8))); \
+typedef SCALAR_TYPE __hip_##SCALAR_TYPE##16_vector_value_type __attribute__((ext_vector_type(16)));
+
+TYPEDEF_VECTOR_VALUE_TYPE(float);
+TYPEDEF_VECTOR_VALUE_TYPE(int);
+TYPEDEF_VECTOR_VALUE_TYPE(uint);
+
+#undef TYPEDEF_VECTOR_VALUE_TYPE
+#pragma pop_macro("TYPEDEF_VECTOR_VALUE_TYPE")
+
+union TData {
+ __hip_float4_vector_value_type f;
+ __hip_int4_vector_value_type i;
+ __hip_uint4_vector_value_type u;
+};
+
+#define __TEXTURE_FUNCTIONS_DECL__ static inline __device__
+
+
+#if (__hcc_workweek__ >= 18114) || __clang__
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
+#else
+#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(2)))
+#endif
+
+#define TEXTURE_PARAMETERS_INIT \
+ unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
+ unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; \
+ TData texel;
+#define TEXTURE_REF_PARAMETERS_INIT \
+ unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)texRef.textureObject; \
+ unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; \
+ TData texel;
+#define TEXTURE_SET_FLOAT *retVal = texel.f.x;
+
+#define TEXTURE_SET_SIGNED *retVal = texel.i.x;
+
+#define TEXTURE_SET_UNSIGNED *retVal = texel.u.x;
+
+#define TEXTURE_SET_FLOAT_X retVal->x = texel.f.x;
+
+#define TEXTURE_SET_SIGNED_X retVal->x = texel.i.x;
+
+#define TEXTURE_SET_UNSIGNED_X retVal->x = texel.u.x;
+
+#define TEXTURE_SET_FLOAT_XY \
+ retVal->x = texel.f.x; \
+ retVal->y = texel.f.y;
+
+#define TEXTURE_SET_SIGNED_XY \
+ retVal->x = texel.i.x; \
+ retVal->y = texel.i.y;
+
+#define TEXTURE_SET_UNSIGNED_XY \
+ retVal->x = texel.u.x; \
+ retVal->y = texel.u.y;
+
+#define TEXTURE_SET_FLOAT_XYZW \
+ retVal->x = texel.f.x; \
+ retVal->y = texel.f.y; \
+ retVal->z = texel.f.z; \
+ retVal->w = texel.f.w;
+
+#define TEXTURE_SET_SIGNED_XYZW \
+ retVal->x = texel.i.x; \
+ retVal->y = texel.i.y; \
+ retVal->z = texel.i.z; \
+ retVal->w = texel.i.w;
+
+#define TEXTURE_SET_UNSIGNED_XYZW \
+ retVal->x = texel.u.x; \
+ retVal->y = texel.u.y; \
+ retVal->z = texel.u.z; \
+ retVal->w = texel.u.w;
+
+#define TEXTURE_RETURN_CHAR return texel.i.x;
+
+#define TEXTURE_RETURN_UCHAR return texel.u.x;
+
+#define TEXTURE_RETURN_SHORT return texel.i.x;
+
+#define TEXTURE_RETURN_USHORT return texel.u.x;
+
+#define TEXTURE_RETURN_INT return texel.i.x;
+
+#define TEXTURE_RETURN_UINT return texel.u.x;
+
+#define TEXTURE_RETURN_SIGNED return texel.i.x;
+
+#define TEXTURE_RETURN_UNSIGNED return texel.u.x;
+
+#define TEXTURE_RETURN_CHAR_X return make_char1(texel.i.x);
+
+#define TEXTURE_RETURN_UCHAR_X return make_uchar1(texel.u.x);
+
+#define TEXTURE_RETURN_SHORT_X return make_short1(texel.i.x);
+
+#define TEXTURE_RETURN_USHORT_X return make_ushort1(texel.u.x);
+
+#define TEXTURE_RETURN_INT_X return make_int1(texel.i.x);
+
+#define TEXTURE_RETURN_UINT_X return make_uint1(texel.u.x);
+
+#define TEXTURE_RETURN_CHAR_XY return make_char2(texel.i.x, texel.i.y);
+
+#define TEXTURE_RETURN_UCHAR_XY return make_uchar2(texel.u.x, texel.u.y);
+
+#define TEXTURE_RETURN_SHORT_XY return make_short2(texel.i.x, texel.i.y);
+
+#define TEXTURE_RETURN_USHORT_XY return make_ushort2(texel.u.x, texel.u.y);
+
+#define TEXTURE_RETURN_INT_XY return make_int2(texel.i.x, texel.i.y);
+
+#define TEXTURE_RETURN_UINT_XY return make_uint2(texel.u.x, texel.u.y);
+
+#define TEXTURE_RETURN_CHAR_XYZW return make_char4(texel.i.x, texel.i.y, texel.i.z, texel.i.w);
+
+#define TEXTURE_RETURN_UCHAR_XYZW return make_uchar4(texel.u.x, texel.u.y, texel.u.z, texel.u.w);
+
+#define TEXTURE_RETURN_SHORT_XYZW return make_short4(texel.i.x, texel.i.y, texel.i.z, texel.i.w);
+
+#define TEXTURE_RETURN_USHORT_XYZW return make_ushort4(texel.u.x, texel.u.y, texel.u.z, texel.u.w);
+
+#define TEXTURE_RETURN_INT_XYZW return make_int4(texel.i.x, texel.i.y, texel.i.z, texel.i.w);
+
+#define TEXTURE_RETURN_UINT_XYZW return make_uint4(texel.u.x, texel.u.y, texel.u.z, texel.u.w);
+
+#define TEXTURE_RETURN_FLOAT return texel.f.x;
+
+#define TEXTURE_RETURN_FLOAT_X return make_float1(texel.f.x);
+
+#define TEXTURE_RETURN_FLOAT_XY return make_float2(texel.f.x, texel.f.y);
+
+#define TEXTURE_RETURN_FLOAT_XYZW return make_float4(texel.f.x, texel.f.y, texel.f.z, texel.f.w);
+
+extern "C" {
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_1D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ float c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_1Da(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_2D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c);
+
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_2Da(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c);
+
+__device__
+float __ockl_image_sample_2Dad(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c);
+
+__device__
+float __ockl_image_sample_2Dd(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_3D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_1D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ float c, float dx, float dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_1Da(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c, float dx, float dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_2D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_2Da(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+float __ockl_image_sample_grad_2Dad(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+float __ockl_image_sample_grad_2Dd(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c, __hip_float2_vector_value_type dx, __hip_float2_vector_value_type dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_grad_3D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c, __hip_float4_vector_value_type dx, __hip_float4_vector_value_type dy);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_1D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ float c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_1Da(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_2D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_2Da(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c, float l);
+
+__device__
+float __ockl_image_sample_lod_2Dad(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c, float l);
+
+__device__
+float __ockl_image_sample_lod_2Dd(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float2_vector_value_type c, float l);
+
+__device__
+__hip_float4_vector_value_type __ockl_image_sample_lod_3D(
+ unsigned int ADDRESS_SPACE_CONSTANT* i, unsigned int ADDRESS_SPACE_CONSTANT* s,
+ __hip_float4_vector_value_type c, float l);
+}
+
+////////////////////////////////////////////////////////////
+// Texture object APIs
+////////////////////////////////////////////////////////////
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char1* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char2* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(char4* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(unsigned char* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uchar1* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uchar2* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uchar4* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short1* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short2* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(short4* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(unsigned short* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(ushort1* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(ushort2* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(ushort4* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int1* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int2* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(int4* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(unsigned int* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uint1* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uint2* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(uint4* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float* retVal, hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float1* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float2* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1Dfetch(float4* retVal, hipTextureObject_t textureObject,
+ int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
+ T ret;
+ tex1Dfetch(&ret, textureObject, x);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(char4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uchar1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uchar2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uchar4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(short4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(ushort1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(ushort2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(ushort4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(int4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uint1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uint2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(uint4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float1* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float2* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1D(float4* retVal, hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1D(hipTextureObject_t textureObject, float x) {
+ T ret;
+ tex1D(&ret, textureObject, x);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(char4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uchar1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uchar2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uchar4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(short4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(ushort1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(ushort2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(ushort4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(int4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uint1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uint2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(uint4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float1* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float2* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLod(float4* retVal, hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLod(hipTextureObject_t textureObject, float x, float level) {
+ T ret;
+ tex1DLod(&ret, textureObject, x, level);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char1* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char2* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(char4* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uchar1* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uchar2* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uchar4* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short1* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short2* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(short4* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(ushort1* retVal, hipTextureObject_t textureObject,
+ float x, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(ushort2* retVal, hipTextureObject_t textureObject,
+ float x, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(ushort4* retVal, hipTextureObject_t textureObject,
+ float x, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int1* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int2* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(int4* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uint1* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uint2* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(uint4* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float1* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float2* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DGrad(float4* retVal, hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ T ret;
+ tex1DLod(&ret, textureObject, x, dx, dy);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(char4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uchar1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uchar2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uchar4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(short4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(ushort1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(ushort2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(ushort4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(int4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uint1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uint2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(uint4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float1* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float2* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2D(float4* retVal, hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
+ T ret;
+ tex2D(&ret, textureObject, x, y);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(char4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x, float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uchar1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uchar2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uchar4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(short4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x, float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(ushort1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(ushort2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(ushort4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(int4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uint1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uint2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(uint4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLod(float4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ T ret;
+ tex2DLod(&ret, textureObject, x, y, level);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(char4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uchar1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uchar2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uchar4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(short4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(ushort1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(ushort2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(ushort4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(int4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uint1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uint2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(uint4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3D(float4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z) {
+ T ret;
+ tex3D(&ret, textureObject, x, y, z);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(char4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(unsigned char* retVal, hipTextureObject_t textureObject,
+ float x, float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uchar1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uchar2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uchar4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(short4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(unsigned short* retVal, hipTextureObject_t textureObject,
+ float x, float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(ushort1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(ushort2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(ushort4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(int4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uint1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uint2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(uint4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float1* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float2* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex3DLod(float4* retVal, hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z,
+ float level) {
+ T ret;
+ tex3DLod(&ret, textureObject, x, y, z, level);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(char4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(unsigned char* retVal,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uchar1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uchar2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uchar4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(short4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(unsigned short* retVal,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(ushort1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(ushort2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(ushort4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int* retVal, hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(int4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uint1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uint2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(uint4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float1* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float2* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_FLOAT_XY;
+}
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayered(float4* retVal, hipTextureObject_t textureObject,
+ float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer) {
+ T ret;
+ tex1DLayered(&ret, textureObject, x, layer);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(char4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(unsigned char* retVal,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uchar1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uchar2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uchar4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(short4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(unsigned short* retVal,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(ushort1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(ushort2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(ushort4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(int4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(unsigned int* retVal,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uint1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uint2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(uint4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredLod(float4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer,
+ float level) {
+ T ret;
+ tex1DLayeredLod(&ret, textureObject, x, layer, level);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(char4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(unsigned char* retVal,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uchar1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uchar2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uchar4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(short4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(unsigned short* retVal,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(ushort1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(ushort2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(ushort4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(int4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(unsigned int* retVal,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uint1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uint2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(uint4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float1* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float2* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex1DLayeredGrad(float4* retVal, hipTextureObject_t textureObject,
+ float x, int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer,
+ float dx, float dy) {
+ T ret;
+ tex1DLayeredGrad(&ret, textureObject, x, layer, dx, dy);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(char4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(unsigned char* retVal,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uchar1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uchar2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uchar4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(short4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(unsigned short* retVal,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(ushort1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(ushort2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(ushort4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int* retVal, hipTextureObject_t textureObject, float x,
+ float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(int4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(unsigned int* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uint1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uint2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(uint4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayered(float4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ T ret;
+ tex2DLayered(&ret, textureObject, x, y, layer);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(char4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(unsigned char* retVal,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uchar1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uchar2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uchar4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(short4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(unsigned short* retVal,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(ushort1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(ushort2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(ushort4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(int4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_SIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(unsigned int* retVal,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uint1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uint2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(uint4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_UNSIGNED_XYZW;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_FLOAT;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float1* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_FLOAT_X;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float2* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_FLOAT_XY;
+}
+
+__TEXTURE_FUNCTIONS_DECL__ void tex2DLayeredLod(float4* retVal, hipTextureObject_t textureObject,
+ float x, float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_SET_FLOAT_XYZW;
+}
+
+template <class T>
+__TEXTURE_FUNCTIONS_DECL__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ T ret;
+ tex2DLayeredLod(&ret, textureObject, x, y, layer, level);
+ return ret;
+}
+
+////////////////////////////////////////////////////////////
+// Texture Reference APIs
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1Dfetch(texture<char, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1Dfetch(texture<char1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1Dfetch(texture<char2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1Dfetch(texture<char4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1Dfetch(texture<unsigned char, texType, mode> texRef,
+ int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1Dfetch(texture<uchar1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1Dfetch(texture<uchar2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1Dfetch(texture<uchar4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1Dfetch(texture<short, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1Dfetch(texture<short1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1Dfetch(texture<short2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1Dfetch(texture<short4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1Dfetch(texture<ushort1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1Dfetch(texture<unsigned short, texType, mode> texRef,
+ int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1Dfetch(texture<ushort2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1Dfetch(texture<ushort4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1Dfetch(texture<int1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1Dfetch(texture<int, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1Dfetch(texture<int2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1Dfetch(texture<int4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1Dfetch(texture<unsigned int, texType, mode> texRef,
+ int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1Dfetch(texture<uint1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1Dfetch(texture<uint2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1Dfetch(texture<uint4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1Dfetch(texture<float, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1Dfetch(texture<float1, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1Dfetch(texture<float2, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1Dfetch(texture<float4, texType, mode> texRef, int x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1Dfetch(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1Dfetch(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1Dfetch(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1Dfetch(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1Dfetch(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1Dfetch(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1Dfetch(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1Dfetch(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1Dfetch(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1Dfetch(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1Dfetch(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1Dfetch(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1Dfetch(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1Dfetch(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1Dfetch(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1Dfetch(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1Dfetch(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1Dfetch(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1Dfetch(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1Dfetch(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1Dfetch(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1Dfetch(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1Dfetch(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1Dfetch(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1Dfetch(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1Dfetch(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1Dfetch(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1Dfetch(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, int x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1D(texture<char, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1D(texture<char1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1D(texture<char2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1D(texture<char4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1D(texture<unsigned char, texType, mode> texRef,
+ float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1D(texture<uchar1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1D(texture<uchar2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1D(texture<uchar4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1D(texture<short, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1D(texture<short1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1D(texture<short2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1D(texture<short4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1D(texture<unsigned short, texType, mode> texRef,
+ float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1D(texture<ushort1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1D(texture<ushort2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1D(texture<ushort4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1D(texture<int, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1D(texture<int1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1D(texture<int2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1D(texture<int4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1D(texture<unsigned int, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1D(texture<uint1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1D(texture<uint2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1D(texture<uint4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1D(texture<float1, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1D(texture<float2, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1D(texture<float4, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1D(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1D(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1D(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1D(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1D(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1D(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1D(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1D(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1D(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1D(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1D(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1D(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1D(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1D(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1D(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1D(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1D(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1D(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1D(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1D(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1D(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1D(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1D(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1D(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1D(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT;
+}
+//////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1D(texture<float, texType, mode> texRef, float x) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1D(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1D(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1D(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1D(i, s, x);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLod(texture<char, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLod(texture<char1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLod(texture<char2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLod(texture<char4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLod(texture<unsigned char, texType, mode> texRef,
+ float x, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLod(texture<uchar1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLod(texture<uchar2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLod(texture<uchar4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLod(texture<short, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLod(texture<short1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLod(texture<short2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLod(texture<short4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLod(texture<unsigned short, texType, mode> texRef,
+ float x, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLod(texture<ushort1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLod(texture<ushort2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLod(texture<ushort4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLod(texture<int, texType, mode> texRef, float x, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLod(texture<int1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLod(texture<int2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLod(texture<int4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLod(texture<unsigned int, texType, mode> texRef,
+ float x, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLod(texture<uint1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLod(texture<uint2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLod(texture<uint4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLod(texture<float, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLod(texture<float1, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLod(texture<float2, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLod(texture<float4, texType, mode> texRef, float x,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLod(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLod(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLod(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLod(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLod(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLod(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLod(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLod(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLod(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLod(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLod(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLod(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLod(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLod(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLod(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLod(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLod(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLod(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLod(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLod(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLod(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLod(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLod(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLod(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLod(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLod(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLod(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLod(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_1D(i, s, x, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DGrad(texture<char, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DGrad(texture<char1, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DGrad(texture<char2, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DGrad(texture<char4, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DGrad(texture<unsigned char, texType, mode> texRef,
+ float x, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DGrad(texture<uchar1, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DGrad(texture<uchar2, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DGrad(texture<uchar4, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DGrad(texture<short, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DGrad(texture<short1, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DGrad(texture<short2, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DGrad(texture<short4, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DGrad(texture<unsigned short, texType, mode> texRef,
+ float x, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DGrad(texture<ushort1, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DGrad(texture<ushort2, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DGrad(texture<ushort4, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DGrad(texture<int, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DGrad(texture<int1, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DGrad(texture<int2, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DGrad(texture<int4, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DGrad(texture<unsigned int, texType, mode> texRef,
+ float x, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DGrad(texture<uint1, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DGrad(texture<uint2, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DGrad(texture<uint4, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DGrad(texture<float, texType, mode> texRef, float x, float dx,
+ float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DGrad(texture<float1, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DGrad(texture<float2, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DGrad(texture<float4, texType, mode> texRef, float x,
+ float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DGrad(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DGrad(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DGrad(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DGrad(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DGrad(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DGrad(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DGrad(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DGrad(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DGrad(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DGrad(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DGrad(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DGrad(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DGrad(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DGrad(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DGrad(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DGrad(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DGrad(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DGrad(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DGrad(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DGrad(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DGrad(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DGrad(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DGrad(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DGrad(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DGrad(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DGrad(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DGrad(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DGrad(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float dx,
+ float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_1D(i, s, x, dx, dy);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2D(texture<char, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2D(texture<char1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2D(texture<char2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2D(texture<char4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2D(texture<unsigned char, texType, mode> texRef,
+ float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2D(texture<uchar1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2D(texture<uchar2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2D(texture<uchar4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2D(texture<short, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2D(texture<short1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2D(texture<short2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2D(texture<short4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2D(texture<unsigned short, texType, mode> texRef,
+ float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2D(texture<ushort1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2D(texture<ushort2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2D(texture<ushort4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2D(texture<int, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2D(texture<int1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2D(texture<int2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2D(texture<int4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2D(texture<unsigned int, texType, mode> texRef, float x,
+ float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2D(texture<uint1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2D(texture<uint2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2D(texture<uint4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2D(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2D(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2D(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2D(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2D(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2D(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2D(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2D(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2D(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2D(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2D(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2D(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2D(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2D(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2D(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2D(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2D(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2D(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2D(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2D(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2D(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2D(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2D(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2D(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2D(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2D(texture<float, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2D(texture<float1, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2D(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2D(texture<float2, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2D(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2D(texture<float4, texType, mode> texRef, float x, float y) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2D(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLod(texture<char, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLod(texture<char1, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLod(texture<char2, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLod(texture<char4, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLod(texture<unsigned char, texType, mode> texRef,
+ float x, float y, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLod(texture<uchar1, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLod(texture<uchar2, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLod(texture<uchar4, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLod(texture<short, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLod(texture<short1, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLod(texture<short2, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLod(texture<short4, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLod(texture<unsigned short, texType, mode> texRef,
+ float x, float y, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLod(texture<ushort1, texType, mode> texRef, float x,
+ float y, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLod(texture<ushort2, texType, mode> texRef, float x,
+ float y, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLod(texture<ushort4, texType, mode> texRef, float x,
+ float y, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLod(texture<int, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLod(texture<int1, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLod(texture<int2, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLod(texture<int4, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLod(texture<unsigned int, texType, mode> texRef,
+ float x, float y, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLod(texture<uint1, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLod(texture<uint2, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLod(texture<uint4, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLod(texture<float, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLod(texture<float1, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLod(texture<float2, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLod(texture<float4, texType, mode> texRef, float x, float y,
+ float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLod(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLod(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLod(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLod(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLod(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLod(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLod(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLod(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLod(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLod(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLod(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLod(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLod(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLod(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLod(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLod(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLod(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLod(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLod(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLod(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLod(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLod(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLod(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLod(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLod(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLod(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLod(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLod(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DGrad(texture<char, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DGrad(texture<char1, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DGrad(texture<char2, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DGrad(texture<char4, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DGrad(texture<unsigned char, texType, mode> texRef,
+ float x, float y, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DGrad(texture<uchar1, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DGrad(texture<uchar2, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DGrad(texture<uchar4, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DGrad(texture<short, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DGrad(texture<short1, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DGrad(texture<short2, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DGrad(texture<short4, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DGrad(texture<unsigned short, texType, mode> texRef,
+ float x, float y, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DGrad(texture<ushort1, texType, mode> texRef, float x,
+ float y, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DGrad(texture<ushort2, texType, mode> texRef, float x,
+ float y, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DGrad(texture<ushort4, texType, mode> texRef, float x,
+ float y, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DGrad(texture<int, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DGrad(texture<int1, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DGrad(texture<int2, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DGrad(texture<int4, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DGrad(texture<unsigned int, texType, mode> texRef,
+ float x, float y, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DGrad(texture<uint1, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DGrad(texture<uint2, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DGrad(texture<uint4, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DGrad(texture<float, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DGrad(texture<float1, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DGrad(texture<float2, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DGrad(texture<float4, texType, mode> texRef, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DGrad(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DGrad(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DGrad(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DGrad(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DGrad(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DGrad(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DGrad(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DGrad(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DGrad(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DGrad(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DGrad(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DGrad(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DGrad(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DGrad(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DGrad(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DGrad(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DGrad(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DGrad(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DGrad(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DGrad(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DGrad(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DGrad(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DGrad(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DGrad(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DGrad(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DGrad(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DGrad(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DGrad(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_grad_2D(i, s, float2(x, y).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3D(texture<char, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3D(texture<char1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3D(texture<char2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3D(texture<char4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3D(texture<unsigned char, texType, mode> texRef,
+ float x, float y, float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3D(texture<uchar1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3D(texture<uchar2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3D(texture<uchar4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3D(texture<short, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3D(texture<short1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3D(texture<short2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3D(texture<short4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3D(texture<unsigned short, texType, mode> texRef,
+ float x, float y, float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3D(texture<ushort1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3D(texture<ushort2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3D(texture<ushort4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3D(texture<int, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3D(texture<int1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3D(texture<int2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3D(texture<int4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3D(texture<unsigned int, texType, mode> texRef, float x,
+ float y, float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3D(texture<uint1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3D(texture<uint2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3D(texture<uint4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3D(texture<float, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3D(texture<float1, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3D(texture<float2, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3D(texture<float4, texType, mode> texRef, float x, float y,
+ float z) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3D(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3D(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3D(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3D(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3D(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3D(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3D(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3D(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3D(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3D(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3D(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3D(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3D(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3D(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3D(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3D(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3D(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3D(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3D(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3D(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y, float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3D(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3D(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3D(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3D(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3D(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3D(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3D(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3D(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DLod(texture<char, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DLod(texture<char1, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DLod(texture<char2, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DLod(texture<char4, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DLod(texture<unsigned char, texType, mode> texRef,
+ float x, float y, float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DLod(texture<uchar1, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DLod(texture<uchar2, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DLod(texture<uchar4, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DLod(texture<int, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DLod(texture<int1, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DLod(texture<int2, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DLod(texture<int4, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DLod(texture<unsigned int, texType, mode> texRef,
+ float x, float y, float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DLod(texture<uint1, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DLod(texture<uint2, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DLod(texture<uint4, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DLod(texture<float, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DLod(texture<float1, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DLod(texture<float2, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DLod(texture<float4, texType, mode> texRef, float x, float y,
+ float z, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DLod(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DLod(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DLod(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DLod(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DLod(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DLod(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DLod(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DLod(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DLod(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y, float z,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DLod(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DLod(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DLod(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DLod(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DLod(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DLod(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DLod(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DLod(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DLod(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DLod(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DLod(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data,
+ level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DGrad(texture<char, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DGrad(texture<char1, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DGrad(texture<char2, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DGrad(texture<char4, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DGrad(texture<unsigned char, texType, mode> texRef,
+ float x, float y, float z, float4 dx,
+ float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DGrad(texture<uchar1, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DGrad(texture<uchar2, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DGrad(texture<uchar4, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3DGrad(texture<short, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3DGrad(texture<short1, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3DGrad(texture<short2, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3DGrad(texture<short4, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3DGrad(texture<unsigned short, texType, mode> texRef,
+ float x, float y, float z, float4 dx,
+ float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3DGrad(texture<ushort1, texType, mode> texRef, float x,
+ float y, float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3DGrad(texture<ushort2, texType, mode> texRef, float x,
+ float y, float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3DGrad(texture<ushort4, texType, mode> texRef, float x,
+ float y, float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DGrad(texture<int, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DGrad(texture<int1, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DGrad(texture<int2, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DGrad(texture<int4, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DGrad(texture<unsigned int, texType, mode> texRef,
+ float x, float y, float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DGrad(texture<uint1, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DGrad(texture<uint2, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DGrad(texture<uint4, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DGrad(texture<float, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DGrad(texture<float1, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DGrad(texture<float2, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DGrad(texture<float4, texType, mode> texRef, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex3DGrad(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex3DGrad(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex3DGrad(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex3DGrad(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex3DGrad(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex3DGrad(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex3DGrad(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex3DGrad(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex3DGrad(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex3DGrad(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex3DGrad(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex3DGrad(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex3DGrad(texture<unsigned short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex3DGrad(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex3DGrad(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex3DGrad(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex3DGrad(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex3DGrad(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex3DGrad(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex3DGrad(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex3DGrad(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex3DGrad(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex3DGrad(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex3DGrad(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex3DGrad(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex3DGrad(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex3DGrad(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex3DGrad(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ float z, float4 dx, float4 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data,
+ float4(dx.x, dx.y, dx.z, dx.w).data,
+ float4(dy.x, dy.y, dy.z, dy.w).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayered(texture<char, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayered(texture<char1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayered(texture<char2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayered(texture<char4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayered(texture<unsigned char, texType, mode> texRef,
+ float x, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayered(texture<uchar1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayered(texture<uchar2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayered(texture<uchar4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayered(texture<short, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayered(texture<short1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayered(texture<short2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayered(texture<short4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayered(
+ texture<unsigned short, texType, mode> texRef, float x, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayered(texture<ushort1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayered(texture<ushort2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayered(texture<ushort4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayered(texture<int, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayered(texture<int1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayered(texture<int2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayered(texture<int4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayered(texture<unsigned int, texType, mode> texRef,
+ float x, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayered(texture<uint1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayered(texture<uint2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayered(texture<uint4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayered(texture<float, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayered(texture<float1, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayered(texture<float2, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayered(texture<float4, texType, mode> texRef, float x,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayered(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayered(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayered(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayered(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayered(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayered(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayered(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayered(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayered(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayered(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayered(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayered(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayered(
+ texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayered(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayered(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayered(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayered(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayered(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayered(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayered(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayered(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayered(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayered(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayered(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayered(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayered(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayered(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayered(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredLod(texture<char, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredLod(texture<char1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredLod(texture<char2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredLod(texture<char4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredLod(
+ texture<unsigned char, texType, mode> texRef, float x, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredLod(texture<uchar1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredLod(texture<uchar2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredLod(texture<uchar4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredLod(texture<short, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredLod(texture<short1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredLod(texture<short2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredLod(texture<short4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredLod(
+ texture<unsigned short, texType, mode> texRef, float x, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredLod(texture<ushort1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredLod(texture<ushort2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredLod(texture<ushort4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredLod(texture<int, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredLod(texture<int1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredLod(texture<int2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredLod(texture<int4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredLod(texture<unsigned int, texType, mode> texRef,
+ float x, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredLod(texture<uint1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredLod(texture<uint2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredLod(texture<uint4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredLod(texture<float, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredLod(texture<float1, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredLod(texture<float2, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredLod(texture<float4, texType, mode> texRef, float x,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredLod(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredLod(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredLod(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredLod(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredLod(
+ texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredLod(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredLod(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredLod(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredLod(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredLod(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredLod(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredLod(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredLod(
+ texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredLod(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredLod(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredLod(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredLod(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, int layer,
+ float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredLod(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredLod(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredLod(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredLod(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredLod(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredLod(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredLod(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredLod(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredLod(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredLod(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredLod(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredGrad(texture<char, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex1DLayeredGrad(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredGrad(texture<char1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex1DLayeredGrad(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredGrad(texture<char2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex1DLayeredGrad(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredGrad(texture<char4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex1DLayeredGrad(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredGrad(
+ texture<unsigned char, texType, mode> texRef, float x, int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex1DLayeredGrad(
+ texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredGrad(texture<uchar1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex1DLayeredGrad(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredGrad(texture<uchar2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex1DLayeredGrad(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredGrad(texture<uchar4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex1DLayeredGrad(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredGrad(texture<short, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex1DLayeredGrad(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredGrad(texture<short1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex1DLayeredGrad(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredGrad(texture<short2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex1DLayeredGrad(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredGrad(texture<short4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex1DLayeredGrad(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredGrad(
+ texture<unsigned short, texType, mode> texRef, float x, int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex1DLayeredGrad(
+ texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredGrad(texture<ushort1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex1DLayeredGrad(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredGrad(texture<ushort2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex1DLayeredGrad(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredGrad(texture<ushort4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex1DLayeredGrad(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredGrad(texture<int, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex1DLayeredGrad(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredGrad(texture<int1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex1DLayeredGrad(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredGrad(texture<int2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex1DLayeredGrad(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredGrad(texture<int4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex1DLayeredGrad(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredGrad(
+ texture<unsigned int, texType, mode> texRef, float x, int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex1DLayeredGrad(
+ texture<unsigned int, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredGrad(texture<uint1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex1DLayeredGrad(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredGrad(texture<uint2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex1DLayeredGrad(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredGrad(texture<uint4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex1DLayeredGrad(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredGrad(texture<float, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex1DLayeredGrad(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredGrad(texture<float1, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex1DLayeredGrad(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredGrad(texture<float2, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex1DLayeredGrad(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredGrad(texture<float4, texType, mode> texRef, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex1DLayeredGrad(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ int layer, float dx, float dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dx, dy);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayered(texture<char, texType, mode> texRef, float x, float y,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayered(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayered(texture<char1, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayered(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayered(texture<char2, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayered(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayered(texture<char4, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayered(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayered(texture<unsigned char, texType, mode> texRef,
+ float x, float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayered(texture<unsigned char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayered(texture<uchar1, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayered(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayered(texture<uchar2, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayered(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayered(texture<uchar4, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayered(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayered(texture<short, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayered(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayered(texture<short1, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayered(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayered(texture<short2, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayered(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayered(texture<short4, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayered(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayered(
+ texture<unsigned short, texType, mode> texRef, float x, float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayered(
+ texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayered(texture<ushort1, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayered(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayered(texture<ushort2, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayered(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayered(texture<ushort4, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayered(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayered(texture<int, texType, mode> texRef, float x, float y,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayered(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayered(texture<int1, texType, mode> texRef, float x, float y,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayered(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayered(texture<int2, texType, mode> texRef, float x, float y,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayered(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayered(texture<int4, texType, mode> texRef, float x, float y,
+ int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayered(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayered(texture<unsigned int, texType, mode> texRef,
+ float x, float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayered(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayered(texture<uint1, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayered(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayered(texture<uint2, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayered(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayered(texture<uint4, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayered(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayered(texture<float, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayered(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayered(texture<float1, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayered(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayered(texture<float2, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayered(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayered(texture<float4, texType, mode> texRef, float x,
+ float y, int layer) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayered(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredLod(texture<char, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredLod(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredLod(texture<char1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredLod(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredLod(texture<char2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredLod(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredLod(texture<char4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredLod(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredLod(
+ texture<unsigned char, texType, mode> texRef, float x, float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredLod(
+ texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredLod(texture<uchar1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredLod(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredLod(texture<uchar2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredLod(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredLod(texture<uchar4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredLod(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredLod(texture<short, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredLod(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredLod(texture<short1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredLod(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredLod(texture<short2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredLod(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredLod(texture<short4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredLod(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredLod(
+ texture<unsigned short, texType, mode> texRef, float x, float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredLod(
+ texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredLod(texture<ushort1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredLod(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredLod(texture<ushort2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredLod(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredLod(texture<ushort4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredLod(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredLod(texture<int, texType, mode> texRef, float x, float y,
+ int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredLod(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredLod(texture<int1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredLod(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredLod(texture<int2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredLod(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredLod(texture<int4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredLod(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredLod(texture<unsigned int, texType, mode> texRef,
+ float x, float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredLod(texture<unsigned int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredLod(texture<uint1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredLod(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredLod(texture<uint2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredLod(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredLod(texture<uint4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredLod(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredLod(texture<float, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredLod(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredLod(texture<float1, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredLod(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredLod(texture<float2, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredLod(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredLod(texture<float4, texType, mode> texRef, float x,
+ float y, int layer, float level) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredLod(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float level) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f = __ockl_image_sample_lod_2Da(
+ i, s, float4(x, y, layer, 0.0f).data, level);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+////////////////////////////////////////////////////////////
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredGrad(texture<char, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char tex2DLayeredGrad(texture<char, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredGrad(texture<char1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char1 tex2DLayeredGrad(texture<char1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredGrad(texture<char2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char2 tex2DLayeredGrad(texture<char2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredGrad(texture<char4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ char4 tex2DLayeredGrad(texture<char4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_CHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredGrad(
+ texture<unsigned char, texType, mode> texRef, float x, float y, int layer, float2 dx,
+ float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned char tex2DLayeredGrad(
+ texture<unsigned char, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredGrad(texture<uchar1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar1 tex2DLayeredGrad(texture<uchar1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredGrad(texture<uchar2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar2 tex2DLayeredGrad(texture<uchar2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredGrad(texture<uchar4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uchar4 tex2DLayeredGrad(texture<uchar4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UCHAR_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredGrad(texture<short, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short tex2DLayeredGrad(texture<short, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredGrad(texture<short1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short1 tex2DLayeredGrad(texture<short1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredGrad(texture<short2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short2 tex2DLayeredGrad(texture<short2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredGrad(texture<short4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ short4 tex2DLayeredGrad(texture<short4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_SHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredGrad(
+ texture<unsigned short, texType, mode> texRef, float x, float y, int layer, float2 dx,
+ float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned short tex2DLayeredGrad(
+ texture<unsigned short, texType, mode> texRef, hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredGrad(texture<ushort1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort1 tex2DLayeredGrad(texture<ushort1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredGrad(texture<ushort2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort2 tex2DLayeredGrad(texture<ushort2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredGrad(texture<ushort4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ ushort4 tex2DLayeredGrad(texture<ushort4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_USHORT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredGrad(texture<int, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int tex2DLayeredGrad(texture<int, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredGrad(texture<int1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int1 tex2DLayeredGrad(texture<int1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredGrad(texture<int2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int2 tex2DLayeredGrad(texture<int2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredGrad(texture<int4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ int4 tex2DLayeredGrad(texture<int4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x, float y,
+ int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_INT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredGrad(
+ texture<unsigned int, texType, mode> texRef, float x, float y, int layer, float2 dx,
+ float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ unsigned int tex2DLayeredGrad(
+ texture<unsigned int, texType, mode> texRef, hipTextureObject_t textureObject, float x, float y,
+ int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredGrad(texture<uint1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint1 tex2DLayeredGrad(texture<uint1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredGrad(texture<uint2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint2 tex2DLayeredGrad(texture<uint2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredGrad(texture<uint4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ uint4 tex2DLayeredGrad(texture<uint4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_UINT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredGrad(texture<float, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float tex2DLayeredGrad(texture<float, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredGrad(texture<float1, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float1 tex2DLayeredGrad(texture<float1, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_X;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredGrad(texture<float2, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float2 tex2DLayeredGrad(texture<float2, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XY;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredGrad(texture<float4, texType, mode> texRef, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_REF_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+
+template <int texType, enum hipTextureReadMode mode>
+__TEXTURE_FUNCTIONS_DECL__ float4 tex2DLayeredGrad(texture<float4, texType, mode> texRef,
+ hipTextureObject_t textureObject, float x,
+ float y, int layer, float2 dx, float2 dy) {
+ TEXTURE_PARAMETERS_INIT;
+ texel.f =
+ __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data,
+ float2(dx.x, dx.y).data,
+ float2(dy.x, dy.y).data);
+ TEXTURE_RETURN_FLOAT_XYZW;
+}
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/texture_indirect_functions.h b/third_party/rocm/include/hip/hcc_detail/texture_indirect_functions.h
new file mode 100644
index 0000000..2fe33f3
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/texture_indirect_functions.h
@@ -0,0 +1,501 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#if defined(__cplusplus)
+
+#include <hip/hip_vector_types.h>
+#include <hip/hip_texture_types.h>
+#include <hip/hcc_detail/ockl_image.h>
+
+#include <type_traits>
+
+#define TEXTURE_OBJECT_PARAMETERS_INIT \
+ unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
+ unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
+
+template<typename T>
+struct __hip_is_itex_channel_type
+{
+ static constexpr bool value =
+ std::is_same<T, char>::value ||
+ std::is_same<T, unsigned char>::value ||
+ std::is_same<T, short>::value ||
+ std::is_same<T, unsigned short>::value ||
+ std::is_same<T, int>::value ||
+ std::is_same<T, unsigned int>::value ||
+ std::is_same<T, float>::value;
+};
+
+template<
+ typename T,
+ unsigned int rank>
+struct __hip_is_itex_channel_type<HIP_vector_type<T, rank>>
+{
+ static constexpr bool value =
+ __hip_is_itex_channel_type<T>::value &&
+ ((rank == 1) ||
+ (rank == 2) ||
+ (rank == 4));
+};
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_load_1Db(i, x);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
+{
+ *ptr = tex1Dfetch<T>(textureObject, x);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1D(hipTextureObject_t textureObject, float x)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_1D(i, s, x);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
+{
+ *ptr = tex1D<T>(textureObject, x);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2D(hipTextureObject_t textureObject, float x, float y)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
+{
+ *ptr = tex2D<T>(textureObject, x, y);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
+{
+ *ptr = tex3D<T>(textureObject, x, y, z);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
+{
+ *ptr = tex1DLayered<T>(textureObject, x, layer);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
+{
+ *ptr = tex1DLayered<T>(textureObject, x, y, layer);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
+{
+ *ptr = texCubemap<T>(textureObject, x, y, z);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
+{
+ *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ switch (comp) {
+ case 1: {
+ auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<T*>(&tmp);
+ break;
+ }
+ case 2: {
+ auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<T*>(&tmp);
+ break;
+ }
+ case 3: {
+ auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<T*>(&tmp);
+ break;
+ }
+ default: {
+ auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
+ return *reinterpret_cast<T*>(&tmp);
+ break;
+ }
+ };
+ return {};
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
+{
+ *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
+{
+ *ptr = tex1DLod<T>(textureObject, x, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
+{
+ *ptr = tex2DLod<T>(textureObject, x, y, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+ *ptr = tex3DLod<T>(textureObject, x, y, z, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
+{
+ *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
+{
+ *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
+{
+ *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ // TODO missing in device libs.
+ // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+ // return *reinterpret_cast<T*>(&tmp);
+ return {};
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+ *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
+{
+ *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
+{
+ *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+ *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+ *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
+{
+ *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
+ return *reinterpret_cast<T*>(&tmp);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+ *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+ TEXTURE_OBJECT_PARAMETERS_INIT
+ // TODO missing in device libs.
+ // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
+ // return *reinterpret_cast<T*>(&tmp);
+ return {};
+}
+
+template <
+ typename T,
+ typename std::enable_if<__hip_is_itex_channel_type<T>::value>::type* = nullptr>
+static __device__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+ *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
+}
+
+#endif
diff --git a/third_party/rocm/include/hip/hcc_detail/texture_types.h b/third_party/rocm/include/hip/hcc_detail/texture_types.h
new file mode 100644
index 0000000..832b909
--- /dev/null
+++ b/third_party/rocm/include/hip/hcc_detail/texture_types.h
@@ -0,0 +1,109 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_TEXTURE_TYPES_H
+
+#include <hip/hcc_detail/driver_types.h>
+
+#define hipTextureType1D 0x01
+#define hipTextureType2D 0x02
+#define hipTextureType3D 0x03
+#define hipTextureTypeCubemap 0x0C
+#define hipTextureType1DLayered 0xF1
+#define hipTextureType2DLayered 0xF2
+#define hipTextureTypeCubemapLayered 0xFC
+
+/**
+ * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
+ */
+#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
+#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
+#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
+#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
+
+/**
+ * An opaque value that represents a hip texture object
+ */
+struct __hip_texture;
+typedef struct __hip_texture* hipTextureObject_t;
+
+/**
+ * hip texture address modes
+ */
+enum hipTextureAddressMode {
+ hipAddressModeWrap = 0,
+ hipAddressModeClamp = 1,
+ hipAddressModeMirror = 2,
+ hipAddressModeBorder = 3
+};
+
+/**
+ * hip texture filter modes
+ */
+enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
+
+/**
+ * hip texture read modes
+ */
+enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
+
+/**
+ * hip texture reference
+ */
+typedef struct textureReference {
+ int normalized;
+ enum hipTextureReadMode readMode;// used only for driver API's
+ enum hipTextureFilterMode filterMode;
+ enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
+ struct hipChannelFormatDesc channelDesc;
+ int sRGB; // Perform sRGB->linear conversion during texture read
+ unsigned int maxAnisotropy; // Limit to the anisotropy ratio
+ enum hipTextureFilterMode mipmapFilterMode;
+ float mipmapLevelBias;
+ float minMipmapLevelClamp;
+ float maxMipmapLevelClamp;
+
+ hipTextureObject_t textureObject;
+ int numChannels;
+ enum hipArray_Format format;
+}textureReference;
+
+/**
+ * hip texture descriptor
+ */
+typedef struct hipTextureDesc {
+ enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
+ enum hipTextureFilterMode filterMode;
+ enum hipTextureReadMode readMode;
+ int sRGB; // Perform sRGB->linear conversion during texture read
+ float borderColor[4];
+ int normalizedCoords;
+ unsigned int maxAnisotropy;
+ enum hipTextureFilterMode mipmapFilterMode;
+ float mipmapLevelBias;
+ float minMipmapLevelClamp;
+ float maxMipmapLevelClamp;
+}hipTextureDesc;
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_bfloat16.h b/third_party/rocm/include/hip/hip_bfloat16.h
new file mode 100644
index 0000000..ef09cf0
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_bfloat16.h
@@ -0,0 +1,280 @@
+/**
+ * MIT License
+ *
+ * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*!\file
+ * \brief hip_bfloat16.h provides struct for hip_bfloat16 typedef
+ */
+
+#ifndef _HIP_BFLOAT16_H_
+#define _HIP_BFLOAT16_H_
+
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of hip_bfloat16
+
+#include <stdint.h>
+/*! \brief Struct to represent a 16 bit brain floating point number. */
+typedef struct
+{
+ uint16_t data;
+} hip_bfloat16;
+
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
+#include <type_traits>
+
+struct hip_bfloat16
+{
+ uint16_t data;
+
+ enum truncate_t
+ {
+ truncate
+ };
+
+ __host__ __device__ hip_bfloat16() = default;
+
+ // round upper 16 bits of IEEE float to convert to bfloat16
+ explicit __host__ __device__ hip_bfloat16(float f)
+ : data(float_to_bfloat16(f))
+ {
+ }
+
+ explicit __host__ __device__ hip_bfloat16(float f, truncate_t)
+ : data(truncate_float_to_bfloat16(f))
+ {
+ }
+
+ // zero extend lower 16 bits of bfloat16 to convert to IEEE float
+ __host__ __device__ operator float() const
+ {
+ union
+ {
+ uint32_t int32;
+ float fp32;
+ } u = {uint32_t(data) << 16};
+ return u.fp32;
+ }
+
+ static __host__ __device__ hip_bfloat16 round_to_bfloat16(float f)
+ {
+ hip_bfloat16 output;
+ output.data = float_to_bfloat16(f);
+ return output;
+ }
+
+ static __host__ __device__ hip_bfloat16 round_to_bfloat16(float f, truncate_t)
+ {
+ hip_bfloat16 output;
+ output.data = truncate_float_to_bfloat16(f);
+ return output;
+ }
+
+private:
+ static __host__ __device__ uint16_t float_to_bfloat16(float f)
+ {
+ union
+ {
+ float fp32;
+ uint32_t int32;
+ } u = {f};
+ if(~u.int32 & 0x7f800000)
+ {
+ // When the exponent bits are not all 1s, then the value is zero, normal,
+ // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+ // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+ // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+ // least significant bits of the float mantissa are greater than 0x8000,
+ // or if they are equal to 0x8000 and the least significant bit of the
+ // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+ // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+ // has the value 0x7f, then incrementing it causes it to become 0x00 and
+ // the exponent is incremented by one, which is the next higher FP value
+ // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+ // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+ // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+ // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+ // incrementing it causes it to become an exponent of 0xFF and a mantissa
+ // of 0x00, which is Inf, the next higher value to the unrounded value.
+ u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+ }
+ else if(u.int32 & 0xffff)
+ {
+ // When all of the exponent bits are 1, the value is Inf or NaN.
+ // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+ // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+ // bit being 1. Signaling NaN is indicated by the most significant
+ // mantissa bit being 0 but some other bit(s) being 1. If any of the
+ // lower 16 bits of the mantissa are 1, we set the least significant bit
+ // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+ // the bloat16's mantissa bits are all 0.
+ u.int32 |= 0x10000; // Preserve signaling NaN
+ }
+ return uint16_t(u.int32 >> 16);
+ }
+
+ // Truncate instead of rounding, preserving SNaN
+ static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
+ {
+ union
+ {
+ float fp32;
+ uint32_t int32;
+ } u = {f};
+ return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+ }
+};
+
+typedef struct
+{
+ uint16_t data;
+} hip_bfloat16_public;
+
+static_assert(std::is_standard_layout<hip_bfloat16>{},
+ "hip_bfloat16 is not a standard layout type, and thus is "
+ "incompatible with C.");
+
+static_assert(std::is_trivial<hip_bfloat16>{},
+ "hip_bfloat16 is not a trivial type, and thus is "
+ "incompatible with C.");
+
+static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
+ && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
+ "internal hip_bfloat16 does not match public hip_bfloat16");
+
+inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
+{
+ return os << float(bf16);
+}
+inline __host__ __device__ hip_bfloat16 operator+(hip_bfloat16 a)
+{
+ return a;
+}
+inline __host__ __device__ hip_bfloat16 operator-(hip_bfloat16 a)
+{
+ a.data ^= 0x8000;
+ return a;
+}
+inline __host__ __device__ hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return hip_bfloat16(float(a) + float(b));
+}
+inline __host__ __device__ hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return hip_bfloat16(float(a) - float(b));
+}
+inline __host__ __device__ hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return hip_bfloat16(float(a) * float(b));
+}
+inline __host__ __device__ hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return hip_bfloat16(float(a) / float(b));
+}
+inline __host__ __device__ bool operator<(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return float(a) < float(b);
+}
+inline __host__ __device__ bool operator==(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return float(a) == float(b);
+}
+inline __host__ __device__ bool operator>(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return b < a;
+}
+inline __host__ __device__ bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return !(a > b);
+}
+inline __host__ __device__ bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return !(a == b);
+}
+inline __host__ __device__ bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
+{
+ return !(a < b);
+}
+inline __host__ __device__ hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
+{
+ return a = a + b;
+}
+inline __host__ __device__ hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
+{
+ return a = a - b;
+}
+inline __host__ __device__ hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
+{
+ return a = a * b;
+}
+inline __host__ __device__ hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
+{
+ return a = a / b;
+}
+inline __host__ __device__ hip_bfloat16& operator++(hip_bfloat16& a)
+{
+ return a += hip_bfloat16(1.0f);
+}
+inline __host__ __device__ hip_bfloat16& operator--(hip_bfloat16& a)
+{
+ return a -= hip_bfloat16(1.0f);
+}
+inline __host__ __device__ hip_bfloat16 operator++(hip_bfloat16& a, int)
+{
+ hip_bfloat16 orig = a;
+ ++a;
+ return orig;
+}
+inline __host__ __device__ hip_bfloat16 operator--(hip_bfloat16& a, int)
+{
+ hip_bfloat16 orig = a;
+ --a;
+ return orig;
+}
+
+namespace std
+{
+ constexpr __host__ __device__ bool isinf(hip_bfloat16 a)
+ {
+ return !(~a.data & 0x7f80) && !(a.data & 0x7f);
+ }
+ constexpr __host__ __device__ bool isnan(hip_bfloat16 a)
+ {
+ return !(~a.data & 0x7f80) && +(a.data & 0x7f);
+ }
+ constexpr __host__ __device__ bool iszero(hip_bfloat16 a)
+ {
+ return !(a.data & 0x7fff);
+ }
+}
+
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // _HIP_BFLOAT16_H_
diff --git a/third_party/rocm/include/hip/hip_common.h b/third_party/rocm/include/hip/hip_common.h
new file mode 100644
index 0000000..79c787b
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_common.h
@@ -0,0 +1,87 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_COMMON_H
+#define HIP_INCLUDE_HIP_HIP_COMMON_H
+
+// Common code included at start of every hip file.
+// Auto enable __HIP_PLATFORM_HCC__ if compiling with HCC
+// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
+#if defined(__HCC__) || (defined(__clang__) && defined(__HIP__))
+#define __HIP_PLATFORM_HCC__
+#endif //__HCC__
+
+// Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__))
+#define __HIP_PLATFORM_NVCC__
+#ifdef __CUDACC__
+#define __HIPCC__
+#endif
+
+#endif //__NVCC__
+
+// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
+#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || \
+ (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
+#define __HIP_DEVICE_COMPILE__ 1
+#endif
+
+#ifdef __GNUC__
+#define HIP_PUBLIC_API __attribute__ ((visibility ("default")))
+#define HIP_INTERNAL_EXPORTED_API __attribute__ ((visibility ("default")))
+#else
+#define HIP_PUBLIC_API
+#define HIP_INTERNAL_EXPORTED_API
+#endif
+
+#if __HIP_DEVICE_COMPILE__ == 0
+// 32-bit Atomics
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
+
+// 64-bit Atomics
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (0)
+
+// Warp cross-lane operations
+#define __HIP_ARCH_HAS_WARP_VOTE__ (0)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
+
+// Sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
+
+// Misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
+#define __HIP_ARCH_HAS_3DGRID__ (0)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_complex.h b/third_party/rocm/include/hip/hip_complex.h
new file mode 100644
index 0000000..fb9cad5
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_complex.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_HIP_COMPLEX_H
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hip_complex.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/hip_complex.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_cooperative_groups.h b/third_party/rocm/include/hip/hip_cooperative_groups.h
new file mode 100644
index 0000000..41f3637
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_cooperative_groups.h
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hip_cooperative_groups.h
+ *
+ * @brief Defines new types and device API wrappers for `Cooperative Group`
+ * feature.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
+#define HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#if __cplusplus && defined(__clang__) && defined(__HIP__)
+#include <hip/hcc_detail/hip_cooperative_groups.h>
+#endif
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/hip_cooperative_groups.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif // HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_H
diff --git a/third_party/rocm/include/hip/hip_ext.h b/third_party/rocm/include/hip/hip_ext.h
new file mode 100644
index 0000000..ef8f53b
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_ext.h
@@ -0,0 +1,164 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_EXT_H
+#define HIP_INCLUDE_HIP_HIP_EXT_H
+#include "hip/hip_runtime.h"
+#if defined(__cplusplus)
+#include <tuple>
+#include <type_traits>
+#endif
+/** @addtogroup Module Module Management
+ * @{
+ */
+
+/**
+ * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed
+ to kernelparams or extra
+ *
+ * @param [in[ f Kernel to launch.
+ * @param [in] gridDimX X grid dimension specified in work-items
+ * @param [in] gridDimY Y grid dimension specified in work-items
+ * @param [in] gridDimZ Z grid dimension specified in work-items
+ * @param [in] blockDimX X block dimensions specified in work-items
+ * @param [in] blockDimY Y grid dimension specified in work-items
+ * @param [in] blockDimZ Z grid dimension specified in work-items
+ * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The
+ kernel can access this with HIP_DYNAMIC_SHARED.
+ * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th
+ default stream is used with associated synchronization rules.
+ * @param [in] kernelParams
+ * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and
+ must be in the memory layout and alignment expected by the kernel.
+ * @param [in] startEvent If non-null, specified event will be updated to track the start time of
+ the kernel launch. The event must be created before calling this API.
+ * @param [in] stopEvent If non-null, specified event will be updated to track the stop time of
+ the kernel launch. The event must be created before calling this API.
+ *
+ * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue
+ *
+ * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please
+ refer to hip_porting_driver_api.md for sample usage.
+ * HIP/ROCm actually updates the start event when the associated kernel completes.
+ */
+HIP_PUBLIC_API
+hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+ uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+ uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+ uint32_t localWorkSizeZ, size_t sharedMemBytes,
+ hipStream_t hStream, void** kernelParams, void** extra,
+ hipEvent_t startEvent = nullptr,
+ hipEvent_t stopEvent = nullptr,
+ uint32_t flags = 0);
+
+HIP_PUBLIC_API
+hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
+ uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ,
+ uint32_t localWorkSizeX, uint32_t localWorkSizeY,
+ uint32_t localWorkSizeZ, size_t sharedMemBytes,
+ hipStream_t hStream, void** kernelParams, void** extra,
+ hipEvent_t startEvent = nullptr,
+ hipEvent_t stopEvent = nullptr)
+ __attribute__((deprecated("use hipExtModuleLaunchKernel instead")));
+
+#if defined(__HIP_ROCclr__) && defined(__cplusplus)
+
+extern "C" hipError_t hipExtLaunchKernel(const void* function_address, dim3 numBlocks,
+ dim3 dimBlocks, void** args, size_t sharedMemBytes,
+ hipStream_t stream, hipEvent_t startEvent,
+ hipEvent_t stopEvent, int flags);
+
+template <typename... Args, typename F = void (*)(Args...)>
+inline void hipExtLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks,
+ std::uint32_t sharedMemBytes, hipStream_t stream,
+ hipEvent_t startEvent, hipEvent_t stopEvent, std::uint32_t flags,
+ Args... args) {
+ constexpr size_t count = sizeof...(Args);
+ auto tup_ = std::tuple<Args...>{args...};
+ auto tup = validateArgsCountType(kernel, tup_);
+ void* _Args[count];
+ pArgs<0>(tup, _Args);
+
+ auto k = reinterpret_cast<void*>(kernel);
+ hipExtLaunchKernel(k, numBlocks, dimBlocks, _Args, sharedMemBytes, stream, startEvent,
+ stopEvent, (int)flags);
+}
+#elif defined(__HIP_PLATFORM_HCC__) && GENERIC_GRID_LAUNCH == 1 && defined(__HCC__)
+//kernel_descriptor and hip_impl::make_kernarg are in "grid_launch_GGL.hpp"
+
+namespace hip_impl {
+inline
+__attribute__((visibility("hidden")))
+void hipExtLaunchKernelGGLImpl(
+ std::uintptr_t function_address,
+ const dim3& numBlocks,
+ const dim3& dimBlocks,
+ std::uint32_t sharedMemBytes,
+ hipStream_t stream,
+ hipEvent_t startEvent,
+ hipEvent_t stopEvent,
+ std::uint32_t flags,
+ void** kernarg) {
+
+ const auto& kd = hip_impl::get_program_state()
+ .kernel_descriptor(function_address, target_agent(stream));
+
+ hipExtModuleLaunchKernel(kd, numBlocks.x * dimBlocks.x,
+ numBlocks.y * dimBlocks.y,
+ numBlocks.z * dimBlocks.z,
+ dimBlocks.x, dimBlocks.y, dimBlocks.z,
+ sharedMemBytes, stream, nullptr, kernarg,
+ startEvent, stopEvent, flags);
+}
+} // namespace hip_impl
+
+template <typename... Args, typename F = void (*)(Args...)>
+inline
+void hipExtLaunchKernelGGL(F kernel, const dim3& numBlocks,
+ const dim3& dimBlocks, std::uint32_t sharedMemBytes,
+ hipStream_t stream, hipEvent_t startEvent,
+ hipEvent_t stopEvent, std::uint32_t flags,
+ Args... args) {
+ hip_impl::hip_init();
+ auto kernarg =
+ hip_impl::make_kernarg(kernel, std::tuple<Args...>{std::move(args)...});
+ std::size_t kernarg_size = kernarg.size();
+
+ void* config[]{
+ HIP_LAUNCH_PARAM_BUFFER_POINTER,
+ kernarg.data(),
+ HIP_LAUNCH_PARAM_BUFFER_SIZE,
+ &kernarg_size,
+ HIP_LAUNCH_PARAM_END};
+
+ hip_impl::hipExtLaunchKernelGGLImpl(reinterpret_cast<std::uintptr_t>(kernel),
+ numBlocks, dimBlocks, sharedMemBytes,
+ stream, startEvent, stopEvent, flags,
+ &config[0]);
+}
+#endif // !__HIP_ROCclr__ && defined(__cplusplus)
+
+// doxygen end AMD-specific features
+/**
+ * @}
+ */
+#endif // #iidef HIP_INCLUDE_HIP_HIP_EXT_H
diff --git a/third_party/rocm/include/hip/hip_fp16.h b/third_party/rocm/include/hip/hip_fp16.h
new file mode 100644
index 0000000..994ce62
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_fp16.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_FP16_H
+#define HIP_INCLUDE_HIP_HIP_FP16_H
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hip_fp16.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include "cuda_fp16.h"
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_hcc.h b/third_party/rocm/include/hip/hip_hcc.h
new file mode 100644
index 0000000..e7e27fc
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_hcc.h
@@ -0,0 +1,24 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_HCC_H
+#define HIP_INCLUDE_HIP_HIP_HCC_H
+#warning "hip/hip_hcc.h is deprecated, please use hip/hip_ext.h"
+#include "hip/hip_ext.h"
+#endif // #ifdef HIP_INCLUDE_HIP_HIP_HCC_H
diff --git a/third_party/rocm/include/hip/hip_profile.h b/third_party/rocm/include/hip/hip_profile.h
new file mode 100644
index 0000000..ff18239
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_profile.h
@@ -0,0 +1,27 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_HIP_PROFILE_H
+#define HIP_INCLUDE_HIP_HIP_PROFILE_H
+
+#define HIP_SCOPED_MARKER(markerName, group)
+#define HIP_BEGIN_MARKER(markerName, group)
+#define HIP_END_MARKER()
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_runtime.h b/third_party/rocm/include/hip/hip_runtime.h
new file mode 100644
index 0000000..c785f8d
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_runtime.h
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//! HIP = Heterogeneous-compute Interface for Portability
+//!
+//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
+//! through either AMD HCC or NVCC. Key features tend to be in the spirit
+//! and terminology of CUDA, but with a portable path to other accelerators as well:
+//
+//! Both paths support rich C++ features including classes, templates, lambdas, etc.
+//! Runtime API is C
+//! Memory management is based on pure pointers and resembles malloc/free/copy.
+//
+//! hip_runtime.h : includes everything in hip_api.h, plus math builtins and kernel launch
+//! macros. hip_runtime_api.h : Defines HIP API. This is a C header file and does not use any C++
+//! features.
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_H
+
+#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__) && __AMDGCN_WAVEFRONT_SIZE == 64
+#error HIP is not supported on GFX10 with wavefront size 64
+#endif
+
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#if __cplusplus > 199711L
+#include <thread>
+#endif
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hip_runtime.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/hip_runtime.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_vector_types.h>
+#include <hip/library_types.h>
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_runtime_api.h b/third_party/rocm/include/hip/hip_runtime_api.h
new file mode 100644
index 0000000..ed9a288
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_runtime_api.h
@@ -0,0 +1,423 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ * @file hip_runtime_api.h
+ *
+ * @brief Defines the API signatures for HIP runtime.
+ * This file can be compiled with a standard compiler.
+ */
+
+#ifndef HIP_INCLUDE_HIP_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_HIP_RUNTIME_API_H
+
+
+#include <string.h> // for getDeviceProp
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+
+enum {
+ HIP_SUCCESS = 0,
+ HIP_ERROR_INVALID_VALUE,
+ HIP_ERROR_NOT_INITIALIZED,
+ HIP_ERROR_LAUNCH_OUT_OF_RESOURCES
+};
+
+typedef struct {
+ // 32-bit Atomics
+ unsigned hasGlobalInt32Atomics : 1; ///< 32-bit integer atomics for global memory.
+ unsigned hasGlobalFloatAtomicExch : 1; ///< 32-bit float atomic exch for global memory.
+ unsigned hasSharedInt32Atomics : 1; ///< 32-bit integer atomics for shared memory.
+ unsigned hasSharedFloatAtomicExch : 1; ///< 32-bit float atomic exch for shared memory.
+ unsigned hasFloatAtomicAdd : 1; ///< 32-bit float atomic add in global and shared memory.
+
+ // 64-bit Atomics
+ unsigned hasGlobalInt64Atomics : 1; ///< 64-bit integer atomics for global memory.
+ unsigned hasSharedInt64Atomics : 1; ///< 64-bit integer atomics for shared memory.
+
+ // Doubles
+ unsigned hasDoubles : 1; ///< Double-precision floating point.
+
+ // Warp cross-lane operations
+ unsigned hasWarpVote : 1; ///< Warp vote instructions (__any, __all).
+ unsigned hasWarpBallot : 1; ///< Warp ballot instructions (__ballot).
+ unsigned hasWarpShuffle : 1; ///< Warp shuffle operations. (__shfl_*).
+ unsigned hasFunnelShift : 1; ///< Funnel two words into one with shift&mask caps.
+
+ // Sync
+ unsigned hasThreadFenceSystem : 1; ///< __threadfence_system.
+ unsigned hasSyncThreadsExt : 1; ///< __syncthreads_count, syncthreads_and, syncthreads_or.
+
+ // Misc
+ unsigned hasSurfaceFuncs : 1; ///< Surface functions.
+ unsigned has3dGrid : 1; ///< Grid and group dims are 3D (rather than 2D).
+ unsigned hasDynamicParallelism : 1; ///< Dynamic parallelism.
+} hipDeviceArch_t;
+
+
+//---
+// Common headers for both NVCC and HCC paths:
+
+/**
+ * hipDeviceProp
+ *
+ */
+typedef struct hipDeviceProp_t {
+ char name[256]; ///< Device name.
+ size_t totalGlobalMem; ///< Size of global memory region (in bytes).
+ size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes).
+ int regsPerBlock; ///< Registers per block.
+ int warpSize; ///< Warp size.
+ int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size.
+ int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block.
+ int maxGridSize[3]; ///< Max grid dimensions (XYZ).
+ int clockRate; ///< Max clock frequency of the multiProcessors in khz.
+ int memoryClockRate; ///< Max global memory clock frequency in khz.
+ int memoryBusWidth; ///< Global memory bus width in bits.
+ size_t totalConstMem; ///< Size of shared memory region (in bytes).
+ int major; ///< Major compute capability. On HCC, this is an approximation and features may
+ ///< differ from CUDA CC. See the arch feature flags for portable ways to query
+ ///< feature caps.
+ int minor; ///< Minor compute capability. On HCC, this is an approximation and features may
+ ///< differ from CUDA CC. See the arch feature flags for portable ways to query
+ ///< feature caps.
+ int multiProcessorCount; ///< Number of multi-processors (compute units).
+ int l2CacheSize; ///< L2 cache size.
+ int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor.
+ int computeMode; ///< Compute mode.
+ int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*"
+ ///< instructions. New for HIP.
+ hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP.
+ int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently.
+ int pciDomainID; ///< PCI Domain ID
+ int pciBusID; ///< PCI Bus ID.
+ int pciDeviceID; ///< PCI Device ID.
+ size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor.
+ int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not.
+ int canMapHostMemory; ///< Check whether HIP can map host memory
+ int gcnArch; ///< DEPRECATED: use gcnArchName instead
+ char gcnArchName[256]; ///< AMD GCN Arch Name.
+ int integrated; ///< APU vs dGPU
+ int cooperativeLaunch; ///< HIP device supports cooperative launch
+ int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices
+ int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory
+ int maxTexture1D; ///< Maximum number of elements in 1D images
+ int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements
+ int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image elements
+ unsigned int* hdpMemFlushCntl; ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register
+ unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register
+ size_t memPitch; ///<Maximum pitch in bytes allowed by memory copies
+ size_t textureAlignment; ///<Alignment requirement for textures
+ size_t texturePitchAlignment; ///<Pitch alignment requirement for texture references bound to pitched memory
+ int kernelExecTimeoutEnabled; ///<Run time limit for kernels executed on the device
+ int ECCEnabled; ///<Device has ECC support enabled
+ int tccDriver; ///< 1:If device is Tesla device using TCC driver, else 0
+ int cooperativeMultiDeviceUnmatchedFunc; ///< HIP device supports cooperative launch on multiple
+ ///devices with unmatched functions
+ int cooperativeMultiDeviceUnmatchedGridDim; ///< HIP device supports cooperative launch on multiple
+ ///devices with unmatched grid dimensions
+ int cooperativeMultiDeviceUnmatchedBlockDim; ///< HIP device supports cooperative launch on multiple
+ ///devices with unmatched block dimensions
+ int cooperativeMultiDeviceUnmatchedSharedMem; ///< HIP device supports cooperative launch on multiple
+ ///devices with unmatched shared memories
+ int isLargeBar; ///< 1: if it is a large PCI bar device, else 0
+ int asicRevision; ///< Revision of the GPU in this device
+ int managedMemory; ///< Device supports allocating managed memory on this system
+ int directManagedMemAccessFromHost; ///< Host can directly access managed memory on the device without migration
+ int concurrentManagedAccess; ///< Device can coherently access managed memory concurrently with the CPU
+ int pageableMemoryAccess; ///< Device supports coherently accessing pageable memory
+ ///< without calling hipHostRegister on it
+ int pageableMemoryAccessUsesHostPageTables; ///< Device accesses pageable memory via the host's page tables
+} hipDeviceProp_t;
+
+
+/**
+ * Memory type (for pointer attributes)
+ */
+typedef enum hipMemoryType {
+ hipMemoryTypeHost, ///< Memory is physically located on host
+ hipMemoryTypeDevice, ///< Memory is physically located on device. (see deviceId for specific
+ ///< device)
+ hipMemoryTypeArray, ///< Array memory, physically located on device. (see deviceId for specific
+ ///< device)
+ hipMemoryTypeUnified ///< Not used currently
+}hipMemoryType;
+
+
+/**
+ * Pointer attributes
+ */
+typedef struct hipPointerAttribute_t {
+ enum hipMemoryType memoryType;
+ int device;
+ void* devicePointer;
+ void* hostPointer;
+ int isManaged;
+ unsigned allocationFlags; /* flags specified when memory was allocated*/
+ /* peers? */
+} hipPointerAttribute_t;
+
+
+// hack to get these to show up in Doxygen:
+/**
+ * @defgroup GlobalDefs Global enum and defines
+ * @{
+ *
+ */
+
+// Ignoring error-code return values from hip APIs is discouraged. On C++17,
+// we can make that yield a warning
+#if __cplusplus >= 201703L
+#define __HIP_NODISCARD [[nodiscard]]
+#else
+#define __HIP_NODISCARD
+#endif
+
+/*
+ * @brief hipError_t
+ * @enum
+ * @ingroup Enumerations
+ */
+// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
+// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
+
+typedef enum __HIP_NODISCARD hipError_t {
+ hipSuccess = 0, ///< Successful completion.
+ hipErrorInvalidValue = 1, ///< One or more of the parameters passed to the API call is NULL
+ ///< or not in an acceptable range.
+ hipErrorOutOfMemory = 2,
+ // Deprecated
+ hipErrorMemoryAllocation = 2, ///< Memory allocation error.
+ hipErrorNotInitialized = 3,
+ // Deprecated
+ hipErrorInitializationError = 3,
+ hipErrorDeinitialized = 4,
+ hipErrorProfilerDisabled = 5,
+ hipErrorProfilerNotInitialized = 6,
+ hipErrorProfilerAlreadyStarted = 7,
+ hipErrorProfilerAlreadyStopped = 8,
+ hipErrorInvalidConfiguration = 9,
+ hipErrorInvalidSymbol = 13,
+ hipErrorInvalidDevicePointer = 17, ///< Invalid Device Pointer
+ hipErrorInvalidMemcpyDirection = 21, ///< Invalid memory copy direction
+ hipErrorInsufficientDriver = 35,
+ hipErrorMissingConfiguration = 52,
+ hipErrorPriorLaunchFailure = 53,
+ hipErrorInvalidDeviceFunction = 98,
+ hipErrorNoDevice = 100, ///< Call to hipGetDeviceCount returned 0 devices
+ hipErrorInvalidDevice = 101, ///< DeviceID must be in range 0...#compute-devices.
+ hipErrorInvalidImage = 200,
+ hipErrorInvalidContext = 201, ///< Produced when input context is invalid.
+ hipErrorContextAlreadyCurrent = 202,
+ hipErrorMapFailed = 205,
+ // Deprecated
+ hipErrorMapBufferObjectFailed = 205, ///< Produced when the IPC memory attach failed from ROCr.
+ hipErrorUnmapFailed = 206,
+ hipErrorArrayIsMapped = 207,
+ hipErrorAlreadyMapped = 208,
+ hipErrorNoBinaryForGpu = 209,
+ hipErrorAlreadyAcquired = 210,
+ hipErrorNotMapped = 211,
+ hipErrorNotMappedAsArray = 212,
+ hipErrorNotMappedAsPointer = 213,
+ hipErrorECCNotCorrectable = 214,
+ hipErrorUnsupportedLimit = 215,
+ hipErrorContextAlreadyInUse = 216,
+ hipErrorPeerAccessUnsupported = 217,
+ hipErrorInvalidKernelFile = 218, ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+ hipErrorInvalidGraphicsContext = 219,
+ hipErrorInvalidSource = 300,
+ hipErrorFileNotFound = 301,
+ hipErrorSharedObjectSymbolNotFound = 302,
+ hipErrorSharedObjectInitFailed = 303,
+ hipErrorOperatingSystem = 304,
+ hipErrorInvalidHandle = 400,
+ // Deprecated
+ hipErrorInvalidResourceHandle = 400, ///< Resource handle (hipEvent_t or hipStream_t) invalid.
+ hipErrorNotFound = 500,
+ hipErrorNotReady = 600, ///< Indicates that asynchronous operations enqueued earlier are not
+ ///< ready. This is not actually an error, but is used to distinguish
+ ///< from hipSuccess (which indicates completion). APIs that return
+ ///< this error include hipEventQuery and hipStreamQuery.
+ hipErrorIllegalAddress = 700,
+ hipErrorLaunchOutOfResources = 701, ///< Out of resources error.
+ hipErrorLaunchTimeOut = 702,
+ hipErrorPeerAccessAlreadyEnabled =
+ 704, ///< Peer access was already enabled from the current device.
+ hipErrorPeerAccessNotEnabled =
+ 705, ///< Peer access was never enabled from the current device.
+ hipErrorSetOnActiveProcess = 708,
+ hipErrorAssert = 710, ///< Produced when the kernel calls assert.
+ hipErrorHostMemoryAlreadyRegistered =
+ 712, ///< Produced when trying to lock a page-locked memory.
+ hipErrorHostMemoryNotRegistered =
+ 713, ///< Produced when trying to unlock a non-page-locked memory.
+ hipErrorLaunchFailure =
+ 719, ///< An exception occurred on the device while executing a kernel.
+ hipErrorCooperativeLaunchTooLarge =
+ 720, ///< This error indicates that the number of blocks launched per grid for a kernel
+ ///< that was launched via cooperative launch APIs exceeds the maximum number of
+ ///< allowed blocks for the current device
+ hipErrorNotSupported = 801, ///< Produced when the hip API is not supported/implemented
+ hipErrorUnknown = 999, //< Unknown error.
+ // HSA Runtime Error Codes start here.
+ hipErrorRuntimeMemory = 1052, ///< HSA runtime memory call returned error. Typically not seen
+ ///< in production systems.
+ hipErrorRuntimeOther = 1053, ///< HSA runtime call other than memory returned error. Typically
+ ///< not seen in production systems.
+ hipErrorTbd ///< Marker that more error codes are needed.
+} hipError_t;
+
+#undef __HIP_NODISCARD
+
+/*
+ * @brief hipDeviceAttribute_t
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipDeviceAttribute_t {
+ hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block.
+ hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block.
+ hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block.
+ hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block.
+ hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid.
+ hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid.
+ hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid.
+ hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in
+ ///< bytes.
+ hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes.
+ hipDeviceAttributeWarpSize, ///< Warp size in threads.
+ hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a
+ ///< thread block. This number is shared by all thread
+ ///< blocks simultaneously resident on a
+ ///< multiprocessor.
+ hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz.
+ hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz.
+ hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits.
+ hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device.
+ hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in.
+ hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
+ ///< cache.
+ hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per
+ ///< multiprocessor.
+ hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number.
+ hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number.
+ hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels
+ ///< concurrently.
+ hipDeviceAttributePciBusId, ///< PCI Bus ID.
+ hipDeviceAttributePciDeviceId, ///< PCI Device ID.
+ hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per
+ ///< Multiprocessor.
+ hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices.
+ hipDeviceAttributeIntegrated, ///< iGPU
+ hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch
+ hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices
+ hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images
+ hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements
+ hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements
+ hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D images in image elements
+ hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimensions height of 3D images in image elements
+ hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimensions depth of 3D images in image elements
+
+ hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+ hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+
+ hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies
+ hipDeviceAttributeTextureAlignment, ///<Alignment requirement for textures
+ hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
+ hipDeviceAttributeKernelExecTimeout, ///<Run time limit for kernels executed on the device
+ hipDeviceAttributeCanMapHostMemory, ///<Device can map host memory into device address space
+ hipDeviceAttributeEccEnabled, ///<Device has ECC support enabled
+
+ hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, ///< Supports cooperative launch on multiple
+ ///devices with unmatched functions
+ hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim, ///< Supports cooperative launch on multiple
+ ///devices with unmatched grid dimensions
+ hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim, ///< Supports cooperative launch on multiple
+ ///devices with unmatched block dimensions
+ hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple
+ ///devices with unmatched shared memories
+ hipDeviceAttributeAsicRevision, ///< Revision of the GPU in this device
+ hipDeviceAttributeManagedMemory, ///< Device supports allocating managed memory on this system
+ hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
+ /// the device without migration
+ hipDeviceAttributeConcurrentManagedAccess, ///< Device can coherently access managed memory
+ /// concurrently with the CPU
+ hipDeviceAttributePageableMemoryAccess, ///< Device supports coherently accessing pageable memory
+ /// without calling hipHostRegister on it
+ hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
+ /// the host's page tables
+} hipDeviceAttribute_t;
+
+enum hipComputeMode {
+ hipComputeModeDefault = 0,
+ hipComputeModeExclusive = 1,
+ hipComputeModeProhibited = 2,
+ hipComputeModeExclusiveProcess = 3
+};
+
+/**
+ * @}
+ */
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include "hip/hcc_detail/hip_runtime_api.h"
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include "hip/nvcc_detail/hip_runtime_api.h"
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+
+/**
+ * @brief: C++ wrapper for hipMalloc
+ *
+ * Perform automatic type conversion to eliminate need for excessive typecasting (ie void**)
+ *
+ * __HIP_DISABLE_CPP_FUNCTIONS__ macro can be defined to suppress these
+ * wrappers. It is useful for applications which need to obtain decltypes of
+ * HIP runtime APIs.
+ *
+ * @see hipMalloc
+ */
+#if defined(__cplusplus) && !defined(__HIP_DISABLE_CPP_FUNCTIONS__)
+template <class T>
+static inline hipError_t hipMalloc(T** devPtr, size_t size) {
+ return hipMalloc((void**)devPtr, size);
+}
+
+// Provide an override to automatically typecast the pointer type from void**, and also provide a
+// default for the flags.
+template <class T>
+static inline hipError_t hipHostMalloc(T** ptr, size_t size,
+ unsigned int flags = hipHostMallocDefault) {
+ return hipHostMalloc((void**)ptr, size, flags);
+}
+
+template <class T>
+static inline hipError_t hipMallocManaged(T** devPtr, size_t size,
+ unsigned int flags = hipMemAttachGlobal) {
+ return hipMallocManaged((void**)devPtr, size, flags);
+}
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_texture_types.h b/third_party/rocm/include/hip/hip_texture_types.h
new file mode 100644
index 0000000..a7feab0
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_texture_types.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_HIP_TEXTURE_TYPES_H
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/hip_texture_types.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <hip/nvcc_detail/hip_texture_types.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_vector_types.h b/third_party/rocm/include/hip/hip_vector_types.h
new file mode 100644
index 0000000..c1a0373
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_vector_types.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+//! hip_vector_types.h : Defines the HIP vector types.
+
+#ifndef HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
+#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
+
+#include <hip/hip_common.h>
+
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#if __cplusplus
+#include <hip/hcc_detail/hip_vector_types.h>
+#endif
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include <vector_types.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/hip_version.h b/third_party/rocm/include/hip/hip_version.h
new file mode 100644
index 0000000..2fdb247
--- /dev/null
+++ b/third_party/rocm/include/hip/hip_version.h
@@ -0,0 +1,14 @@
+// Auto-generated by cmake
+
+#ifndef HIP_VERSION_H
+#define HIP_VERSION_H
+
+#define HIP_VERSION_MAJOR 4
+#define HIP_VERSION_MINOR 1
+#define HIP_VERSION_PATCH 21114
+#define HIP_VERSION (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)
+
+#define __HIP_HAS_GET_PCH 1
+
+#endif
+
diff --git a/third_party/rocm/include/hip/hiprtc.h b/third_party/rocm/include/hip/hiprtc.h
new file mode 100644
index 0000000..22d78d2
--- /dev/null
+++ b/third_party/rocm/include/hip/hiprtc.h
@@ -0,0 +1,32 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+ #include <hip/hcc_detail/hiprtc.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+ #include <hip/nvcc_detail/nvrtc.h>
+#else
+ #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
\ No newline at end of file
diff --git a/third_party/rocm/include/hip/library_types.h b/third_party/rocm/include/hip/library_types.h
new file mode 100644
index 0000000..4a988df
--- /dev/null
+++ b/third_party/rocm/include/hip/library_types.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_LIBRARY_TYPES_H
+#define HIP_INCLUDE_HIP_LIBRARY_TYPES_H
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/library_types.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include "library_types.h"
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/math_functions.h b/third_party/rocm/include/hip/math_functions.h
new file mode 100644
index 0000000..2dfec45
--- /dev/null
+++ b/third_party/rocm/include/hip/math_functions.h
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_MATH_FUNCTIONS_H
+#define HIP_INCLUDE_HIP_MATH_FUNCTIONS_H
+
+// Some standard header files, these are included by hc.hpp and so want to make them avail on both
+// paths to provide a consistent include env and avoid "missing symbol" errors that only appears
+// on NVCC path:
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/math_functions.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+//#include <hip/nvcc_detail/math_functions.h>
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/nvcc_detail/channel_descriptor.h b/third_party/rocm/include/hip/nvcc_detail/channel_descriptor.h
new file mode 100644
index 0000000..c3e9dc1
--- /dev/null
+++ b/third_party/rocm/include/hip/nvcc_detail/channel_descriptor.h
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include "channel_descriptor.h"
+
+#endif
diff --git a/third_party/rocm/include/hip/nvcc_detail/hip_complex.h b/third_party/rocm/include/hip/nvcc_detail/hip_complex.h
new file mode 100644
index 0000000..d0e45d2
--- /dev/null
+++ b/third_party/rocm/include/hip/nvcc_detail/hip_complex.h
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COMPLEX_H
+
+#include "cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+ return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+ return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+ return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+ return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+ return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+ return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+ return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
+
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+ return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+ return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+ return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+ return cuCmul(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+ return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
+ return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+ return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+ return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+ return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+ hipDoubleComplex r) {
+ return cuCfma(p, q, r);
+}
+
+#endif
diff --git a/third_party/rocm/include/hip/nvcc_detail/hip_cooperative_groups.h b/third_party/rocm/include/hip/nvcc_detail/hip_cooperative_groups.h
new file mode 100644
index 0000000..113e600
--- /dev/null
+++ b/third_party/rocm/include/hip/nvcc_detail/hip_cooperative_groups.h
@@ -0,0 +1,12 @@
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+// Include CUDA headers
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+// Include HIP wrapper headers around CUDA
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/third_party/rocm/include/hip/nvcc_detail/hip_runtime.h b/third_party/rocm/include/hip/nvcc_detail/hip_runtime.h
new file mode 100644
index 0000000..e7c3eaf
--- /dev/null
+++ b/third_party/rocm/include/hip/nvcc_detail/hip_runtime.h
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_H
+
+#include <cuda_runtime.h>
+
+#include <hip/hip_runtime_api.h>
+
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+
+typedef int hipLaunchParm;
+
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...) \
+ do { \
+ kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__); \
+ } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+
+#define hipReadModeElementType cudaReadModeElementType
+
+#ifdef __CUDA_ARCH__
+
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
+
+#endif
+
+#ifdef __CUDACC__
+
+
+#define hipThreadIdx_x threadIdx.x
+#define hipThreadIdx_y threadIdx.y
+#define hipThreadIdx_z threadIdx.z
+
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z
+
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z
+
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z
+
+#define HIP_SYMBOL(X) &X
+
+/**
+ * extern __shared__
+ */
+
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define abort_() \
+ { asm("trap;"); }
+#undef assert
+#define assert(COND) \
+ { \
+ if (!COND) { \
+ abort_(); \
+ } \
+ }
+#endif
+
+#define __clock() clock()
+#define __clock64() clock64()
+
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hip/nvcc_detail/hip_runtime_api.h b/third_party/rocm/include/hip/nvcc_detail/hip_runtime_api.h
new file mode 100644
index 0000000..257d795
--- /dev/null
+++ b/third_party/rocm/include/hip/nvcc_detail/hip_runtime_api.h
@@ -0,0 +1,2045 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H
+
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_fp16.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+#define __dparm(x) = x
+#else
+#define __dparm(x)
+#endif
+
+// Add Deprecated Support for CUDA Mapped HIP APIs
+#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
+#define __HIP_DEPRECATED
+#elif defined(_MSC_VER)
+#define __HIP_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __HIP_DEPRECATED __attribute__((deprecated))
+#else
+#define __HIP_DEPRECATED
+#endif
+
+
+// TODO -move to include/hip_runtime_api.h as a common implementation.
+/**
+ * Memory copy types
+ *
+ */
+typedef enum hipMemcpyKind {
+ hipMemcpyHostToHost,
+ hipMemcpyHostToDevice,
+ hipMemcpyDeviceToHost,
+ hipMemcpyDeviceToDevice,
+ hipMemcpyDefault
+} hipMemcpyKind;
+
+// hipDataType
+#define hipDataType cudaDataType
+#define HIP_R_16F CUDA_R_16F
+#define HIP_R_32F CUDA_R_32F
+#define HIP_R_64F CUDA_R_64F
+#define HIP_C_16F CUDA_C_16F
+#define HIP_C_32F CUDA_C_32F
+#define HIP_C_64F CUDA_C_64F
+
+// hipLibraryPropertyType
+#define hipLibraryPropertyType libraryPropertyType
+#define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
+#define HIP_LIBRARY_MINOR_VERSION MINOR_VERSION
+#define HIP_LIBRARY_PATCH_LEVEL PATCH_LEVEL
+
+#define HIP_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR
+
+//hipArray_Format
+#define HIP_AD_FORMAT_UNSIGNED_INT8 CU_AD_FORMAT_UNSIGNED_INT8
+#define HIP_AD_FORMAT_UNSIGNED_INT16 CU_AD_FORMAT_UNSIGNED_INT16
+#define HIP_AD_FORMAT_UNSIGNED_INT32 CU_AD_FORMAT_UNSIGNED_INT32
+#define HIP_AD_FORMAT_SIGNED_INT8 CU_AD_FORMAT_SIGNED_INT8
+#define HIP_AD_FORMAT_SIGNED_INT16 CU_AD_FORMAT_SIGNED_INT16
+#define HIP_AD_FORMAT_SIGNED_INT32 CU_AD_FORMAT_SIGNED_INT32
+#define HIP_AD_FORMAT_HALF CU_AD_FORMAT_HALF
+#define HIP_AD_FORMAT_FLOAT CU_AD_FORMAT_FLOAT
+
+// hipArray_Format
+#define hipArray_Format CUarray_format
+
+inline static CUarray_format hipArray_FormatToCUarray_format(
+ hipArray_Format format) {
+ switch (format) {
+ case HIP_AD_FORMAT_UNSIGNED_INT8:
+ return CU_AD_FORMAT_UNSIGNED_INT8;
+ case HIP_AD_FORMAT_UNSIGNED_INT16:
+ return CU_AD_FORMAT_UNSIGNED_INT16;
+ case HIP_AD_FORMAT_UNSIGNED_INT32:
+ return CU_AD_FORMAT_UNSIGNED_INT32;
+ case HIP_AD_FORMAT_SIGNED_INT8:
+ return CU_AD_FORMAT_SIGNED_INT8;
+ case HIP_AD_FORMAT_SIGNED_INT16:
+ return CU_AD_FORMAT_SIGNED_INT16;
+ case HIP_AD_FORMAT_SIGNED_INT32:
+ return CU_AD_FORMAT_SIGNED_INT32;
+ case HIP_AD_FORMAT_HALF:
+ return CU_AD_FORMAT_HALF;
+ case HIP_AD_FORMAT_FLOAT:
+ return CU_AD_FORMAT_FLOAT;
+ default:
+ return CU_AD_FORMAT_UNSIGNED_INT8;
+ }
+}
+
+#define HIP_TR_ADDRESS_MODE_WRAP CU_TR_ADDRESS_MODE_WRAP
+#define HIP_TR_ADDRESS_MODE_CLAMP CU_TR_ADDRESS_MODE_CLAMP
+#define HIP_TR_ADDRESS_MODE_MIRROR CU_TR_ADDRESS_MODE_MIRROR
+#define HIP_TR_ADDRESS_MODE_BORDER CU_TR_ADDRESS_MODE_BORDER
+
+// hipAddress_mode
+#define hipAddress_mode CUaddress_mode
+
+inline static CUaddress_mode hipAddress_modeToCUaddress_mode(
+ hipAddress_mode mode) {
+ switch (mode) {
+ case HIP_TR_ADDRESS_MODE_WRAP:
+ return CU_TR_ADDRESS_MODE_WRAP;
+ case HIP_TR_ADDRESS_MODE_CLAMP:
+ return CU_TR_ADDRESS_MODE_CLAMP;
+ case HIP_TR_ADDRESS_MODE_MIRROR:
+ return CU_TR_ADDRESS_MODE_MIRROR;
+ case HIP_TR_ADDRESS_MODE_BORDER:
+ return CU_TR_ADDRESS_MODE_BORDER;
+ default:
+ return CU_TR_ADDRESS_MODE_WRAP;
+ }
+}
+
+#define HIP_TR_FILTER_MODE_POINT CU_TR_FILTER_MODE_POINT
+#define HIP_TR_FILTER_MODE_LINEAR CU_TR_FILTER_MODE_LINEAR
+
+// hipFilter_mode
+#define hipFilter_mode CUfilter_mode
+
+inline static CUfilter_mode hipFilter_mode_enumToCUfilter_mode(
+ hipFilter_mode mode) {
+ switch (mode) {
+ case HIP_TR_FILTER_MODE_POINT:
+ return CU_TR_FILTER_MODE_POINT;
+ case HIP_TR_FILTER_MODE_LINEAR:
+ return CU_TR_FILTER_MODE_LINEAR;
+ default:
+ return CU_TR_FILTER_MODE_POINT;
+ }
+}
+
+//hipResourcetype
+#define HIP_RESOURCE_TYPE_ARRAY CU_RESOURCE_TYPE_ARRAY
+#define HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+#define HIP_RESOURCE_TYPE_LINEAR CU_RESOURCE_TYPE_LINEAR
+#define HIP_RESOURCE_TYPE_PITCH2D CU_RESOURCE_TYPE_PITCH2D
+
+// hipResourcetype
+#define hipResourcetype CUresourcetype
+
+inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
+ hipResourcetype resType) {
+ switch (resType) {
+ case HIP_RESOURCE_TYPE_ARRAY:
+ return CU_RESOURCE_TYPE_ARRAY;
+ case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+ return CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+ case HIP_RESOURCE_TYPE_LINEAR:
+ return CU_RESOURCE_TYPE_LINEAR;
+ case HIP_RESOURCE_TYPE_PITCH2D:
+ return CU_RESOURCE_TYPE_PITCH2D;
+ default:
+ return CU_RESOURCE_TYPE_ARRAY;
+ }
+}
+
+#define hipTexRef CUtexref
+#define hiparray CUarray
+
+// hipTextureAddressMode
+typedef enum cudaTextureAddressMode hipTextureAddressMode;
+#define hipAddressModeWrap cudaAddressModeWrap
+#define hipAddressModeClamp cudaAddressModeClamp
+#define hipAddressModeMirror cudaAddressModeMirror
+#define hipAddressModeBorder cudaAddressModeBorder
+
+// hipTextureFilterMode
+typedef enum cudaTextureFilterMode hipTextureFilterMode;
+#define hipFilterModePoint cudaFilterModePoint
+#define hipFilterModeLinear cudaFilterModeLinear
+
+// hipTextureReadMode
+typedef enum cudaTextureReadMode hipTextureReadMode;
+#define hipReadModeElementType cudaReadModeElementType
+#define hipReadModeNormalizedFloat cudaReadModeNormalizedFloat
+
+// hipChannelFormatKind
+typedef enum cudaChannelFormatKind hipChannelFormatKind;
+#define hipChannelFormatKindSigned cudaChannelFormatKindSigned
+#define hipChannelFormatKindUnsigned cudaChannelFormatKindUnsigned
+#define hipChannelFormatKindFloat cudaChannelFormatKindFloat
+#define hipChannelFormatKindNone cudaChannelFormatKindNone
+
+#define hipSurfaceBoundaryMode cudaSurfaceBoundaryMode
+#define hipBoundaryModeZero cudaBoundaryModeZero
+#define hipBoundaryModeTrap cudaBoundaryModeTrap
+#define hipBoundaryModeClamp cudaBoundaryModeClamp
+
+// hipFuncCache
+#define hipFuncCachePreferNone cudaFuncCachePreferNone
+#define hipFuncCachePreferShared cudaFuncCachePreferShared
+#define hipFuncCachePreferL1 cudaFuncCachePreferL1
+#define hipFuncCachePreferEqual cudaFuncCachePreferEqual
+
+// hipResourceType
+#define hipResourceType cudaResourceType
+#define hipResourceTypeArray cudaResourceTypeArray
+#define hipResourceTypeMipmappedArray cudaResourceTypeMipmappedArray
+#define hipResourceTypeLinear cudaResourceTypeLinear
+#define hipResourceTypePitch2D cudaResourceTypePitch2D
+//
+// hipErrorNoDevice.
+
+
+//! Flags that can be used with hipEventCreateWithFlags:
+#define hipEventDefault cudaEventDefault
+#define hipEventBlockingSync cudaEventBlockingSync
+#define hipEventDisableTiming cudaEventDisableTiming
+#define hipEventInterprocess cudaEventInterprocess
+#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */
+#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */
+
+
+#define hipHostMallocDefault cudaHostAllocDefault
+#define hipHostMallocPortable cudaHostAllocPortable
+#define hipHostMallocMapped cudaHostAllocMapped
+#define hipHostMallocWriteCombined cudaHostAllocWriteCombined
+#define hipHostMallocCoherent 0x0
+#define hipHostMallocNonCoherent 0x0
+
+#define hipMemAttachGlobal cudaMemAttachGlobal
+#define hipMemAttachHost cudaMemAttachHost
+
+#define hipHostRegisterDefault cudaHostRegisterDefault
+#define hipHostRegisterPortable cudaHostRegisterPortable
+#define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterIoMemory cudaHostRegisterIoMemory
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
+#define hipLimitMallocHeapSize cudaLimitMallocHeapSize
+#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
+
+#define hipOccupancyDefault cudaOccupancyDefault
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync \
+ cudaCooperativeLaunchMultiDeviceNoPreSync
+#define hipCooperativeLaunchMultiDeviceNoPostSync \
+ cudaCooperativeLaunchMultiDeviceNoPostSync
+
+
+// enum CUjit_option redefines
+#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
+#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
+
+typedef cudaEvent_t hipEvent_t;
+typedef cudaStream_t hipStream_t;
+typedef cudaIpcEventHandle_t hipIpcEventHandle_t;
+typedef cudaIpcMemHandle_t hipIpcMemHandle_t;
+typedef enum cudaLimit hipLimit_t;
+typedef enum cudaFuncAttribute hipFuncAttribute;
+typedef enum cudaFuncCache hipFuncCache_t;
+typedef CUcontext hipCtx_t;
+typedef enum cudaSharedMemConfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUjit_option hipJitOption;
+typedef CUdevice hipDevice_t;
+typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
+#define hipDevP2PAttrPerformanceRank cudaDevP2PAttrPerformanceRank
+#define hipDevP2PAttrAccessSupported cudaDevP2PAttrAccessSupported
+#define hipDevP2PAttrNativeAtomicSupported cudaDevP2PAttrNativeAtomicSupported
+#define hipDevP2PAttrHipArrayAccessSupported cudaDevP2PAttrCudaArrayAccessSupported
+#define hipFuncAttributeMaxDynamicSharedMemorySize cudaFuncAttributeMaxDynamicSharedMemorySize
+#define hipFuncAttributePreferredSharedMemoryCarveout cudaFuncAttributePreferredSharedMemoryCarveout
+
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
+typedef struct cudaArray hipArray;
+typedef struct cudaArray* hipArray_t;
+typedef struct cudaArray* hipArray_const_t;
+typedef struct cudaFuncAttributes hipFuncAttributes;
+typedef struct cudaLaunchParams hipLaunchParams;
+#define hipFunction_attribute CUfunction_attribute
+#define hip_Memcpy2D CUDA_MEMCPY2D
+#define hipMemcpy3DParms cudaMemcpy3DParms
+#define hipArrayDefault cudaArrayDefault
+#define hipArrayLayered cudaArrayLayered
+#define hipArraySurfaceLoadStore cudaArraySurfaceLoadStore
+#define hipArrayCubemap cudaArrayCubemap
+#define hipArrayTextureGather cudaArrayTextureGather
+
+typedef cudaTextureObject_t hipTextureObject_t;
+typedef cudaSurfaceObject_t hipSurfaceObject_t;
+#define hipTextureType1D cudaTextureType1D
+#define hipTextureType1DLayered cudaTextureType1DLayered
+#define hipTextureType2D cudaTextureType2D
+#define hipTextureType2DLayered cudaTextureType2DLayered
+#define hipTextureType3D cudaTextureType3D
+#define hipDeviceMapHost cudaDeviceMapHost
+
+typedef struct cudaExtent hipExtent;
+typedef struct cudaPitchedPtr hipPitchedPtr;
+#define make_hipExtent make_cudaExtent
+#define make_hipPos make_cudaPos
+#define make_hipPitchedPtr make_cudaPitchedPtr
+// Flags that can be used with hipStreamCreateWithFlags
+#define hipStreamDefault cudaStreamDefault
+#define hipStreamNonBlocking cudaStreamNonBlocking
+
+typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
+typedef struct cudaResourceDesc hipResourceDesc;
+typedef struct cudaTextureDesc hipTextureDesc;
+typedef struct cudaResourceViewDesc hipResourceViewDesc;
+// adding code for hipmemSharedConfig
+#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
+#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
+#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte
+
+//Function Attributes
+#define HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+#define HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_NUM_REGS CU_FUNC_ATTRIBUTE_NUM_REGS
+#define HIP_FUNC_ATTRIBUTE_PTX_VERSION CU_FUNC_ATTRIBUTE_PTX_VERSION
+#define HIP_FUNC_ATTRIBUTE_BINARY_VERSION CU_FUNC_ATTRIBUTE_BINARY_VERSION
+#define HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
+#define HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+#define HIP_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
+
+#if CUDA_VERSION >= 9000
+#define __shfl(...) __shfl_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_up(...) __shfl_up_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_xor(...) __shfl_xor_sync(0xffffffff, __VA_ARGS__)
+#endif // CUDA_VERSION >= 9000
+
+inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
+ switch (cuError) {
+ case cudaSuccess:
+ return hipSuccess;
+ case cudaErrorProfilerDisabled:
+ return hipErrorProfilerDisabled;
+ case cudaErrorProfilerNotInitialized:
+ return hipErrorProfilerNotInitialized;
+ case cudaErrorProfilerAlreadyStarted:
+ return hipErrorProfilerAlreadyStarted;
+ case cudaErrorProfilerAlreadyStopped:
+ return hipErrorProfilerAlreadyStopped;
+ case cudaErrorInsufficientDriver:
+ return hipErrorInsufficientDriver;
+ case cudaErrorUnsupportedLimit:
+ return hipErrorUnsupportedLimit;
+ case cudaErrorPeerAccessUnsupported:
+ return hipErrorPeerAccessUnsupported;
+ case cudaErrorInvalidGraphicsContext:
+ return hipErrorInvalidGraphicsContext;
+ case cudaErrorSharedObjectSymbolNotFound:
+ return hipErrorSharedObjectSymbolNotFound;
+ case cudaErrorSharedObjectInitFailed:
+ return hipErrorSharedObjectInitFailed;
+ case cudaErrorOperatingSystem:
+ return hipErrorOperatingSystem;
+ case cudaErrorSetOnActiveProcess:
+ return hipErrorSetOnActiveProcess;
+ case cudaErrorIllegalAddress:
+ return hipErrorIllegalAddress;
+ case cudaErrorInvalidSymbol:
+ return hipErrorInvalidSymbol;
+ case cudaErrorMissingConfiguration:
+ return hipErrorMissingConfiguration;
+ case cudaErrorMemoryAllocation:
+ return hipErrorOutOfMemory;
+ case cudaErrorInitializationError:
+ return hipErrorNotInitialized;
+ case cudaErrorLaunchFailure:
+ return hipErrorLaunchFailure;
+ case cudaErrorCooperativeLaunchTooLarge:
+ return hipErrorCooperativeLaunchTooLarge;
+ case cudaErrorPriorLaunchFailure:
+ return hipErrorPriorLaunchFailure;
+ case cudaErrorLaunchOutOfResources:
+ return hipErrorLaunchOutOfResources;
+ case cudaErrorInvalidDeviceFunction:
+ return hipErrorInvalidDeviceFunction;
+ case cudaErrorInvalidConfiguration:
+ return hipErrorInvalidConfiguration;
+ case cudaErrorInvalidDevice:
+ return hipErrorInvalidDevice;
+ case cudaErrorInvalidValue:
+ return hipErrorInvalidValue;
+ case cudaErrorInvalidDevicePointer:
+ return hipErrorInvalidDevicePointer;
+ case cudaErrorInvalidMemcpyDirection:
+ return hipErrorInvalidMemcpyDirection;
+ case cudaErrorInvalidResourceHandle:
+ return hipErrorInvalidHandle;
+ case cudaErrorNotReady:
+ return hipErrorNotReady;
+ case cudaErrorNoDevice:
+ return hipErrorNoDevice;
+ case cudaErrorPeerAccessAlreadyEnabled:
+ return hipErrorPeerAccessAlreadyEnabled;
+ case cudaErrorPeerAccessNotEnabled:
+ return hipErrorPeerAccessNotEnabled;
+ case cudaErrorHostMemoryAlreadyRegistered:
+ return hipErrorHostMemoryAlreadyRegistered;
+ case cudaErrorHostMemoryNotRegistered:
+ return hipErrorHostMemoryNotRegistered;
+ case cudaErrorMapBufferObjectFailed:
+ return hipErrorMapFailed;
+ case cudaErrorAssert:
+ return hipErrorAssert;
+ case cudaErrorNotSupported:
+ return hipErrorNotSupported;
+ case cudaErrorCudartUnloading:
+ return hipErrorDeinitialized;
+ case cudaErrorInvalidKernelImage:
+ return hipErrorInvalidImage;
+ case cudaErrorUnmapBufferObjectFailed:
+ return hipErrorUnmapFailed;
+ case cudaErrorNoKernelImageForDevice:
+ return hipErrorNoBinaryForGpu;
+ case cudaErrorECCUncorrectable:
+ return hipErrorECCNotCorrectable;
+ case cudaErrorDeviceAlreadyInUse:
+ return hipErrorContextAlreadyInUse;
+ case cudaErrorInvalidPtx:
+ return hipErrorInvalidKernelFile;
+ case cudaErrorLaunchTimeout:
+ return hipErrorLaunchTimeOut;
+#if CUDA_VERSION >= 10010
+ case cudaErrorInvalidSource:
+ return hipErrorInvalidSource;
+ case cudaErrorFileNotFound:
+ return hipErrorFileNotFound;
+ case cudaErrorSymbolNotFound:
+ return hipErrorNotFound;
+ case cudaErrorArrayIsMapped:
+ return hipErrorArrayIsMapped;
+ case cudaErrorNotMappedAsPointer:
+ return hipErrorNotMappedAsPointer;
+ case cudaErrorNotMappedAsArray:
+ return hipErrorNotMappedAsArray;
+ case cudaErrorNotMapped:
+ return hipErrorNotMapped;
+ case cudaErrorAlreadyAcquired:
+ return hipErrorAlreadyAcquired;
+ case cudaErrorAlreadyMapped:
+ return hipErrorAlreadyMapped;
+#endif
+#if CUDA_VERSION >= 10020
+ case cudaErrorDeviceUninitialized:
+ return hipErrorInvalidContext;
+#endif
+ case cudaErrorUnknown:
+ default:
+ return hipErrorUnknown; // Note - translated error.
+ }
+}
+
+inline static hipError_t hipCUResultTohipError(CUresult cuError) {
+ switch (cuError) {
+ case CUDA_SUCCESS:
+ return hipSuccess;
+ case CUDA_ERROR_OUT_OF_MEMORY:
+ return hipErrorOutOfMemory;
+ case CUDA_ERROR_INVALID_VALUE:
+ return hipErrorInvalidValue;
+ case CUDA_ERROR_INVALID_DEVICE:
+ return hipErrorInvalidDevice;
+ case CUDA_ERROR_DEINITIALIZED:
+ return hipErrorDeinitialized;
+ case CUDA_ERROR_NO_DEVICE:
+ return hipErrorNoDevice;
+ case CUDA_ERROR_INVALID_CONTEXT:
+ return hipErrorInvalidContext;
+ case CUDA_ERROR_NOT_INITIALIZED:
+ return hipErrorNotInitialized;
+ case CUDA_ERROR_INVALID_HANDLE:
+ return hipErrorInvalidHandle;
+ case CUDA_ERROR_MAP_FAILED:
+ return hipErrorMapFailed;
+ case CUDA_ERROR_PROFILER_DISABLED:
+ return hipErrorProfilerDisabled;
+ case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+ return hipErrorProfilerNotInitialized;
+ case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+ return hipErrorProfilerAlreadyStarted;
+ case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+ return hipErrorProfilerAlreadyStopped;
+ case CUDA_ERROR_INVALID_IMAGE:
+ return hipErrorInvalidImage;
+ case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+ return hipErrorContextAlreadyCurrent;
+ case CUDA_ERROR_UNMAP_FAILED:
+ return hipErrorUnmapFailed;
+ case CUDA_ERROR_ARRAY_IS_MAPPED:
+ return hipErrorArrayIsMapped;
+ case CUDA_ERROR_ALREADY_MAPPED:
+ return hipErrorAlreadyMapped;
+ case CUDA_ERROR_NO_BINARY_FOR_GPU:
+ return hipErrorNoBinaryForGpu;
+ case CUDA_ERROR_ALREADY_ACQUIRED:
+ return hipErrorAlreadyAcquired;
+ case CUDA_ERROR_NOT_MAPPED:
+ return hipErrorNotMapped;
+ case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+ return hipErrorNotMappedAsArray;
+ case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+ return hipErrorNotMappedAsPointer;
+ case CUDA_ERROR_ECC_UNCORRECTABLE:
+ return hipErrorECCNotCorrectable;
+ case CUDA_ERROR_UNSUPPORTED_LIMIT:
+ return hipErrorUnsupportedLimit;
+ case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+ return hipErrorContextAlreadyInUse;
+ case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+ return hipErrorPeerAccessUnsupported;
+ case CUDA_ERROR_INVALID_PTX:
+ return hipErrorInvalidKernelFile;
+ case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+ return hipErrorInvalidGraphicsContext;
+ case CUDA_ERROR_INVALID_SOURCE:
+ return hipErrorInvalidSource;
+ case CUDA_ERROR_FILE_NOT_FOUND:
+ return hipErrorFileNotFound;
+ case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+ return hipErrorSharedObjectSymbolNotFound;
+ case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+ return hipErrorSharedObjectInitFailed;
+ case CUDA_ERROR_OPERATING_SYSTEM:
+ return hipErrorOperatingSystem;
+ case CUDA_ERROR_NOT_FOUND:
+ return hipErrorNotFound;
+ case CUDA_ERROR_NOT_READY:
+ return hipErrorNotReady;
+ case CUDA_ERROR_ILLEGAL_ADDRESS:
+ return hipErrorIllegalAddress;
+ case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+ return hipErrorLaunchOutOfResources;
+ case CUDA_ERROR_LAUNCH_TIMEOUT:
+ return hipErrorLaunchTimeOut;
+ case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+ return hipErrorPeerAccessAlreadyEnabled;
+ case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+ return hipErrorPeerAccessNotEnabled;
+ case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+ return hipErrorSetOnActiveProcess;
+ case CUDA_ERROR_ASSERT:
+ return hipErrorAssert;
+ case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+ return hipErrorHostMemoryAlreadyRegistered;
+ case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+ return hipErrorHostMemoryNotRegistered;
+ case CUDA_ERROR_LAUNCH_FAILED:
+ return hipErrorLaunchFailure;
+ case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
+ return hipErrorCooperativeLaunchTooLarge;
+ case CUDA_ERROR_NOT_SUPPORTED:
+ return hipErrorNotSupported;
+ case CUDA_ERROR_UNKNOWN:
+ default:
+ return hipErrorUnknown; // Note - translated error.
+ }
+}
+
+inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
+ switch (hError) {
+ case hipSuccess:
+ return cudaSuccess;
+ case hipErrorOutOfMemory:
+ return cudaErrorMemoryAllocation;
+ case hipErrorProfilerDisabled:
+ return cudaErrorProfilerDisabled;
+ case hipErrorProfilerNotInitialized:
+ return cudaErrorProfilerNotInitialized;
+ case hipErrorProfilerAlreadyStarted:
+ return cudaErrorProfilerAlreadyStarted;
+ case hipErrorProfilerAlreadyStopped:
+ return cudaErrorProfilerAlreadyStopped;
+ case hipErrorInvalidConfiguration:
+ return cudaErrorInvalidConfiguration;
+ case hipErrorLaunchOutOfResources:
+ return cudaErrorLaunchOutOfResources;
+ case hipErrorInvalidValue:
+ return cudaErrorInvalidValue;
+ case hipErrorInvalidHandle:
+ return cudaErrorInvalidResourceHandle;
+ case hipErrorInvalidDevice:
+ return cudaErrorInvalidDevice;
+ case hipErrorInvalidMemcpyDirection:
+ return cudaErrorInvalidMemcpyDirection;
+ case hipErrorInvalidDevicePointer:
+ return cudaErrorInvalidDevicePointer;
+ case hipErrorNotInitialized:
+ return cudaErrorInitializationError;
+ case hipErrorNoDevice:
+ return cudaErrorNoDevice;
+ case hipErrorNotReady:
+ return cudaErrorNotReady;
+ case hipErrorPeerAccessNotEnabled:
+ return cudaErrorPeerAccessNotEnabled;
+ case hipErrorPeerAccessAlreadyEnabled:
+ return cudaErrorPeerAccessAlreadyEnabled;
+ case hipErrorHostMemoryAlreadyRegistered:
+ return cudaErrorHostMemoryAlreadyRegistered;
+ case hipErrorHostMemoryNotRegistered:
+ return cudaErrorHostMemoryNotRegistered;
+ case hipErrorDeinitialized:
+ return cudaErrorCudartUnloading;
+ case hipErrorInvalidSymbol:
+ return cudaErrorInvalidSymbol;
+ case hipErrorInsufficientDriver:
+ return cudaErrorInsufficientDriver;
+ case hipErrorMissingConfiguration:
+ return cudaErrorMissingConfiguration;
+ case hipErrorPriorLaunchFailure:
+ return cudaErrorPriorLaunchFailure;
+ case hipErrorInvalidDeviceFunction:
+ return cudaErrorInvalidDeviceFunction;
+ case hipErrorInvalidImage:
+ return cudaErrorInvalidKernelImage;
+ case hipErrorInvalidContext:
+#if CUDA_VERSION >= 10020
+ return cudaErrorDeviceUninitialized;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorMapFailed:
+ return cudaErrorMapBufferObjectFailed;
+ case hipErrorUnmapFailed:
+ return cudaErrorUnmapBufferObjectFailed;
+ case hipErrorArrayIsMapped:
+#if CUDA_VERSION >= 10010
+ return cudaErrorArrayIsMapped;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorAlreadyMapped:
+#if CUDA_VERSION >= 10010
+ return cudaErrorAlreadyMapped;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorNoBinaryForGpu:
+ return cudaErrorNoKernelImageForDevice;
+ case hipErrorAlreadyAcquired:
+#if CUDA_VERSION >= 10010
+ return cudaErrorAlreadyAcquired;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorNotMapped:
+#if CUDA_VERSION >= 10010
+ return cudaErrorNotMapped;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorNotMappedAsArray:
+#if CUDA_VERSION >= 10010
+ return cudaErrorNotMappedAsArray;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorNotMappedAsPointer:
+#if CUDA_VERSION >= 10010
+ return cudaErrorNotMappedAsPointer;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorECCNotCorrectable:
+ return cudaErrorECCUncorrectable;
+ case hipErrorUnsupportedLimit:
+ return cudaErrorUnsupportedLimit;
+ case hipErrorContextAlreadyInUse:
+ return cudaErrorDeviceAlreadyInUse;
+ case hipErrorPeerAccessUnsupported:
+ return cudaErrorPeerAccessUnsupported;
+ case hipErrorInvalidKernelFile:
+ return cudaErrorInvalidPtx;
+ case hipErrorInvalidGraphicsContext:
+ return cudaErrorInvalidGraphicsContext;
+ case hipErrorInvalidSource:
+#if CUDA_VERSION >= 10010
+ return cudaErrorInvalidSource;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorFileNotFound:
+#if CUDA_VERSION >= 10010
+ return cudaErrorFileNotFound;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorSharedObjectSymbolNotFound:
+ return cudaErrorSharedObjectSymbolNotFound;
+ case hipErrorSharedObjectInitFailed:
+ return cudaErrorSharedObjectInitFailed;
+ case hipErrorOperatingSystem:
+ return cudaErrorOperatingSystem;
+ case hipErrorNotFound:
+#if CUDA_VERSION >= 10010
+ return cudaErrorSymbolNotFound;
+#else
+ return cudaErrorUnknown;
+#endif
+ case hipErrorIllegalAddress:
+ return cudaErrorIllegalAddress;
+ case hipErrorLaunchTimeOut:
+ return cudaErrorLaunchTimeout;
+ case hipErrorSetOnActiveProcess:
+ return cudaErrorSetOnActiveProcess;
+ case hipErrorLaunchFailure:
+ return cudaErrorLaunchFailure;
+ case hipErrorCooperativeLaunchTooLarge:
+ return cudaErrorCooperativeLaunchTooLarge;
+ case hipErrorNotSupported:
+ return cudaErrorNotSupported;
+ // HSA: does not exist in CUDA
+ case hipErrorRuntimeMemory:
+ // HSA: does not exist in CUDA
+ case hipErrorRuntimeOther:
+ case hipErrorUnknown:
+ case hipErrorTbd:
+ default:
+ return cudaErrorUnknown; // Note - translated error.
+ }
+}
+
+inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) {
+ switch (kind) {
+ case hipMemcpyHostToHost:
+ return cudaMemcpyHostToHost;
+ case hipMemcpyHostToDevice:
+ return cudaMemcpyHostToDevice;
+ case hipMemcpyDeviceToHost:
+ return cudaMemcpyDeviceToHost;
+ case hipMemcpyDeviceToDevice:
+ return cudaMemcpyDeviceToDevice;
+ default:
+ return cudaMemcpyDefault;
+ }
+}
+
+inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddressMode(
+ hipTextureAddressMode kind) {
+ switch (kind) {
+ case hipAddressModeWrap:
+ return cudaAddressModeWrap;
+ case hipAddressModeClamp:
+ return cudaAddressModeClamp;
+ case hipAddressModeMirror:
+ return cudaAddressModeMirror;
+ case hipAddressModeBorder:
+ return cudaAddressModeBorder;
+ default:
+ return cudaAddressModeWrap;
+ }
+}
+
+inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilterMode(
+ hipTextureFilterMode kind) {
+ switch (kind) {
+ case hipFilterModePoint:
+ return cudaFilterModePoint;
+ case hipFilterModeLinear:
+ return cudaFilterModeLinear;
+ default:
+ return cudaFilterModePoint;
+ }
+}
+
+inline static enum cudaTextureReadMode hipTextureReadModeToCudaTextureReadMode(hipTextureReadMode kind) {
+ switch (kind) {
+ case hipReadModeElementType:
+ return cudaReadModeElementType;
+ case hipReadModeNormalizedFloat:
+ return cudaReadModeNormalizedFloat;
+ default:
+ return cudaReadModeElementType;
+ }
+}
+
+inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormatKind(
+ hipChannelFormatKind kind) {
+ switch (kind) {
+ case hipChannelFormatKindSigned:
+ return cudaChannelFormatKindSigned;
+ case hipChannelFormatKindUnsigned:
+ return cudaChannelFormatKindUnsigned;
+ case hipChannelFormatKindFloat:
+ return cudaChannelFormatKindFloat;
+ case hipChannelFormatKindNone:
+ return cudaChannelFormatKindNone;
+ default:
+ return cudaChannelFormatKindNone;
+ }
+}
+
+/**
+ * Stream CallBack struct
+ */
+#define HIPRT_CB CUDART_CB
+typedef void(HIPRT_CB* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+inline static hipError_t hipInit(unsigned int flags) {
+ return hipCUResultTohipError(cuInit(flags));
+}
+
+inline static hipError_t hipDeviceReset() { return hipCUDAErrorTohipError(cudaDeviceReset()); }
+
+inline static hipError_t hipGetLastError() { return hipCUDAErrorTohipError(cudaGetLastError()); }
+
+inline static hipError_t hipPeekAtLastError() {
+ return hipCUDAErrorTohipError(cudaPeekAtLastError());
+}
+
+inline static hipError_t hipMalloc(void** ptr, size_t size) {
+ return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
+}
+
+inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+ return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
+}
+
+inline static hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr,size_t* pitch,size_t widthInBytes,size_t height,unsigned int elementSizeBytes){
+ return hipCUResultTohipError(cuMemAllocPitch(dptr,pitch,widthInBytes,height,elementSizeBytes));
+}
+
+inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) {
+ return hipCUDAErrorTohipError(cudaMalloc3D(pitchedDevPtr, extent));
+}
+
+inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); }
+
+inline static hipError_t hipMallocHost(void** ptr, size_t size)
+ __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMallocHost(void** ptr, size_t size) {
+ return hipCUDAErrorTohipError(cudaMallocHost(ptr, size));
+}
+
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size)
+ __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
+ return hipCUResultTohipError(cuMemAllocHost(ptr, size));
+}
+
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags)
+ __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
+}
+
+inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
+ size_t width, size_t height,
+ unsigned int flags __dparm(hipArrayDefault)) {
+ return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
+}
+
+inline static hipError_t hipMalloc3DArray(hipArray** array, const hipChannelFormatDesc* desc,
+ hipExtent extent, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaMalloc3DArray(array, desc, extent, flags));
+}
+
+inline static hipError_t hipFreeArray(hipArray* array) {
+ return hipCUDAErrorTohipError(cudaFreeArray(array));
+}
+
+inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
+}
+
+inline static hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
+ return hipCUDAErrorTohipError(cudaHostGetFlags(flagsPtr, hostPtr));
+}
+
+inline static hipError_t hipHostRegister(void* ptr, size_t size, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaHostRegister(ptr, size, flags));
+}
+
+inline static hipError_t hipHostUnregister(void* ptr) {
+ return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
+}
+
+inline static hipError_t hipFreeHost(void* ptr)
+ __attribute__((deprecated("use hipHostFree instead")));
+inline static hipError_t hipFreeHost(void* ptr) {
+ return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipHostFree(void* ptr) {
+ return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipSetDevice(int device) {
+ return hipCUDAErrorTohipError(cudaSetDevice(device));
+}
+
+inline static hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) {
+ struct cudaDeviceProp cdprop;
+ memset(&cdprop, 0x0, sizeof(struct cudaDeviceProp));
+ cdprop.major = prop->major;
+ cdprop.minor = prop->minor;
+ cdprop.totalGlobalMem = prop->totalGlobalMem;
+ cdprop.sharedMemPerBlock = prop->sharedMemPerBlock;
+ cdprop.regsPerBlock = prop->regsPerBlock;
+ cdprop.warpSize = prop->warpSize;
+ cdprop.maxThreadsPerBlock = prop->maxThreadsPerBlock;
+ cdprop.clockRate = prop->clockRate;
+ cdprop.totalConstMem = prop->totalConstMem;
+ cdprop.multiProcessorCount = prop->multiProcessorCount;
+ cdprop.l2CacheSize = prop->l2CacheSize;
+ cdprop.maxThreadsPerMultiProcessor = prop->maxThreadsPerMultiProcessor;
+ cdprop.computeMode = prop->computeMode;
+ cdprop.canMapHostMemory = prop->canMapHostMemory;
+ cdprop.memoryClockRate = prop->memoryClockRate;
+ cdprop.memoryBusWidth = prop->memoryBusWidth;
+ return hipCUDAErrorTohipError(cudaChooseDevice(device, &cdprop));
+}
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t size) {
+ return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t size) {
+ return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size) {
+ return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t size,
+ hipStream_t stream) {
+ return hipCUResultTohipError(cuMemcpyHtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t size,
+ hipStream_t stream) {
+ return hipCUResultTohipError(cuMemcpyDtoHAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size,
+ hipStream_t stream) {
+ return hipCUResultTohipError(cuMemcpyDtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
+ hipMemcpyKind copyKind) {
+ return hipCUDAErrorTohipError(
+ cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
+}
+
+
+inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
+ size_t sizeBytes, hipMemcpyKind copyKind,
+ hipStream_t stream) {
+ cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes,
+ hipMemcpyKindToCudaMemcpyKind(copyKind),
+ stream);
+
+ if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
+
+ return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
+ hipMemcpyKind copyKind, hipStream_t stream __dparm(0)) {
+ return hipCUDAErrorTohipError(
+ cudaMemcpyAsync(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind), stream));
+}
+
+inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
+ size_t offset __dparm(0),
+ hipMemcpyKind copyType __dparm(hipMemcpyHostToDevice)) {
+ return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset,
+ hipMemcpyKindToCudaMemcpyKind(copyType)));
+}
+
+inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
+ size_t sizeBytes, size_t offset,
+ hipMemcpyKind copyType,
+ hipStream_t stream __dparm(0)) {
+ return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(
+ symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
+}
+
+inline static hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
+ size_t offset __dparm(0),
+ hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+ return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset,
+ hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
+ size_t sizeBytes, size_t offset,
+ hipMemcpyKind kind,
+ hipStream_t stream __dparm(0)) {
+ return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(
+ dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
+ return hipCUDAErrorTohipError(cudaGetSymbolAddress(devPtr, symbolName));
+}
+
+inline static hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
+ return hipCUDAErrorTohipError(cudaGetSymbolSize(size, symbolName));
+}
+
+inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
+ size_t width, size_t height, hipMemcpyKind kind) {
+ return hipCUDAErrorTohipError(
+ cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
+ return hipCUResultTohipError(cuMemcpy2D(pCopy));
+}
+
+inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
+ return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
+}
+
+inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
+{
+ return hipCUDAErrorTohipError(cudaMemcpy3D(p));
+}
+
+inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream)
+{
+ return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
+}
+
+inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
+ size_t width, size_t height, hipMemcpyKind kind,
+ hipStream_t stream) {
+ return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
+ hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
+ const void* src, size_t spitch, size_t width,
+ size_t height, hipMemcpyKind kind) {
+ return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
+ height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
+ size_t hOffset, const void* src,
+ size_t count, hipMemcpyKind kind) {
+ return hipCUDAErrorTohipError(
+ cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
+ size_t wOffset, size_t hOffset,
+ size_t count, hipMemcpyKind kind) {
+ return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
+ hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
+ size_t count) {
+ return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
+}
+
+inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost,
+ size_t count) {
+ return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
+}
+
+inline static hipError_t hipDeviceSynchronize() {
+ return hipCUDAErrorTohipError(cudaDeviceSynchronize());
+}
+
+inline static hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* pCacheConfig) {
+ return hipCUDAErrorTohipError(cudaDeviceGetCacheConfig(pCacheConfig));
+}
+
+inline static hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) {
+ return hipCUDAErrorTohipError(cudaFuncSetAttribute(func, attr, value));
+}
+
+inline static hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) {
+ return hipCUDAErrorTohipError(cudaDeviceSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
+ return hipCUDAErrorTohipError(cudaFuncSetSharedMemConfig(func, config));
+}
+
+inline static const char* hipGetErrorString(hipError_t error) {
+ return cudaGetErrorString(hipErrorToCudaError(error));
+}
+
+inline static const char* hipGetErrorName(hipError_t error) {
+ return cudaGetErrorName(hipErrorToCudaError(error));
+}
+
+inline static hipError_t hipGetDeviceCount(int* count) {
+ return hipCUDAErrorTohipError(cudaGetDeviceCount(count));
+}
+
+inline static hipError_t hipGetDevice(int* device) {
+ return hipCUDAErrorTohipError(cudaGetDevice(device));
+}
+
+inline static hipError_t hipIpcCloseMemHandle(void* devPtr) {
+ return hipCUDAErrorTohipError(cudaIpcCloseMemHandle(devPtr));
+}
+
+inline static hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
+ return hipCUDAErrorTohipError(cudaIpcGetEventHandle(handle, event));
+}
+
+inline static hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr) {
+ return hipCUDAErrorTohipError(cudaIpcGetMemHandle(handle, devPtr));
+}
+
+inline static hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
+ return hipCUDAErrorTohipError(cudaIpcOpenEventHandle(event, handle));
+}
+
+inline static hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle,
+ unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaIpcOpenMemHandle(devPtr, handle, flags));
+}
+
+inline static hipError_t hipMemset(void* devPtr, int value, size_t count) {
+ return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetD32(hipDeviceptr_t devPtr, int value, size_t count) {
+ return hipCUResultTohipError(cuMemsetD32(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetAsync(void* devPtr, int value, size_t count,
+ hipStream_t stream __dparm(0)) {
+ return hipCUDAErrorTohipError(cudaMemsetAsync(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD32Async(hipDeviceptr_t devPtr, int value, size_t count,
+ hipStream_t stream __dparm(0)) {
+ return hipCUResultTohipError(cuMemsetD32Async(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes) {
+ return hipCUResultTohipError(cuMemsetD8(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes,
+ hipStream_t stream __dparm(0)) {
+ return hipCUResultTohipError(cuMemsetD8Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes) {
+ return hipCUResultTohipError(cuMemsetD16(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes,
+ hipStream_t stream __dparm(0)) {
+ return hipCUResultTohipError(cuMemsetD16Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) {
+ return hipCUDAErrorTohipError(cudaMemset2D(dst, pitch, value, width, height));
+}
+
+inline static hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream __dparm(0)) {
+ return hipCUDAErrorTohipError(cudaMemset2DAsync(dst, pitch, value, width, height, stream));
+}
+
+inline static hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent ){
+ return hipCUDAErrorTohipError(cudaMemset3D(pitchedDevPtr, value, extent));
+}
+
+inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent, hipStream_t stream __dparm(0) ){
+ return hipCUDAErrorTohipError(cudaMemset3DAsync(pitchedDevPtr, value, extent, stream));
+}
+
+inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
+ struct cudaDeviceProp cdprop;
+ cudaError_t cerror;
+ cerror = cudaGetDeviceProperties(&cdprop, device);
+
+ strncpy(p_prop->name, cdprop.name, 256);
+ p_prop->totalGlobalMem = cdprop.totalGlobalMem;
+ p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
+ p_prop->regsPerBlock = cdprop.regsPerBlock;
+ p_prop->warpSize = cdprop.warpSize;
+ p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
+ for (int i = 0; i < 3; i++) {
+ p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i];
+ p_prop->maxGridSize[i] = cdprop.maxGridSize[i];
+ }
+ p_prop->clockRate = cdprop.clockRate;
+ p_prop->memoryClockRate = cdprop.memoryClockRate;
+ p_prop->memoryBusWidth = cdprop.memoryBusWidth;
+ p_prop->totalConstMem = cdprop.totalConstMem;
+ p_prop->major = cdprop.major;
+ p_prop->minor = cdprop.minor;
+ p_prop->multiProcessorCount = cdprop.multiProcessorCount;
+ p_prop->l2CacheSize = cdprop.l2CacheSize;
+ p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
+ p_prop->computeMode = cdprop.computeMode;
+ p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate:
+
+ int ccVers = p_prop->major * 100 + p_prop->minor * 10;
+ p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110);
+ p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110);
+ p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120);
+ p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120);
+ p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200);
+ p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120);
+ p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110);
+ p_prop->arch.hasDoubles = (ccVers >= 130);
+ p_prop->arch.hasWarpVote = (ccVers >= 120);
+ p_prop->arch.hasWarpBallot = (ccVers >= 200);
+ p_prop->arch.hasWarpShuffle = (ccVers >= 300);
+ p_prop->arch.hasFunnelShift = (ccVers >= 350);
+ p_prop->arch.hasThreadFenceSystem = (ccVers >= 200);
+ p_prop->arch.hasSyncThreadsExt = (ccVers >= 200);
+ p_prop->arch.hasSurfaceFuncs = (ccVers >= 200);
+ p_prop->arch.has3dGrid = (ccVers >= 200);
+ p_prop->arch.hasDynamicParallelism = (ccVers >= 350);
+
+ p_prop->concurrentKernels = cdprop.concurrentKernels;
+ p_prop->pciDomainID = cdprop.pciDomainID;
+ p_prop->pciBusID = cdprop.pciBusID;
+ p_prop->pciDeviceID = cdprop.pciDeviceID;
+ p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
+ p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
+ p_prop->canMapHostMemory = cdprop.canMapHostMemory;
+ p_prop->gcnArch = 0; // Not a GCN arch
+ p_prop->integrated = cdprop.integrated;
+ p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
+ p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
+ p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
+ p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
+ p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
+ p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
+
+ p_prop->maxTexture1D = cdprop.maxTexture1D;
+ p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
+ p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
+ p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
+ p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
+ p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
+
+ p_prop->memPitch = cdprop.memPitch;
+ p_prop->textureAlignment = cdprop.textureAlignment;
+ p_prop->texturePitchAlignment = cdprop.texturePitchAlignment;
+ p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
+ p_prop->ECCEnabled = cdprop.ECCEnabled;
+ p_prop->tccDriver = cdprop.tccDriver;
+
+ return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
+ enum cudaDeviceAttr cdattr;
+ cudaError_t cerror;
+
+ switch (attr) {
+ case hipDeviceAttributeMaxThreadsPerBlock:
+ cdattr = cudaDevAttrMaxThreadsPerBlock;
+ break;
+ case hipDeviceAttributeMaxBlockDimX:
+ cdattr = cudaDevAttrMaxBlockDimX;
+ break;
+ case hipDeviceAttributeMaxBlockDimY:
+ cdattr = cudaDevAttrMaxBlockDimY;
+ break;
+ case hipDeviceAttributeMaxBlockDimZ:
+ cdattr = cudaDevAttrMaxBlockDimZ;
+ break;
+ case hipDeviceAttributeMaxGridDimX:
+ cdattr = cudaDevAttrMaxGridDimX;
+ break;
+ case hipDeviceAttributeMaxGridDimY:
+ cdattr = cudaDevAttrMaxGridDimY;
+ break;
+ case hipDeviceAttributeMaxGridDimZ:
+ cdattr = cudaDevAttrMaxGridDimZ;
+ break;
+ case hipDeviceAttributeMaxSharedMemoryPerBlock:
+ cdattr = cudaDevAttrMaxSharedMemoryPerBlock;
+ break;
+ case hipDeviceAttributeTotalConstantMemory:
+ cdattr = cudaDevAttrTotalConstantMemory;
+ break;
+ case hipDeviceAttributeWarpSize:
+ cdattr = cudaDevAttrWarpSize;
+ break;
+ case hipDeviceAttributeMaxRegistersPerBlock:
+ cdattr = cudaDevAttrMaxRegistersPerBlock;
+ break;
+ case hipDeviceAttributeClockRate:
+ cdattr = cudaDevAttrClockRate;
+ break;
+ case hipDeviceAttributeMemoryClockRate:
+ cdattr = cudaDevAttrMemoryClockRate;
+ break;
+ case hipDeviceAttributeMemoryBusWidth:
+ cdattr = cudaDevAttrGlobalMemoryBusWidth;
+ break;
+ case hipDeviceAttributeMultiprocessorCount:
+ cdattr = cudaDevAttrMultiProcessorCount;
+ break;
+ case hipDeviceAttributeComputeMode:
+ cdattr = cudaDevAttrComputeMode;
+ break;
+ case hipDeviceAttributeL2CacheSize:
+ cdattr = cudaDevAttrL2CacheSize;
+ break;
+ case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+ cdattr = cudaDevAttrMaxThreadsPerMultiProcessor;
+ break;
+ case hipDeviceAttributeComputeCapabilityMajor:
+ cdattr = cudaDevAttrComputeCapabilityMajor;
+ break;
+ case hipDeviceAttributeComputeCapabilityMinor:
+ cdattr = cudaDevAttrComputeCapabilityMinor;
+ break;
+ case hipDeviceAttributeConcurrentKernels:
+ cdattr = cudaDevAttrConcurrentKernels;
+ break;
+ case hipDeviceAttributePciBusId:
+ cdattr = cudaDevAttrPciBusId;
+ break;
+ case hipDeviceAttributePciDeviceId:
+ cdattr = cudaDevAttrPciDeviceId;
+ break;
+ case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
+ cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
+ break;
+ case hipDeviceAttributeIsMultiGpuBoard:
+ cdattr = cudaDevAttrIsMultiGpuBoard;
+ break;
+ case hipDeviceAttributeIntegrated:
+ cdattr = cudaDevAttrIntegrated;
+ break;
+ case hipDeviceAttributeMaxTexture1DWidth:
+ cdattr = cudaDevAttrMaxTexture1DWidth;
+ break;
+ case hipDeviceAttributeMaxTexture2DWidth:
+ cdattr = cudaDevAttrMaxTexture2DWidth;
+ break;
+ case hipDeviceAttributeMaxTexture2DHeight:
+ cdattr = cudaDevAttrMaxTexture2DHeight;
+ break;
+ case hipDeviceAttributeMaxTexture3DWidth:
+ cdattr = cudaDevAttrMaxTexture3DWidth;
+ break;
+ case hipDeviceAttributeMaxTexture3DHeight:
+ cdattr = cudaDevAttrMaxTexture3DHeight;
+ break;
+ case hipDeviceAttributeMaxTexture3DDepth:
+ cdattr = cudaDevAttrMaxTexture3DDepth;
+ break;
+ case hipDeviceAttributeMaxPitch:
+ cdattr = cudaDevAttrMaxPitch;
+ break;
+ case hipDeviceAttributeTextureAlignment:
+ cdattr = cudaDevAttrTextureAlignment;
+ break;
+ case hipDeviceAttributeTexturePitchAlignment:
+ cdattr = cudaDevAttrTexturePitchAlignment;
+ break;
+ case hipDeviceAttributeKernelExecTimeout:
+ cdattr = cudaDevAttrKernelExecTimeout;
+ break;
+ case hipDeviceAttributeCanMapHostMemory:
+ cdattr = cudaDevAttrCanMapHostMemory;
+ break;
+ case hipDeviceAttributeEccEnabled:
+ cdattr = cudaDevAttrEccEnabled;
+ break;
+ case hipDeviceAttributeCooperativeLaunch:
+ cdattr = cudaDevAttrCooperativeLaunch;
+ break;
+ case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+ cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
+ break;
+ default:
+ return hipCUDAErrorTohipError(cudaErrorInvalidValue);
+ }
+
+ cerror = cudaDeviceGetAttribute(pi, cdattr, device);
+
+ return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+ const void* func,
+ int blockSize,
+ size_t dynamicSMemSize) {
+ return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+ blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+ const void* func,
+ int blockSize,
+ size_t dynamicSMemSize,
+ unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+ blockSize, dynamicSMemSize, flags));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+ hipFunction_t f,
+ int blockSize,
+ size_t dynamicSMemSize ){
+ return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f,
+ blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+ hipFunction_t f,
+ int blockSize,
+ size_t dynamicSMemSize,
+ unsigned int flags ) {
+ return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks,f,
+ blockSize, dynamicSMemSize, flags));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+ hipFunction_t f, size_t dynSharedMemPerBlk,
+ int blockSizeLimit){
+ return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, NULL,
+ dynSharedMemPerBlk, blockSizeLimit));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+ hipFunction_t f, size_t dynSharedMemPerBlk,
+ int blockSizeLimit, unsigned int flags){
+ return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, f, NULL,
+ dynSharedMemPerBlk, blockSizeLimit, flags));
+}
+
+inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) {
+ struct cudaPointerAttributes cPA;
+ hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr));
+ if (err == hipSuccess) {
+#if (CUDART_VERSION >= 11000)
+ auto memType = cPA.type;
+#else
+ unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11
+#endif
+ switch (memType) {
+ case cudaMemoryTypeDevice:
+ attributes->memoryType = hipMemoryTypeDevice;
+ break;
+ case cudaMemoryTypeHost:
+ attributes->memoryType = hipMemoryTypeHost;
+ break;
+ default:
+ return hipErrorUnknown;
+ }
+ attributes->device = cPA.device;
+ attributes->devicePointer = cPA.devicePointer;
+ attributes->hostPointer = cPA.hostPointer;
+ attributes->isManaged = 0;
+ attributes->allocationFlags = 0;
+ }
+ return err;
+}
+
+inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {
+ return hipCUDAErrorTohipError(cudaMemGetInfo(free, total));
+}
+
+inline static hipError_t hipEventCreate(hipEvent_t* event) {
+ return hipCUDAErrorTohipError(cudaEventCreate(event));
+}
+
+inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __dparm(NULL)) {
+ return hipCUDAErrorTohipError(cudaEventRecord(event, stream));
+}
+
+inline static hipError_t hipEventSynchronize(hipEvent_t event) {
+ return hipCUDAErrorTohipError(cudaEventSynchronize(event));
+}
+
+inline static hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
+ return hipCUDAErrorTohipError(cudaEventElapsedTime(ms, start, stop));
+}
+
+inline static hipError_t hipEventDestroy(hipEvent_t event) {
+ return hipCUDAErrorTohipError(cudaEventDestroy(event));
+}
+
+inline static hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaStreamCreateWithFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+ return hipCUDAErrorTohipError(cudaStreamCreateWithPriority(stream, flags, priority));
+}
+
+inline static hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+ return hipCUDAErrorTohipError(cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority));
+}
+
+inline static hipError_t hipStreamCreate(hipStream_t* stream) {
+ return hipCUDAErrorTohipError(cudaStreamCreate(stream));
+}
+
+inline static hipError_t hipStreamSynchronize(hipStream_t stream) {
+ return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipStreamDestroy(hipStream_t stream) {
+ return hipCUDAErrorTohipError(cudaStreamDestroy(stream));
+}
+
+inline static hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) {
+ return hipCUDAErrorTohipError(cudaStreamGetFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamGetPriority(hipStream_t stream, int *priority) {
+ return hipCUDAErrorTohipError(cudaStreamGetPriority(stream, priority));
+}
+
+inline static hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event,
+ unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaStreamWaitEvent(stream, event, flags));
+}
+
+inline static hipError_t hipStreamQuery(hipStream_t stream) {
+ return hipCUDAErrorTohipError(cudaStreamQuery(stream));
+}
+
+inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback,
+ void* userData, unsigned int flags) {
+ return hipCUDAErrorTohipError(
+ cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
+}
+
+inline static hipError_t hipDriverGetVersion(int* driverVersion) {
+ cudaError_t err = cudaDriverGetVersion(driverVersion);
+
+ // Override driver version to match version reported on HCC side.
+ *driverVersion = 4;
+
+ return hipCUDAErrorTohipError(err);
+}
+
+inline static hipError_t hipRuntimeGetVersion(int* runtimeVersion) {
+ return hipCUDAErrorTohipError(cudaRuntimeGetVersion(runtimeVersion));
+}
+
+inline static hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice) {
+ return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
+}
+
+inline static hipError_t hipDeviceDisablePeerAccess(int peerDevice) {
+ return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice));
+}
+
+inline static hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags));
+}
+
+inline static hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) {
+ return hipCUResultTohipError(cuCtxDisablePeerAccess(peerCtx));
+}
+
+inline static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
+ return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags));
+}
+
+inline static hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags,
+ int* active) {
+ return hipCUResultTohipError(cuDevicePrimaryCtxGetState(dev, flags, active));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
+ return hipCUResultTohipError(cuDevicePrimaryCtxRelease(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
+ return hipCUResultTohipError(cuDevicePrimaryCtxRetain(pctx, dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
+ return hipCUResultTohipError(cuDevicePrimaryCtxReset(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
+ return hipCUResultTohipError(cuDevicePrimaryCtxSetFlags(dev, flags));
+}
+
+inline static hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize,
+ hipDeviceptr_t dptr) {
+ return hipCUResultTohipError(cuMemGetAddressRange(pbase, psize, dptr));
+}
+
+inline static hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice,
+ size_t count) {
+ return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count));
+}
+
+inline static hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
+ int srcDevice, size_t count,
+ hipStream_t stream __dparm(0)) {
+ return hipCUDAErrorTohipError(
+ cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream));
+}
+
+// Profile APIs:
+inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cudaProfilerStart()); }
+
+inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); }
+
+inline static hipError_t hipGetDeviceFlags(unsigned int* flags) {
+ return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags));
+}
+
+inline static hipError_t hipSetDeviceFlags(unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags));
+}
+
+inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaEventCreateWithFlags(event, flags));
+}
+
+inline static hipError_t hipEventQuery(hipEvent_t event) {
+ return hipCUDAErrorTohipError(cudaEventQuery(event));
+}
+
+inline static hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device) {
+ return hipCUResultTohipError(cuCtxCreate(ctx, flags, device));
+}
+
+inline static hipError_t hipCtxDestroy(hipCtx_t ctx) {
+ return hipCUResultTohipError(cuCtxDestroy(ctx));
+}
+
+inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
+ return hipCUResultTohipError(cuCtxPopCurrent(ctx));
+}
+
+inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
+ return hipCUResultTohipError(cuCtxPushCurrent(ctx));
+}
+
+inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
+ return hipCUResultTohipError(cuCtxSetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
+ return hipCUResultTohipError(cuCtxGetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetDevice(hipDevice_t* device) {
+ return hipCUResultTohipError(cuCtxGetDevice(device));
+}
+
+inline static hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
+ return hipCUResultTohipError(cuCtxGetApiVersion(ctx, (unsigned int*)apiVersion));
+}
+
+inline static hipError_t hipCtxGetCacheConfig(hipFuncCache* cacheConfig) {
+ return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetCacheConfig(hipFuncCache cacheConfig) {
+ return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
+ return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config));
+}
+
+inline static hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
+ return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig*)pConfig));
+}
+
+inline static hipError_t hipCtxSynchronize(void) {
+ return hipCUResultTohipError(cuCtxSynchronize());
+}
+
+inline static hipError_t hipCtxGetFlags(unsigned int* flags) {
+ return hipCUResultTohipError(cuCtxGetFlags(flags));
+}
+
+inline static hipError_t hipCtxDetach(hipCtx_t ctx) {
+ return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) {
+ return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
+inline static hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) {
+ return hipCUResultTohipError(cuDeviceComputeCapability(major, minor, device));
+}
+
+inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) {
+ return hipCUResultTohipError(cuDeviceGetName(name, len, device));
+}
+
+inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+ int srcDevice, int dstDevice) {
+ return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));
+}
+
+inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) {
+ return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device));
+}
+
+inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId) {
+ return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId));
+}
+
+inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* config) {
+ return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
+ return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
+ return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
+}
+
+inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
+ return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
+ return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod) {
+ return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
+inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
+ const char* kname) {
+ return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetTexRef(hipTexRef* pTexRef, hipModule_t hmod, const char* name){
+ hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
+}
+
+inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {
+ return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
+}
+
+inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
+ return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
+ const char* name) {
+ return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
+ return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image,
+ unsigned int numOptions, hipJitOption* options,
+ void** optionValues) {
+ return hipCUResultTohipError(
+ cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
+}
+
+inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks,
+ dim3 dimBlocks, void** args, size_t sharedMemBytes,
+ hipStream_t stream)
+{
+ return hipCUDAErrorTohipError(cudaLaunchKernel(function_address,numBlocks,dimBlocks,args,sharedMemBytes,stream));
+}
+
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+ unsigned int gridDimY, unsigned int gridDimZ,
+ unsigned int blockDimX, unsigned int blockDimY,
+ unsigned int blockDimZ, unsigned int sharedMemBytes,
+ hipStream_t stream, void** kernelParams,
+ void** extra) {
+ return hipCUResultTohipError(cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX,
+ blockDimY, blockDimZ, sharedMemBytes, stream,
+ kernelParams, extra));
+}
+
+inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
+ return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
+ struct textureReference* tex,
+ const void* devPtr,
+ const hipChannelFormatDesc* desc,
+ size_t size __dparm(UINT_MAX)) {
+ return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
+ size_t* offset, struct textureReference* tex, const void* devPtr,
+ const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
+ return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+ hipChannelFormatKind f) {
+ return cudaCreateChannelDesc(x, y, z, w, hipChannelFormatKindToCudaChannelFormatKind(f));
+}
+
+inline static hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
+ const hipResourceDesc* pResDesc,
+ const hipTextureDesc* pTexDesc,
+ const hipResourceViewDesc* pResViewDesc) {
+ return hipCUDAErrorTohipError(
+ cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc));
+}
+
+inline static hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject) {
+ return hipCUDAErrorTohipError(cudaDestroyTextureObject(textureObject));
+}
+
+inline static hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
+ const hipResourceDesc* pResDesc) {
+ return hipCUDAErrorTohipError(cudaCreateSurfaceObject(pSurfObject, pResDesc));
+}
+
+inline static hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
+ return hipCUDAErrorTohipError(cudaDestroySurfaceObject(surfaceObject));
+}
+
+inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+ hipTextureObject_t textureObject) {
+ return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
+ size_t* offset, const struct textureReference* texref) {
+ return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
+}
+
+inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
+{
+ return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
+}
+
+inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
+ void** kernelParams, unsigned int sharedMemBytes,
+ hipStream_t stream) {
+ return hipCUDAErrorTohipError(
+ cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+ int numDevices, unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __CUDACC__
+
+template<class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+ T func,
+ int blockSize,
+ size_t dynamicSMemSize) {
+ return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+ blockSize, dynamicSMemSize));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
+ size_t dynamicSMemSize = 0,
+ int blockSizeLimit = 0) {
+ return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+ dynamicSMemSize, blockSizeLimit));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func,
+ size_t dynamicSMemSize = 0,
+ int blockSizeLimit = 0, unsigned int flags = 0) {
+ return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+ dynamicSMemSize, blockSizeLimit, flags));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, T func,
+ int blockSize, size_t dynamicSMemSize,unsigned int flags) {
+ return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+ blockSize, dynamicSMemSize, flags));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+ const void* devPtr, size_t size = UINT_MAX) {
+ return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,
+ const void* devPtr, const hipChannelFormatDesc& desc,
+ size_t size = UINT_MAX) {
+ return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
+ return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
+ return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+ struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+ const hipChannelFormatDesc& desc) {
+ return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+ struct texture<T, dim, readMode>* tex, hipArray_const_t array,
+ const hipChannelFormatDesc* desc) {
+ return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+ struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
+ return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
+}
+
+template <class T>
+inline static hipChannelFormatDesc hipCreateChannelDesc() {
+ return cudaCreateChannelDesc<T>();
+}
+
+template <class T>
+inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+ void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+ return hipCUDAErrorTohipError(
+ cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
+ return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
+}
+
+inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
+ return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
+}
+
+inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
+ return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
+}
+
+inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, hipDeviceptr_t dptr, size_t Pitch){
+ return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
+}
+
+inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
+ return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
+}
+
+inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
+ return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
+}
+
+inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
+ return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
+}
+
+inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
+ return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
+}
+
+inline static hipError_t hipArrayDestroy(hiparray hArray){
+ return hipCUResultTohipError(cuArrayDestroy(hArray));
+}
+
+#endif //__CUDACC__
+
+#endif // HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H
diff --git a/third_party/rocm/include/hip/nvcc_detail/hip_texture_types.h b/third_party/rocm/include/hip/nvcc_detail/hip_texture_types.h
new file mode 100644
index 0000000..751dd8e
--- /dev/null
+++ b/third_party/rocm/include/hip/nvcc_detail/hip_texture_types.h
@@ -0,0 +1,6 @@
+#ifndef HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_TEXTURE_TYPES_H
+
+#include <texture_types.h>
+
+#endif
diff --git a/third_party/rocm/include/hip/texture_types.h b/third_party/rocm/include/hip/texture_types.h
new file mode 100644
index 0000000..7d78570
--- /dev/null
+++ b/third_party/rocm/include/hip/texture_types.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
+
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_HCC__) && !defined(__HIP_PLATFORM_NVCC__)
+#include <hip/hcc_detail/texture_types.h>
+#elif defined(__HIP_PLATFORM_NVCC__) && !defined(__HIP_PLATFORM_HCC__)
+#include "texture_types.h"
+#else
+#error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
+#endif
+
+#endif
diff --git a/third_party/rocm/include/hsa/Brig.h b/third_party/rocm/include/hsa/Brig.h
new file mode 100644
index 0000000..4f34bd1
--- /dev/null
+++ b/third_party/rocm/include/hsa/Brig.h
@@ -0,0 +1,1131 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013-2015, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+// HSA Team
+//
+// Advanced Micro Devices, Inc
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimers in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the names of the LLVM Team, University of Illinois at
+// Urbana-Champaign, nor the names of its contributors may be used to
+// endorse or promote products derived from this Software without specific
+// prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+#ifndef INCLUDED_BRIG_H
+#define INCLUDED_BRIG_H
+
+#include <stddef.h> /* size_t */
+#include <stdint.h> /* uintXX_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/*========================================================================================*/
+/* =======================================================================================*/
+/* =======================================================================================*/
+/* =======================================================================================*/
+
+typedef uint32_t BrigCodeOffset32_t;
+typedef uint32_t BrigOperandOffset32_t;
+typedef uint32_t BrigDataOffset32_t;
+
+typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t;
+typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t;
+typedef BrigDataOffset32_t BrigDataOffsetString32_t;
+
+typedef uint32_t BrigVersion32_t;
+enum BrigVersion {
+ BRIG_VERSION_HSAIL_MAJOR = 1,
+ BRIG_VERSION_HSAIL_MINOR = 0,
+ BRIG_VERSION_BRIG_MAJOR = 1,
+ BRIG_VERSION_BRIG_MINOR = 0
+};
+
+typedef uint16_t BrigKind16_t;
+enum BrigKind {
+ BRIG_KIND_NONE = 0x0000,
+
+ BRIG_KIND_DIRECTIVE_BEGIN = 0x1000,
+ BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000,
+ BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001,
+ BRIG_KIND_DIRECTIVE_COMMENT = 0x1002,
+ BRIG_KIND_DIRECTIVE_CONTROL = 0x1003,
+ BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004,
+ BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005,
+ BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006,
+ BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007,
+ BRIG_KIND_DIRECTIVE_KERNEL = 0x1008,
+ BRIG_KIND_DIRECTIVE_LABEL = 0x1009,
+ BRIG_KIND_DIRECTIVE_LOC = 0x100a,
+ BRIG_KIND_DIRECTIVE_MODULE = 0x100b,
+ BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c,
+ BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d,
+ BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e,
+ BRIG_KIND_DIRECTIVE_END = 0x100f,
+
+ BRIG_KIND_INST_BEGIN = 0x2000,
+ BRIG_KIND_INST_ADDR = 0x2000,
+ BRIG_KIND_INST_ATOMIC = 0x2001,
+ BRIG_KIND_INST_BASIC = 0x2002,
+ BRIG_KIND_INST_BR = 0x2003,
+ BRIG_KIND_INST_CMP = 0x2004,
+ BRIG_KIND_INST_CVT = 0x2005,
+ BRIG_KIND_INST_IMAGE = 0x2006,
+ BRIG_KIND_INST_LANE = 0x2007,
+ BRIG_KIND_INST_MEM = 0x2008,
+ BRIG_KIND_INST_MEM_FENCE = 0x2009,
+ BRIG_KIND_INST_MOD = 0x200a,
+ BRIG_KIND_INST_QUERY_IMAGE = 0x200b,
+ BRIG_KIND_INST_QUERY_SAMPLER = 0x200c,
+ BRIG_KIND_INST_QUEUE = 0x200d,
+ BRIG_KIND_INST_SEG = 0x200e,
+ BRIG_KIND_INST_SEG_CVT = 0x200f,
+ BRIG_KIND_INST_SIGNAL = 0x2010,
+ BRIG_KIND_INST_SOURCE_TYPE = 0x2011,
+ BRIG_KIND_INST_END = 0x2012,
+
+ BRIG_KIND_OPERAND_BEGIN = 0x3000,
+ BRIG_KIND_OPERAND_ADDRESS = 0x3000,
+ BRIG_KIND_OPERAND_ALIGN = 0x3001,
+ BRIG_KIND_OPERAND_CODE_LIST = 0x3002,
+ BRIG_KIND_OPERAND_CODE_REF = 0x3003,
+ BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004,
+ BRIG_KIND_OPERAND_RESERVED = 0x3005,
+ BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006,
+ BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007,
+ BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008,
+ BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009,
+ BRIG_KIND_OPERAND_REGISTER = 0x300a,
+ BRIG_KIND_OPERAND_STRING = 0x300b,
+ BRIG_KIND_OPERAND_WAVESIZE = 0x300c,
+ BRIG_KIND_OPERAND_END = 0x300d
+};
+
+typedef uint8_t BrigAlignment8_t;
+enum BrigAlignment {
+ BRIG_ALIGNMENT_NONE = 0,
+ BRIG_ALIGNMENT_1 = 1,
+ BRIG_ALIGNMENT_2 = 2,
+ BRIG_ALIGNMENT_4 = 3,
+ BRIG_ALIGNMENT_8 = 4,
+ BRIG_ALIGNMENT_16 = 5,
+ BRIG_ALIGNMENT_32 = 6,
+ BRIG_ALIGNMENT_64 = 7,
+ BRIG_ALIGNMENT_128 = 8,
+ BRIG_ALIGNMENT_256 = 9,
+ BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_256
+};
+
+typedef uint8_t BrigAllocation8_t;
+enum BrigAllocation {
+ BRIG_ALLOCATION_NONE = 0,
+ BRIG_ALLOCATION_PROGRAM = 1,
+ BRIG_ALLOCATION_AGENT = 2,
+ BRIG_ALLOCATION_AUTOMATIC = 3
+};
+
+typedef uint8_t BrigAluModifier8_t;
+enum BrigAluModifierMask {
+ BRIG_ALU_FTZ = 1
+};
+
+typedef uint8_t BrigAtomicOperation8_t;
+enum BrigAtomicOperation {
+ BRIG_ATOMIC_ADD = 0,
+ BRIG_ATOMIC_AND = 1,
+ BRIG_ATOMIC_CAS = 2,
+ BRIG_ATOMIC_EXCH = 3,
+ BRIG_ATOMIC_LD = 4,
+ BRIG_ATOMIC_MAX = 5,
+ BRIG_ATOMIC_MIN = 6,
+ BRIG_ATOMIC_OR = 7,
+ BRIG_ATOMIC_ST = 8,
+ BRIG_ATOMIC_SUB = 9,
+ BRIG_ATOMIC_WRAPDEC = 10,
+ BRIG_ATOMIC_WRAPINC = 11,
+ BRIG_ATOMIC_XOR = 12,
+ BRIG_ATOMIC_WAIT_EQ = 13,
+ BRIG_ATOMIC_WAIT_NE = 14,
+ BRIG_ATOMIC_WAIT_LT = 15,
+ BRIG_ATOMIC_WAIT_GTE = 16,
+ BRIG_ATOMIC_WAITTIMEOUT_EQ = 17,
+ BRIG_ATOMIC_WAITTIMEOUT_NE = 18,
+ BRIG_ATOMIC_WAITTIMEOUT_LT = 19,
+ BRIG_ATOMIC_WAITTIMEOUT_GTE = 20
+};
+
+typedef uint8_t BrigCompareOperation8_t;
+enum BrigCompareOperation {
+ BRIG_COMPARE_EQ = 0,
+ BRIG_COMPARE_NE = 1,
+ BRIG_COMPARE_LT = 2,
+ BRIG_COMPARE_LE = 3,
+ BRIG_COMPARE_GT = 4,
+ BRIG_COMPARE_GE = 5,
+ BRIG_COMPARE_EQU = 6,
+ BRIG_COMPARE_NEU = 7,
+ BRIG_COMPARE_LTU = 8,
+ BRIG_COMPARE_LEU = 9,
+ BRIG_COMPARE_GTU = 10,
+ BRIG_COMPARE_GEU = 11,
+ BRIG_COMPARE_NUM = 12,
+ BRIG_COMPARE_NAN = 13,
+ BRIG_COMPARE_SEQ = 14,
+ BRIG_COMPARE_SNE = 15,
+ BRIG_COMPARE_SLT = 16,
+ BRIG_COMPARE_SLE = 17,
+ BRIG_COMPARE_SGT = 18,
+ BRIG_COMPARE_SGE = 19,
+ BRIG_COMPARE_SGEU = 20,
+ BRIG_COMPARE_SEQU = 21,
+ BRIG_COMPARE_SNEU = 22,
+ BRIG_COMPARE_SLTU = 23,
+ BRIG_COMPARE_SLEU = 24,
+ BRIG_COMPARE_SNUM = 25,
+ BRIG_COMPARE_SNAN = 26,
+ BRIG_COMPARE_SGTU = 27
+};
+
+typedef uint16_t BrigControlDirective16_t;
+enum BrigControlDirective {
+ BRIG_CONTROL_NONE = 0,
+ BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1,
+ BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2,
+ BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3,
+ BRIG_CONTROL_MAXFLATGRIDSIZE = 4,
+ BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5,
+ BRIG_CONTROL_REQUIREDDIM = 6,
+ BRIG_CONTROL_REQUIREDGRIDSIZE = 7,
+ BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8,
+ BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9
+};
+
+typedef uint8_t BrigExecutableModifier8_t;
+enum BrigExecutableModifierMask {
+ BRIG_EXECUTABLE_DEFINITION = 1
+};
+
+typedef uint8_t BrigImageChannelOrder8_t;
+enum BrigImageChannelOrder {
+ BRIG_CHANNEL_ORDER_A = 0,
+ BRIG_CHANNEL_ORDER_R = 1,
+ BRIG_CHANNEL_ORDER_RX = 2,
+ BRIG_CHANNEL_ORDER_RG = 3,
+ BRIG_CHANNEL_ORDER_RGX = 4,
+ BRIG_CHANNEL_ORDER_RA = 5,
+ BRIG_CHANNEL_ORDER_RGB = 6,
+ BRIG_CHANNEL_ORDER_RGBX = 7,
+ BRIG_CHANNEL_ORDER_RGBA = 8,
+ BRIG_CHANNEL_ORDER_BGRA = 9,
+ BRIG_CHANNEL_ORDER_ARGB = 10,
+ BRIG_CHANNEL_ORDER_ABGR = 11,
+ BRIG_CHANNEL_ORDER_SRGB = 12,
+ BRIG_CHANNEL_ORDER_SRGBX = 13,
+ BRIG_CHANNEL_ORDER_SRGBA = 14,
+ BRIG_CHANNEL_ORDER_SBGRA = 15,
+ BRIG_CHANNEL_ORDER_INTENSITY = 16,
+ BRIG_CHANNEL_ORDER_LUMINANCE = 17,
+ BRIG_CHANNEL_ORDER_DEPTH = 18,
+ BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19,
+
+ BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigImageChannelType8_t;
+enum BrigImageChannelType {
+ BRIG_CHANNEL_TYPE_SNORM_INT8 = 0,
+ BRIG_CHANNEL_TYPE_SNORM_INT16 = 1,
+ BRIG_CHANNEL_TYPE_UNORM_INT8 = 2,
+ BRIG_CHANNEL_TYPE_UNORM_INT16 = 3,
+ BRIG_CHANNEL_TYPE_UNORM_INT24 = 4,
+ BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+ BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+ BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7,
+ BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8,
+ BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9,
+ BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10,
+ BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+ BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+ BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+ BRIG_CHANNEL_TYPE_HALF_FLOAT = 14,
+ BRIG_CHANNEL_TYPE_FLOAT = 15,
+
+ BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigImageGeometry8_t;
+enum BrigImageGeometry {
+ BRIG_GEOMETRY_1D = 0,
+ BRIG_GEOMETRY_2D = 1,
+ BRIG_GEOMETRY_3D = 2,
+ BRIG_GEOMETRY_1DA = 3,
+ BRIG_GEOMETRY_2DA = 4,
+ BRIG_GEOMETRY_1DB = 5,
+ BRIG_GEOMETRY_2DDEPTH = 6,
+ BRIG_GEOMETRY_2DADEPTH = 7,
+
+ BRIG_GEOMETRY_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigImageQuery8_t;
+enum BrigImageQuery {
+ BRIG_IMAGE_QUERY_WIDTH = 0,
+ BRIG_IMAGE_QUERY_HEIGHT = 1,
+ BRIG_IMAGE_QUERY_DEPTH = 2,
+ BRIG_IMAGE_QUERY_ARRAY = 3,
+ BRIG_IMAGE_QUERY_CHANNELORDER = 4,
+ BRIG_IMAGE_QUERY_CHANNELTYPE = 5,
+
+ BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6
+};
+
+typedef uint8_t BrigLinkage8_t;
+enum BrigLinkage {
+ BRIG_LINKAGE_NONE = 0,
+ BRIG_LINKAGE_PROGRAM = 1,
+ BRIG_LINKAGE_MODULE = 2,
+ BRIG_LINKAGE_FUNCTION = 3,
+ BRIG_LINKAGE_ARG = 4
+};
+
+typedef uint8_t BrigMachineModel8_t;
+enum BrigMachineModel {
+ BRIG_MACHINE_SMALL = 0,
+ BRIG_MACHINE_LARGE = 1,
+};
+
+typedef uint8_t BrigMemoryModifier8_t;
+enum BrigMemoryModifierMask {
+ BRIG_MEMORY_CONST = 1
+};
+
+typedef uint8_t BrigMemoryOrder8_t;
+enum BrigMemoryOrder {
+ BRIG_MEMORY_ORDER_NONE = 0,
+ BRIG_MEMORY_ORDER_RELAXED = 1,
+ BRIG_MEMORY_ORDER_SC_ACQUIRE = 2,
+ BRIG_MEMORY_ORDER_SC_RELEASE = 3,
+ BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4,
+};
+
+typedef uint8_t BrigMemoryScope8_t;
+enum BrigMemoryScope {
+ BRIG_MEMORY_SCOPE_NONE = 0,
+ BRIG_MEMORY_SCOPE_WORKITEM = 1,
+ BRIG_MEMORY_SCOPE_WAVEFRONT = 2,
+ BRIG_MEMORY_SCOPE_WORKGROUP = 3,
+ BRIG_MEMORY_SCOPE_AGENT = 4,
+ BRIG_MEMORY_SCOPE_SYSTEM = 5,
+};
+
+typedef uint16_t BrigOpcode16_t;
+enum BrigOpcode {
+ BRIG_OPCODE_NOP = 0,
+ BRIG_OPCODE_ABS = 1,
+ BRIG_OPCODE_ADD = 2,
+ BRIG_OPCODE_BORROW = 3,
+ BRIG_OPCODE_CARRY = 4,
+ BRIG_OPCODE_CEIL = 5,
+ BRIG_OPCODE_COPYSIGN = 6,
+ BRIG_OPCODE_DIV = 7,
+ BRIG_OPCODE_FLOOR = 8,
+ BRIG_OPCODE_FMA = 9,
+ BRIG_OPCODE_FRACT = 10,
+ BRIG_OPCODE_MAD = 11,
+ BRIG_OPCODE_MAX = 12,
+ BRIG_OPCODE_MIN = 13,
+ BRIG_OPCODE_MUL = 14,
+ BRIG_OPCODE_MULHI = 15,
+ BRIG_OPCODE_NEG = 16,
+ BRIG_OPCODE_REM = 17,
+ BRIG_OPCODE_RINT = 18,
+ BRIG_OPCODE_SQRT = 19,
+ BRIG_OPCODE_SUB = 20,
+ BRIG_OPCODE_TRUNC = 21,
+ BRIG_OPCODE_MAD24 = 22,
+ BRIG_OPCODE_MAD24HI = 23,
+ BRIG_OPCODE_MUL24 = 24,
+ BRIG_OPCODE_MUL24HI = 25,
+ BRIG_OPCODE_SHL = 26,
+ BRIG_OPCODE_SHR = 27,
+ BRIG_OPCODE_AND = 28,
+ BRIG_OPCODE_NOT = 29,
+ BRIG_OPCODE_OR = 30,
+ BRIG_OPCODE_POPCOUNT = 31,
+ BRIG_OPCODE_XOR = 32,
+ BRIG_OPCODE_BITEXTRACT = 33,
+ BRIG_OPCODE_BITINSERT = 34,
+ BRIG_OPCODE_BITMASK = 35,
+ BRIG_OPCODE_BITREV = 36,
+ BRIG_OPCODE_BITSELECT = 37,
+ BRIG_OPCODE_FIRSTBIT = 38,
+ BRIG_OPCODE_LASTBIT = 39,
+ BRIG_OPCODE_COMBINE = 40,
+ BRIG_OPCODE_EXPAND = 41,
+ BRIG_OPCODE_LDA = 42,
+ BRIG_OPCODE_MOV = 43,
+ BRIG_OPCODE_SHUFFLE = 44,
+ BRIG_OPCODE_UNPACKHI = 45,
+ BRIG_OPCODE_UNPACKLO = 46,
+ BRIG_OPCODE_PACK = 47,
+ BRIG_OPCODE_UNPACK = 48,
+ BRIG_OPCODE_CMOV = 49,
+ BRIG_OPCODE_CLASS = 50,
+ BRIG_OPCODE_NCOS = 51,
+ BRIG_OPCODE_NEXP2 = 52,
+ BRIG_OPCODE_NFMA = 53,
+ BRIG_OPCODE_NLOG2 = 54,
+ BRIG_OPCODE_NRCP = 55,
+ BRIG_OPCODE_NRSQRT = 56,
+ BRIG_OPCODE_NSIN = 57,
+ BRIG_OPCODE_NSQRT = 58,
+ BRIG_OPCODE_BITALIGN = 59,
+ BRIG_OPCODE_BYTEALIGN = 60,
+ BRIG_OPCODE_PACKCVT = 61,
+ BRIG_OPCODE_UNPACKCVT = 62,
+ BRIG_OPCODE_LERP = 63,
+ BRIG_OPCODE_SAD = 64,
+ BRIG_OPCODE_SADHI = 65,
+ BRIG_OPCODE_SEGMENTP = 66,
+ BRIG_OPCODE_FTOS = 67,
+ BRIG_OPCODE_STOF = 68,
+ BRIG_OPCODE_CMP = 69,
+ BRIG_OPCODE_CVT = 70,
+ BRIG_OPCODE_LD = 71,
+ BRIG_OPCODE_ST = 72,
+ BRIG_OPCODE_ATOMIC = 73,
+ BRIG_OPCODE_ATOMICNORET = 74,
+ BRIG_OPCODE_SIGNAL = 75,
+ BRIG_OPCODE_SIGNALNORET = 76,
+ BRIG_OPCODE_MEMFENCE = 77,
+ BRIG_OPCODE_RDIMAGE = 78,
+ BRIG_OPCODE_LDIMAGE = 79,
+ BRIG_OPCODE_STIMAGE = 80,
+ BRIG_OPCODE_IMAGEFENCE = 81,
+ BRIG_OPCODE_QUERYIMAGE = 82,
+ BRIG_OPCODE_QUERYSAMPLER = 83,
+ BRIG_OPCODE_CBR = 84,
+ BRIG_OPCODE_BR = 85,
+ BRIG_OPCODE_SBR = 86,
+ BRIG_OPCODE_BARRIER = 87,
+ BRIG_OPCODE_WAVEBARRIER = 88,
+ BRIG_OPCODE_ARRIVEFBAR = 89,
+ BRIG_OPCODE_INITFBAR = 90,
+ BRIG_OPCODE_JOINFBAR = 91,
+ BRIG_OPCODE_LEAVEFBAR = 92,
+ BRIG_OPCODE_RELEASEFBAR = 93,
+ BRIG_OPCODE_WAITFBAR = 94,
+ BRIG_OPCODE_LDF = 95,
+ BRIG_OPCODE_ACTIVELANECOUNT = 96,
+ BRIG_OPCODE_ACTIVELANEID = 97,
+ BRIG_OPCODE_ACTIVELANEMASK = 98,
+ BRIG_OPCODE_ACTIVELANEPERMUTE = 99,
+ BRIG_OPCODE_CALL = 100,
+ BRIG_OPCODE_SCALL = 101,
+ BRIG_OPCODE_ICALL = 102,
+ BRIG_OPCODE_RET = 103,
+ BRIG_OPCODE_ALLOCA = 104,
+ BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105,
+ BRIG_OPCODE_CURRENTWORKITEMFLATID = 106,
+ BRIG_OPCODE_DIM = 107,
+ BRIG_OPCODE_GRIDGROUPS = 108,
+ BRIG_OPCODE_GRIDSIZE = 109,
+ BRIG_OPCODE_PACKETCOMPLETIONSIG = 110,
+ BRIG_OPCODE_PACKETID = 111,
+ BRIG_OPCODE_WORKGROUPID = 112,
+ BRIG_OPCODE_WORKGROUPSIZE = 113,
+ BRIG_OPCODE_WORKITEMABSID = 114,
+ BRIG_OPCODE_WORKITEMFLATABSID = 115,
+ BRIG_OPCODE_WORKITEMFLATID = 116,
+ BRIG_OPCODE_WORKITEMID = 117,
+ BRIG_OPCODE_CLEARDETECTEXCEPT = 118,
+ BRIG_OPCODE_GETDETECTEXCEPT = 119,
+ BRIG_OPCODE_SETDETECTEXCEPT = 120,
+ BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121,
+ BRIG_OPCODE_CASQUEUEWRITEINDEX = 122,
+ BRIG_OPCODE_LDQUEUEREADINDEX = 123,
+ BRIG_OPCODE_LDQUEUEWRITEINDEX = 124,
+ BRIG_OPCODE_STQUEUEREADINDEX = 125,
+ BRIG_OPCODE_STQUEUEWRITEINDEX = 126,
+ BRIG_OPCODE_CLOCK = 127,
+ BRIG_OPCODE_CUID = 128,
+ BRIG_OPCODE_DEBUGTRAP = 129,
+ BRIG_OPCODE_GROUPBASEPTR = 130,
+ BRIG_OPCODE_KERNARGBASEPTR = 131,
+ BRIG_OPCODE_LANEID = 132,
+ BRIG_OPCODE_MAXCUID = 133,
+ BRIG_OPCODE_MAXWAVEID = 134,
+ BRIG_OPCODE_NULLPTR = 135,
+ BRIG_OPCODE_WAVEID = 136,
+
+ BRIG_OPCODE_FIRST_USER_DEFINED = 32768,
+};
+
+typedef uint8_t BrigPack8_t;
+enum BrigPack {
+ BRIG_PACK_NONE = 0,
+ BRIG_PACK_PP = 1,
+ BRIG_PACK_PS = 2,
+ BRIG_PACK_SP = 3,
+ BRIG_PACK_SS = 4,
+ BRIG_PACK_S = 5,
+ BRIG_PACK_P = 6,
+ BRIG_PACK_PPSAT = 7,
+ BRIG_PACK_PSSAT = 8,
+ BRIG_PACK_SPSAT = 9,
+ BRIG_PACK_SSSAT = 10,
+ BRIG_PACK_SSAT = 11,
+ BRIG_PACK_PSAT = 12
+};
+
+typedef uint8_t BrigProfile8_t;
+enum BrigProfile {
+ BRIG_PROFILE_BASE = 0,
+ BRIG_PROFILE_FULL = 1,
+};
+
+typedef uint16_t BrigRegisterKind16_t;
+enum BrigRegisterKind {
+ BRIG_REGISTER_KIND_CONTROL = 0,
+ BRIG_REGISTER_KIND_SINGLE = 1,
+ BRIG_REGISTER_KIND_DOUBLE = 2,
+ BRIG_REGISTER_KIND_QUAD = 3
+};
+
+typedef uint8_t BrigRound8_t;
+enum BrigRound {
+ BRIG_ROUND_NONE = 0,
+ BRIG_ROUND_FLOAT_DEFAULT = 1,
+ BRIG_ROUND_FLOAT_NEAR_EVEN = 2,
+ BRIG_ROUND_FLOAT_ZERO = 3,
+ BRIG_ROUND_FLOAT_PLUS_INFINITY = 4,
+ BRIG_ROUND_FLOAT_MINUS_INFINITY = 5,
+ BRIG_ROUND_INTEGER_NEAR_EVEN = 6,
+ BRIG_ROUND_INTEGER_ZERO = 7,
+ BRIG_ROUND_INTEGER_PLUS_INFINITY = 8,
+ BRIG_ROUND_INTEGER_MINUS_INFINITY = 9,
+ BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10,
+ BRIG_ROUND_INTEGER_ZERO_SAT = 11,
+ BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12,
+ BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13,
+ BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14,
+ BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15,
+ BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16,
+ BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17,
+ BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18,
+ BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19,
+ BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20,
+ BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21
+};
+
+typedef uint8_t BrigSamplerAddressing8_t;
+enum BrigSamplerAddressing {
+ BRIG_ADDRESSING_UNDEFINED = 0,
+ BRIG_ADDRESSING_CLAMP_TO_EDGE = 1,
+ BRIG_ADDRESSING_CLAMP_TO_BORDER = 2,
+ BRIG_ADDRESSING_REPEAT = 3,
+ BRIG_ADDRESSING_MIRRORED_REPEAT = 4,
+
+ BRIG_ADDRESSING_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigSamplerCoordNormalization8_t;
+enum BrigSamplerCoordNormalization {
+ BRIG_COORD_UNNORMALIZED = 0,
+ BRIG_COORD_NORMALIZED = 1
+};
+
+typedef uint8_t BrigSamplerFilter8_t;
+enum BrigSamplerFilter {
+ BRIG_FILTER_NEAREST = 0,
+ BRIG_FILTER_LINEAR = 1,
+
+ BRIG_FILTER_FIRST_USER_DEFINED = 128
+};
+
+typedef uint8_t BrigSamplerQuery8_t;
+enum BrigSamplerQuery {
+ BRIG_SAMPLER_QUERY_ADDRESSING = 0,
+ BRIG_SAMPLER_QUERY_COORD = 1,
+ BRIG_SAMPLER_QUERY_FILTER = 2
+};
+
+typedef uint32_t BrigSectionIndex32_t;
+enum BrigSectionIndex {
+ BRIG_SECTION_INDEX_DATA = 0,
+ BRIG_SECTION_INDEX_CODE = 1,
+ BRIG_SECTION_INDEX_OPERAND = 2,
+
+ BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3,
+};
+
+typedef uint8_t BrigSegCvtModifier8_t;
+enum BrigSegCvtModifierMask {
+ BRIG_SEG_CVT_NONULL = 1
+};
+
+typedef uint8_t BrigSegment8_t;
+enum BrigSegment {
+ BRIG_SEGMENT_NONE = 0,
+ BRIG_SEGMENT_FLAT = 1,
+ BRIG_SEGMENT_GLOBAL = 2,
+ BRIG_SEGMENT_READONLY = 3,
+ BRIG_SEGMENT_KERNARG = 4,
+ BRIG_SEGMENT_GROUP = 5,
+ BRIG_SEGMENT_PRIVATE = 6,
+ BRIG_SEGMENT_SPILL = 7,
+ BRIG_SEGMENT_ARG = 8,
+
+ BRIG_SEGMENT_FIRST_USER_DEFINED = 128
+};
+
+enum {
+ BRIG_TYPE_BASE_SIZE = 5,
+ BRIG_TYPE_PACK_SIZE = 2,
+ BRIG_TYPE_ARRAY_SIZE = 1,
+
+ BRIG_TYPE_BASE_SHIFT = 0,
+ BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE,
+ BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE,
+
+ BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT,
+ BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT,
+
+ BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT,
+
+ BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT
+};
+
+typedef uint16_t BrigType16_t;
+enum BrigType {
+ BRIG_TYPE_NONE = 0,
+ BRIG_TYPE_U8 = 1,
+ BRIG_TYPE_U16 = 2,
+ BRIG_TYPE_U32 = 3,
+ BRIG_TYPE_U64 = 4,
+ BRIG_TYPE_S8 = 5,
+ BRIG_TYPE_S16 = 6,
+ BRIG_TYPE_S32 = 7,
+ BRIG_TYPE_S64 = 8,
+ BRIG_TYPE_F16 = 9,
+ BRIG_TYPE_F32 = 10,
+ BRIG_TYPE_F64 = 11,
+ BRIG_TYPE_B1 = 12,
+ BRIG_TYPE_B8 = 13,
+ BRIG_TYPE_B16 = 14,
+ BRIG_TYPE_B32 = 15,
+ BRIG_TYPE_B64 = 16,
+ BRIG_TYPE_B128 = 17,
+ BRIG_TYPE_SAMP = 18,
+ BRIG_TYPE_ROIMG = 19,
+ BRIG_TYPE_WOIMG = 20,
+ BRIG_TYPE_RWIMG = 21,
+ BRIG_TYPE_SIG32 = 22,
+ BRIG_TYPE_SIG64 = 23,
+
+ BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32,
+ BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32,
+ BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32,
+ BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32,
+ BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32,
+ BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64,
+ BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128,
+ BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128,
+
+ BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY,
+ BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY,
+};
+
+typedef uint8_t BrigVariableModifier8_t;
+enum BrigVariableModifierMask {
+ BRIG_VARIABLE_DEFINITION = 1,
+ BRIG_VARIABLE_CONST = 2
+};
+
+typedef uint8_t BrigWidth8_t;
+enum BrigWidth {
+ BRIG_WIDTH_NONE = 0,
+ BRIG_WIDTH_1 = 1,
+ BRIG_WIDTH_2 = 2,
+ BRIG_WIDTH_4 = 3,
+ BRIG_WIDTH_8 = 4,
+ BRIG_WIDTH_16 = 5,
+ BRIG_WIDTH_32 = 6,
+ BRIG_WIDTH_64 = 7,
+ BRIG_WIDTH_128 = 8,
+ BRIG_WIDTH_256 = 9,
+ BRIG_WIDTH_512 = 10,
+ BRIG_WIDTH_1024 = 11,
+ BRIG_WIDTH_2048 = 12,
+ BRIG_WIDTH_4096 = 13,
+ BRIG_WIDTH_8192 = 14,
+ BRIG_WIDTH_16384 = 15,
+ BRIG_WIDTH_32768 = 16,
+ BRIG_WIDTH_65536 = 17,
+ BRIG_WIDTH_131072 = 18,
+ BRIG_WIDTH_262144 = 19,
+ BRIG_WIDTH_524288 = 20,
+ BRIG_WIDTH_1048576 = 21,
+ BRIG_WIDTH_2097152 = 22,
+ BRIG_WIDTH_4194304 = 23,
+ BRIG_WIDTH_8388608 = 24,
+ BRIG_WIDTH_16777216 = 25,
+ BRIG_WIDTH_33554432 = 26,
+ BRIG_WIDTH_67108864 = 27,
+ BRIG_WIDTH_134217728 = 28,
+ BRIG_WIDTH_268435456 = 29,
+ BRIG_WIDTH_536870912 = 30,
+ BRIG_WIDTH_1073741824 = 31,
+ BRIG_WIDTH_2147483648 = 32,
+ BRIG_WIDTH_WAVESIZE = 33,
+ BRIG_WIDTH_ALL = 34,
+};
+
+struct BrigUInt64 {
+ uint32_t lo;
+ uint32_t hi;
+};
+
+struct BrigBase {
+ uint16_t byteCount;
+ BrigKind16_t kind;
+};
+
+struct BrigData {
+ uint32_t byteCount;
+ uint8_t bytes[1];
+};
+
+struct BrigDirectiveArgBlock {
+ BrigBase base;
+};
+
+struct BrigDirectiveComment {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveControl {
+ BrigBase base;
+ BrigControlDirective16_t control;
+ uint16_t reserved;
+ BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveExecutable {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ uint16_t outArgCount;
+ uint16_t inArgCount;
+ BrigCodeOffset32_t firstInArg;
+ BrigCodeOffset32_t firstCodeBlockEntry;
+ BrigCodeOffset32_t nextModuleEntry;
+ BrigExecutableModifier8_t modifier;
+ BrigLinkage8_t linkage;
+ uint16_t reserved;
+};
+
+struct BrigDirectiveExtension {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveFbarrier {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ BrigVariableModifier8_t modifier;
+ BrigLinkage8_t linkage;
+ uint16_t reserved;
+};
+
+struct BrigDirectiveLabel {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveLoc {
+ BrigBase base;
+ BrigDataOffsetString32_t filename;
+ uint32_t line;
+ uint32_t column;
+};
+
+struct BrigDirectiveNone {
+ BrigBase base;
+};
+
+struct BrigDirectivePragma {
+ BrigBase base;
+ BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveVariable {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ BrigOperandOffset32_t init;
+ BrigType16_t type;
+ BrigSegment8_t segment;
+ BrigAlignment8_t align;
+ BrigUInt64 dim;
+ BrigVariableModifier8_t modifier;
+ BrigLinkage8_t linkage;
+ BrigAllocation8_t allocation;
+ uint8_t reserved;
+};
+
+struct BrigDirectiveModule {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ BrigVersion32_t hsailMajor;
+ BrigVersion32_t hsailMinor;
+ BrigProfile8_t profile;
+ BrigMachineModel8_t machineModel;
+ BrigRound8_t defaultFloatRound;
+ uint8_t reserved;
+};
+
+struct BrigInstBase {
+ BrigBase base;
+ BrigOpcode16_t opcode;
+ BrigType16_t type;
+ BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigInstAddr {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ uint8_t reserved[3];
+};
+
+struct BrigInstAtomic {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ BrigMemoryOrder8_t memoryOrder;
+ BrigMemoryScope8_t memoryScope;
+ BrigAtomicOperation8_t atomicOperation;
+ uint8_t equivClass;
+ uint8_t reserved[3];
+};
+
+struct BrigInstBasic {
+ BrigInstBase base;
+};
+
+struct BrigInstBr {
+ BrigInstBase base;
+ BrigWidth8_t width;
+ uint8_t reserved[3];
+};
+
+struct BrigInstCmp {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigAluModifier8_t modifier;
+ BrigCompareOperation8_t compare;
+ BrigPack8_t pack;
+ uint8_t reserved[3];
+};
+
+struct BrigInstCvt {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigAluModifier8_t modifier;
+ BrigRound8_t round;
+};
+
+struct BrigInstImage {
+ BrigInstBase base;
+ BrigType16_t imageType;
+ BrigType16_t coordType;
+ BrigImageGeometry8_t geometry;
+ uint8_t equivClass;
+ uint16_t reserved;
+};
+
+struct BrigInstLane {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigWidth8_t width;
+ uint8_t reserved;
+};
+
+struct BrigInstMem {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ BrigAlignment8_t align;
+ uint8_t equivClass;
+ BrigWidth8_t width;
+ BrigMemoryModifier8_t modifier;
+ uint8_t reserved[3];
+};
+
+struct BrigInstMemFence {
+ BrigInstBase base;
+ BrigMemoryOrder8_t memoryOrder;
+ BrigMemoryScope8_t globalSegmentMemoryScope;
+ BrigMemoryScope8_t groupSegmentMemoryScope;
+ BrigMemoryScope8_t imageSegmentMemoryScope;
+};
+
+struct BrigInstMod {
+ BrigInstBase base;
+ BrigAluModifier8_t modifier;
+ BrigRound8_t round;
+ BrigPack8_t pack;
+ uint8_t reserved;
+};
+
+struct BrigInstQueryImage {
+ BrigInstBase base;
+ BrigType16_t imageType;
+ BrigImageGeometry8_t geometry;
+ BrigImageQuery8_t query;
+};
+
+struct BrigInstQuerySampler {
+ BrigInstBase base;
+ BrigSamplerQuery8_t query;
+ uint8_t reserved[3];
+};
+
+struct BrigInstQueue {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ BrigMemoryOrder8_t memoryOrder;
+ uint16_t reserved;
+};
+
+struct BrigInstSeg {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ uint8_t reserved[3];
+};
+
+struct BrigInstSegCvt {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigSegment8_t segment;
+ BrigSegCvtModifier8_t modifier;
+};
+
+struct BrigInstSignal {
+ BrigInstBase base;
+ BrigType16_t signalType;
+ BrigMemoryOrder8_t memoryOrder;
+ BrigAtomicOperation8_t signalOperation;
+};
+
+struct BrigInstSourceType {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ uint16_t reserved;
+};
+
+struct BrigOperandAddress {
+ BrigBase base;
+ BrigCodeOffset32_t symbol;
+ BrigOperandOffset32_t reg;
+ BrigUInt64 offset;
+};
+
+struct BrigOperandAlign {
+ BrigBase base;
+ BrigAlignment8_t align;
+ uint8_t reserved[3];
+};
+
+struct BrigOperandCodeList {
+ BrigBase base;
+ BrigDataOffsetCodeList32_t elements;
+};
+
+struct BrigOperandCodeRef {
+ BrigBase base;
+ BrigCodeOffset32_t ref;
+};
+
+struct BrigOperandConstantBytes {
+ BrigBase base;
+ BrigType16_t type;
+ uint16_t reserved;
+ BrigDataOffsetString32_t bytes;
+};
+
+struct BrigOperandConstantOperandList {
+ BrigBase base;
+ BrigType16_t type;
+ uint16_t reserved;
+ BrigDataOffsetOperandList32_t elements;
+};
+
+struct BrigOperandConstantImage {
+ BrigBase base;
+ BrigType16_t type;
+ BrigImageGeometry8_t geometry;
+ BrigImageChannelOrder8_t channelOrder;
+ BrigImageChannelType8_t channelType;
+ uint8_t reserved[3];
+ BrigUInt64 width;
+ BrigUInt64 height;
+ BrigUInt64 depth;
+ BrigUInt64 array;
+};
+
+struct BrigOperandOperandList {
+ BrigBase base;
+ BrigDataOffsetOperandList32_t elements;
+};
+
+struct BrigOperandRegister {
+ BrigBase base;
+ BrigRegisterKind16_t regKind;
+ uint16_t regNum;
+};
+
+struct BrigOperandConstantSampler {
+ BrigBase base;
+ BrigType16_t type;
+ BrigSamplerCoordNormalization8_t coord;
+ BrigSamplerFilter8_t filter;
+ BrigSamplerAddressing8_t addressing;
+ uint8_t reserved[3];
+};
+
+struct BrigOperandString {
+ BrigBase base;
+ BrigDataOffsetString32_t string;
+};
+
+struct BrigOperandWavesize {
+ BrigBase base;
+};
+
+typedef uint32_t BrigExceptions32_t;
+enum BrigExceptionsMask {
+ BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0,
+ BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1,
+ BRIG_EXCEPTIONS_OVERFLOW = 1 << 2,
+ BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3,
+ BRIG_EXCEPTIONS_INEXACT = 1 << 4,
+
+ BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16
+};
+
+struct BrigSectionHeader {
+ uint64_t byteCount;
+ uint32_t headerByteCount;
+ uint32_t nameLength;
+ uint8_t name[1];
+};
+
+struct BrigModuleHeader {
+ char identification[8];
+ BrigVersion32_t brigMajor;
+ BrigVersion32_t brigMinor;
+ uint64_t byteCount;
+ uint8_t hash[64];
+ uint32_t reserved;
+ uint32_t sectionCount;
+ uint64_t sectionIndex;
+};
+
+typedef BrigModuleHeader* BrigModule_t;
+
+#ifdef __cplusplus
+}
+#endif /*__cplusplus*/
+
+#endif // defined(INCLUDED_BRIG_H)
diff --git a/third_party/rocm/include/hsa/amd_hsa_common.h b/third_party/rocm/include/hsa/amd_hsa_common.h
new file mode 100644
index 0000000..7c4ed3e
--- /dev/null
+++ b/third_party/rocm/include/hsa/amd_hsa_common.h
@@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The following set of header files provides definitions for AMD GPU
+// Architecture:
+// - amd_hsa_common.h
+// - amd_hsa_elf.h
+// - amd_hsa_kernel_code.h
+// - amd_hsa_queue.h
+// - amd_hsa_signal.h
+//
+// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more
+// information.
+
+#ifndef AMD_HSA_COMMON_H
+#define AMD_HSA_COMMON_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Descriptive version of the HSA Application Binary Interface.
+#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)"
+
+// Alignment attribute that specifies a minimum alignment (in bytes) for
+// variables of the specified type.
+#if defined(__GNUC__)
+# define __ALIGNED__(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+# define __ALIGNED__(x) __declspec(align(x))
+#elif defined(RC_INVOKED)
+# define __ALIGNED__(x)
+#else
+# error
+#endif
+
+// Creates enumeration entries for packed types. Enumeration entries include
+// bit shift amount, bit width, and bit mask.
+#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \
+ name##_SHIFT = (shift), \
+ name##_WIDTH = (width), \
+ name = (((1 << (width)) - 1) << (shift)) \
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask) \
+ ((src & mask) >> mask ## _SHIFT) \
+
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val) \
+ dst &= (~(1 << mask##_SHIFT) & ~mask); \
+ dst |= (((val) << mask##_SHIFT) & mask) \
+
+#endif // AMD_HSA_COMMON_H
diff --git a/third_party/rocm/include/hsa/amd_hsa_elf.h b/third_party/rocm/include/hsa/amd_hsa_elf.h
new file mode 100644
index 0000000..adcdec4
--- /dev/null
+++ b/third_party/rocm/include/hsa/amd_hsa_elf.h
@@ -0,0 +1,416 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Undefine the macro in case it is defined in the system elf.h.
+#undef EM_AMDGPU
+
+#ifndef AMD_HSA_ELF_H
+#define AMD_HSA_ELF_H
+
+// AMD GPU Specific ELF Header Enumeration Values.
+//
+// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains
+// code object V1 defintions which are not part of the LLVM header. Code object
+// V1 was only supported by the Finalizer which is now deprecated and removed.
+//
+// TODO: Deprecate and remove V1 support and replace this header with using the
+// LLVM header.
+namespace ELF {
+
+// Machine architectures
+// See current registered ELF machine architectures at:
+// http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html
+enum {
+ EM_AMDGPU = 224, // AMD GPU architecture
+};
+
+// OS ABI identification.
+enum {
+ ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime
+};
+
+// AMDGPU OS ABI Version identification.
+enum {
+ // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification
+ // was never defined for V1.
+ ELFABIVERSION_AMDGPU_HSA_V2 = 0,
+ ELFABIVERSION_AMDGPU_HSA_V3 = 1,
+ ELFABIVERSION_AMDGPU_HSA_V4 = 2
+};
+
+// AMDGPU specific e_flags.
+enum : unsigned {
+ // Processor selection mask for EF_AMDGPU_MACH_* values.
+ EF_AMDGPU_MACH = 0x0ff,
+
+ // Not specified processor.
+ EF_AMDGPU_MACH_NONE = 0x000,
+
+ // AMDGCN-based processors.
+ EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
+ EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
+ EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
+ EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
+ EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
+ EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
+ EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
+ EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027,
+ EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
+ EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
+ EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
+ EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
+ EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
+ EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
+ EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
+ EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
+ EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
+ EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
+ EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032,
+ EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
+ EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
+ EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
+ EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
+ EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037,
+ EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038,
+ EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039,
+ EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
+ EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
+ EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
+
+ // First/last AMDGCN-based processors.
+ EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
+ EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805,
+
+ // Indicates if the "xnack" target feature is enabled for all code contained
+ // in the object.
+ //
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2.
+ EF_AMDGPU_FEATURE_XNACK_V2 = 0x01,
+ // Indicates if the trap handler is enabled for all code contained
+ // in the object.
+ //
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2.
+ EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02,
+
+ // Indicates if the "xnack" target feature is enabled for all code contained
+ // in the object.
+ //
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+ EF_AMDGPU_FEATURE_XNACK_V3 = 0x100,
+ // Indicates if the "sramecc" target feature is enabled for all code
+ // contained in the object.
+ //
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3.
+ EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200,
+
+ // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values.
+ //
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+ EF_AMDGPU_FEATURE_XNACK_V4 = 0x300,
+ // XNACK is not supported.
+ EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000,
+ // XNACK is any/default/unspecified.
+ EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100,
+ // XNACK is off.
+ EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200,
+ // XNACK is on.
+ EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300,
+
+ // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values.
+ //
+ // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4.
+ EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00,
+ // SRAMECC is not supported.
+ EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000,
+ // SRAMECC is any/default/unspecified.
+ EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400,
+ // SRAMECC is off.
+ EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800,
+ // SRAMECC is on.
+ EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00,
+};
+
+} // end namespace ELF
+
+// ELF Section Header Flag Enumeration Values.
+#define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS)
+#define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS)
+
+//
+typedef enum {
+ AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0,
+ AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1,
+ AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2,
+ AMDGPU_HSA_SEGMENT_CODE_AGENT = 3,
+ AMDGPU_HSA_SEGMENT_LAST,
+} amdgpu_hsa_elf_segment_t;
+
+// ELF Program Header Type Enumeration Values.
+#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM)
+#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT)
+#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT)
+#define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT)
+
+// ELF Symbol Type Enumeration Values.
+#define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0)
+#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1)
+#define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2)
+
+// ELF Symbol Binding Enumeration Values.
+#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0)
+
+// ELF Symbol Other Information Creation/Retrieval.
+#define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3)
+#define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4)
+#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3))
+
+typedef enum {
+ AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0,
+ AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1,
+ AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2,
+ AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3,
+ AMDGPU_HSA_SYMBOL_ALLOCATION_LAST,
+} amdgpu_hsa_symbol_allocation_t;
+
+// ELF Symbol Allocation Enumeration Values.
+#define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT
+#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM
+#define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT
+#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT
+
+typedef enum {
+ AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0,
+ AMDGPU_HSA_SYMBOL_FLAG_CONST = 1,
+ AMDGPU_HSA_SYMBOL_FLAG_LAST,
+} amdgpu_hsa_symbol_flag_t;
+
+// ELF Symbol Flag Enumeration Values.
+#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST
+
+// AMD GPU Relocation Type Enumeration Values.
+#define R_AMDGPU_NONE 0
+#define R_AMDGPU_32_LOW 1
+#define R_AMDGPU_32_HIGH 2
+#define R_AMDGPU_64 3
+#define R_AMDGPU_INIT_SAMPLER 4
+#define R_AMDGPU_INIT_IMAGE 5
+#define R_AMDGPU_RELATIVE64 13
+
+// AMD GPU Note Type Enumeration Values.
+#define NT_AMD_HSA_CODE_OBJECT_VERSION 1
+#define NT_AMD_HSA_HSAIL 2
+#define NT_AMD_HSA_ISA_VERSION 3
+#define NT_AMD_HSA_PRODUCER 4
+#define NT_AMD_HSA_PRODUCER_OPTIONS 5
+#define NT_AMD_HSA_EXTENSION 6
+#define NT_AMD_HSA_ISA_NAME 11
+#define NT_AMD_HSA_HLDEBUG_DEBUG 101
+#define NT_AMD_HSA_HLDEBUG_TARGET 102
+
+// AMD GPU Metadata Kind Enumeration Values.
+typedef uint16_t amdgpu_hsa_metadata_kind16_t;
+typedef enum {
+ AMDGPU_HSA_METADATA_KIND_NONE = 0,
+ AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1,
+ AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2,
+ AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3,
+ AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4
+} amdgpu_hsa_metadata_kind_t;
+
+// AMD GPU Sampler Coordinate Normalization Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_coord8_t;
+typedef enum {
+ AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0,
+ AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1
+} amdgpu_hsa_sampler_coord_t;
+
+// AMD GPU Sampler Filter Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_filter8_t;
+typedef enum {
+ AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0,
+ AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1
+} amdgpu_hsa_sampler_filter_t;
+
+// AMD GPU Sampler Addressing Enumeration Values.
+typedef uint8_t amdgpu_hsa_sampler_addressing8_t;
+typedef enum {
+ AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0,
+ AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1,
+ AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2,
+ AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3,
+ AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4
+} amdgpu_hsa_sampler_addressing_t;
+
+// AMD GPU Sampler Descriptor.
+typedef struct amdgpu_hsa_sampler_descriptor_s {
+ uint16_t size;
+ amdgpu_hsa_metadata_kind16_t kind;
+ amdgpu_hsa_sampler_coord8_t coord;
+ amdgpu_hsa_sampler_filter8_t filter;
+ amdgpu_hsa_sampler_addressing8_t addressing;
+ uint8_t reserved1;
+} amdgpu_hsa_sampler_descriptor_t;
+
+// AMD GPU Image Geometry Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_geometry8_t;
+typedef enum {
+ AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0,
+ AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1,
+ AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2,
+ AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3,
+ AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4,
+ AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5,
+ AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6,
+ AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7
+} amdgpu_hsa_image_geometry_t;
+
+// AMD GPU Image Channel Order Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_channel_order8_t;
+typedef enum {
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+ AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} amdgpu_hsa_image_channel_order_t;
+
+// AMD GPU Image Channel Type Enumeration Values.
+typedef uint8_t amdgpu_hsa_image_channel_type8_t;
+typedef enum {
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+ AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15
+} amdgpu_hsa_image_channel_type_t;
+
+// AMD GPU Image Descriptor.
+typedef struct amdgpu_hsa_image_descriptor_s {
+ uint16_t size;
+ amdgpu_hsa_metadata_kind16_t kind;
+ amdgpu_hsa_image_geometry8_t geometry;
+ amdgpu_hsa_image_channel_order8_t channel_order;
+ amdgpu_hsa_image_channel_type8_t channel_type;
+ uint8_t reserved1;
+ uint64_t width;
+ uint64_t height;
+ uint64_t depth;
+ uint64_t array;
+} amdgpu_hsa_image_descriptor_t;
+
+typedef struct amdgpu_hsa_note_code_object_version_s {
+ uint32_t major_version;
+ uint32_t minor_version;
+} amdgpu_hsa_note_code_object_version_t;
+
+typedef struct amdgpu_hsa_note_hsail_s {
+ uint32_t hsail_major_version;
+ uint32_t hsail_minor_version;
+ uint8_t profile;
+ uint8_t machine_model;
+ uint8_t default_float_round;
+} amdgpu_hsa_note_hsail_t;
+
+typedef struct amdgpu_hsa_note_isa_s {
+ uint16_t vendor_name_size;
+ uint16_t architecture_name_size;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t stepping;
+ char vendor_and_architecture_name[1];
+} amdgpu_hsa_note_isa_t;
+
+typedef struct amdgpu_hsa_note_producer_s {
+ uint16_t producer_name_size;
+ uint16_t reserved;
+ uint32_t producer_major_version;
+ uint32_t producer_minor_version;
+ char producer_name[1];
+} amdgpu_hsa_note_producer_t;
+
+typedef struct amdgpu_hsa_note_producer_options_s {
+ uint16_t producer_options_size;
+ char producer_options[1];
+} amdgpu_hsa_note_producer_options_t;
+
+typedef enum {
+ AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0,
+ AMDGPU_HSA_RODATA_GLOBAL_AGENT,
+ AMDGPU_HSA_RODATA_READONLY_AGENT,
+ AMDGPU_HSA_DATA_GLOBAL_PROGRAM,
+ AMDGPU_HSA_DATA_GLOBAL_AGENT,
+ AMDGPU_HSA_DATA_READONLY_AGENT,
+ AMDGPU_HSA_BSS_GLOBAL_PROGRAM,
+ AMDGPU_HSA_BSS_GLOBAL_AGENT,
+ AMDGPU_HSA_BSS_READONLY_AGENT,
+ AMDGPU_HSA_SECTION_LAST,
+} amdgpu_hsa_elf_section_t;
+
+#endif // AMD_HSA_ELF_H
diff --git a/third_party/rocm/include/hsa/amd_hsa_kernel_code.h b/third_party/rocm/include/hsa/amd_hsa_kernel_code.h
new file mode 100644
index 0000000..901e49c
--- /dev/null
+++ b/third_party/rocm/include/hsa/amd_hsa_kernel_code.h
@@ -0,0 +1,269 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_KERNEL_CODE_H
+#define AMD_HSA_KERNEL_CODE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Kernel Code Version Enumeration Values.
+typedef uint32_t amd_kernel_code_version32_t;
+enum amd_kernel_code_version_t {
+ AMD_KERNEL_CODE_VERSION_MAJOR = 1,
+ AMD_KERNEL_CODE_VERSION_MINOR = 1
+};
+
+// AMD Machine Kind Enumeration Values.
+typedef uint16_t amd_machine_kind16_t;
+enum amd_machine_kind_t {
+ AMD_MACHINE_KIND_UNDEFINED = 0,
+ AMD_MACHINE_KIND_AMDGPU = 1
+};
+
+// AMD Machine Version.
+typedef uint16_t amd_machine_version16_t;
+
+// AMD Float Round Mode Enumeration Values.
+enum amd_float_round_mode_t {
+ AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0,
+ AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1,
+ AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2,
+ AMD_FLOAT_ROUND_MODE_ZERO = 3
+};
+
+// AMD Float Denorm Mode Enumeration Values.
+enum amd_float_denorm_mode_t {
+ AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0,
+ AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1,
+ AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2,
+ AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3
+};
+
+// AMD Compute Program Resource Register One.
+typedef uint32_t amd_compute_pgm_rsrc_one32_t;
+enum amd_compute_pgm_rsrc_one_t {
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1, 26, 6)
+};
+
+// AMD System VGPR Workitem ID Enumeration Values.
+enum amd_system_vgpr_workitem_id_t {
+ AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0,
+ AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1,
+ AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2,
+ AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3
+};
+
+// AMD Compute Program Resource Register Two.
+typedef uint32_t amd_compute_pgm_rsrc_two32_t;
+enum amd_compute_pgm_rsrc_two_t {
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1)
+};
+
+// AMD Element Byte Size Enumeration Values.
+enum amd_element_byte_size_t {
+ AMD_ELEMENT_BYTE_SIZE_2 = 0,
+ AMD_ELEMENT_BYTE_SIZE_4 = 1,
+ AMD_ELEMENT_BYTE_SIZE_8 = 2,
+ AMD_ELEMENT_BYTE_SIZE_16 = 3
+};
+
+// AMD Kernel Code Properties.
+typedef uint32_t amd_kernel_code_properties32_t;
+enum amd_kernel_code_properties_t {
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 10, 6),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9)
+};
+
+// AMD Power Of Two Enumeration Values.
+typedef uint8_t amd_powertwo8_t;
+enum amd_powertwo_t {
+ AMD_POWERTWO_1 = 0,
+ AMD_POWERTWO_2 = 1,
+ AMD_POWERTWO_4 = 2,
+ AMD_POWERTWO_8 = 3,
+ AMD_POWERTWO_16 = 4,
+ AMD_POWERTWO_32 = 5,
+ AMD_POWERTWO_64 = 6,
+ AMD_POWERTWO_128 = 7,
+ AMD_POWERTWO_256 = 8
+};
+
+// AMD Enabled Control Directive Enumeration Values.
+typedef uint64_t amd_enabled_control_directive64_t;
+enum amd_enabled_control_directive_t {
+ AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1,
+ AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2,
+ AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4,
+ AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8,
+ AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16,
+ AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32,
+ AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64,
+ AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128,
+ AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256
+};
+
+// AMD Exception Kind Enumeration Values.
+typedef uint16_t amd_exception_kind16_t;
+enum amd_exception_kind_t {
+ AMD_EXCEPTION_KIND_INVALID_OPERATION = 1,
+ AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2,
+ AMD_EXCEPTION_KIND_OVERFLOW = 4,
+ AMD_EXCEPTION_KIND_UNDERFLOW = 8,
+ AMD_EXCEPTION_KIND_INEXACT = 16
+};
+
+// AMD Control Directives.
+#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64
+#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES)
+typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s {
+ amd_enabled_control_directive64_t enabled_control_directives;
+ uint16_t enable_break_exceptions;
+ uint16_t enable_detect_exceptions;
+ uint32_t max_dynamic_group_size;
+ uint64_t max_flat_grid_size;
+ uint32_t max_flat_workgroup_size;
+ uint8_t required_dim;
+ uint8_t reserved1[3];
+ uint64_t required_grid_size[3];
+ uint32_t required_workgroup_size[3];
+ uint8_t reserved2[60];
+} amd_control_directives_t;
+
+// AMD Kernel Code.
+#define AMD_ISA_ALIGN_BYTES 256
+#define AMD_KERNEL_CODE_ALIGN_BYTES 64
+#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES)
+typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s {
+ amd_kernel_code_version32_t amd_kernel_code_version_major;
+ amd_kernel_code_version32_t amd_kernel_code_version_minor;
+ amd_machine_kind16_t amd_machine_kind;
+ amd_machine_version16_t amd_machine_version_major;
+ amd_machine_version16_t amd_machine_version_minor;
+ amd_machine_version16_t amd_machine_version_stepping;
+ int64_t kernel_code_entry_byte_offset;
+ int64_t kernel_code_prefetch_byte_offset;
+ uint64_t kernel_code_prefetch_byte_size;
+ uint64_t max_scratch_backing_memory_byte_size;
+ amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1;
+ amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2;
+ amd_kernel_code_properties32_t kernel_code_properties;
+ uint32_t workitem_private_segment_byte_size;
+ uint32_t workgroup_group_segment_byte_size;
+ uint32_t gds_segment_byte_size;
+ uint64_t kernarg_segment_byte_size;
+ uint32_t workgroup_fbarrier_count;
+ uint16_t wavefront_sgpr_count;
+ uint16_t workitem_vgpr_count;
+ uint16_t reserved_vgpr_first;
+ uint16_t reserved_vgpr_count;
+ uint16_t reserved_sgpr_first;
+ uint16_t reserved_sgpr_count;
+ uint16_t debug_wavefront_private_segment_offset_sgpr;
+ uint16_t debug_private_segment_buffer_sgpr;
+ amd_powertwo8_t kernarg_segment_alignment;
+ amd_powertwo8_t group_segment_alignment;
+ amd_powertwo8_t private_segment_alignment;
+ amd_powertwo8_t wavefront_size;
+ int32_t call_convention;
+ uint8_t reserved1[12];
+ uint64_t runtime_loader_kernel_symbol;
+ amd_control_directives_t control_directives;
+} amd_kernel_code_t;
+
+// TODO: this struct should be completely gone once debugger designs/implements
+// Debugger APIs.
+typedef struct amd_runtime_loader_debug_info_s {
+ const void* elf_raw;
+ size_t elf_size;
+ const char *kernel_name;
+ const void *owning_segment;
+} amd_runtime_loader_debug_info_t;
+
+#endif // AMD_HSA_KERNEL_CODE_H
diff --git a/third_party/rocm/include/hsa/amd_hsa_queue.h b/third_party/rocm/include/hsa/amd_hsa_queue.h
new file mode 100644
index 0000000..8675ec4
--- /dev/null
+++ b/third_party/rocm/include/hsa/amd_hsa_queue.h
@@ -0,0 +1,87 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_QUEUE_H
+#define AMD_HSA_QUEUE_H
+
+#include "amd_hsa_common.h"
+#include "hsa.h"
+
+// AMD Queue Properties.
+typedef uint32_t amd_queue_properties32_t;
+enum amd_queue_properties_t {
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1),
+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27)
+};
+
+// AMD Queue.
+#define AMD_QUEUE_ALIGN_BYTES 64
+#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES)
+typedef struct AMD_QUEUE_ALIGN amd_queue_s {
+ hsa_queue_t hsa_queue;
+ uint32_t reserved1[4];
+ volatile uint64_t write_dispatch_id;
+ uint32_t group_segment_aperture_base_hi;
+ uint32_t private_segment_aperture_base_hi;
+ uint32_t max_cu_id;
+ uint32_t max_wave_id;
+ volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
+ volatile uint32_t legacy_doorbell_lock;
+ uint32_t reserved2[9];
+ volatile uint64_t read_dispatch_id;
+ uint32_t read_dispatch_id_field_base_byte_offset;
+ uint32_t compute_tmpring_size;
+ uint32_t scratch_resource_descriptor[4];
+ uint64_t scratch_backing_memory_location;
+ uint64_t scratch_backing_memory_byte_size;
+ uint32_t scratch_wave64_lane_byte_size;
+ amd_queue_properties32_t queue_properties;
+ uint32_t reserved3[2];
+ hsa_signal_t queue_inactive_signal;
+ uint32_t reserved4[14];
+} amd_queue_t;
+
+#endif // AMD_HSA_QUEUE_H
diff --git a/third_party/rocm/include/hsa/amd_hsa_signal.h b/third_party/rocm/include/hsa/amd_hsa_signal.h
new file mode 100644
index 0000000..f9d721f
--- /dev/null
+++ b/third_party/rocm/include/hsa/amd_hsa_signal.h
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AMD_HSA_SIGNAL_H
+#define AMD_HSA_SIGNAL_H
+
+#include "amd_hsa_common.h"
+#include "amd_hsa_queue.h"
+
+// AMD Signal Kind Enumeration Values.
+typedef int64_t amd_signal_kind64_t;
+enum amd_signal_kind_t {
+ AMD_SIGNAL_KIND_INVALID = 0,
+ AMD_SIGNAL_KIND_USER = 1,
+ AMD_SIGNAL_KIND_DOORBELL = -1,
+ AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
+};
+
+// AMD Signal.
+#define AMD_SIGNAL_ALIGN_BYTES 64
+#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES)
+typedef struct AMD_SIGNAL_ALIGN amd_signal_s {
+ amd_signal_kind64_t kind;
+ union {
+ volatile int64_t value;
+ volatile uint32_t* legacy_hardware_doorbell_ptr;
+ volatile uint64_t* hardware_doorbell_ptr;
+ };
+ uint64_t event_mailbox_ptr;
+ uint32_t event_id;
+ uint32_t reserved1;
+ uint64_t start_ts;
+ uint64_t end_ts;
+ union {
+ amd_queue_t* queue_ptr;
+ uint64_t reserved2;
+ };
+ uint32_t reserved3[2];
+} amd_signal_t;
+
+#endif // AMD_HSA_SIGNAL_H
diff --git a/third_party/rocm/include/hsa/hsa.h b/third_party/rocm/include/hsa/hsa.h
new file mode 100644
index 0000000..d8fdd47
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa.h
@@ -0,0 +1,5660 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_H_
+#define HSA_RUNTIME_INC_HSA_H_
+
+#include <stddef.h> /* size_t */
+#include <stdint.h> /* uintXX_t */
+
+#ifndef __cplusplus
+#include <stdbool.h> /* bool */
+#endif /* __cplusplus */
+
+// Placeholder for calling convention and import/export macros
+#ifndef HSA_CALL
+#define HSA_CALL
+#endif
+
+#ifndef HSA_EXPORT_DECORATOR
+#ifdef __GNUC__
+#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default")))
+#else
+#define HSA_EXPORT_DECORATOR
+#endif
+#endif
+#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL
+#define HSA_API_IMPORT HSA_CALL
+
+#if !defined(HSA_API) && defined(HSA_EXPORT)
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+// Detect and set large model builds.
+#undef HSA_LARGE_MODEL
+#if defined(__LP64__) || defined(_M_X64)
+#define HSA_LARGE_MODEL
+#endif
+
+// Try to detect CPU endianness
+#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU)
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+ defined(_M_X64)
+#define LITTLEENDIAN_CPU
+#endif
+#endif
+
+#undef HSA_LITTLE_ENDIAN
+#if defined(LITTLEENDIAN_CPU)
+#define HSA_LITTLE_ENDIAN
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+#ifndef HSA_DEPRECATED
+#define HSA_DEPRECATED
+//#ifdef __GNUC__
+//#define HSA_DEPRECATED __attribute__((deprecated))
+//#else
+//#define HSA_DEPRECATED __declspec(deprecated)
+//#endif
+#endif
+
+#define HSA_VERSION_1_0 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** \defgroup status Runtime Notifications
+ * @{
+ */
+
+/**
+ * @brief Status codes.
+ */
+typedef enum {
+ /**
+ * The function has been executed successfully.
+ */
+ HSA_STATUS_SUCCESS = 0x0,
+ /**
+ * A traversal over a list of elements has been interrupted by the
+ * application before completing.
+ */
+ HSA_STATUS_INFO_BREAK = 0x1,
+ /**
+ * A generic error has occurred.
+ */
+ HSA_STATUS_ERROR = 0x1000,
+ /**
+ * One of the actual arguments does not meet a precondition stated in the
+ * documentation of the corresponding formal argument.
+ */
+ HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001,
+ /**
+ * The requested queue creation is not valid.
+ */
+ HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002,
+ /**
+ * The requested allocation is not valid.
+ */
+ HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003,
+ /**
+ * The agent is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_AGENT = 0x1004,
+ /**
+ * The memory region is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_REGION = 0x1005,
+ /**
+ * The signal is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006,
+ /**
+ * The queue is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007,
+ /**
+ * The HSA runtime failed to allocate the necessary resources. This error
+ * may also occur when the HSA runtime needs to spawn threads or create
+ * internal OS-specific events.
+ */
+ HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008,
+ /**
+ * The AQL packet is malformed.
+ */
+ HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009,
+ /**
+ * An error has been detected while releasing a resource.
+ */
+ HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A,
+ /**
+ * An API other than ::hsa_init has been invoked while the reference count
+ * of the HSA runtime is 0.
+ */
+ HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+ /**
+ * The maximum reference count for the object has been reached.
+ */
+ HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C,
+ /**
+ * The arguments passed to a functions are not compatible.
+ */
+ HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D,
+ /**
+ * The index is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_INDEX = 0x100E,
+ /**
+ * The instruction set architecture is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_ISA = 0x100F,
+ /**
+ * The instruction set architecture name is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017,
+ /**
+ * The code object is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
+ /**
+ * The executable is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011,
+ /**
+ * The executable is frozen.
+ */
+ HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012,
+ /**
+ * There is no symbol with the given name.
+ */
+ HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013,
+ /**
+ * The variable is already defined.
+ */
+ HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014,
+ /**
+ * The variable is undefined.
+ */
+ HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015,
+ /**
+ * An HSAIL operation resulted in a hardware exception.
+ */
+ HSA_STATUS_ERROR_EXCEPTION = 0x1016,
+ /**
+ * The code object symbol is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018,
+ /**
+ * The executable symbol is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019,
+ /**
+ * The file descriptor is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_FILE = 0x1020,
+ /**
+ * The code object reader is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021,
+ /**
+ * The cache is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_CACHE = 0x1022,
+ /**
+ * The wavefront is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023,
+ /**
+ * The signal group is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024,
+ /**
+ * The HSA runtime is not in the configuration state.
+ */
+ HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025,
+ /**
+ * The queue received an error that may require process termination.
+ */
+ HSA_STATUS_ERROR_FATAL = 0x1026
+} hsa_status_t;
+
+/**
+ * @brief Query additional information about a status code.
+ *
+ * @param[in] status Status code.
+ *
+ * @param[out] status_string A NUL-terminated string that describes the error
+ * status.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid
+ * status code, or @p status_string is NULL.
+ */
+hsa_status_t HSA_API hsa_status_string(
+ hsa_status_t status,
+ const char ** status_string);
+
+/** @} */
+
+/** \defgroup common Common Definitions
+ * @{
+ */
+
+/**
+ * @brief Three-dimensional coordinate.
+ */
+typedef struct hsa_dim3_s {
+ /**
+ * X dimension.
+ */
+ uint32_t x;
+
+ /**
+ * Y dimension.
+ */
+ uint32_t y;
+
+ /**
+ * Z dimension.
+ */
+ uint32_t z;
+} hsa_dim3_t;
+
+/**
+ * @brief Access permissions.
+ */
+typedef enum {
+ /**
+ * Read-only access.
+ */
+ HSA_ACCESS_PERMISSION_RO = 1,
+ /**
+ * Write-only access.
+ */
+ HSA_ACCESS_PERMISSION_WO = 2,
+ /**
+ * Read and write access.
+ */
+ HSA_ACCESS_PERMISSION_RW = 3
+} hsa_access_permission_t;
+
+/**
+ * @brief POSIX file descriptor.
+ */
+typedef int hsa_file_t;
+
+/** @} **/
+
+
+/** \defgroup initshutdown Initialization and Shut Down
+ * @{
+ */
+
+/**
+ * @brief Initialize the HSA runtime.
+ *
+ * @details Initializes the HSA runtime if it is not already initialized, and
+ * increases the reference counter associated with the HSA runtime for the
+ * current process. Invocation of any HSA function other than ::hsa_init results
+ * in undefined behavior if the current HSA runtime reference counter is less
+ * than one.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference
+ * count reaches INT32_MAX.
+ */
+hsa_status_t HSA_API hsa_init();
+
+/**
+ * @brief Shut down the HSA runtime.
+ *
+ * @details Decreases the reference count of the HSA runtime instance. When the
+ * reference count reaches 0, the HSA runtime is no longer considered valid
+ * but the application might call ::hsa_init to initialize the HSA runtime
+ * again.
+ *
+ * Once the reference count of the HSA runtime reaches 0, all the resources
+ * associated with it (queues, signals, agent information, etc.) are
+ * considered invalid and any attempt to reference them in subsequent API calls
+ * results in undefined behavior. When the reference count reaches 0, the HSA
+ * runtime may release resources associated with it.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_shut_down();
+
+/** @} **/
+
+/** \defgroup agentinfo System and Agent Information
+ * @{
+ */
+
+/**
+ * @brief Endianness. A convention used to interpret the bytes making up a data
+ * word.
+ */
+typedef enum {
+ /**
+ * The least significant byte is stored in the smallest address.
+ */
+ HSA_ENDIANNESS_LITTLE = 0,
+ /**
+ * The most significant byte is stored in the smallest address.
+ */
+ HSA_ENDIANNESS_BIG = 1
+} hsa_endianness_t;
+
+/**
+ * @brief Machine model. A machine model determines the size of certain data
+ * types in HSA runtime and an agent.
+ */
+typedef enum {
+ /**
+ * Small machine model. Addresses use 32 bits.
+ */
+ HSA_MACHINE_MODEL_SMALL = 0,
+ /**
+ * Large machine model. Addresses use 64 bits.
+ */
+ HSA_MACHINE_MODEL_LARGE = 1
+} hsa_machine_model_t;
+
+/**
+ * @brief Profile. A profile indicates a particular level of feature
+ * support. For example, in the base profile the application must use the HSA
+ * runtime allocator to reserve shared virtual memory, while in the full profile
+ * any host pointer can be shared across all the agents.
+ */
+typedef enum {
+ /**
+ * Base profile.
+ */
+ HSA_PROFILE_BASE = 0,
+ /**
+ * Full profile.
+ */
+ HSA_PROFILE_FULL = 1
+} hsa_profile_t;
+
+/**
+ * @brief System attributes.
+ */
+typedef enum {
+ /**
+ * Major version of the HSA runtime specification supported by the
+ * implementation. The type of this attribute is uint16_t.
+ */
+ HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
+ /**
+ * Minor version of the HSA runtime specification supported by the
+ * implementation. The type of this attribute is uint16_t.
+ */
+ HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+ /**
+ * Current timestamp. The value of this attribute monotonically increases at a
+ * constant rate. The type of this attribute is uint64_t.
+ */
+ HSA_SYSTEM_INFO_TIMESTAMP = 2,
+ /**
+ * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is
+ * in the range 1-400MHz. The type of this attribute is uint64_t.
+ */
+ HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3,
+ /**
+ * Maximum duration of a signal wait operation. Expressed as a count based on
+ * the timestamp frequency. The type of this attribute is uint64_t.
+ */
+ HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4,
+ /**
+ * Endianness of the system. The type of this attribute is ::hsa_endianness_t.
+ */
+ HSA_SYSTEM_INFO_ENDIANNESS = 5,
+ /**
+ * Machine model supported by the HSA runtime. The type of this attribute is
+ * ::hsa_machine_model_t.
+ */
+ HSA_SYSTEM_INFO_MACHINE_MODEL = 6,
+ /**
+ * Bit-mask indicating which extensions are supported by the
+ * implementation. An extension with an ID of @p i is supported if the bit at
+ * position @p i is set. The type of this attribute is uint8_t[128].
+ */
+ HSA_SYSTEM_INFO_EXTENSIONS = 7,
+ /**
+ * String containing the ROCr build identifier.
+ */
+ HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200
+} hsa_system_info_t;
+
+/**
+ * @brief Get the current value of a system attribute.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * system attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_system_get_info(
+ hsa_system_info_t attribute,
+ void* value);
+
+/**
+ * @brief HSA extensions.
+ */
+typedef enum {
+ /**
+ * Finalizer extension.
+ */
+ HSA_EXTENSION_FINALIZER = 0,
+ /**
+ * Images extension.
+ */
+ HSA_EXTENSION_IMAGES = 1,
+
+ /**
+ * Performance counter extension.
+ */
+ HSA_EXTENSION_PERFORMANCE_COUNTERS = 2,
+
+ /**
+ * Profiling events extension.
+ */
+ HSA_EXTENSION_PROFILING_EVENTS = 3,
+ /**
+ * Extension count.
+ */
+ HSA_EXTENSION_STD_LAST = 3,
+ /**
+ * First AMD extension number.
+ */
+ HSA_AMD_FIRST_EXTENSION = 0x200,
+ /**
+ * Profiler extension.
+ */
+ HSA_EXTENSION_AMD_PROFILER = 0x200,
+ /**
+ * Loader extension.
+ */
+ HSA_EXTENSION_AMD_LOADER = 0x201,
+ /**
+ * AqlProfile extension.
+ */
+ HSA_EXTENSION_AMD_AQLPROFILE = 0x202,
+ /**
+ * Last AMD extension.
+ */
+ HSA_AMD_LAST_EXTENSION = 0x202
+} hsa_extension_t;
+
+/**
+ * @brief Query the name of a given extension.
+ *
+ * @param[in] extension Extension identifier. If the extension is not supported
+ * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior
+ * is undefined.
+ *
+ * @param[out] name Pointer to a memory location where the HSA runtime stores
+ * the extension name. The extension name is a NUL-terminated string.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p name is NULL.
+ */
+hsa_status_t HSA_API hsa_extension_get_name(
+ uint16_t extension,
+ const char **name);
+
+/**
+ * @deprecated
+ *
+ * @brief Query if a given version of an extension is supported by the HSA
+ * implementation.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[in] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported(
+ uint16_t extension,
+ uint16_t version_major,
+ uint16_t version_minor,
+ bool* result);
+
+/**
+ * @brief Query if a given version of an extension is supported by the HSA
+ * implementation. All minor versions from 0 up to the returned @p version_minor
+ * must be supported by the implementation.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[out] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p version_minor is NULL, or @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_system_major_extension_supported(
+ uint16_t extension,
+ uint16_t version_major,
+ uint16_t *version_minor,
+ bool* result);
+
+
+/**
+ * @deprecated
+ *
+ * @brief Retrieve the function pointers corresponding to a given version of an
+ * extension. Portable applications are expected to invoke the extension API
+ * using the returned function pointers
+ *
+ * @details The application is responsible for verifying that the given version
+ * of the extension is supported by the HSA implementation (see
+ * ::hsa_system_extension_supported). If the given combination of extension,
+ * major version, and minor version is not supported by the implementation, the
+ * behavior is undefined.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[in] version_minor Minor version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[out] table Pointer to an application-allocated function pointer table
+ * that is populated by the HSA runtime. Must not be NULL. The memory associated
+ * with table can be reused or freed after the function returns.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p table is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table(
+ uint16_t extension,
+ uint16_t version_major,
+ uint16_t version_minor,
+ void *table);
+
+/**
+ * @brief Retrieve the function pointers corresponding to a given major version
+ * of an extension. Portable applications are expected to invoke the extension
+ * API using the returned function pointers.
+ *
+ * @details The application is responsible for verifying that the given major
+ * version of the extension is supported by the HSA implementation (see
+ * ::hsa_system_major_extension_supported). If the given combination of extension
+ * and major version is not supported by the implementation, the behavior is
+ * undefined. Additionally if the length doesn't allow space for a full minor
+ * version, it is implementation defined if only some of the function pointers for
+ * that minor version get written.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] version_major Major version number for which to retrieve the
+ * function pointer table.
+ *
+ * @param[in] table_length Size in bytes of the function pointer table to be
+ * populated. The implementation will not write more than this many bytes to the
+ * table.
+ *
+ * @param[out] table Pointer to an application-allocated function pointer table
+ * that is populated by the HSA runtime. Must not be NULL. The memory associated
+ * with table can be reused or freed after the function returns.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p table is NULL.
+ */
+hsa_status_t HSA_API hsa_system_get_major_extension_table(
+ uint16_t extension,
+ uint16_t version_major,
+ size_t table_length,
+ void *table);
+
+/**
+ * @brief Struct containing an opaque handle to an agent, a device that participates in
+ * the HSA memory model. An agent can submit AQL packets for execution, and
+ * may also accept AQL packets for execution (agent dispatch packets or kernel
+ * dispatch packets launching HSAIL-derived binaries).
+ */
+typedef struct hsa_agent_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_agent_t;
+
+/**
+ * @brief Agent features.
+ */
+typedef enum {
+ /**
+ * The agent supports AQL packets of kernel dispatch type. If this
+ * feature is enabled, the agent is also a kernel agent.
+ */
+ HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1,
+ /**
+ * The agent supports AQL packets of agent dispatch type.
+ */
+ HSA_AGENT_FEATURE_AGENT_DISPATCH = 2
+} hsa_agent_feature_t;
+
+/**
+ * @brief Hardware device type.
+ */
+typedef enum {
+ /**
+ * CPU device.
+ */
+ HSA_DEVICE_TYPE_CPU = 0,
+ /**
+ * GPU device.
+ */
+ HSA_DEVICE_TYPE_GPU = 1,
+ /**
+ * DSP device.
+ */
+ HSA_DEVICE_TYPE_DSP = 2
+} hsa_device_type_t;
+
+/**
+ * @brief Default floating-point rounding mode.
+ */
+typedef enum {
+ /**
+ * Use a default floating-point rounding mode specified elsewhere.
+ */
+ HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0,
+ /**
+ * Operations that specify the default floating-point mode are rounded to zero
+ * by default.
+ */
+ HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1,
+ /**
+ * Operations that specify the default floating-point mode are rounded to the
+ * nearest representable number and that ties should be broken by selecting
+ * the value with an even least significant bit.
+ */
+ HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2
+} hsa_default_float_rounding_mode_t;
+
+/**
+ * @brief Agent attributes.
+ */
+typedef enum {
+ /**
+ * Agent name. The type of this attribute is a NUL-terminated char[64]. The
+ * name must be at most 63 characters long (not including the NUL terminator)
+ * and all array elements not used for the name must be NUL.
+ */
+ HSA_AGENT_INFO_NAME = 0,
+ /**
+ * Name of vendor. The type of this attribute is a NUL-terminated char[64].
+ * The name must be at most 63 characters long (not including the NUL
+ * terminator) and all array elements not used for the name must be NUL.
+ */
+ HSA_AGENT_INFO_VENDOR_NAME = 1,
+ /**
+ * Agent capability. The type of this attribute is ::hsa_agent_feature_t.
+ */
+ HSA_AGENT_INFO_FEATURE = 2,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set
+ * architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Machine model supported by the agent. The type of this attribute is
+ * ::hsa_machine_model_t.
+ */
+ HSA_AGENT_INFO_MACHINE_MODEL = 3,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set
+ * architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Profile supported by the agent. The type of this attribute is
+ * ::hsa_profile_t.
+ */
+ HSA_AGENT_INFO_PROFILE = 4,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given
+ * intruction set architecture supported by the agent instead. If more than
+ * one ISA is supported by the agent, the returned value corresponds to the
+ * first ISA enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Default floating-point rounding mode. The type of this attribute is
+ * ::hsa_default_float_rounding_mode_t, but the value
+ * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed.
+ */
+ HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES
+ * for a given intruction set architecture supported by the agent instead. If
+ * more than one ISA is supported by the agent, the returned value corresponds
+ * to the first ISA enumerated by ::hsa_agent_iterate_isas.
+ *
+ * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the
+ * default floating-point rounding modes supported by the agent in the Base
+ * profile. The type of this attribute is uint32_t. The default floating-point
+ * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not
+ * be set.
+ */
+ HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction
+ * set architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Flag indicating that the f16 HSAIL operation is at least as fast as the
+ * f32 operation in the current agent. The value of this attribute is
+ * undefined if the agent is not a kernel agent. The type of this
+ * attribute is bool.
+ */
+ HSA_AGENT_INFO_FAST_F16_OPERATION = 24,
+ /**
+ * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and
+ * intruction set architecture supported by the agent instead. If more than
+ * one ISA is supported by the agent, the returned value corresponds to the
+ * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront
+ * enumerated by ::hsa_isa_iterate_wavefronts for that ISA.
+ *
+ * Number of work-items in a wavefront. Must be a power of 2 in the range
+ * [1,256]. The value of this attribute is undefined if the agent is not
+ * a kernel agent. The type of this attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction
+ * set architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Maximum number of work-items of each dimension of a work-group. Each
+ * maximum must be greater than 0. No maximum can exceed the value of
+ * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is
+ * undefined if the agent is not a kernel agent. The type of this
+ * attribute is uint16_t[3].
+ */
+ HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction
+ * set architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Maximum total number of work-items in a work-group. The value of this
+ * attribute is undefined if the agent is not a kernel agent. The type
+ * of this attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set
+ * architecture supported by the agent instead.
+ *
+ * Maximum number of work-items of each dimension of a grid. Each maximum must
+ * be greater than 0, and must not be smaller than the corresponding value in
+ * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+ * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined
+ * if the agent is not a kernel agent. The type of this attribute is
+ * ::hsa_dim3_t.
+ */
+ HSA_AGENT_INFO_GRID_MAX_DIM = 9,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set
+ * architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Maximum total number of work-items in a grid. The value of this attribute
+ * is undefined if the agent is not a kernel agent. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_GRID_MAX_SIZE = 10,
+ /**
+ * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction
+ * set architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Maximum number of fbarriers per work-group. Must be at least 32. The value
+ * of this attribute is undefined if the agent is not a kernel agent. The
+ * type of this attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11,
+ /**
+ * @deprecated The maximum number of queues is not statically determined.
+ *
+ * Maximum number of queues that can be active (created but not destroyed) at
+ * one time in the agent. The type of this attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_QUEUES_MAX = 12,
+ /**
+ * Minimum number of packets that a queue created in the agent
+ * can hold. Must be a power of 2 greater than 0. Must not exceed
+ * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13,
+ /**
+ * Maximum number of packets that a queue created in the agent can
+ * hold. Must be a power of 2 greater than 0. The type of this attribute
+ * is uint32_t.
+ */
+ HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14,
+ /**
+ * Type of a queue created in the agent. The type of this attribute is
+ * ::hsa_queue_type32_t.
+ */
+ HSA_AGENT_INFO_QUEUE_TYPE = 15,
+ /**
+ * @deprecated NUMA information is not exposed anywhere else in the API.
+ *
+ * Identifier of the NUMA node associated with the agent. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_AGENT_INFO_NODE = 16,
+ /**
+ * Type of hardware device associated with the agent. The type of this
+ * attribute is ::hsa_device_type_t.
+ */
+ HSA_AGENT_INFO_DEVICE = 17,
+ /**
+ * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about
+ * the caches present in a given agent.
+ *
+ * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size
+ * of 0 for a particular level indicates that there is no cache information
+ * for that level. The type of this attribute is uint32_t[4].
+ */
+ HSA_AGENT_INFO_CACHE_SIZE = 18,
+ /**
+ * @deprecated An agent may support multiple instruction set
+ * architectures. See ::hsa_agent_iterate_isas. If more than one ISA is
+ * supported by the agent, the returned value corresponds to the first ISA
+ * enumerated by ::hsa_agent_iterate_isas.
+ *
+ * Instruction set architecture of the agent. The type of this attribute
+ * is ::hsa_isa_t.
+ */
+ HSA_AGENT_INFO_ISA = 19,
+ /**
+ * Bit-mask indicating which extensions are supported by the agent. An
+ * extension with an ID of @p i is supported if the bit at position @p i is
+ * set. The type of this attribute is uint8_t[128].
+ */
+ HSA_AGENT_INFO_EXTENSIONS = 20,
+ /**
+ * Major version of the HSA runtime specification supported by the
+ * agent. The type of this attribute is uint16_t.
+ */
+ HSA_AGENT_INFO_VERSION_MAJOR = 21,
+ /**
+ * Minor version of the HSA runtime specification supported by the
+ * agent. The type of this attribute is uint16_t.
+ */
+ HSA_AGENT_INFO_VERSION_MINOR = 22
+
+} hsa_agent_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given agent.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * agent attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_get_info(
+ hsa_agent_t agent,
+ hsa_agent_info_t attribute,
+ void* value);
+
+/**
+ * @brief Iterate over the available agents, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] callback Callback to be invoked once per agent. The HSA
+ * runtime passes two arguments to the callback: the agent and the
+ * application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_iterate_agents returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+*/
+hsa_status_t HSA_API hsa_iterate_agents(
+ hsa_status_t (*callback)(hsa_agent_t agent, void* data),
+ void* data);
+
+/*
+
+// If we do not know the size of an attribute, we need to query it first
+// Note: this API will not be in the spec unless needed
+hsa_status_t HSA_API hsa_agent_get_info_size(
+ hsa_agent_t agent,
+ hsa_agent_info_t attribute,
+ size_t* size);
+
+// Set the value of an agents attribute
+// Note: this API will not be in the spec unless needed
+hsa_status_t HSA_API hsa_agent_set_info(
+ hsa_agent_t agent,
+ hsa_agent_info_t attribute,
+ void* value);
+
+*/
+
+/**
+ * @brief Exception policies applied in the presence of hardware exceptions.
+ */
+typedef enum {
+ /**
+ * If a hardware exception is detected, a work-item signals an exception.
+ */
+ HSA_EXCEPTION_POLICY_BREAK = 1,
+ /**
+ * If a hardware exception is detected, a hardware status bit is set.
+ */
+ HSA_EXCEPTION_POLICY_DETECT = 2
+} hsa_exception_policy_t;
+
+/**
+ * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set
+ * architecture supported by the agent instead. If more than one ISA is
+ * supported by the agent, this function uses the first value returned by
+ * ::hsa_agent_iterate_isas.
+ *
+ * @brief Retrieve the exception policy support for a given combination of
+ * agent and profile
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] profile Profile.
+ *
+ * @param[out] mask Pointer to a memory location where the HSA runtime stores a
+ * mask of ::hsa_exception_policy_t values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid
+ * profile, or @p mask is NULL.
+ *
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies(
+ hsa_agent_t agent,
+ hsa_profile_t profile,
+ uint16_t *mask);
+
+/**
+ * @brief Cache handle.
+ */
+typedef struct hsa_cache_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_cache_t;
+
+/**
+ * @brief Cache attributes.
+ */
+typedef enum {
+ /**
+ * The length of the cache name in bytes, not including the NUL terminator.
+ * The type of this attribute is uint32_t.
+ */
+ HSA_CACHE_INFO_NAME_LENGTH = 0,
+ /**
+ * Human-readable description. The type of this attribute is a NUL-terminated
+ * character array with the length equal to the value of
+ * ::HSA_CACHE_INFO_NAME_LENGTH attribute.
+ */
+ HSA_CACHE_INFO_NAME = 1,
+ /**
+ * Cache level. A L1 cache must return a value of 1, a L2 must return a value
+ * of 2, and so on. The type of this attribute is uint8_t.
+ */
+ HSA_CACHE_INFO_LEVEL = 2,
+ /**
+ * Cache size, in bytes. A value of 0 indicates that there is no size
+ * information available. The type of this attribute is uint32_t.
+ */
+ HSA_CACHE_INFO_SIZE = 3
+} hsa_cache_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given cache object.
+ *
+ * @param[in] cache Cache.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_cache_get_info(
+ hsa_cache_t cache,
+ hsa_cache_info_t attribute,
+ void* value);
+
+/**
+ * @brief Iterate over the memory caches of a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @details Caches are visited in ascending order according to the value of the
+ * ::HSA_CACHE_INFO_LEVEL attribute.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per cache that is present in
+ * the agent. The HSA runtime passes two arguments to the callback: the cache
+ * and the application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * that value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_caches(
+ hsa_agent_t agent,
+ hsa_status_t (*callback)(hsa_cache_t cache, void* data),
+ void* data);
+
+/**
+ * @deprecated
+ *
+ * @brief Query if a given version of an extension is supported by an agent
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[in] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise. The result must be false if
+ * ::hsa_system_extension_supported returns false for the same extension
+ * version.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported(
+ uint16_t extension,
+ hsa_agent_t agent,
+ uint16_t version_major,
+ uint16_t version_minor,
+ bool* result);
+
+/**
+ * @brief Query if a given version of an extension is supported by an agent. All
+ * minor versions from 0 up to the returned @p version_minor must be supported.
+ *
+ * @param[in] extension Extension identifier.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] version_major Major version number.
+ *
+ * @param[out] version_minor Minor version number.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. The result is true if the specified version of the
+ * extension is supported, and false otherwise. The result must be false if
+ * ::hsa_system_extension_supported returns false for the same extension
+ * version.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid
+ * extension, or @p version_minor is NULL, or @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_major_extension_supported(
+ uint16_t extension,
+ hsa_agent_t agent,
+ uint16_t version_major,
+ uint16_t *version_minor,
+ bool* result);
+
+
+/** @} */
+
+
+/** \defgroup signals Signals
+ * @{
+ */
+
+/**
+ * @brief Signal handle.
+ */
+typedef struct hsa_signal_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal. The value 0 is reserved.
+ */
+ uint64_t handle;
+} hsa_signal_t;
+
+/**
+ * @brief Signal value. The value occupies 32 bits in small machine mode, and 64
+ * bits in large machine mode.
+ */
+#ifdef HSA_LARGE_MODEL
+ typedef int64_t hsa_signal_value_t;
+#else
+ typedef int32_t hsa_signal_value_t;
+#endif
+
+/**
+ * @brief Create a signal.
+ *
+ * @param[in] initial_value Initial value of the signal.
+ *
+ * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that
+ * any agent might wait on the signal.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the
+ * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the
+ * HSA runtime might use the list to optimize the handling of the signal
+ * object. If an agent not listed in @p consumers waits on the returned
+ * signal, the behavior is undefined. The memory associated with @p consumers
+ * can be reused or freed after the function returns.
+ *
+ * @param[out] signal Pointer to a memory location where the HSA runtime will
+ * store the newly created signal handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p
+ * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers
+ * contains duplicates.
+ */
+hsa_status_t HSA_API hsa_signal_create(
+ hsa_signal_value_t initial_value,
+ uint32_t num_consumers,
+ const hsa_agent_t *consumers,
+ hsa_signal_t *signal);
+
+/**
+ * @brief Destroy a signal previous created by ::hsa_signal_create.
+ *
+ * @param[in] signal Signal.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0.
+ */
+hsa_status_t HSA_API hsa_signal_destroy(
+ hsa_signal_t signal);
+
+/**
+ * @brief Atomically read the current value of a signal.
+ *
+ * @param[in] signal Signal.
+ *
+ * @return Value of the signal.
+*/
+hsa_signal_value_t HSA_API hsa_signal_load_scacquire(
+ hsa_signal_t signal);
+
+/**
+ * @copydoc hsa_signal_load_scacquire
+ */
+hsa_signal_value_t HSA_API hsa_signal_load_relaxed(
+ hsa_signal_t signal);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_load_scacquire.
+ *
+ * @copydoc hsa_signal_load_scacquire
+*/
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire(
+ hsa_signal_t signal);
+
+/**
+ * @brief Atomically set the value of a signal.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] value New signal value.
+ */
+void HSA_API hsa_signal_store_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_store_relaxed
+ */
+void HSA_API hsa_signal_store_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_store_screlease.
+ *
+ * @copydoc hsa_signal_store_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_store_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal without necessarily notifying the
+ * the agents waiting on it.
+ *
+ * @details The agents waiting on @p signal may not wake up even when the new
+ * value satisfies their wait condition. If the application wants to update the
+ * signal and there is no need to notify any agent, invoking this function can
+ * be more efficient than calling the non-silent counterpart.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] value New signal value.
+ */
+void HSA_API hsa_signal_silent_store_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_silent_store_relaxed
+ */
+void HSA_API hsa_signal_silent_store_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal and return its previous value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value New value.
+ *
+ * @return Value of the signal prior to the exchange.
+ *
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl.
+ *
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_scacquire.
+ *
+ * @copydoc hsa_signal_exchange_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+/**
+ * @copydoc hsa_signal_exchange_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_exchange_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_exchange_screlease.
+ *
+ * @copydoc hsa_signal_exchange_screlease
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically set the value of a signal if the observed value is equal to
+ * the expected value. The observed value is returned regardless of whether the
+ * replacement was done.
+ *
+ * @details If the value of the signal is changed, all the agents waiting
+ * on @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue
+ * doorbell signal, the behavior is undefined.
+ *
+ * @param[in] expected Value to compare with.
+ *
+ * @param[in] value New value.
+ *
+ * @return Observed value of the signal.
+ *
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_scacq_screl.
+ *
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_scacquire.
+ *
+ * @copydoc hsa_signal_cas_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_cas_scacq_screl
+ */
+hsa_signal_value_t HSA_API hsa_signal_cas_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_cas_screlease.
+ *
+ * @copydoc hsa_signal_cas_screlease
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t expected,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically increment the value of a signal by a given amount.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to add to the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_add_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_scacq_screl.
+ *
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_scacquire.
+ *
+ * @copydoc hsa_signal_add_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_add_scacq_screl
+ */
+void HSA_API hsa_signal_add_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_add_screlease.
+ *
+ * @copydoc hsa_signal_add_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_add_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically decrement the value of a signal by a given amount.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to subtract from the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_subtract_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl.
+ *
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_scacquire.
+ *
+ * @copydoc hsa_signal_subtract_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_subtract_scacq_screl
+ */
+void HSA_API hsa_signal_subtract_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_subtract_screlease.
+ *
+ * @copydoc hsa_signal_subtract_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_subtract_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise AND operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to AND with the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_and_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_scacq_screl.
+ *
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_scacquire.
+ *
+ * @copydoc hsa_signal_and_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_and_scacq_screl
+ */
+void HSA_API hsa_signal_and_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_and_screlease.
+ *
+ * @copydoc hsa_signal_and_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_and_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise OR operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to OR with the value of the signal.
+ */
+void HSA_API hsa_signal_or_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_scacq_screl.
+ *
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_scacquire.
+ *
+ * @copydoc hsa_signal_or_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_or_scacq_screl
+ */
+void HSA_API hsa_signal_or_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_or_screlease.
+ *
+ * @copydoc hsa_signal_or_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_or_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Atomically perform a bitwise XOR operation between the value of a
+ * signal and a given value.
+ *
+ * @details If the value of the signal is changed, all the agents waiting on
+ * @p signal for which @p value satisfies their wait condition are awakened.
+ *
+ * @param[in] signal Signal. If @p signal is a queue doorbell signal, the
+ * behavior is undefined.
+ *
+ * @param[in] value Value to XOR with the value of the signal.
+ *
+ */
+void HSA_API hsa_signal_xor_scacq_screl(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_scacq_screl.
+ *
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_scacquire.
+ *
+ * @copydoc hsa_signal_xor_scacquire
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @copydoc hsa_signal_xor_scacq_screl
+ */
+void HSA_API hsa_signal_xor_screlease(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_xor_screlease.
+ *
+ * @copydoc hsa_signal_xor_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_signal_xor_release(
+ hsa_signal_t signal,
+ hsa_signal_value_t value);
+
+/**
+ * @brief Wait condition operator.
+ */
+typedef enum {
+ /**
+ * The two operands are equal.
+ */
+ HSA_SIGNAL_CONDITION_EQ = 0,
+ /**
+ * The two operands are not equal.
+ */
+ HSA_SIGNAL_CONDITION_NE = 1,
+ /**
+ * The first operand is less than the second operand.
+ */
+ HSA_SIGNAL_CONDITION_LT = 2,
+ /**
+ * The first operand is greater than or equal to the second operand.
+ */
+ HSA_SIGNAL_CONDITION_GTE = 3
+} hsa_signal_condition_t;
+
+/**
+ * @brief State of the application thread during a signal wait.
+ */
+typedef enum {
+ /**
+ * The application thread may be rescheduled while waiting on the signal.
+ */
+ HSA_WAIT_STATE_BLOCKED = 0,
+ /**
+ * The application thread stays active while waiting on a signal.
+ */
+ HSA_WAIT_STATE_ACTIVE = 1
+} hsa_wait_state_t;
+
+
+/**
+ * @brief Wait until a signal value satisfies a specified condition, or a
+ * certain amount of time has elapsed.
+ *
+ * @details A wait operation can spuriously resume at any time sooner than the
+ * timeout (for example, due to system or other external factors) even when the
+ * condition has not been met.
+ *
+ * The function is guaranteed to return if the signal value satisfies the
+ * condition at some point in time during the wait, but the value returned to
+ * the application might not satisfy the condition. The application must ensure
+ * that signals are used in such way that wait wakeup conditions are not
+ * invalidated before dependent threads have woken up.
+ *
+ * When the wait operation internally loads the value of the passed signal, it
+ * uses the memory order indicated in the function name.
+ *
+ * @param[in] signal Signal.
+ *
+ * @param[in] condition Condition used to compare the signal value with @p
+ * compare_value.
+ *
+ * @param[in] compare_value Value to compare with.
+ *
+ * @param[in] timeout_hint Maximum duration of the wait. Specified in the same
+ * unit as the system timestamp. The operation might block for a shorter or
+ * longer time even if the condition is not met. A value of UINT64_MAX indicates
+ * no maximum.
+ *
+ * @param[in] wait_state_hint Hint used by the application to indicate the
+ * preferred waiting state. The actual waiting state is ultimately decided by
+ * HSA runtime and may not match the provided hint. A value of
+ * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal
+ * update by avoiding rescheduling overhead.
+ *
+ * @return Observed value of the signal, which might not satisfy the specified
+ * condition.
+ *
+*/
+hsa_signal_value_t HSA_API hsa_signal_wait_scacquire(
+ hsa_signal_t signal,
+ hsa_signal_condition_t condition,
+ hsa_signal_value_t compare_value,
+ uint64_t timeout_hint,
+ hsa_wait_state_t wait_state_hint);
+
+/**
+ * @copydoc hsa_signal_wait_scacquire
+ */
+hsa_signal_value_t HSA_API hsa_signal_wait_relaxed(
+ hsa_signal_t signal,
+ hsa_signal_condition_t condition,
+ hsa_signal_value_t compare_value,
+ uint64_t timeout_hint,
+ hsa_wait_state_t wait_state_hint);
+
+/**
+ * @deprecated Renamed as ::hsa_signal_wait_scacquire.
+ *
+ * @copydoc hsa_signal_wait_scacquire
+ */
+hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire(
+ hsa_signal_t signal,
+ hsa_signal_condition_t condition,
+ hsa_signal_value_t compare_value,
+ uint64_t timeout_hint,
+ hsa_wait_state_t wait_state_hint);
+
+/**
+ * @brief Group of signals.
+ */
+typedef struct hsa_signal_group_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_signal_group_t;
+
+/**
+ * @brief Create a signal group.
+ *
+ * @param[in] num_signals Number of elements in @p signals. Must not be 0.
+ *
+ * @param[in] signals List of signals in the group. The list must not contain
+ * any repeated elements. Must not be NULL.
+ *
+ * @param[in] num_consumers Number of elements in @p consumers. Must not be 0.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the signal
+ * group. The list must not contain repeated elements, and must be a subset of
+ * the set of agents that are allowed to wait on all the signals in the
+ * group. If an agent not listed in @p consumers waits on the returned group,
+ * the behavior is undefined. The memory associated with @p consumers can be
+ * reused or freed after the function returns. Must not be NULL.
+ *
+ * @param[out] signal_group Pointer to newly created signal group. Must not be
+ * NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals
+ * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_signal_group_create(
+ uint32_t num_signals,
+ const hsa_signal_t *signals,
+ uint32_t num_consumers,
+ const hsa_agent_t *consumers,
+ hsa_signal_group_t *signal_group);
+
+/**
+ * @brief Destroy a signal group previous created by ::hsa_signal_group_create.
+ *
+ * @param[in] signal_group Signal group.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid.
+ */
+hsa_status_t HSA_API hsa_signal_group_destroy(
+ hsa_signal_group_t signal_group);
+
+/**
+ * @brief Wait until the value of at least one of the signals in a signal group
+ * satisfies its associated condition.
+ *
+ * @details The function is guaranteed to return if the value of at least one of
+ * the signals in the group satisfies its associated condition at some point in
+ * time during the wait, but the signal value returned to the application may no
+ * longer satisfy the condition. The application must ensure that signals in the
+ * group are used in such way that wait wakeup conditions are not invalidated
+ * before dependent threads have woken up.
+ *
+ * When this operation internally loads the value of the passed signal, it uses
+ * the memory order indicated in the function name.
+ *
+ * @param[in] signal_group Signal group.
+ *
+ * @param[in] conditions List of conditions. Each condition, and the value at
+ * the same index in @p compare_values, is used to compare the value of the
+ * signal at that index in @p signal_group (the signal passed by the application
+ * to ::hsa_signal_group_create at that particular index). The size of @p
+ * conditions must not be smaller than the number of signals in @p signal_group;
+ * any extra elements are ignored. Must not be NULL.
+ *
+ * @param[in] compare_values List of comparison values. The size of @p
+ * compare_values must not be smaller than the number of signals in @p
+ * signal_group; any extra elements are ignored. Must not be NULL.
+ *
+ * @param[in] wait_state_hint Hint used by the application to indicate the
+ * preferred waiting state. The actual waiting state is decided by the HSA runtime
+ * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may
+ * improve the latency of response to a signal update by avoiding rescheduling
+ * overhead.
+ *
+ * @param[out] signal Signal in the group that satisfied the associated
+ * condition. If several signals satisfied their condition, the function can
+ * return any of those signals. Must not be NULL.
+ *
+ * @param[out] value Observed value for @p signal, which might no longer satisfy
+ * the specified condition. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p
+ * compare_values is NULL, @p signal is NULL, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire(
+ hsa_signal_group_t signal_group,
+ const hsa_signal_condition_t *conditions,
+ const hsa_signal_value_t *compare_values,
+ hsa_wait_state_t wait_state_hint,
+ hsa_signal_t *signal,
+ hsa_signal_value_t *value);
+
+/**
+ * @copydoc hsa_signal_group_wait_any_scacquire
+ */
+hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed(
+ hsa_signal_group_t signal_group,
+ const hsa_signal_condition_t *conditions,
+ const hsa_signal_value_t *compare_values,
+ hsa_wait_state_t wait_state_hint,
+ hsa_signal_t *signal,
+ hsa_signal_value_t *value);
+
+/** @} */
+
+/** \defgroup memory Memory
+ * @{
+ */
+
+/**
+ * @brief A memory region represents a block of virtual memory with certain
+ * properties. For example, the HSA runtime represents fine-grained memory in
+ * the global segment using a region. A region might be associated with more
+ * than one agent.
+ */
+typedef struct hsa_region_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_region_t;
+
+/** @} */
+
+
+/** \defgroup queue Queues
+ * @{
+ */
+
+/**
+ * @brief Queue type. Intended to be used for dynamic queue protocol
+ * determination.
+ */
+typedef enum {
+ /**
+ * Queue supports multiple producers. Use of multiproducer queue mechanics is
+ * required.
+ */
+ HSA_QUEUE_TYPE_MULTI = 0,
+ /**
+ * Queue only supports a single producer. In some scenarios, the application
+ * may want to limit the submission of AQL packets to a single agent. Queues
+ * that support a single producer may be more efficient than queues supporting
+ * multiple producers. Use of multiproducer queue mechanics is not supported.
+ */
+ HSA_QUEUE_TYPE_SINGLE = 1,
+ /**
+ * Queue supports multiple producers and cooperative dispatches. Cooperative
+ * dispatches are able to use GWS synchronization. Queues of this type may be
+ * limited in number. The runtime may return the same queue to serve multiple
+ * ::hsa_queue_create calls when this type is given. Callers must inspect the
+ * returned queue to discover queue size. Queues of this type are reference
+ * counted and require a matching number of ::hsa_queue_destroy calls to
+ * release. Use of multiproducer queue mechanics is required. See
+ * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this
+ * type.
+ */
+ HSA_QUEUE_TYPE_COOPERATIVE = 2
+} hsa_queue_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_queue_type_t constants.
+ */
+typedef uint32_t hsa_queue_type32_t;
+
+/**
+ * @brief Queue features.
+ */
+typedef enum {
+ /**
+ * Queue supports kernel dispatch packets.
+ */
+ HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1,
+
+ /**
+ * Queue supports agent dispatch packets.
+ */
+ HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2
+} hsa_queue_feature_t;
+
+/**
+ * @brief User mode queue.
+ *
+ * @details The queue structure is read-only and allocated by the HSA runtime,
+ * but agents can directly modify the contents of the buffer pointed by @a
+ * base_address, or use HSA runtime APIs to access the doorbell signal.
+ *
+ */
+typedef struct hsa_queue_s {
+ /**
+ * Queue type.
+ */
+ hsa_queue_type32_t type;
+
+ /**
+ * Queue features mask. This is a bit-field of ::hsa_queue_feature_t
+ * values. Applications should ignore any unknown set bits.
+ */
+ uint32_t features;
+
+#ifdef HSA_LARGE_MODEL
+ void* base_address;
+#elif defined HSA_LITTLE_ENDIAN
+ /**
+ * Starting address of the HSA runtime-allocated buffer used to store the AQL
+ * packets. Must be aligned to the size of an AQL packet.
+ */
+ void* base_address;
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved0;
+#else
+ uint32_t reserved0;
+ void* base_address;
+#endif
+
+ /**
+ * Signal object used by the application to indicate the ID of a packet that
+ * is ready to be processed. The HSA runtime manages the doorbell signal. If
+ * the application tries to replace or destroy this signal, the behavior is
+ * undefined.
+ *
+ * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be
+ * updated in a monotonically increasing fashion. If @a type is
+ * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any
+ * value.
+ */
+ hsa_signal_t doorbell_signal;
+
+ /**
+ * Maximum number of packets the queue can hold. Must be a power of 2.
+ */
+ uint32_t size;
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+ /**
+ * Queue identifier, which is unique over the lifetime of the application.
+ */
+ uint64_t id;
+
+} hsa_queue_t;
+
+/**
+ * @brief Create a user mode queue.
+ *
+ * @details The HSA runtime creates the queue structure, the underlying packet
+ * buffer, the completion signal, and the write and read indexes. The initial
+ * value of the write and read indexes is 0. The type of every packet in the
+ * buffer is initialized to ::HSA_PACKET_TYPE_INVALID.
+ *
+ * The application should only rely on the error code returned to determine if
+ * the queue is valid.
+ *
+ * @param[in] agent Agent where to create the queue.
+ *
+ * @param[in] size Number of packets the queue is expected to
+ * hold. Must be a power of 2 between 1 and the value of
+ * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly
+ * created queue is the maximum of @p size and the value of
+ * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent.
+ *
+ * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values.
+ * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE,
+ * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE.
+ *
+ * @param[in] callback Callback invoked by the HSA runtime for every
+ * asynchronous event related to the newly created queue. May be NULL. The HSA
+ * runtime passes three arguments to the callback: a code identifying the event
+ * that triggered the invocation, a pointer to the queue where the event
+ * originated, and the application data.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @param[in] private_segment_size Hint indicating the maximum
+ * expected private segment usage per work-item, in bytes. There may
+ * be performance degradation if the application places a kernel
+ * dispatch packet in the queue and the corresponding private segment
+ * usage exceeds @p private_segment_size. If the application does not
+ * want to specify any particular value for this argument, @p
+ * private_segment_size must be UINT32_MAX. If the queue does not
+ * support kernel dispatch packets, this argument is ignored.
+ *
+ * @param[in] group_segment_size Hint indicating the maximum expected
+ * group segment usage per work-group, in bytes. There may be
+ * performance degradation if the application places a kernel dispatch
+ * packet in the queue and the corresponding group segment usage
+ * exceeds @p group_segment_size. If the application does not want to
+ * specify any particular value for this argument, @p
+ * group_segment_size must be UINT32_MAX. If the queue does not
+ * support kernel dispatch packets, this argument is ignored.
+ *
+ * @param[out] queue Memory location where the HSA runtime stores a pointer to
+ * the newly created queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not
+ * support queues of the given type.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two,
+ * @p size is 0, @p type is an invalid queue type, or @p queue is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_queue_create(
+ hsa_agent_t agent,
+ uint32_t size,
+ hsa_queue_type32_t type,
+ void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+ void *data,
+ uint32_t private_segment_size,
+ uint32_t group_segment_size,
+ hsa_queue_t **queue);
+
+/**
+ * @brief Create a queue for which the application or a kernel is responsible
+ * for processing the AQL packets.
+ *
+ * @details The application can use this function to create queues where AQL
+ * packets are not parsed by the packet processor associated with an agent,
+ * but rather by a unit of execution running on that agent (for example, a
+ * thread in the host application).
+ *
+ * The application is responsible for ensuring that all the producers and
+ * consumers of the resulting queue can access the provided doorbell signal
+ * and memory region. The application is also responsible for ensuring that the
+ * unit of execution processing the queue packets supports the indicated
+ * features (AQL packet types).
+ *
+ * When the queue is created, the HSA runtime allocates the packet buffer using
+ * @p region, and the write and read indexes. The initial value of the write and
+ * read indexes is 0, and the type of every packet in the buffer is initialized
+ * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features,
+ * and @e doorbell_signal fields in the returned queue match the values passed
+ * by the application.
+ *
+ * @param[in] region Memory region that the HSA runtime should use to allocate
+ * the AQL packet buffer and any other queue metadata.
+ *
+ * @param[in] size Number of packets the queue is expected to hold. Must be a
+ * power of 2 greater than 0.
+ *
+ * @param[in] type Queue type.
+ *
+ * @param[in] features Supported queue features. This is a bit-field of
+ * ::hsa_queue_feature_t values.
+ *
+ * @param[in] doorbell_signal Doorbell signal that the HSA runtime must
+ * associate with the returned queue. The signal handle must not be 0.
+ *
+ * @param[out] queue Memory location where the HSA runtime stores a pointer to
+ * the newly created queue. The application should not rely on the value
+ * returned for this argument but only in the status code to determine if the
+ * queue is valid. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p
+ * size is 0, @p type is an invalid queue type, the doorbell signal handle is
+ * 0, or @p queue is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_soft_queue_create(
+ hsa_region_t region,
+ uint32_t size,
+ hsa_queue_type32_t type,
+ uint32_t features,
+ hsa_signal_t doorbell_signal,
+ hsa_queue_t **queue);
+
+/**
+ * @brief Destroy a user mode queue.
+ *
+ * @details When a queue is destroyed, the state of the AQL packets that have
+ * not been yet fully processed (their completion phase has not finished)
+ * becomes undefined. It is the responsibility of the application to ensure that
+ * all pending queue operations are finished if their results are required.
+ *
+ * The resources allocated by the HSA runtime during queue creation (queue
+ * structure, ring buffer, doorbell signal) are released. The queue should not
+ * be accessed after being destroyed.
+ *
+ * @param[in] queue Pointer to a queue created using ::hsa_queue_create.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API hsa_queue_destroy(
+ hsa_queue_t *queue);
+
+/**
+ * @brief Inactivate a queue.
+ *
+ * @details Inactivating the queue aborts any pending executions and prevent any
+ * new packets from being processed. Any more packets written to the queue once
+ * it is inactivated will be ignored by the packet processor.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API hsa_queue_inactivate(
+ hsa_queue_t *queue);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire.
+ *
+ * @copydoc hsa_queue_load_read_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire(
+ const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically load the read index of a queue.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @return Read index of the queue pointed by @p queue.
+ */
+uint64_t HSA_API hsa_queue_load_read_index_scacquire(
+ const hsa_queue_t *queue);
+
+/**
+ * @copydoc hsa_queue_load_read_index_scacquire
+ */
+uint64_t HSA_API hsa_queue_load_read_index_relaxed(
+ const hsa_queue_t *queue);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_load_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire(
+ const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically load the write index of a queue.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @return Write index of the queue pointed by @p queue.
+ */
+uint64_t HSA_API hsa_queue_load_write_index_scacquire(
+ const hsa_queue_t *queue);
+
+/**
+ * @copydoc hsa_queue_load_write_index_scacquire
+ */
+uint64_t HSA_API hsa_queue_load_write_index_relaxed(
+ const hsa_queue_t *queue);
+
+/**
+ * @brief Atomically set the write index of a queue.
+ *
+ * @details It is recommended that the application uses this function to update
+ * the write index when there is a single agent submitting work to the queue
+ * (the queue type is ::HSA_QUEUE_TYPE_SINGLE).
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to assign to the write index.
+ *
+ */
+void HSA_API hsa_queue_store_write_index_relaxed(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_store_write_index_screlease.
+ *
+ * @copydoc hsa_queue_store_write_index_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_store_write_index_relaxed
+ */
+void HSA_API hsa_queue_store_write_index_screlease(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl.
+ *
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @brief Atomically set the write index of a queue if the observed value is
+ * equal to the expected value. The application can inspect the returned value
+ * to determine if the replacement was done.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] expected Expected value.
+ *
+ * @param[in] value Value to assign to the write index if @p expected matches
+ * the observed write index. Must be greater than @p expected.
+ *
+ * @return Previous value of the write index.
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_cas_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_scacquire(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_relaxed(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease.
+ *
+ * @copydoc hsa_queue_cas_write_index_screlease
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_cas_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_cas_write_index_screlease(
+ const hsa_queue_t *queue,
+ uint64_t expected,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl.
+ *
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @brief Atomically increment the write index of a queue by an offset.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to add to the write index.
+ *
+ * @return Previous value of the write index.
+ */
+uint64_t HSA_API hsa_queue_add_write_index_scacq_screl(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire.
+ *
+ * @copydoc hsa_queue_add_write_index_scacquire
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_scacquire(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_relaxed(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_add_write_index_screlease.
+ *
+ * @copydoc hsa_queue_add_write_index_screlease
+ */
+uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_add_write_index_scacq_screl
+ */
+uint64_t HSA_API hsa_queue_add_write_index_screlease(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @brief Atomically set the read index of a queue.
+ *
+ * @details Modifications of the read index are not allowed and result in
+ * undefined behavior if the queue is associated with an agent for which
+ * only the corresponding packet processor is permitted to update the read
+ * index.
+ *
+ * @param[in] queue Pointer to a queue.
+ *
+ * @param[in] value Value to assign to the read index.
+ *
+ */
+void HSA_API hsa_queue_store_read_index_relaxed(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @deprecated Renamed as ::hsa_queue_store_read_index_screlease.
+ *
+ * @copydoc hsa_queue_store_read_index_screlease
+ */
+void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release(
+ const hsa_queue_t *queue,
+ uint64_t value);
+
+/**
+ * @copydoc hsa_queue_store_read_index_relaxed
+ */
+void HSA_API hsa_queue_store_read_index_screlease(
+ const hsa_queue_t *queue,
+ uint64_t value);
+/** @} */
+
+
+/** \defgroup aql Architected Queuing Language
+ * @{
+ */
+
+/**
+ * @brief Packet type.
+ */
+typedef enum {
+ /**
+ * Vendor-specific packet.
+ */
+ HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0,
+ /**
+ * The packet has been processed in the past, but has not been reassigned to
+ * the packet processor. A packet processor must not process a packet of this
+ * type. All queues support this packet type.
+ */
+ HSA_PACKET_TYPE_INVALID = 1,
+ /**
+ * Packet used by agents for dispatching jobs to kernel agents. Not all
+ * queues support packets of this type (see ::hsa_queue_feature_t).
+ */
+ HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
+ /**
+ * Packet used by agents to delay processing of subsequent packets, and to
+ * express complex dependencies between multiple packets. All queues support
+ * this packet type.
+ */
+ HSA_PACKET_TYPE_BARRIER_AND = 3,
+ /**
+ * Packet used by agents for dispatching jobs to agents. Not all
+ * queues support packets of this type (see ::hsa_queue_feature_t).
+ */
+ HSA_PACKET_TYPE_AGENT_DISPATCH = 4,
+ /**
+ * Packet used by agents to delay processing of subsequent packets, and to
+ * express complex dependencies between multiple packets. All queues support
+ * this packet type.
+ */
+ HSA_PACKET_TYPE_BARRIER_OR = 5
+} hsa_packet_type_t;
+
+/**
+ * @brief Scope of the memory fence operation associated with a packet.
+ */
+typedef enum {
+ /**
+ * No scope (no fence is applied). The packet relies on external fences to
+ * ensure visibility of memory updates.
+ */
+ HSA_FENCE_SCOPE_NONE = 0,
+ /**
+ * The fence is applied with agent scope for the global segment.
+ */
+ HSA_FENCE_SCOPE_AGENT = 1,
+ /**
+ * The fence is applied across both agent and system scope for the global
+ * segment.
+ */
+ HSA_FENCE_SCOPE_SYSTEM = 2
+} hsa_fence_scope_t;
+
+/**
+ * @brief Sub-fields of the @a header field that is present in any AQL
+ * packet. The offset (with respect to the address of @a header) of a sub-field
+ * is identical to its enumeration constant. The width of each sub-field is
+ * determined by the corresponding value in ::hsa_packet_header_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+ /**
+ * Packet type. The value of this sub-field must be one of
+ * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the
+ * packet layout is vendor-specific.
+ */
+ HSA_PACKET_HEADER_TYPE = 0,
+ /**
+ * Barrier bit. If the barrier bit is set, the processing of the current
+ * packet only launches when all preceding packets (within the same queue) are
+ * complete.
+ */
+ HSA_PACKET_HEADER_BARRIER = 8,
+ /**
+ * Acquire fence scope. The value of this sub-field determines the scope and
+ * type of the memory fence operation applied before the packet enters the
+ * active phase. An acquire fence ensures that any subsequent global segment
+ * or image loads by any unit of execution that belongs to a dispatch that has
+ * not yet entered the active phase on any queue of the same kernel agent,
+ * sees any data previously released at the scopes specified by the acquire
+ * fence. The value of this sub-field must be one of ::hsa_fence_scope_t.
+ */
+ HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
+ /**
+ * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE.
+ */
+ HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
+ /**
+ * Release fence scope, The value of this sub-field determines the scope and
+ * type of the memory fence operation applied after kernel completion but
+ * before the packet is completed. A release fence makes any global segment or
+ * image data that was stored by any unit of execution that belonged to a
+ * dispatch that has completed the active phase on any queue of the same
+ * kernel agent visible in all the scopes specified by the release fence. The
+ * value of this sub-field must be one of ::hsa_fence_scope_t.
+ */
+ HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11,
+ /**
+ * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE.
+ */
+ HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
+ } hsa_packet_header_t;
+
+/**
+ * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t.
+ */
+ typedef enum {
+ HSA_PACKET_HEADER_WIDTH_TYPE = 8,
+ HSA_PACKET_HEADER_WIDTH_BARRIER = 1,
+ HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2,
+ /**
+ * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE.
+ */
+ HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2,
+ HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2,
+ /**
+ * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE.
+ */
+ HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2
+ } hsa_packet_header_width_t;
+
+/**
+ * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset
+ * (with respect to the address of @a setup) of a sub-field is identical to its
+ * enumeration constant. The width of each sub-field is determined by the
+ * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The
+ * offset and the width are expressed in bits.
+ */
+ typedef enum {
+ /**
+ * Number of dimensions of the grid. Valid values are 1, 2, or 3.
+ *
+ */
+ HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
+ } hsa_kernel_dispatch_packet_setup_t;
+
+/**
+ * @brief Width (in bits) of the sub-fields in
+ * ::hsa_kernel_dispatch_packet_setup_t.
+ */
+ typedef enum {
+ HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2
+ } hsa_kernel_dispatch_packet_setup_width_t;
+
+/**
+ * @brief AQL kernel dispatch packet
+ */
+typedef struct hsa_kernel_dispatch_packet_s {
+ /**
+ * Packet header. Used to configure multiple packet parameters such as the
+ * packet type. The parameters are described by ::hsa_packet_header_t.
+ */
+ uint16_t header;
+
+ /**
+ * Dispatch setup parameters. Used to configure kernel dispatch parameters
+ * such as the number of dimensions in the grid. The parameters are described
+ * by ::hsa_kernel_dispatch_packet_setup_t.
+ */
+ uint16_t setup;
+
+ /**
+ * X dimension of work-group, in work-items. Must be greater than 0.
+ */
+ uint16_t workgroup_size_x;
+
+ /**
+ * Y dimension of work-group, in work-items. Must be greater than
+ * 0. If the grid has 1 dimension, the only valid value is 1.
+ */
+ uint16_t workgroup_size_y;
+
+ /**
+ * Z dimension of work-group, in work-items. Must be greater than
+ * 0. If the grid has 1 or 2 dimensions, the only valid value is 1.
+ */
+ uint16_t workgroup_size_z;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint16_t reserved0;
+
+ /**
+ * X dimension of grid, in work-items. Must be greater than 0. Must
+ * not be smaller than @a workgroup_size_x.
+ */
+ uint32_t grid_size_x;
+
+ /**
+ * Y dimension of grid, in work-items. Must be greater than 0. If the grid has
+ * 1 dimension, the only valid value is 1. Must not be smaller than @a
+ * workgroup_size_y.
+ */
+ uint32_t grid_size_y;
+
+ /**
+ * Z dimension of grid, in work-items. Must be greater than 0. If the grid has
+ * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a
+ * workgroup_size_z.
+ */
+ uint32_t grid_size_z;
+
+ /**
+ * Size in bytes of private memory allocation request (per work-item).
+ */
+ uint32_t private_segment_size;
+
+ /**
+ * Size in bytes of group memory allocation request (per work-group). Must not
+ * be less than the sum of the group memory used by the kernel (and the
+ * functions it calls directly or indirectly) and the dynamically allocated
+ * group segment variables.
+ */
+ uint32_t group_segment_size;
+
+ /**
+ * Opaque handle to a code object that includes an implementation-defined
+ * executable code for the kernel.
+ */
+ uint64_t kernel_object;
+
+#ifdef HSA_LARGE_MODEL
+ void* kernarg_address;
+#elif defined HSA_LITTLE_ENDIAN
+ /**
+ * Pointer to a buffer containing the kernel arguments. May be NULL.
+ *
+ * The buffer must be allocated using ::hsa_memory_allocate, and must not be
+ * modified once the kernel dispatch packet is enqueued until the dispatch has
+ * completed execution.
+ */
+ void* kernarg_address;
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+#else
+ uint32_t reserved1;
+ void* kernarg_address;
+#endif
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint64_t reserved2;
+
+ /**
+ * Signal used to indicate completion of the job. The application can use the
+ * special signal handle 0 to indicate that no signal is used.
+ */
+ hsa_signal_t completion_signal;
+
+} hsa_kernel_dispatch_packet_t;
+
+/**
+ * @brief Agent dispatch packet.
+ */
+typedef struct hsa_agent_dispatch_packet_s {
+ /**
+ * Packet header. Used to configure multiple packet parameters such as the
+ * packet type. The parameters are described by ::hsa_packet_header_t.
+ */
+ uint16_t header;
+
+ /**
+ * Application-defined function to be performed by the destination agent.
+ */
+ uint16_t type;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved0;
+
+#ifdef HSA_LARGE_MODEL
+ void* return_address;
+#elif defined HSA_LITTLE_ENDIAN
+ /**
+ * Address where to store the function return values, if any.
+ */
+ void* return_address;
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+#else
+ uint32_t reserved1;
+ void* return_address;
+#endif
+
+ /**
+ * Function arguments.
+ */
+ uint64_t arg[4];
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint64_t reserved2;
+
+ /**
+ * Signal used to indicate completion of the job. The application can use the
+ * special signal handle 0 to indicate that no signal is used.
+ */
+ hsa_signal_t completion_signal;
+
+} hsa_agent_dispatch_packet_t;
+
+/**
+ * @brief Barrier-AND packet.
+ */
+typedef struct hsa_barrier_and_packet_s {
+ /**
+ * Packet header. Used to configure multiple packet parameters such as the
+ * packet type. The parameters are described by ::hsa_packet_header_t.
+ */
+ uint16_t header;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint16_t reserved0;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+
+ /**
+ * Array of dependent signal objects. Signals with a handle value of 0 are
+ * allowed and are interpreted by the packet processor as satisfied
+ * dependencies.
+ */
+ hsa_signal_t dep_signal[5];
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint64_t reserved2;
+
+ /**
+ * Signal used to indicate completion of the job. The application can use the
+ * special signal handle 0 to indicate that no signal is used.
+ */
+ hsa_signal_t completion_signal;
+
+} hsa_barrier_and_packet_t;
+
+/**
+ * @brief Barrier-OR packet.
+ */
+typedef struct hsa_barrier_or_packet_s {
+ /**
+ * Packet header. Used to configure multiple packet parameters such as the
+ * packet type. The parameters are described by ::hsa_packet_header_t.
+ */
+ uint16_t header;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint16_t reserved0;
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+
+ /**
+ * Array of dependent signal objects. Signals with a handle value of 0 are
+ * allowed and are interpreted by the packet processor as dependencies not
+ * satisfied.
+ */
+ hsa_signal_t dep_signal[5];
+
+ /**
+ * Reserved. Must be 0.
+ */
+ uint64_t reserved2;
+
+ /**
+ * Signal used to indicate completion of the job. The application can use the
+ * special signal handle 0 to indicate that no signal is used.
+ */
+ hsa_signal_t completion_signal;
+
+} hsa_barrier_or_packet_t;
+
+/** @} */
+
+/** \addtogroup memory Memory
+ * @{
+ */
+
+/**
+ * @brief Memory segments associated with a region.
+ */
+typedef enum {
+ /**
+ * Global segment. Used to hold data that is shared by all agents.
+ */
+ HSA_REGION_SEGMENT_GLOBAL = 0,
+ /**
+ * Read-only segment. Used to hold data that remains constant during the
+ * execution of a kernel.
+ */
+ HSA_REGION_SEGMENT_READONLY = 1,
+ /**
+ * Private segment. Used to hold data that is local to a single work-item.
+ */
+ HSA_REGION_SEGMENT_PRIVATE = 2,
+ /**
+ * Group segment. Used to hold data that is shared by the work-items of a
+ * work-group.
+ */
+ HSA_REGION_SEGMENT_GROUP = 3,
+ /**
+ * Kernarg segment. Used to store kernel arguments.
+ */
+ HSA_REGION_SEGMENT_KERNARG = 4
+} hsa_region_segment_t;
+
+/**
+ * @brief Global region flags.
+ */
+typedef enum {
+ /**
+ * The application can use memory in the region to store kernel arguments, and
+ * provide the values for the kernarg segment of a kernel dispatch. If this
+ * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set.
+ */
+ HSA_REGION_GLOBAL_FLAG_KERNARG = 1,
+ /**
+ * Updates to memory in this region are immediately visible to all the
+ * agents under the terms of the HSA memory model. If this
+ * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set.
+ */
+ HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2,
+ /**
+ * Updates to memory in this region can be performed by a single agent at
+ * a time. If a different agent in the system is allowed to access the
+ * region, the application must explicitely invoke ::hsa_memory_assign_agent
+ * in order to transfer ownership to that agent for a particular buffer.
+ */
+ HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4
+} hsa_region_global_flag_t;
+
+/**
+ * @brief Attributes of a memory region.
+ */
+typedef enum {
+ /**
+ * Segment where memory in the region can be used. The type of this
+ * attribute is ::hsa_region_segment_t.
+ */
+ HSA_REGION_INFO_SEGMENT = 0,
+ /**
+ * Flag mask. The value of this attribute is undefined if the value of
+ * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of
+ * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t
+ * values.
+ */
+ HSA_REGION_INFO_GLOBAL_FLAGS = 1,
+ /**
+ * Size of this region, in bytes. The type of this attribute is size_t.
+ */
+ HSA_REGION_INFO_SIZE = 2,
+ /**
+ * Maximum allocation size in this region, in bytes. Must not exceed the value
+ * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t.
+ *
+ * If the region is in the global or readonly segments, this is the maximum
+ * size that the application can pass to ::hsa_memory_allocate.
+ *
+ * If the region is in the group segment, this is the maximum size (per
+ * work-group) that can be requested for a given kernel dispatch. If the
+ * region is in the private segment, this is the maximum size (per work-item)
+ * that can be requested for a specific kernel dispatch, and must be at least
+ * 256 bytes.
+ */
+ HSA_REGION_INFO_ALLOC_MAX_SIZE = 4,
+ /**
+ * Maximum size (per work-group) of private memory that can be requested for a
+ * specific kernel dispatch. Must be at least 65536 bytes. The type of this
+ * attribute is uint32_t. The value of this attribute is undefined if the
+ * region is not in the private segment.
+ */
+ HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8,
+ /**
+ * Indicates whether memory in this region can be allocated using
+ * ::hsa_memory_allocate. The type of this attribute is bool.
+ *
+ * The value of this flag is always false for regions in the group and private
+ * segments.
+ */
+ HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+ /**
+ * Allocation granularity of buffers allocated by ::hsa_memory_allocate in
+ * this region. The size of a buffer allocated in this region is a multiple of
+ * the value of this attribute. The value of this attribute is only defined if
+ * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type
+ * of this attribute is size_t.
+ */
+ HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6,
+ /**
+ * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The
+ * value of this attribute is only defined if
+ * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be
+ * a power of 2. The type of this attribute is size_t.
+ */
+ HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7
+} hsa_region_info_t;
+
+/**
+ * @brief Get the current value of an attribute of a region.
+ *
+ * @param[in] region A valid region.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * region attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_region_get_info(
+ hsa_region_t region,
+ hsa_region_info_t attribute,
+ void* value);
+
+/**
+ * @brief Iterate over the memory regions associated with a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per region that is
+ * accessible from the agent. The HSA runtime passes two arguments to the
+ * callback, the region and the application data. If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and ::hsa_agent_iterate_regions returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_regions(
+ hsa_agent_t agent,
+ hsa_status_t (*callback)(hsa_region_t region, void* data),
+ void* data);
+
+/**
+ * @brief Allocate a block of memory in a given region.
+ *
+ * @param[in] region Region where to allocate memory from. The region must have
+ * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set.
+ *
+ * @param[in] size Allocation size, in bytes. Must not be zero. This value is
+ * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
+ * in @p region.
+ *
+ * @param[out] ptr Pointer to the location where to store the base address of
+ * the allocated block. The returned base address is aligned to the value of
+ * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation
+ * fails, the returned value is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to
+ * allocate memory in @p region, or @p size is greater than the value of
+ * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0.
+ */
+hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region,
+ size_t size,
+ void** ptr);
+
+/**
+ * @brief Deallocate a block of memory previously allocated using
+ * ::hsa_memory_allocate.
+ *
+ * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value
+ * previously returned by ::hsa_memory_allocate, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t HSA_API hsa_memory_free(void* ptr);
+
+/**
+ * @brief Copy a block of memory from the location pointed to by @p src to the
+ * memory block pointed to by @p dst.
+ *
+ * @param[out] dst Buffer where the content is to be copied. If @p dst is in
+ * coarse-grained memory, the copied data is only visible to the agent currently
+ * assigned (::hsa_memory_assign_agent) to @p dst.
+ *
+ * @param[in] src A valid pointer to the source of data to be copied. The source
+ * buffer must not overlap with the destination buffer. If the source buffer is
+ * in coarse-grained memory then it must be assigned to an agent, from which the
+ * data will be retrieved.
+ *
+ * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
+ * performed and the function returns success. Copying a number of bytes larger
+ * than the size of the buffers pointed by @p dst or @p src results in undefined
+ * behavior.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL.
+ */
+hsa_status_t HSA_API hsa_memory_copy(
+ void *dst,
+ const void *src,
+ size_t size);
+
+/**
+ * @brief Change the ownership of a global, coarse-grained buffer.
+ *
+ * @details The contents of a coarse-grained buffer are visible to an agent
+ * only after ownership has been explicitely transferred to that agent. Once the
+ * operation completes, the previous owner cannot longer access the data in the
+ * buffer.
+ *
+ * An implementation of the HSA runtime is allowed, but not required, to change
+ * the physical location of the buffer when ownership is transferred to a
+ * different agent. In general the application must not assume this
+ * behavior. The virtual location (address) of the passed buffer is never
+ * modified.
+ *
+ * @param[in] ptr Base address of a global buffer. The pointer must match an
+ * address previously returned by ::hsa_memory_allocate. The size of the buffer
+ * affected by the ownership change is identical to the size of that previous
+ * allocation. If @p ptr points to a fine-grained global buffer, no operation is
+ * performed and the function returns success. If @p ptr does not point to
+ * global memory, the behavior is undefined.
+ *
+ * @param[in] agent Agent that becomes the owner of the buffer. The
+ * application is responsible for ensuring that @p agent has access to the
+ * region that contains the buffer. It is allowed to change ownership to an
+ * agent that is already the owner of the buffer, with the same or different
+ * access permissions.
+ *
+ * @param[in] access Access permissions requested for the new owner.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is
+ * not a valid access value.
+ */
+hsa_status_t HSA_API hsa_memory_assign_agent(
+ void *ptr,
+ hsa_agent_t agent,
+ hsa_access_permission_t access);
+
+/**
+ *
+ * @brief Register a global, fine-grained buffer.
+ *
+ * @details Registering a buffer serves as an indication to the HSA runtime that
+ * the memory might be accessed from a kernel agent other than the
+ * host. Registration is a performance hint that allows the HSA runtime
+ * implementation to know which buffers will be accessed by some of the kernel
+ * agents ahead of time.
+ *
+ * Registration is only recommended for buffers in the global segment that have
+ * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS
+ * allocator instead. Registering an OS-allocated buffer in the base profile is
+ * equivalent to a no-op.
+ *
+ * Registrations should not overlap.
+ *
+ * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is
+ * passed, no operation is performed. If the buffer has been allocated using
+ * ::hsa_memory_allocate, or has already been registered, no operation is
+ * performed.
+ *
+ * @param[in] size Requested registration size in bytes. A size of 0 is
+ * only allowed if @p ptr is NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr
+ * is not NULL.
+ */
+hsa_status_t HSA_API hsa_memory_register(
+ void *ptr,
+ size_t size);
+
+/**
+ *
+ * @brief Deregister memory previously registered using ::hsa_memory_register.
+ *
+ * @details If the memory interval being deregistered does not match a previous
+ * registration (start and end addresses), the behavior is undefined.
+ *
+ * @param[in] ptr A pointer to the base of the buffer to be deregistered. If
+ * a NULL pointer is passed, no operation is performed.
+ *
+ * @param[in] size Size of the buffer to be deregistered.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_memory_deregister(
+ void *ptr,
+ size_t size);
+
+/** @} */
+
+
+/** \defgroup instruction-set-architecture Instruction Set Architecture.
+ * @{
+ */
+
+/**
+ * @brief Instruction set architecture.
+ */
+typedef struct hsa_isa_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_isa_t;
+
+/**
+ * @brief Retrieve a reference to an instruction set architecture handle out of
+ * a symbolic name.
+ *
+ * @param[in] name Vendor-specific name associated with a a particular
+ * instruction set architecture. @p name must start with the vendor name and a
+ * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be
+ * a NUL-terminated string.
+ *
+ * @param[out] isa Memory location where the HSA runtime stores the ISA handle
+ * corresponding to the given name. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not
+ * correspond to any instruction set architecture.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_isa_from_name(
+ const char *name,
+ hsa_isa_t *isa);
+
+/**
+ * @brief Iterate over the instruction sets supported by the given agent, and
+ * invoke an application-defined callback on every iteration. The iterator is
+ * deterministic: if an agent supports several instruction set architectures,
+ * they are traversed in the same order in every invocation of this function.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked once per instruction set
+ * architecture. The HSA runtime passes two arguments to the callback: the
+ * ISA and the application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * that status value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_agent_iterate_isas(
+ hsa_agent_t agent,
+ hsa_status_t (*callback)(hsa_isa_t isa, void *data),
+ void *data);
+
+/**
+ * @brief Instruction set architecture attributes.
+ */
+typedef enum {
+ /**
+ * The length of the ISA name in bytes, not including the NUL terminator. The
+ * type of this attribute is uint32_t.
+ */
+ HSA_ISA_INFO_NAME_LENGTH = 0,
+ /**
+ * Human-readable description. The type of this attribute is character array
+ * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute.
+ */
+ HSA_ISA_INFO_NAME = 1,
+ /**
+ * @deprecated
+ *
+ * Number of call conventions supported by the instruction set architecture.
+ * Must be greater than zero. The type of this attribute is uint32_t.
+ */
+ HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2,
+ /**
+ * @deprecated
+ *
+ * Number of work-items in a wavefront for a given call convention. Must be a
+ * power of 2 in the range [1,256]. The type of this attribute is uint32_t.
+ */
+ HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3,
+ /**
+ * @deprecated
+ *
+ * Number of wavefronts per compute unit for a given call convention. In
+ * practice, other factors (for example, the amount of group memory used by a
+ * work-group) may further limit the number of wavefronts per compute
+ * unit. The type of this attribute is uint32_t.
+ */
+ HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4,
+ /**
+ * Machine models supported by the instruction set architecture. The type of
+ * this attribute is a bool[2]. If the ISA supports the small machine model,
+ * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports
+ * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true.
+ */
+ HSA_ISA_INFO_MACHINE_MODELS = 5,
+ /**
+ * Profiles supported by the instruction set architecture. The type of this
+ * attribute is a bool[2]. If the ISA supports the base profile, the element
+ * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile,
+ * the element at index ::HSA_PROFILE_FULL is true.
+ */
+ HSA_ISA_INFO_PROFILES = 6,
+ /**
+ * Default floating-point rounding modes supported by the instruction set
+ * architecture. The type of this attribute is a bool[3]. The value at a given
+ * index is true if the corresponding rounding mode in
+ * ::hsa_default_float_rounding_mode_t is supported. At least one default mode
+ * has to be supported.
+ *
+ * If the default mode is supported, then
+ * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that
+ * both the zero and the near roundings modes are supported.
+ */
+ HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7,
+ /**
+ * Default floating-point rounding modes supported by the instruction set
+ * architecture in the Base profile. The type of this attribute is a
+ * bool[3]. The value at a given index is true if the corresponding rounding
+ * mode in ::hsa_default_float_rounding_mode_t is supported. The value at
+ * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false. At least one
+ * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or
+ * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true.
+ */
+ HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8,
+ /**
+ * Flag indicating that the f16 HSAIL operation is at least as fast as the
+ * f32 operation in the instruction set architecture. The type of this
+ * attribute is bool.
+ */
+ HSA_ISA_INFO_FAST_F16_OPERATION = 9,
+ /**
+ * Maximum number of work-items of each dimension of a work-group. Each
+ * maximum must be greater than 0. No maximum can exceed the value of
+ * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is
+ * uint16_t[3].
+ */
+ HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12,
+ /**
+ * Maximum total number of work-items in a work-group. The type
+ * of this attribute is uint32_t.
+ */
+ HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13,
+ /**
+ * Maximum number of work-items of each dimension of a grid. Each maximum must
+ * be greater than 0, and must not be smaller than the corresponding value in
+ * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of
+ * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is
+ * ::hsa_dim3_t.
+ */
+ HSA_ISA_INFO_GRID_MAX_DIM = 14,
+ /**
+ * Maximum total number of work-items in a grid. The type of this
+ * attribute is uint64_t.
+ */
+ HSA_ISA_INFO_GRID_MAX_SIZE = 16,
+ /**
+ * Maximum number of fbarriers per work-group. Must be at least 32. The
+ * type of this attribute is uint32_t.
+ */
+ HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17
+} hsa_isa_info_t;
+
+/**
+ * @deprecated The concept of call convention has been deprecated. If the
+ * application wants to query the value of an attribute for a given instruction
+ * set architecture, use ::hsa_isa_get_info_alt instead. If the application
+ * wants to query an attribute that is specific to a given combination of ISA
+ * and wavefront, use ::hsa_wavefront_get_info.
+ *
+ * @brief Get the current value of an attribute for a given instruction set
+ * architecture (ISA).
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[in] index Call convention index. Used only for call convention
+ * attributes, otherwise ignored. Must have a value between 0 (inclusive) and
+ * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not
+ * inclusive) in @p isa.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info(
+ hsa_isa_t isa,
+ hsa_isa_info_t attribute,
+ uint32_t index,
+ void *value);
+
+/**
+ * @brief Get the current value of an attribute for a given instruction set
+ * architecture (ISA).
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * instruction set architecture attribute, or @p value is
+ * NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_info_alt(
+ hsa_isa_t isa,
+ hsa_isa_info_t attribute,
+ void *value);
+
+/**
+ * @brief Retrieve the exception policy support for a given combination of
+ * instruction set architecture and profile.
+ *
+ * @param[in] isa A valid instruction set architecture.
+ *
+ * @param[in] profile Profile.
+ *
+ * @param[out] mask Pointer to a memory location where the HSA runtime stores a
+ * mask of ::hsa_exception_policy_t values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid
+ * profile, or @p mask is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_exception_policies(
+ hsa_isa_t isa,
+ hsa_profile_t profile,
+ uint16_t *mask);
+
+/**
+ * @brief Floating-point types.
+ */
+typedef enum {
+ /**
+ * 16-bit floating-point type.
+ */
+ HSA_FP_TYPE_16 = 1,
+ /**
+ * 32-bit floating-point type.
+ */
+ HSA_FP_TYPE_32 = 2,
+ /**
+ * 64-bit floating-point type.
+ */
+ HSA_FP_TYPE_64 = 4
+} hsa_fp_type_t;
+
+/**
+ * @brief Flush to zero modes.
+ */
+typedef enum {
+ /**
+ * Flush to zero.
+ */
+ HSA_FLUSH_MODE_FTZ = 1,
+ /**
+ * Do not flush to zero.
+ */
+ HSA_FLUSH_MODE_NON_FTZ = 2
+} hsa_flush_mode_t;
+
+/**
+ * @brief Round methods.
+ */
+typedef enum {
+ /**
+ * Single round method.
+ */
+ HSA_ROUND_METHOD_SINGLE = 1,
+ /**
+ * Double round method.
+ */
+ HSA_ROUND_METHOD_DOUBLE = 2
+} hsa_round_method_t;
+
+/**
+ * @brief Retrieve the round method (single or double) used to implement the
+ * floating-point multiply add instruction (mad) for a given combination of
+ * instruction set architecture, floating-point type, and flush to zero
+ * modifier.
+ *
+ * @param[in] isa Instruction set architecture.
+ *
+ * @param[in] fp_type Floating-point type.
+ *
+ * @param[in] flush_mode Flush to zero modifier.
+ *
+ * @param[out] round_method Pointer to a memory location where the HSA
+ * runtime stores the round method used by the implementation. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid
+ * floating-point type, or @p flush_mode is not a valid flush to zero modifier,
+ * or @p round_method is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_get_round_method(
+ hsa_isa_t isa,
+ hsa_fp_type_t fp_type,
+ hsa_flush_mode_t flush_mode,
+ hsa_round_method_t *round_method);
+
+/**
+ * @brief Wavefront handle
+ */
+typedef struct hsa_wavefront_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_wavefront_t;
+
+/**
+ * @brief Wavefront attributes.
+ */
+typedef enum {
+ /**
+ * Number of work-items in the wavefront. Must be a power of 2 in the range
+ * [1,256]. The type of this attribute is uint32_t.
+ */
+ HSA_WAVEFRONT_INFO_SIZE = 0
+} hsa_wavefront_info_t;
+
+/**
+ * @brief Get the current value of a wavefront attribute.
+ *
+ * @param[in] wavefront A wavefront.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * wavefront attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_wavefront_get_info(
+ hsa_wavefront_t wavefront,
+ hsa_wavefront_info_t attribute,
+ void *value);
+
+/**
+ * @brief Iterate over the different wavefronts supported by an instruction set
+ * architecture, and invoke an application-defined callback on every iteration.
+ *
+ * @param[in] isa Instruction set architecture.
+ *
+ * @param[in] callback Callback to be invoked once per wavefront that is
+ * supported by the agent. The HSA runtime passes two arguments to the callback:
+ * the wavefront handle and the application data. If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and that value is returned.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_isa_iterate_wavefronts(
+ hsa_isa_t isa,
+ hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data),
+ void *data);
+
+/**
+ * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set
+ * architectures are supported by a given agent.
+ *
+ * @brief Check if the instruction set architecture of a code object can be
+ * executed on an agent associated with another architecture.
+ *
+ * @param[in] code_object_isa Instruction set architecture associated with a
+ * code object.
+ *
+ * @param[in] agent_isa Instruction set architecture associated with an agent.
+ *
+ * @param[out] result Pointer to a memory location where the HSA runtime stores
+ * the result of the check. If the two architectures are compatible, the result
+ * is true; if they are incompatible, the result is false.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible(
+ hsa_isa_t code_object_isa,
+ hsa_isa_t agent_isa,
+ bool *result);
+
+/** @} */
+
+
+/** \defgroup executable Executable
+ * @{
+ */
+
+/**
+ * @brief Code object reader handle. A code object reader is used to
+ * load a code object from file (when created using
+ * ::hsa_code_object_reader_create_from_file), or from memory (if created using
+ * ::hsa_code_object_reader_create_from_memory).
+ */
+typedef struct hsa_code_object_reader_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_code_object_reader_t;
+
+/**
+ * @brief Create a code object reader to operate on a file.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_create_from_file(
+ hsa_file_t file,
+ hsa_code_object_reader_t *code_object_reader);
+
+/**
+ * @brief Create a code object reader to operate on memory.
+ *
+ * @param[in] code_object Memory buffer that contains a vendor-specific code
+ * object. The buffer is owned and managed by the application; the lifetime of
+ * the buffer must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the buffer pointed to by @p code_object. Must not be
+ * 0.
+ *
+ * @param[out] code_object_reader Memory location to store newly created code
+ * object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size
+ * is zero, or @p code_object_reader is NULL.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_create_from_memory(
+ const void *code_object,
+ size_t size,
+ hsa_code_object_reader_t *code_object_reader);
+
+/**
+ * @brief Destroy a code object reader.
+ *
+ * @details The code object reader handle becomes invalid after completion of
+ * this function. Any file or memory used to create the code object read is not
+ * closed, removed, or deallocated by this function.
+ *
+ * @param[in] code_object_reader Code object reader to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ */
+hsa_status_t HSA_API hsa_code_object_reader_destroy(
+ hsa_code_object_reader_t code_object_reader);
+
+/**
+ * @brief Struct containing an opaque handle to an executable, which contains
+ * ISA for finalized kernels and indirect functions together with the allocated
+ * global or readonly segment variables they reference.
+ */
+typedef struct hsa_executable_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_executable_t;
+
+/**
+ * @brief Executable state.
+ */
+typedef enum {
+ /**
+ * Executable state, which allows the user to load code objects and define
+ * external variables. Variable addresses, kernel code handles, and
+ * indirect function code handles are not available in query operations until
+ * the executable is frozen (zero always returned).
+ */
+ HSA_EXECUTABLE_STATE_UNFROZEN = 0,
+ /**
+ * Executable state, which allows the user to query variable addresses,
+ * kernel code handles, and indirect function code handles using query
+ * operations. Loading new code objects, as well as defining external
+ * variables, is not allowed in this state.
+ */
+ HSA_EXECUTABLE_STATE_FROZEN = 1
+} hsa_executable_state_t;
+
+/**
+ * @deprecated Use ::hsa_executable_create_alt instead, which allows the
+ * application to specify the default floating-point rounding mode of the
+ * executable and assumes an unfrozen initial state.
+ *
+ * @brief Create an empty executable.
+ *
+ * @param[in] profile Profile used in the executable.
+ *
+ * @param[in] executable_state Executable state. If the state is
+ * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no
+ * code objects can be loaded, and no variables can be defined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] executable Memory location where the HSA runtime stores the newly
+ * created executable handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or
+ * @p executable is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create(
+ hsa_profile_t profile,
+ hsa_executable_state_t executable_state,
+ const char *options,
+ hsa_executable_t *executable);
+
+/**
+ * @brief Create an empty executable.
+ *
+ * @param[in] profile Profile used in the executable.
+ *
+ * @param[in] default_float_rounding_mode Default floating-point rounding mode
+ * used in the executable. Allowed rounding modes are near and zero (default is
+ * not allowed).
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] executable Memory location where the HSA runtime stores newly
+ * created executable handle. The initial state of the executable is
+ * ::HSA_EXECUTABLE_STATE_UNFROZEN.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or
+ * @p executable is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_create_alt(
+ hsa_profile_t profile,
+ hsa_default_float_rounding_mode_t default_float_rounding_mode,
+ const char *options,
+ hsa_executable_t *executable);
+
+/**
+ * @brief Destroy an executable.
+ *
+ * @details An executable handle becomes invalid after the executable has been
+ * destroyed. Code object handles that were loaded into this executable are
+ * still valid after the executable has been destroyed, and can be used as
+ * intended. Resources allocated outside and associated with this executable
+ * (such as external global or readonly variables) can be released after the
+ * executable has been destroyed.
+ *
+ * Executable should not be destroyed while kernels are in flight.
+ *
+ * @param[in] executable Executable.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ */
+hsa_status_t HSA_API hsa_executable_destroy(
+ hsa_executable_t executable);
+
+/**
+ * @brief Loaded code object handle.
+ */
+typedef struct hsa_loaded_code_object_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_loaded_code_object_t;
+
+/**
+ * @brief Load a program code object into an executable.
+ *
+ * @details A program code object contains information about resources that are
+ * accessible by all kernel agents that run the executable, and can be loaded
+ * at most once into an executable.
+ *
+ * If the program code object uses extensions, the implementation must support
+ * them for this operation to return successfully.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] code_object_reader A code object reader that holds the program
+ * code object to load. If a code object reader is destroyed before all the
+ * associated executables are destroyed, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] loaded_code_object Pointer to a memory location where the HSA
+ * runtime stores the loaded code object handle. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is
+ * not compatible with the executable or the implementation (for example, the
+ * code object uses an extension that is not supported by the implementation).
+ */
+hsa_status_t HSA_API hsa_executable_load_program_code_object(
+ hsa_executable_t executable,
+ hsa_code_object_reader_t code_object_reader,
+ const char *options,
+ hsa_loaded_code_object_t *loaded_code_object);
+
+/**
+ * @brief Load an agent code object into an executable.
+ *
+ * @details The agent code object contains all defined agent
+ * allocation variables, functions, indirect functions, and kernels in a given
+ * program for a given instruction set architecture.
+ *
+ * Any module linkage declaration must have been defined either by a define
+ * variable or by loading a code object that has a symbol with module linkage
+ * definition.
+ *
+ * The default floating-point rounding mode of the code object associated with
+ * @p code_object_reader must match that of the executable
+ * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which
+ * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used).
+ * If the agent code object uses extensions, the implementation and the agent
+ * must support them for this operation to return successfully.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent to load code object for. A code object can be loaded
+ * into an executable at most once for a given agent. The instruction set
+ * architecture of the code object must be supported by the agent.
+ *
+ * @param[in] code_object_reader A code object reader that holds the code object
+ * to load. If a code object reader is destroyed before all the associated
+ * executables are destroyed, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] loaded_code_object Pointer to a memory location where the HSA
+ * runtime stores the loaded code object handle. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader
+ * is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p
+ * code_object_reader is not compatible with the agent (for example, the agent
+ * does not support the instruction set architecture of the code object), the
+ * executable (for example, there is a default floating-point mode mismatch
+ * between the two), or the implementation.
+ */
+hsa_status_t HSA_API hsa_executable_load_agent_code_object(
+ hsa_executable_t executable,
+ hsa_agent_t agent,
+ hsa_code_object_reader_t code_object_reader,
+ const char *options,
+ hsa_loaded_code_object_t *loaded_code_object);
+
+/**
+ * @brief Freeze the executable.
+ *
+ * @details No modifications to executable can be made after freezing: no code
+ * objects can be loaded to the executable, and no external variables can be
+ * defined. Freezing the executable does not prevent querying the executable's
+ * attributes. The application must define all the external variables in an
+ * executable before freezing it.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are
+ * undefined in the executable.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen.
+ */
+hsa_status_t HSA_API hsa_executable_freeze(
+ hsa_executable_t executable,
+ const char *options);
+
+/**
+ * @brief Executable attributes.
+ */
+typedef enum {
+ /**
+ * Profile this executable is created for. The type of this attribute is
+ * ::hsa_profile_t.
+ */
+ HSA_EXECUTABLE_INFO_PROFILE = 1,
+ /**
+ * Executable state. The type of this attribute is ::hsa_executable_state_t.
+ */
+ HSA_EXECUTABLE_INFO_STATE = 2,
+ /**
+ * Default floating-point rounding mode specified when executable was created.
+ * The type of this attribute is ::hsa_default_float_rounding_mode_t.
+ */
+ HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3
+} hsa_executable_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given executable.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * executable attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_get_info(
+ hsa_executable_t executable,
+ hsa_executable_info_t attribute,
+ void *value);
+
+/**
+ * @brief Define an external global variable with program allocation.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the global segment memory with program allocation. The
+ * variable must be defined before loading a code object into an executable.
+ * In addition, code objects loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * be in global memory and can be read and written by any agent in the
+ * system. The application cannot deallocate the buffer pointed by @p address
+ * before @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_global_variable_define(
+ hsa_executable_t executable,
+ const char *variable_name,
+ void *address);
+
+/**
+ * @brief Define an external global variable with agent allocation.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the global segment memory with agent allocation. The
+ * variable must be defined before loading a code object into an executable.
+ * In addition, code objects loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] agent Agent for which the variable is being defined.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * have been previously allocated using ::hsa_memory_allocate in a global region
+ * that is only visible to @p agent. The application cannot deallocate the
+ * buffer pointed by @p address before @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_agent_global_variable_define(
+ hsa_executable_t executable,
+ hsa_agent_t agent,
+ const char *variable_name,
+ void *address);
+
+/**
+ * @brief Define an external readonly variable.
+ *
+ * @details This function allows the application to provide the definition
+ * of a variable in the readonly segment memory. The variable must be defined
+ * before loading a code object into an executable. In addition, code objects
+ * loaded must not define the variable.
+ *
+ * @param[in] executable Executable. Must not be in frozen state.
+ *
+ * @param[in] agent Agent for which the variable is being defined.
+ *
+ * @param[in] variable_name Name of the variable. The Programmer's Reference
+ * Manual describes the standard name mangling scheme.
+ *
+ * @param[in] address Address where the variable is defined. This address must
+ * have been previously allocated using ::hsa_memory_allocate in a readonly
+ * region associated with @p agent. The application cannot deallocate the buffer
+ * pointed by @p address before @p executable is destroyed.
+ *
+ * @param[in] address Address where the variable is defined. The buffer pointed
+ * by @p address is owned by the application, and cannot be deallocated before
+ * @p executable is destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is
+ * already defined.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the
+ * @p variable_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_readonly_variable_define(
+ hsa_executable_t executable,
+ hsa_agent_t agent,
+ const char *variable_name,
+ void *address);
+
+/**
+ * @brief Validate an executable. Checks that all code objects have matching
+ * machine model, profile, and default floating-point rounding mode. Checks that
+ * all declarations have definitions. Checks declaration-definition
+ * compatibility (see the HSA Programming Reference Manual for compatibility
+ * rules). Invoking this function is equivalent to invoking
+ * ::hsa_executable_validate_alt with no options.
+ *
+ * @param[in] executable Executable. Must be in frozen state.
+ *
+ * @param[out] result Memory location where the HSA runtime stores the
+ * validation result. If the executable passes validation, the result is 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_validate(
+ hsa_executable_t executable,
+ uint32_t *result);
+
+/**
+ * @brief Validate an executable. Checks that all code objects have matching
+ * machine model, profile, and default floating-point rounding mode. Checks that
+ * all declarations have definitions. Checks declaration-definition
+ * compatibility (see the HSA Programming Reference Manual for compatibility
+ * rules).
+ *
+ * @param[in] executable Executable. Must be in frozen state.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] result Memory location where the HSA runtime stores the
+ * validation result. If the executable passes validation, the result is 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_validate_alt(
+ hsa_executable_t executable,
+ const char *options,
+ uint32_t *result);
+
+/**
+ * @brief Executable symbol handle.
+ *
+ * The lifetime of an executable object symbol matches that of the executable
+ * associated with it. An operation on a symbol whose associated executable has
+ * been destroyed results in undefined behavior.
+ */
+typedef struct hsa_executable_symbol_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_executable_symbol_t;
+
+/**
+ * @deprecated Use ::hsa_executable_get_symbol_by_name instead.
+ *
+ * @brief Get the symbol handle for a given a symbol name.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] module_name Module name. Must be NULL if the symbol has
+ * program linkage.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[in] agent Agent associated with the symbol. If the symbol is
+ * independent of any agent (for example, a variable with program
+ * allocation), this argument is ignored.
+ *
+ * @param[in] call_convention Call convention associated with the symbol. If the
+ * symbol does not correspond to an indirect function, this argument is ignored.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol(
+ hsa_executable_t executable,
+ const char *module_name,
+ const char *symbol_name,
+ hsa_agent_t agent,
+ int32_t call_convention,
+ hsa_executable_symbol_t *symbol);
+
+/**
+ * @brief Retrieve the symbol handle corresponding to a given a symbol name.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] symbol_name Symbol name. Must be a NUL-terminated character
+ * array. The Programmer's Reference Manual describes the standard name mangling
+ * scheme.
+ *
+ * @param[in] agent Pointer to the agent for which the symbol with the given
+ * name is defined. If the symbol corresponding to the given name has program
+ * allocation, @p agent must be NULL.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p
+ * symbol is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_get_symbol_by_name(
+ hsa_executable_t executable,
+ const char *symbol_name,
+ const hsa_agent_t *agent,
+ hsa_executable_symbol_t *symbol);
+
+/**
+ * @brief Symbol type.
+ */
+typedef enum {
+ /**
+ * Variable.
+ */
+ HSA_SYMBOL_KIND_VARIABLE = 0,
+ /**
+ * Kernel.
+ */
+ HSA_SYMBOL_KIND_KERNEL = 1,
+ /**
+ * Indirect function.
+ */
+ HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2
+} hsa_symbol_kind_t;
+
+/**
+ * @brief Linkage type of a symbol.
+ */
+typedef enum {
+ /**
+ * Module linkage.
+ */
+ HSA_SYMBOL_LINKAGE_MODULE = 0,
+ /**
+ * Program linkage.
+ */
+ HSA_SYMBOL_LINKAGE_PROGRAM = 1
+} hsa_symbol_linkage_t;
+
+/**
+ * @brief Allocation type of a variable.
+ */
+typedef enum {
+ /**
+ * Agent allocation.
+ */
+ HSA_VARIABLE_ALLOCATION_AGENT = 0,
+ /**
+ * Program allocation.
+ */
+ HSA_VARIABLE_ALLOCATION_PROGRAM = 1
+} hsa_variable_allocation_t;
+
+/**
+ * @brief Memory segment associated with a variable.
+ */
+typedef enum {
+ /**
+ * Global memory segment.
+ */
+ HSA_VARIABLE_SEGMENT_GLOBAL = 0,
+ /**
+ * Readonly memory segment.
+ */
+ HSA_VARIABLE_SEGMENT_READONLY = 1
+} hsa_variable_segment_t;
+
+/**
+ * @brief Executable symbol attributes.
+ */
+typedef enum {
+ /**
+ * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0,
+ /**
+ * The length of the symbol name in bytes, not including the NUL terminator.
+ * The type of this attribute is uint32_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1,
+ /**
+ * The name of the symbol. The type of this attribute is character array with
+ * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
+ * attribute.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2,
+ /**
+ * @deprecated
+ *
+ * The length of the module name in bytes (not including the NUL terminator)
+ * to which this symbol belongs if this symbol has module linkage, otherwise 0
+ * is returned. The type of this attribute is uint32_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3,
+ /**
+ * @deprecated
+ *
+ * The module name to which this symbol belongs if this symbol has module
+ * linkage, otherwise an empty string is returned. The type of this attribute
+ * is character array with the length equal to the value of
+ * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4,
+ /**
+ * @deprecated
+ *
+ * Agent associated with this symbol. If the symbol is a variable, the
+ * value of this attribute is only defined if
+ * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is
+ * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20,
+ /**
+ * The address of the variable. The value of this attribute is undefined if
+ * the symbol is not a variable. The type of this attribute is uint64_t.
+ *
+ * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is
+ * returned.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21,
+ /**
+ * The linkage kind of the symbol. The type of this attribute is
+ * ::hsa_symbol_linkage_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5,
+ /**
+ * Indicates whether the symbol corresponds to a definition. The type of this
+ * attribute is bool.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17,
+ /**
+ * @deprecated
+ *
+ * The allocation kind of the variable. The value of this attribute is
+ * undefined if the symbol is not a variable. The type of this attribute is
+ * ::hsa_variable_allocation_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6,
+ /**
+ * @deprecated
+ *
+ * The segment kind of the variable. The value of this attribute is undefined
+ * if the symbol is not a variable. The type of this attribute is
+ * ::hsa_variable_segment_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7,
+ /**
+ * @deprecated
+ *
+ * Alignment of the symbol in memory. The value of this attribute is undefined
+ * if the symbol is not a variable. The type of this attribute is uint32_t.
+ *
+ * The current alignment of the variable in memory may be greater than the
+ * value specified in the source program variable declaration.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8,
+ /**
+ * @deprecated
+ *
+ * Size of the variable. The value of this attribute is undefined if
+ * the symbol is not a variable. The type of this attribute is uint32_t.
+ *
+ * A value of 0 is returned if the variable is an external variable and has an
+ * unknown dimension.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9,
+ /**
+ * @deprecated
+ *
+ * Indicates whether the variable is constant. The value of this attribute is
+ * undefined if the symbol is not a variable. The type of this attribute is
+ * bool.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10,
+ /**
+ * Kernel object handle, used in the kernel dispatch packet. The value of this
+ * attribute is undefined if the symbol is not a kernel. The type of this
+ * attribute is uint64_t.
+ *
+ * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+ * is returned.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22,
+ /**
+ * Size of kernarg segment memory that is required to hold the values of the
+ * kernel arguments, in bytes. Must be a multiple of 16. The value of this
+ * attribute is undefined if the symbol is not a kernel. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
+ /**
+ * Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+ * which is the maximum of 16 and the maximum alignment of any of the kernel
+ * arguments. The value of this attribute is undefined if the symbol is not a
+ * kernel. The type of this attribute is uint32_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12,
+ /**
+ * Size of static group segment memory required by the kernel (per
+ * work-group), in bytes. The value of this attribute is undefined
+ * if the symbol is not a kernel. The type of this attribute is uint32_t.
+ *
+ * The reported amount does not include any dynamically allocated group
+ * segment memory that may be requested by the application when a kernel is
+ * dispatched.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
+ /**
+ * Size of static private, spill, and arg segment memory required by
+ * this kernel (per work-item), in bytes. The value of this attribute is
+ * undefined if the symbol is not a kernel. The type of this attribute is
+ * uint32_t.
+ *
+ * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is
+ * true, the kernel may use more private memory than the reported value, and
+ * the application must add the dynamic call stack usage to @a
+ * private_segment_size when populating a kernel dispatch packet.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+ /**
+ * Dynamic callstack flag. The value of this attribute is undefined if the
+ * symbol is not a kernel. The type of this attribute is bool.
+ *
+ * If this flag is set (the value is true), the kernel uses a dynamically
+ * sized call stack. This can happen if recursive calls, calls to indirect
+ * functions, or the HSAIL alloca instruction are present in the kernel.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
+ /**
+ * @deprecated
+ *
+ * Call convention of the kernel. The value of this attribute is undefined if
+ * the symbol is not a kernel. The type of this attribute is uint32_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18,
+ /**
+ * Indirect function object handle. The value of this attribute is undefined
+ * if the symbol is not an indirect function, or the associated agent does
+ * not support the Full Profile. The type of this attribute depends on the
+ * machine model: the type is uint32_t for small machine model, and uint64_t
+ * for large model.
+ *
+ * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0
+ * is returned.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23,
+ /**
+ * @deprecated
+ *
+ * Call convention of the indirect function. The value of this attribute is
+ * undefined if the symbol is not an indirect function, or the associated
+ * agent does not support the Full Profile. The type of this attribute is
+ * uint32_t.
+ */
+ HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
+} hsa_executable_symbol_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given executable symbol.
+ *
+ * @param[in] executable_symbol Executable symbol.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * executable symbol attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_symbol_get_info(
+ hsa_executable_symbol_t executable_symbol,
+ hsa_executable_symbol_info_t attribute,
+ void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Iterate over the symbols in a executable, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols(
+ hsa_executable_t executable,
+ hsa_status_t (*callback)(hsa_executable_t exec,
+ hsa_executable_symbol_t symbol,
+ void *data),
+ void *data);
+
+/**
+ * @brief Iterate over the kernels, indirect functions, and agent allocation
+ * variables in an executable for a given agent, and invoke an application-
+ * defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_iterate_agent_symbols(
+ hsa_executable_t executable,
+ hsa_agent_t agent,
+ hsa_status_t (*callback)(hsa_executable_t exec,
+ hsa_agent_t agent,
+ hsa_executable_symbol_t symbol,
+ void *data),
+ void *data);
+
+/**
+ * @brief Iterate over the program allocation variables in an executable, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per executable symbol. The
+ * HSA runtime passes three arguments to the callback: the executable, a symbol,
+ * and the application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_executable_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_executable_iterate_program_symbols(
+ hsa_executable_t executable,
+ hsa_status_t (*callback)(hsa_executable_t exec,
+ hsa_executable_symbol_t symbol,
+ void *data),
+ void *data);
+
+/** @} */
+
+
+/** \defgroup code-object Code Objects (deprecated).
+ * @{
+ */
+
+/**
+ * @deprecated
+ *
+ * @brief Struct containing an opaque handle to a code object, which contains
+ * ISA for finalized kernels and indirect functions together with information
+ * about the global or readonly segment variables they reference.
+ */
+typedef struct hsa_code_object_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_code_object_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Application data handle that is passed to the serialization
+ * and deserialization functions.
+ */
+typedef struct hsa_callback_data_s {
+ /**
+ * Opaque handle.
+ */
+ uint64_t handle;
+} hsa_callback_data_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Serialize a code object. Can be used for offline finalization,
+ * install-time finalization, disk code caching, etc.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] alloc_callback Callback function for memory allocation. Must not
+ * be NULL. The HSA runtime passes three arguments to the callback: the
+ * allocation size, the application data, and a pointer to a memory location
+ * where the application stores the allocation result. The HSA runtime invokes
+ * @p alloc_callback once to allocate a buffer that contains the serialized
+ * version of @p code_object. If the callback returns a status code other than
+ * ::HSA_STATUS_SUCCESS, this function returns the same code.
+ *
+ * @param[in] callback_data Application data that is passed to @p
+ * alloc_callback. May be NULL.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] serialized_code_object Memory location where the HSA runtime
+ * stores a pointer to the serialized code object. Must not be NULL.
+ *
+ * @param[out] serialized_code_object_size Memory location where the HSA runtime
+ * stores the size (in bytes) of @p serialized_code_object. The returned value
+ * matches the allocation size passed by the HSA runtime to @p
+ * alloc_callback. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p
+ * serialized_code_object, or @p serialized_code_object_size are NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize(
+ hsa_code_object_t code_object,
+ hsa_status_t (*alloc_callback)(size_t size,
+ hsa_callback_data_t data,
+ void **address),
+ hsa_callback_data_t callback_data,
+ const char *options,
+ void **serialized_code_object,
+ size_t *serialized_code_object_size);
+
+/**
+ * @deprecated
+ *
+ * @brief Deserialize a code object.
+ *
+ * @param[in] serialized_code_object A serialized code object. Must not be NULL.
+ *
+ * @param[in] serialized_code_object_size The size (in bytes) of @p
+ * serialized_code_object. Must not be 0.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @param[out] code_object Memory location where the HSA runtime stores the
+ * deserialized code object.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p
+ * code_object are NULL, or @p serialized_code_object_size is 0.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize(
+ void *serialized_code_object,
+ size_t serialized_code_object_size,
+ const char *options,
+ hsa_code_object_t *code_object);
+
+/**
+ * @deprecated
+ *
+ * @brief Destroy a code object.
+ *
+ * @details The lifetime of a code object must exceed that of any executable
+ * where it has been loaded. If an executable that loaded @p code_object has not
+ * been destroyed, the behavior is undefined.
+ *
+ * @param[in] code_object Code object. The handle becomes invalid after it has
+ * been destroyed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy(
+ hsa_code_object_t code_object);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object type.
+ */
+typedef enum {
+ /**
+ * Produces code object that contains ISA for all kernels and indirect
+ * functions in HSA source.
+ */
+ HSA_CODE_OBJECT_TYPE_PROGRAM = 0
+} hsa_code_object_type_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Code object attributes.
+ */
+typedef enum {
+ /**
+ * The version of the code object. The type of this attribute is a
+ * NUL-terminated char[64]. The name must be at most 63 characters long (not
+ * including the NUL terminator) and all array elements not used for the name
+ * must be NUL.
+ */
+ HSA_CODE_OBJECT_INFO_VERSION = 0,
+ /**
+ * Type of code object. The type of this attribute is
+ * ::hsa_code_object_type_t.
+ */
+ HSA_CODE_OBJECT_INFO_TYPE = 1,
+ /**
+ * Instruction set architecture this code object is produced for. The type of
+ * this attribute is ::hsa_isa_t.
+ */
+ HSA_CODE_OBJECT_INFO_ISA = 2,
+ /**
+ * Machine model this code object is produced for. The type of this attribute
+ * is ::hsa_machine_model_t.
+ */
+ HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3,
+ /**
+ * Profile this code object is produced for. The type of this attribute is
+ * ::hsa_profile_t.
+ */
+ HSA_CODE_OBJECT_INFO_PROFILE = 4,
+ /**
+ * Default floating-point rounding mode used when the code object is
+ * produced. The type of this attribute is
+ * ::hsa_default_float_rounding_mode_t.
+ */
+ HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5
+} hsa_code_object_info_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the current value of an attribute for a given code object.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * code object attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info(
+ hsa_code_object_t code_object,
+ hsa_code_object_info_t attribute,
+ void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Load code object into the executable.
+ *
+ * @details Every global or readonly variable that is external must be defined
+ * before loading the code object. An internal global or readonly variable is
+ * allocated once the code object, that is being loaded, references this
+ * variable and this variable is not allocated.
+ *
+ * Any module linkage declaration must have been defined either by a define
+ * variable or by loading a code object that has a symbol with module linkage
+ * definition.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] agent Agent to load code object for. The agent must support the
+ * default floating-point rounding mode used by @p code_object.
+ *
+ * @param[in] code_object Code object to load. The lifetime of the code object
+ * must exceed that of the executable: if @p code_object is destroyed before @p
+ * executable, the behavior is undefined.
+ *
+ * @param[in] options Standard and vendor-specific options. Unknown options are
+ * ignored. A standard option begins with the "-hsa_" prefix. Options beginning
+ * with the "-hsa_ext_<extension_name>_" prefix are reserved for extensions. A
+ * vendor-specific option begins with the "-<vendor_name>_" prefix. Must be a
+ * NUL-terminated string. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible
+ * with @p code_object (for example, @p agent does not support the default
+ * floating-point rounding mode specified by @p code_object), or @p code_object
+ * is not compatible with @p executable (for example, @p code_object and @p
+ * executable have different machine models or profiles).
+ *
+ * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object(
+ hsa_executable_t executable,
+ hsa_agent_t agent,
+ hsa_code_object_t code_object,
+ const char *options);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object symbol handle.
+ *
+ * The lifetime of a code object symbol matches that of the code object
+ * associated with it. An operation on a symbol whose associated code object has
+ * been destroyed results in undefined behavior.
+ */
+typedef struct hsa_code_symbol_s {
+ /**
+ * Opaque handle. Two handles reference the same object of the enclosing type
+ * if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_code_symbol_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the symbol handle within a code object for a given a symbol name.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol(
+ hsa_code_object_t code_object,
+ const char *symbol_name,
+ hsa_code_symbol_t *symbol);
+
+/**
+ * @deprecated
+ *
+ * @brief Get the symbol handle within a code object for a given a symbol name.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] module_name Module name. Must be NULL if the symbol has
+ * program linkage.
+ *
+ * @param[in] symbol_name Symbol name.
+ *
+ * @param[out] symbol Memory location where the HSA runtime stores the symbol
+ * handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name
+ * that matches @p symbol_name.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or
+ * @p symbol is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name(
+ hsa_code_object_t code_object,
+ const char *module_name,
+ const char *symbol_name,
+ hsa_code_symbol_t *symbol);
+
+/**
+ * @deprecated
+ *
+ * @brief Code object symbol attributes.
+ */
+typedef enum {
+ /**
+ * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t.
+ */
+ HSA_CODE_SYMBOL_INFO_TYPE = 0,
+ /**
+ * The length of the symbol name in bytes, not including the NUL terminator.
+ * The type of this attribute is uint32_t.
+ */
+ HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1,
+ /**
+ * The name of the symbol. The type of this attribute is character array with
+ * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH
+ * attribute.
+ */
+ HSA_CODE_SYMBOL_INFO_NAME = 2,
+ /**
+ * The length of the module name in bytes (not including the NUL terminator)
+ * to which this symbol belongs if this symbol has module linkage, otherwise 0
+ * is returned. The type of this attribute is uint32_t.
+ */
+ HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3,
+ /**
+ * The module name to which this symbol belongs if this symbol has module
+ * linkage, otherwise an empty string is returned. The type of this attribute
+ * is character array with the length equal to the value of
+ * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute.
+ */
+ HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4,
+ /**
+ * The linkage kind of the symbol. The type of this attribute is
+ * ::hsa_symbol_linkage_t.
+ */
+ HSA_CODE_SYMBOL_INFO_LINKAGE = 5,
+ /**
+ * Indicates whether the symbol corresponds to a definition. The type of this
+ * attribute is bool.
+ */
+ HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17,
+ /**
+ * The allocation kind of the variable. The value of this attribute is
+ * undefined if the symbol is not a variable. The type of this attribute is
+ * ::hsa_variable_allocation_t.
+ */
+ HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6,
+ /**
+ * The segment kind of the variable. The value of this attribute is
+ * undefined if the symbol is not a variable. The type of this attribute is
+ * ::hsa_variable_segment_t.
+ */
+ HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7,
+ /**
+ * Alignment of the symbol in memory. The value of this attribute is undefined
+ * if the symbol is not a variable. The type of this attribute is uint32_t.
+ *
+ * The current alignment of the variable in memory may be greater than the
+ * value specified in the source program variable declaration.
+ */
+ HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8,
+ /**
+ * Size of the variable. The value of this attribute is undefined if the
+ * symbol is not a variable. The type of this attribute is uint32_t.
+ *
+ * A size of 0 is returned if the variable is an external variable and has an
+ * unknown dimension.
+ */
+ HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9,
+ /**
+ * Indicates whether the variable is constant. The value of this attribute is
+ * undefined if the symbol is not a variable. The type of this attribute is
+ * bool.
+ */
+ HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10,
+ /**
+ * Size of kernarg segment memory that is required to hold the values of the
+ * kernel arguments, in bytes. Must be a multiple of 16. The value of this
+ * attribute is undefined if the symbol is not a kernel. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
+ /**
+ * Alignment (in bytes) of the buffer used to pass arguments to the kernel,
+ * which is the maximum of 16 and the maximum alignment of any of the kernel
+ * arguments. The value of this attribute is undefined if the symbol is not a
+ * kernel. The type of this attribute is uint32_t.
+ */
+ HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12,
+ /**
+ * Size of static group segment memory required by the kernel (per
+ * work-group), in bytes. The value of this attribute is undefined
+ * if the symbol is not a kernel. The type of this attribute is uint32_t.
+ *
+ * The reported amount does not include any dynamically allocated group
+ * segment memory that may be requested by the application when a kernel is
+ * dispatched.
+ */
+ HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
+ /**
+ * Size of static private, spill, and arg segment memory required by
+ * this kernel (per work-item), in bytes. The value of this attribute is
+ * undefined if the symbol is not a kernel. The type of this attribute is
+ * uint32_t.
+ *
+ * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true,
+ * the kernel may use more private memory than the reported value, and the
+ * application must add the dynamic call stack usage to @a
+ * private_segment_size when populating a kernel dispatch packet.
+ */
+ HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
+ /**
+ * Dynamic callstack flag. The value of this attribute is undefined if the
+ * symbol is not a kernel. The type of this attribute is bool.
+ *
+ * If this flag is set (the value is true), the kernel uses a dynamically
+ * sized call stack. This can happen if recursive calls, calls to indirect
+ * functions, or the HSAIL alloca instruction are present in the kernel.
+ */
+ HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15,
+ /**
+ * Call convention of the kernel. The value of this attribute is undefined if
+ * the symbol is not a kernel. The type of this attribute is uint32_t.
+ */
+ HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18,
+ /**
+ * Call convention of the indirect function. The value of this attribute is
+ * undefined if the symbol is not an indirect function. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16
+} hsa_code_symbol_info_t;
+
+/**
+ * @deprecated
+ *
+ * @brief Get the current value of an attribute for a given code symbol.
+ *
+ * @param[in] code_symbol Code symbol.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * code symbol attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info(
+ hsa_code_symbol_t code_symbol,
+ hsa_code_symbol_info_t attribute,
+ void *value);
+
+/**
+ * @deprecated
+ *
+ * @brief Iterate over the symbols in a code object, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] code_object Code object.
+ *
+ * @param[in] callback Callback to be invoked once per code object symbol. The
+ * HSA runtime passes three arguments to the callback: the code object, a
+ * symbol, and the application data. If @p callback returns a status other than
+ * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and
+ * ::hsa_code_object_iterate_symbols returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols(
+ hsa_code_object_t code_object,
+ hsa_status_t (*callback)(hsa_code_object_t code_object,
+ hsa_code_symbol_t symbol,
+ void *data),
+ void *data);
+
+/** @} */
+
+#ifdef __cplusplus
+} // end extern "C" block
+#endif
+
+#endif // header guard
diff --git a/third_party/rocm/include/hsa/hsa_api_trace.h b/third_party/rocm/include/hsa/hsa_api_trace.h
new file mode 100644
index 0000000..5c33f07
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa_api_trace.h
@@ -0,0 +1,474 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_H
+#define HSA_RUNTIME_INC_HSA_API_TRACE_H
+
+#include "hsa.h"
+#ifdef AMD_INTERNAL_BUILD
+#include "hsa_ext_image.h"
+#include "hsa_ext_amd.h"
+#include "hsa_ext_finalize.h"
+#else
+#include "inc/hsa_ext_image.h"
+#include "inc/hsa_ext_amd.h"
+#include "inc/hsa_ext_finalize.h"
+#endif
+
+#include <string.h>
+#include <assert.h>
+#include <stddef.h>
+
+// Major Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_MAJOR_VERSION 0x01
+#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x01
+#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x01
+#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x01
+#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x01
+#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01
+
+// Step Ids of the Api tables exported by Hsa Core Runtime
+#define HSA_API_TABLE_STEP_VERSION 0x00
+#define HSA_CORE_API_TABLE_STEP_VERSION 0x00
+#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x00
+#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00
+#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00
+#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00
+
+// Min function used to copy Api Tables
+static inline uint32_t Min(const uint32_t a, const uint32_t b) {
+ return (a > b) ? b : a;
+}
+
+// Declarations of APIs intended for use only by tools.
+typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* pkts, uint64_t pkt_count);
+typedef void (*hsa_amd_queue_intercept_handler)(const void* pkts, uint64_t pkt_count,
+ uint64_t user_pkt_index, void* data,
+ hsa_amd_queue_intercept_packet_writer writer);
+hsa_status_t hsa_amd_queue_intercept_register(hsa_queue_t* queue,
+ hsa_amd_queue_intercept_handler callback,
+ void* user_data);
+hsa_status_t hsa_amd_queue_intercept_create(
+ hsa_agent_t agent_handle, uint32_t size, hsa_queue_type32_t type,
+ void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data,
+ uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue);
+
+typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t* queue, hsa_agent_t agent,
+ void* data);
+hsa_status_t hsa_amd_runtime_queue_create_register(hsa_amd_runtime_queue_notifier callback,
+ void* user_data);
+
+// Structure of Version used to identify an instance of Api table
+// Must be the first member (offsetof == 0) of all API tables.
+// This is the root of the table passing ABI.
+struct ApiTableVersion {
+ uint32_t major_id;
+ uint32_t minor_id;
+ uint32_t step_id;
+ uint32_t reserved;
+};
+
+// Table to export HSA Finalizer Extension Apis
+struct FinalizerExtTable {
+ ApiTableVersion version;
+ decltype(hsa_ext_program_create)* hsa_ext_program_create_fn;
+ decltype(hsa_ext_program_destroy)* hsa_ext_program_destroy_fn;
+ decltype(hsa_ext_program_add_module)* hsa_ext_program_add_module_fn;
+ decltype(hsa_ext_program_iterate_modules)* hsa_ext_program_iterate_modules_fn;
+ decltype(hsa_ext_program_get_info)* hsa_ext_program_get_info_fn;
+ decltype(hsa_ext_program_finalize)* hsa_ext_program_finalize_fn;
+};
+
+// Table to export HSA Image Extension Apis
+struct ImageExtTable {
+ ApiTableVersion version;
+ decltype(hsa_ext_image_get_capability)* hsa_ext_image_get_capability_fn;
+ decltype(hsa_ext_image_data_get_info)* hsa_ext_image_data_get_info_fn;
+ decltype(hsa_ext_image_create)* hsa_ext_image_create_fn;
+ decltype(hsa_ext_image_import)* hsa_ext_image_import_fn;
+ decltype(hsa_ext_image_export)* hsa_ext_image_export_fn;
+ decltype(hsa_ext_image_copy)* hsa_ext_image_copy_fn;
+ decltype(hsa_ext_image_clear)* hsa_ext_image_clear_fn;
+ decltype(hsa_ext_image_destroy)* hsa_ext_image_destroy_fn;
+ decltype(hsa_ext_sampler_create)* hsa_ext_sampler_create_fn;
+ decltype(hsa_ext_sampler_destroy)* hsa_ext_sampler_destroy_fn;
+ decltype(hsa_ext_image_get_capability_with_layout)* hsa_ext_image_get_capability_with_layout_fn;
+ decltype(hsa_ext_image_data_get_info_with_layout)* hsa_ext_image_data_get_info_with_layout_fn;
+ decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn;
+};
+
+// Table to export AMD Extension Apis
+struct AmdExtTable {
+ ApiTableVersion version;
+ decltype(hsa_amd_coherency_get_type)* hsa_amd_coherency_get_type_fn;
+ decltype(hsa_amd_coherency_set_type)* hsa_amd_coherency_set_type_fn;
+ decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled_fn;
+ decltype(hsa_amd_profiling_async_copy_enable) *hsa_amd_profiling_async_copy_enable_fn;
+ decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time_fn;
+ decltype(hsa_amd_profiling_get_async_copy_time) *hsa_amd_profiling_get_async_copy_time_fn;
+ decltype(hsa_amd_profiling_convert_tick_to_system_domain)* hsa_amd_profiling_convert_tick_to_system_domain_fn;
+ decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler_fn;
+ decltype(hsa_amd_async_function)* hsa_amd_async_function_fn;
+ decltype(hsa_amd_signal_wait_any)* hsa_amd_signal_wait_any_fn;
+ decltype(hsa_amd_queue_cu_set_mask)* hsa_amd_queue_cu_set_mask_fn;
+ decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info_fn;
+ decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools_fn;
+ decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn;
+ decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn;
+ decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
+ decltype(hsa_amd_agent_memory_pool_get_info)* hsa_amd_agent_memory_pool_get_info_fn;
+ decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn;
+ decltype(hsa_amd_memory_pool_can_migrate)* hsa_amd_memory_pool_can_migrate_fn;
+ decltype(hsa_amd_memory_migrate)* hsa_amd_memory_migrate_fn;
+ decltype(hsa_amd_memory_lock)* hsa_amd_memory_lock_fn;
+ decltype(hsa_amd_memory_unlock)* hsa_amd_memory_unlock_fn;
+ decltype(hsa_amd_memory_fill)* hsa_amd_memory_fill_fn;
+ decltype(hsa_amd_interop_map_buffer)* hsa_amd_interop_map_buffer_fn;
+ decltype(hsa_amd_interop_unmap_buffer)* hsa_amd_interop_unmap_buffer_fn;
+ decltype(hsa_amd_image_create)* hsa_amd_image_create_fn;
+ decltype(hsa_amd_pointer_info)* hsa_amd_pointer_info_fn;
+ decltype(hsa_amd_pointer_info_set_userdata)* hsa_amd_pointer_info_set_userdata_fn;
+ decltype(hsa_amd_ipc_memory_create)* hsa_amd_ipc_memory_create_fn;
+ decltype(hsa_amd_ipc_memory_attach)* hsa_amd_ipc_memory_attach_fn;
+ decltype(hsa_amd_ipc_memory_detach)* hsa_amd_ipc_memory_detach_fn;
+ decltype(hsa_amd_signal_create)* hsa_amd_signal_create_fn;
+ decltype(hsa_amd_ipc_signal_create)* hsa_amd_ipc_signal_create_fn;
+ decltype(hsa_amd_ipc_signal_attach)* hsa_amd_ipc_signal_attach_fn;
+ decltype(hsa_amd_register_system_event_handler)* hsa_amd_register_system_event_handler_fn;
+ decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
+ decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
+ decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn;
+ decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
+ decltype(hsa_amd_runtime_queue_create_register)* hsa_amd_runtime_queue_create_register_fn;
+ decltype(hsa_amd_memory_lock_to_pool)* hsa_amd_memory_lock_to_pool_fn;
+ decltype(hsa_amd_register_deallocation_callback)* hsa_amd_register_deallocation_callback_fn;
+ decltype(hsa_amd_deregister_deallocation_callback)* hsa_amd_deregister_deallocation_callback_fn;
+};
+
+// Table to export HSA Core Runtime Apis
+struct CoreApiTable {
+ ApiTableVersion version;
+ decltype(hsa_init)* hsa_init_fn;
+ decltype(hsa_shut_down)* hsa_shut_down_fn;
+ decltype(hsa_system_get_info)* hsa_system_get_info_fn;
+ decltype(hsa_system_extension_supported)* hsa_system_extension_supported_fn;
+ decltype(hsa_system_get_extension_table)* hsa_system_get_extension_table_fn;
+ decltype(hsa_iterate_agents)* hsa_iterate_agents_fn;
+ decltype(hsa_agent_get_info)* hsa_agent_get_info_fn;
+ decltype(hsa_queue_create)* hsa_queue_create_fn;
+ decltype(hsa_soft_queue_create)* hsa_soft_queue_create_fn;
+ decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
+ decltype(hsa_queue_inactivate)* hsa_queue_inactivate_fn;
+ decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
+ decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
+ decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
+ decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
+ decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
+ decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
+ decltype(hsa_queue_cas_write_index_scacq_screl)* hsa_queue_cas_write_index_scacq_screl_fn;
+ decltype(hsa_queue_cas_write_index_scacquire)* hsa_queue_cas_write_index_scacquire_fn;
+ decltype(hsa_queue_cas_write_index_relaxed)* hsa_queue_cas_write_index_relaxed_fn;
+ decltype(hsa_queue_cas_write_index_screlease)* hsa_queue_cas_write_index_screlease_fn;
+ decltype(hsa_queue_add_write_index_scacq_screl)* hsa_queue_add_write_index_scacq_screl_fn;
+ decltype(hsa_queue_add_write_index_scacquire)* hsa_queue_add_write_index_scacquire_fn;
+ decltype(hsa_queue_add_write_index_relaxed)* hsa_queue_add_write_index_relaxed_fn;
+ decltype(hsa_queue_add_write_index_screlease)* hsa_queue_add_write_index_screlease_fn;
+ decltype(hsa_queue_store_read_index_relaxed)* hsa_queue_store_read_index_relaxed_fn;
+ decltype(hsa_queue_store_read_index_screlease)* hsa_queue_store_read_index_screlease_fn;
+ decltype(hsa_agent_iterate_regions)* hsa_agent_iterate_regions_fn;
+ decltype(hsa_region_get_info)* hsa_region_get_info_fn;
+ decltype(hsa_agent_get_exception_policies)* hsa_agent_get_exception_policies_fn;
+ decltype(hsa_agent_extension_supported)* hsa_agent_extension_supported_fn;
+ decltype(hsa_memory_register)* hsa_memory_register_fn;
+ decltype(hsa_memory_deregister)* hsa_memory_deregister_fn;
+ decltype(hsa_memory_allocate)* hsa_memory_allocate_fn;
+ decltype(hsa_memory_free)* hsa_memory_free_fn;
+ decltype(hsa_memory_copy)* hsa_memory_copy_fn;
+ decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn;
+ decltype(hsa_signal_create)* hsa_signal_create_fn;
+ decltype(hsa_signal_destroy)* hsa_signal_destroy_fn;
+ decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed_fn;
+ decltype(hsa_signal_load_scacquire)* hsa_signal_load_scacquire_fn;
+ decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
+ decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease_fn;
+ decltype(hsa_signal_wait_relaxed)* hsa_signal_wait_relaxed_fn;
+ decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire_fn;
+ decltype(hsa_signal_and_relaxed)* hsa_signal_and_relaxed_fn;
+ decltype(hsa_signal_and_scacquire)* hsa_signal_and_scacquire_fn;
+ decltype(hsa_signal_and_screlease)* hsa_signal_and_screlease_fn;
+ decltype(hsa_signal_and_scacq_screl)* hsa_signal_and_scacq_screl_fn;
+ decltype(hsa_signal_or_relaxed)* hsa_signal_or_relaxed_fn;
+ decltype(hsa_signal_or_scacquire)* hsa_signal_or_scacquire_fn;
+ decltype(hsa_signal_or_screlease)* hsa_signal_or_screlease_fn;
+ decltype(hsa_signal_or_scacq_screl)* hsa_signal_or_scacq_screl_fn;
+ decltype(hsa_signal_xor_relaxed)* hsa_signal_xor_relaxed_fn;
+ decltype(hsa_signal_xor_scacquire)* hsa_signal_xor_scacquire_fn;
+ decltype(hsa_signal_xor_screlease)* hsa_signal_xor_screlease_fn;
+ decltype(hsa_signal_xor_scacq_screl)* hsa_signal_xor_scacq_screl_fn;
+ decltype(hsa_signal_exchange_relaxed)* hsa_signal_exchange_relaxed_fn;
+ decltype(hsa_signal_exchange_scacquire)* hsa_signal_exchange_scacquire_fn;
+ decltype(hsa_signal_exchange_screlease)* hsa_signal_exchange_screlease_fn;
+ decltype(hsa_signal_exchange_scacq_screl)* hsa_signal_exchange_scacq_screl_fn;
+ decltype(hsa_signal_add_relaxed)* hsa_signal_add_relaxed_fn;
+ decltype(hsa_signal_add_scacquire)* hsa_signal_add_scacquire_fn;
+ decltype(hsa_signal_add_screlease)* hsa_signal_add_screlease_fn;
+ decltype(hsa_signal_add_scacq_screl)* hsa_signal_add_scacq_screl_fn;
+ decltype(hsa_signal_subtract_relaxed)* hsa_signal_subtract_relaxed_fn;
+ decltype(hsa_signal_subtract_scacquire)* hsa_signal_subtract_scacquire_fn;
+ decltype(hsa_signal_subtract_screlease)* hsa_signal_subtract_screlease_fn;
+ decltype(hsa_signal_subtract_scacq_screl)* hsa_signal_subtract_scacq_screl_fn;
+ decltype(hsa_signal_cas_relaxed)* hsa_signal_cas_relaxed_fn;
+ decltype(hsa_signal_cas_scacquire)* hsa_signal_cas_scacquire_fn;
+ decltype(hsa_signal_cas_screlease)* hsa_signal_cas_screlease_fn;
+ decltype(hsa_signal_cas_scacq_screl)* hsa_signal_cas_scacq_screl_fn;
+
+ //===--- Instruction Set Architecture -----------------------------------===//
+
+ decltype(hsa_isa_from_name)* hsa_isa_from_name_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_isa_get_info)* hsa_isa_get_info_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_isa_compatible)* hsa_isa_compatible_fn;
+
+ //===--- Code Objects (deprecated) --------------------------------------===//
+
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_serialize)* hsa_code_object_serialize_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_deserialize)* hsa_code_object_deserialize_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_destroy)* hsa_code_object_destroy_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_get_info)* hsa_code_object_get_info_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_get_symbol)* hsa_code_object_get_symbol_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_code_symbol_get_info)* hsa_code_symbol_get_info_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_iterate_symbols)* hsa_code_object_iterate_symbols_fn;
+
+ //===--- Executable -----------------------------------------------------===//
+
+ // Deprecated since v1.1.
+ decltype(hsa_executable_create)* hsa_executable_create_fn;
+ decltype(hsa_executable_destroy)* hsa_executable_destroy_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_executable_load_code_object)* hsa_executable_load_code_object_fn;
+ decltype(hsa_executable_freeze)* hsa_executable_freeze_fn;
+ decltype(hsa_executable_get_info)* hsa_executable_get_info_fn;
+ decltype(hsa_executable_global_variable_define)*
+ hsa_executable_global_variable_define_fn;
+ decltype(hsa_executable_agent_global_variable_define)*
+ hsa_executable_agent_global_variable_define_fn;
+ decltype(hsa_executable_readonly_variable_define)*
+ hsa_executable_readonly_variable_define_fn;
+ decltype(hsa_executable_validate)* hsa_executable_validate_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol_fn;
+ decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info_fn;
+ // Deprecated since v1.1.
+ decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols_fn;
+
+ //===--- Runtime Notifications ------------------------------------------===//
+
+ decltype(hsa_status_string)* hsa_status_string_fn;
+
+ // Start HSA v1.1 additions
+ decltype(hsa_extension_get_name)* hsa_extension_get_name_fn;
+ decltype(hsa_system_major_extension_supported)* hsa_system_major_extension_supported_fn;
+ decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table_fn;
+ decltype(hsa_agent_major_extension_supported)* hsa_agent_major_extension_supported_fn;
+ decltype(hsa_cache_get_info)* hsa_cache_get_info_fn;
+ decltype(hsa_agent_iterate_caches)* hsa_agent_iterate_caches_fn;
+ decltype(hsa_signal_silent_store_relaxed)* hsa_signal_silent_store_relaxed_fn;
+ decltype(hsa_signal_silent_store_screlease)* hsa_signal_silent_store_screlease_fn;
+ decltype(hsa_signal_group_create)* hsa_signal_group_create_fn;
+ decltype(hsa_signal_group_destroy)* hsa_signal_group_destroy_fn;
+ decltype(hsa_signal_group_wait_any_scacquire)* hsa_signal_group_wait_any_scacquire_fn;
+ decltype(hsa_signal_group_wait_any_relaxed)* hsa_signal_group_wait_any_relaxed_fn;
+
+ //===--- Instruction Set Architecture - HSA v1.1 additions --------------===//
+
+ decltype(hsa_agent_iterate_isas)* hsa_agent_iterate_isas_fn;
+ decltype(hsa_isa_get_info_alt)* hsa_isa_get_info_alt_fn;
+ decltype(hsa_isa_get_exception_policies)* hsa_isa_get_exception_policies_fn;
+ decltype(hsa_isa_get_round_method)* hsa_isa_get_round_method_fn;
+ decltype(hsa_wavefront_get_info)* hsa_wavefront_get_info_fn;
+ decltype(hsa_isa_iterate_wavefronts)* hsa_isa_iterate_wavefronts_fn;
+
+ //===--- Code Objects (deprecated) - HSA v1.1 additions -----------------===//
+
+ // Deprecated since v1.1.
+ decltype(hsa_code_object_get_symbol_from_name)*
+ hsa_code_object_get_symbol_from_name_fn;
+
+ //===--- Executable - HSA v1.1 additions --------------------------------===//
+
+ decltype(hsa_code_object_reader_create_from_file)*
+ hsa_code_object_reader_create_from_file_fn;
+ decltype(hsa_code_object_reader_create_from_memory)*
+ hsa_code_object_reader_create_from_memory_fn;
+ decltype(hsa_code_object_reader_destroy)* hsa_code_object_reader_destroy_fn;
+ decltype(hsa_executable_create_alt)* hsa_executable_create_alt_fn;
+ decltype(hsa_executable_load_program_code_object)*
+ hsa_executable_load_program_code_object_fn;
+ decltype(hsa_executable_load_agent_code_object)*
+ hsa_executable_load_agent_code_object_fn;
+ decltype(hsa_executable_validate_alt)* hsa_executable_validate_alt_fn;
+ decltype(hsa_executable_get_symbol_by_name)*
+ hsa_executable_get_symbol_by_name_fn;
+ decltype(hsa_executable_iterate_agent_symbols)*
+ hsa_executable_iterate_agent_symbols_fn;
+ decltype(hsa_executable_iterate_program_symbols)*
+ hsa_executable_iterate_program_symbols_fn;
+};
+
+// Table to export HSA Apis from Core Runtime, Amd Extensions
+// Finalizer and Images
+struct HsaApiTable {
+
+ // Version of Hsa Api Table
+ ApiTableVersion version;
+
+ // Table of function pointers to HSA Core Runtime
+ CoreApiTable* core_;
+
+ // Table of function pointers to AMD extensions
+ AmdExtTable* amd_ext_;
+
+ // Table of function pointers to HSA Finalizer Extension
+ FinalizerExtTable* finalizer_ext_;
+
+ // Table of function pointers to HSA Image Extension
+ ImageExtTable* image_ext_;
+};
+
+// Structure containing instances of different api tables
+struct HsaApiTableContainer {
+ HsaApiTable root;
+ CoreApiTable core;
+ AmdExtTable amd_ext;
+ FinalizerExtTable finalizer_ext;
+ ImageExtTable image_ext;
+
+ // Default initialization of a container instance
+ HsaApiTableContainer() {
+ root.version.major_id = HSA_API_TABLE_MAJOR_VERSION;
+ root.version.minor_id = sizeof(HsaApiTable);
+ root.version.step_id = HSA_API_TABLE_STEP_VERSION;
+
+ core.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION;
+ core.version.minor_id = sizeof(CoreApiTable);
+ core.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION;
+ root.core_ = &core;
+
+ amd_ext.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION;
+ amd_ext.version.minor_id = sizeof(AmdExtTable);
+ amd_ext.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION;
+ root.amd_ext_ = &amd_ext;
+
+ finalizer_ext.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION;
+ finalizer_ext.version.minor_id = sizeof(FinalizerExtTable);
+ finalizer_ext.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION;
+ root.finalizer_ext_ = & finalizer_ext;
+
+ image_ext.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION;
+ image_ext.version.minor_id = sizeof(ImageExtTable);
+ image_ext.version.step_id = HSA_IMAGE_API_TABLE_STEP_VERSION;
+ root.image_ext_ = &image_ext;
+ }
+};
+
+// Api to copy function pointers of a table
+static
+void inline copyApi(void* src, void* dest, size_t size) {
+ assert(size >= sizeof(ApiTableVersion));
+ memcpy((char*)src + sizeof(ApiTableVersion),
+ (char*)dest + sizeof(ApiTableVersion),
+ (size - sizeof(ApiTableVersion)));
+}
+
+// Copy Api child tables if valid.
+static void inline copyElement(ApiTableVersion* dest, ApiTableVersion* src) {
+ if (src->major_id && (dest->major_id == src->major_id)) {
+ dest->step_id = src->step_id;
+ dest->minor_id = Min(dest->minor_id, src->minor_id);
+ copyApi(dest, src, dest->minor_id);
+ } else {
+ dest->major_id = 0;
+ dest->minor_id = 0;
+ dest->step_id = 0;
+ }
+}
+
+// Copy constructor for all Api tables. The function assumes the
+// user has initialized an instance of tables container correctly
+// for the Major, Minor and Stepping Ids of Root and Child Api tables.
+// The function will overwrite the value of Minor Id by taking the
+// minimum of source and destination parameters. It will also overwrite
+// the stepping Id with value from source parameter.
+static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) {
+ // Verify Major Id of source and destination tables match
+ if (dest->version.major_id != src->version.major_id) {
+ dest->version.major_id = 0;
+ dest->version.minor_id = 0;
+ dest->version.step_id = 0;
+ return;
+ }
+
+ // Initialize the stepping id and minor id of root table. For the
+ // minor id which encodes struct size, take the minimum of source
+ // and destination parameters
+ dest->version.step_id = src->version.step_id;
+ dest->version.minor_id = Min(dest->version.minor_id, src->version.minor_id);
+
+ // Copy child tables if present
+ if ((offsetof(HsaApiTable, core_) < dest->version.minor_id))
+ copyElement(&dest->core_->version, &src->core_->version);
+ if ((offsetof(HsaApiTable, amd_ext_) < dest->version.minor_id))
+ copyElement(&dest->amd_ext_->version, &src->amd_ext_->version);
+ if ((offsetof(HsaApiTable, finalizer_ext_) < dest->version.minor_id))
+ copyElement(&dest->finalizer_ext_->version, &src->finalizer_ext_->version);
+ if ((offsetof(HsaApiTable, image_ext_) < dest->version.minor_id))
+ copyElement(&dest->image_ext_->version, &src->image_ext_->version);
+}
+#endif
diff --git a/third_party/rocm/include/hsa/hsa_ext_amd.h b/third_party/rocm/include/hsa/hsa_ext_amd.h
new file mode 100644
index 0000000..04a6e4d
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa_ext_amd.h
@@ -0,0 +1,1983 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension.
+
+#ifndef HSA_RUNTIME_EXT_AMD_H_
+#define HSA_RUNTIME_EXT_AMD_H_
+
+#include "hsa.h"
+#include "hsa_ext_image.h"
+
+#define HSA_AMD_INTERFACE_VERSION_MAJOR 1
+#define HSA_AMD_INTERFACE_VERSION_MINOR 0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t.
+ *
+ * @remark Additions to hsa_status_t
+ */
+enum {
+ /**
+ * The memory pool is invalid.
+ */
+ HSA_STATUS_ERROR_INVALID_MEMORY_POOL = 40,
+
+ /**
+ * Agent accessed memory beyond the maximum legal address.
+ */
+ HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION = 41,
+
+ /**
+ * Agent executed an invalid shader instruction.
+ */
+ HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42,
+};
+
+/**
+ * @brief Agent attributes.
+ */
+typedef enum hsa_amd_agent_info_s {
+ /**
+ * Chip identifier. The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
+ /**
+ * Size of a cacheline in bytes. The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
+ /**
+ * The number of compute unit available in the agent. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
+ /**
+ * The maximum clock frequency of the agent in MHz. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+ /**
+ * Internal driver node identifier. The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_DRIVER_NODE_ID = 0xA004,
+ /**
+ * Max number of watch points on memory address ranges to generate exception
+ * events when the watched addresses are accessed. The type of this
+ * attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_MAX_ADDRESS_WATCH_POINTS = 0xA005,
+ /**
+ * Agent BDF_ID, named LocationID in thunk. The type of this attribute is
+ * uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_BDFID = 0xA006,
+ /**
+ * Memory Interface width, the return value type is uint32_t.
+ * This attribute is deprecated.
+ */
+ HSA_AMD_AGENT_INFO_MEMORY_WIDTH = 0xA007,
+ /**
+ * Max Memory Clock, the return value type is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY = 0xA008,
+ /**
+ * Board name of Agent - populated from MarketingName of Kfd Node
+ * The value is an Ascii string of 64 chars.
+ */
+ HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
+ /**
+ * Maximum number of waves possible in a Compute Unit.
+ * The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
+ /**
+ * Number of SIMD's per compute unit CU
+ * The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
+ /**
+ * Number of Shader Engines (SE) in Gpu
+ * The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES = 0xA00C,
+ /**
+ * Number of Shader Arrays Per Shader Engines in Gpu
+ * The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE = 0xA00D,
+ /**
+ * Address of the HDP flush registers. Use of these registers does not conform to the HSA memory
+ * model and should be treated with caution.
+ * The type of this attribute is hsa_amd_hdp_flush_t.
+ */
+ HSA_AMD_AGENT_INFO_HDP_FLUSH = 0xA00E,
+ /**
+ * PCIe domain for the agent. Pairs with HSA_AMD_AGENT_INFO_BDFID
+ * to give the full physical location of the Agent.
+ * The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_DOMAIN = 0xA00F,
+ /**
+ * Queries for support of cooperative queues. See ::HSA_QUEUE_TYPE_COOPERATIVE.
+ * The type of this attribute is bool.
+ */
+ HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
+ /**
+ * Queries UUID of an agent. The value is an Ascii string with a maximum
+ * of 21 chars including NUL. The string value consists of two parts: header
+ * and body. The header identifies device type (GPU, CPU, DSP) while body
+ * encodes UUID as a 16 digit hex string
+ *
+ * Agents that do not support UUID will return the string "GPU-XX" or
+ * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t
+ */
+ HSA_AMD_AGENT_INFO_UUID = 0xA011,
+ /**
+ * Queries for the ASIC revision of an agent. The value is an integer that
+ * increments for each revision. This can be used by user-level software to
+ * change how it operates, depending on the hardware version. This allows
+ * selective workarounds for hardware errata.
+ * The type of this attribute is uint32_t.
+ */
+ HSA_AMD_AGENT_INFO_ASIC_REVISION = 0xA012
+} hsa_amd_agent_info_t;
+
+typedef struct hsa_amd_hdp_flush_s {
+ uint32_t* HDP_MEM_FLUSH_CNTL;
+ uint32_t* HDP_REG_FLUSH_CNTL;
+} hsa_amd_hdp_flush_t;
+
+/**
+ * @brief Region attributes.
+ */
+typedef enum hsa_amd_region_info_s {
+ /**
+ * Determine if host can access the region. The type of this attribute
+ * is bool.
+ */
+ HSA_AMD_REGION_INFO_HOST_ACCESSIBLE = 0xA000,
+ /**
+ * Base address of the region in flat address space.
+ */
+ HSA_AMD_REGION_INFO_BASE = 0xA001,
+ /**
+ * Memory Interface width, the return value type is uint32_t.
+ * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_WIDTH.
+ */
+ HSA_AMD_REGION_INFO_BUS_WIDTH = 0xA002,
+ /**
+ * Max Memory Clock, the return value type is uint32_t.
+ * This attribute is deprecated. Use HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY.
+ */
+ HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY = 0xA003
+} hsa_amd_region_info_t;
+
+/**
+ * @brief Coherency attributes of fine grain region.
+ */
+typedef enum hsa_amd_coherency_type_s {
+ /**
+ * Coherent region.
+ */
+ HSA_AMD_COHERENCY_TYPE_COHERENT = 0,
+ /**
+ * Non coherent region.
+ */
+ HSA_AMD_COHERENCY_TYPE_NONCOHERENT = 1
+} hsa_amd_coherency_type_t;
+
+/**
+ * @brief Get the coherency type of the fine grain region of an agent.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[out] type Pointer to a memory location where the HSA runtime will
+ * store the coherency type of the fine grain region.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_coherency_get_type(hsa_agent_t agent,
+ hsa_amd_coherency_type_t* type);
+
+/**
+ * @brief Set the coherency type of the fine grain region of an agent.
+ * Deprecated. This is supported on KV platforms. For backward compatibility
+ * other platforms will spuriously succeed.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] type The coherency type to be set.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p type is invalid.
+ */
+hsa_status_t HSA_API hsa_amd_coherency_set_type(hsa_agent_t agent,
+ hsa_amd_coherency_type_t type);
+
+/**
+ * @brief Structure containing profiling dispatch time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_dispatch_time_s {
+ /**
+ * Dispatch packet processing start time.
+ */
+ uint64_t start;
+ /**
+ * Dispatch packet completion time.
+ */
+ uint64_t end;
+} hsa_amd_profiling_dispatch_time_t;
+
+/**
+ * @brief Structure containing profiling async copy time information.
+ *
+ * Times are reported as ticks in the domain of the HSA system clock.
+ * The HSA system clock tick and frequency is obtained via hsa_system_get_info.
+ */
+typedef struct hsa_amd_profiling_async_copy_time_s {
+ /**
+ * Async copy processing start time.
+ */
+ uint64_t start;
+ /**
+ * Async copy completion time.
+ */
+ uint64_t end;
+} hsa_amd_profiling_async_copy_time_t;
+
+/**
+ * @brief Enable or disable profiling capability of a queue.
+ *
+ * @param[in] queue A valid queue.
+ *
+ * @param[in] enable 1 to enable profiling. 0 to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL.
+ */
+hsa_status_t HSA_API
+ hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable);
+
+/**
+ * @brief Enable or disable asynchronous memory copy profiling.
+ *
+ * @details The runtime will provide the copy processing start timestamp and
+ * completion timestamp of each call to hsa_amd_memory_async_copy if the
+ * async copy profiling is enabled prior to the call to
+ * hsa_amd_memory_async_copy. The completion signal object is used to
+ * hold the last async copy start and end timestamp. The client can retrieve
+ * these timestamps via call to hsa_amd_profiling_get_async_copy_time.
+ *
+ * @param[in] enable True to enable profiling. False to disable profiling.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed on allocating resources
+ * needed to profile the asynchronous copy.
+ */
+hsa_status_t HSA_API
+ hsa_amd_profiling_async_copy_enable(bool enable);
+
+/**
+ * @brief Retrieve packet processing time stamps.
+ *
+ * @param[in] agent The agent with which the signal was last used. For
+ * instance, if the profiled dispatch packet is dispatched onto queue Q,
+ * which was created on agent A, then this parameter must be A.
+ *
+ * @param[in] signal A signal used as the completion signal of the dispatch
+ * packet to retrieve time stamps from. This dispatch packet must have been
+ * issued to a queue with profiling enabled and have already completed. Also
+ * the signal must not have yet been used in any other packet following the
+ * completion of the profiled dispatch packet.
+ *
+ * @param[out] time Packet processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_dispatch_time(
+ hsa_agent_t agent, hsa_signal_t signal,
+ hsa_amd_profiling_dispatch_time_t* time);
+
+/**
+ * @brief Retrieve asynchronous copy timestamps.
+ *
+ * @details Async copy profiling is enabled via call to
+ * hsa_amd_profiling_async_copy_enable.
+ *
+ * @param[in] signal A signal used as the completion signal of the call to
+ * hsa_amd_memory_async_copy.
+ *
+ * @param[out] time Async copy processing timestamps in the HSA system clock
+ * domain.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL The signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p time is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_profiling_get_async_copy_time(
+ hsa_signal_t signal, hsa_amd_profiling_async_copy_time_t* time);
+
+/**
+ * @brief Computes the frequency ratio and offset between the agent clock and
+ * HSA system clock and converts the agent's tick to HSA system domain tick.
+ *
+ * @param[in] agent The agent used to retrieve the agent_tick. It is user's
+ * responsibility to make sure the tick number is from this agent, otherwise,
+ * the behavior is undefined.
+ *
+ * @param[in] agent_tick The tick count retrieved from the specified @p agent.
+ *
+ * @param[out] system_tick The translated HSA system domain clock counter tick.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p system_tick is NULL;
+ */
+hsa_status_t HSA_API
+ hsa_amd_profiling_convert_tick_to_system_domain(hsa_agent_t agent,
+ uint64_t agent_tick,
+ uint64_t* system_tick);
+
+/**
+ * @brief Signal attribute flags.
+ */
+typedef enum {
+ /**
+ * Signal will only be consumed by AMD GPUs. Limits signal consumption to
+ * AMD GPU agents only. Ignored if @p num_consumers is not zero (all agents).
+ */
+ HSA_AMD_SIGNAL_AMD_GPU_ONLY = 1,
+ /**
+ * Signal may be used for interprocess communication.
+ * IPC signals can be read, written, and waited on from any process.
+ * Profiling using an IPC enabled signal is only supported in a single process
+ * at a time. Producing profiling data in one process and consuming it in
+ * another process is undefined.
+ */
+ HSA_AMD_SIGNAL_IPC = 2,
+} hsa_amd_signal_attribute_t;
+
+/**
+ * @brief Create a signal with specific attributes.
+ *
+ * @param[in] initial_value Initial value of the signal.
+ *
+ * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that
+ * any agent might wait on the signal.
+ *
+ * @param[in] consumers List of agents that might consume (wait on) the
+ * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the
+ * HSA runtime might use the list to optimize the handling of the signal
+ * object. If an agent not listed in @p consumers waits on the returned
+ * signal, the behavior is undefined. The memory associated with @p consumers
+ * can be reused or freed after the function returns.
+ *
+ * @param[in] attributes Requested signal attributes. Multiple signal attributes
+ * may be requested by combining them with bitwise OR. Requesting no attributes
+ * (@p attributes == 0) results in the same signal as would have been obtained
+ * via hsa_signal_create.
+ *
+ * @param[out] signal Pointer to a memory location where the HSA runtime will
+ * store the newly created signal handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p
+ * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers
+ * contains duplicates.
+ */
+hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uint32_t num_consumers,
+ const hsa_agent_t* consumers, uint64_t attributes,
+ hsa_signal_t* signal);
+
+/**
+ * @brief Asyncronous signal handler function type.
+ *
+ * @details Type definition of callback function to be used with
+ * hsa_amd_signal_async_handler. This callback is invoked if the associated
+ * signal and condition are met. The callback receives the value of the signal
+ * which satisfied the associated wait condition and a user provided value. If
+ * the callback returns true then the callback will be called again if the
+ * associated signal and condition are satisfied again. If the callback returns
+ * false then it will not be called again.
+ *
+ * @param[in] value Contains the value of the signal observed by
+ * hsa_amd_signal_async_handler which caused the signal handler to be invoked.
+ *
+ * @param[in] arg Contains the user provided value given when the signal handler
+ * was registered with hsa_amd_signal_async_handler
+ *
+ * @retval true resumes monitoring the signal with this handler (as if calling
+ * hsa_amd_signal_async_handler again with identical parameters)
+ *
+ * @retval false stops monitoring the signal with this handler (handler will
+ * not be called again for this signal)
+ *
+ */
+typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void* arg);
+
+/**
+ * @brief Register asynchronous signal handler function.
+ *
+ * @details Allows registering a callback function and user provided value with
+ * a signal and wait condition. The callback will be invoked if the associated
+ * signal and wait condition are satisfied. Callbacks will be invoked serially
+ * but in an arbitrary order so callbacks should be independent of each other.
+ * After being invoked a callback may continue to wait for its associated signal
+ * and condition and, possibly, be invoked again. Or the callback may stop
+ * waiting. If the callback returns true then it will continue waiting and may
+ * be called again. If false then the callback will not wait again and will not
+ * be called again for the associated signal and condition. It is possible to
+ * register the same callback multiple times with the same or different signals
+ * and/or conditions. Each registration of the callback will be treated entirely
+ * independently.
+ *
+ * @param[in] signal hsa signal to be asynchronously monitored
+ *
+ * @param[in] cond condition value to monitor for
+ *
+ * @param[in] value signal value used in condition expression
+ *
+ * @param[in] handler asynchronous signal handler invoked when signal's
+ * condition is met
+ *
+ * @param[in] arg user provided value which is provided to handler when handler
+ * is invoked
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL)
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of
+ * resources or blocking signals are not supported by the HSA driver component.
+ *
+ */
+hsa_status_t HSA_API
+ hsa_amd_signal_async_handler(hsa_signal_t signal,
+ hsa_signal_condition_t cond,
+ hsa_signal_value_t value,
+ hsa_amd_signal_handler handler, void* arg);
+
+/**
+ * @brief Call a function asynchronously
+ *
+ * @details Provides access to the runtime's asynchronous event handling thread
+ * for general asynchronous functions. Functions queued this way are executed
+ * in the same manner as if they were a signal handler who's signal is
+ * satisfied.
+ *
+ * @param[in] callback asynchronous function to be invoked
+ *
+ * @param[in] arg user provided value which is provided to handler when handler
+ * is invoked
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT handler is invalid (NULL)
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime is out of
+ * resources or blocking signals are not supported by the HSA driver component.
+ *
+ */
+hsa_status_t HSA_API
+ hsa_amd_async_function(void (*callback)(void* arg), void* arg);
+
+/**
+ * @brief Wait for any signal-condition pair to be satisfied.
+ *
+ * @details Allows waiting for any of several signal and conditions pairs to be
+ * satisfied. The function returns the index into the list of signals of the
+ * first satisfying signal-condition pair. The value of the satisfying signal's
+ * value is returned in satisfying_value unless satisfying_value is NULL. This
+ * function provides only relaxed memory semantics.
+ */
+uint32_t HSA_API
+ hsa_amd_signal_wait_any(uint32_t signal_count, hsa_signal_t* signals,
+ hsa_signal_condition_t* conds,
+ hsa_signal_value_t* values, uint64_t timeout_hint,
+ hsa_wait_state_t wait_hint,
+ hsa_signal_value_t* satisfying_value);
+
+/**
+ * @brief Query image limits.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] attribute HSA image info attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p value is NULL or @p attribute <
+ * HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS or @p attribute >
+ * HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent,
+ hsa_agent_info_t attribute,
+ void* value);
+
+/**
+ * @brief Set a CU affinity to specific queues within the process, this function
+ * call is "atomic".
+ *
+ * @param[in] queue A pointer to HSA queue.
+ *
+ * @param[in] num_cu_mask_count Size of CUMask bit array passed in.
+ *
+ * @param[in] cu_mask Bit-vector representing the CU mask.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not
+ * multiple of 32 or @p cu_mask is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR failed to call thunk api
+ *
+ */
+hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue,
+ uint32_t num_cu_mask_count,
+ const uint32_t* cu_mask);
+
+/**
+ * @brief Memory segments associated with a memory pool.
+ */
+typedef enum {
+ /**
+ * Global segment. Used to hold data that is shared by all agents.
+ */
+ HSA_AMD_SEGMENT_GLOBAL = 0,
+ /**
+ * Read-only segment. Used to hold data that remains constant during the
+ * execution of a kernel.
+ */
+ HSA_AMD_SEGMENT_READONLY = 1,
+ /**
+ * Private segment. Used to hold data that is local to a single work-item.
+ */
+ HSA_AMD_SEGMENT_PRIVATE = 2,
+ /**
+ * Group segment. Used to hold data that is shared by the work-items of a
+ * work-group.
+ */
+ HSA_AMD_SEGMENT_GROUP = 3,
+} hsa_amd_segment_t;
+
+/**
+ * @brief A memory pool encapsulates physical storage on an agent
+ * along with a memory access model.
+ *
+ * @details A memory pool encapsulates a physical partition of an agent's
+ * memory system along with a memory access model. Division of a single
+ * memory system into separate pools allows querying each partition's access
+ * path properties (see ::hsa_amd_agent_memory_pool_get_info). Allocations
+ * from a pool are preferentially bound to that pool's physical partition.
+ * Binding to the pool's preferential physical partition may not be
+ * possible or persistent depending on the system's memory policy
+ * and/or state which is beyond the scope of HSA APIs.
+ *
+ * For example, a multi-node NUMA memory system may be represented by multiple
+ * pool's with each pool providing size and access path information for the
+ * partition it represents. Allocations from a pool are preferentially bound
+ * to the pool's partition (which in this example is a NUMA node) while
+ * following its memory access model. The actual placement may vary or migrate
+ * due to the system's NUMA policy and state, which is beyond the scope of
+ * HSA APIs.
+ */
+typedef struct hsa_amd_memory_pool_s {
+ /**
+ * Opaque handle.
+ */
+ uint64_t handle;
+} hsa_amd_memory_pool_t;
+
+typedef enum hsa_amd_memory_pool_global_flag_s {
+ /**
+ * The application can use allocations in the memory pool to store kernel
+ * arguments, and provide the values for the kernarg segment of
+ * a kernel dispatch.
+ */
+ HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1,
+ /**
+ * Updates to memory in this pool conform to HSA memory consistency model.
+ * If this flag is set, then ::HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
+ * must not be set.
+ */
+ HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2,
+ /**
+ * Writes to memory in this pool can be performed by a single agent at a time.
+ */
+ HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
+} hsa_amd_memory_pool_global_flag_t;
+
+/**
+ * @brief Memory pool features.
+ */
+typedef enum {
+ /**
+ * Segment where the memory pool resides. The type of this attribute is
+ * ::hsa_amd_segment_t.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0,
+ /**
+ * Flag mask. The value of this attribute is undefined if the value of
+ * ::HSA_AMD_MEMORY_POOL_INFO_SEGMENT is not ::HSA_AMD_SEGMENT_GLOBAL. The type
+ * of
+ * this attribute is uint32_t, a bit-field of
+ * ::hsa_amd_memory_pool_global_flag_t
+ * values.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1,
+ /**
+ * Size of this pool, in bytes. The type of this attribute is size_t.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_SIZE = 2,
+ /**
+ * Indicates whether memory in this pool can be allocated using
+ * ::hsa_amd_memory_pool_allocate. The type of this attribute is bool.
+ *
+ * The value of this flag is always false for memory pools in the group and
+ * private segments.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+ /**
+ * Allocation granularity of buffers allocated by
+ * ::hsa_amd_memory_pool_allocate
+ * in this memory pool. The size of a buffer allocated in this pool is a
+ * multiple of the value of this attribute. The value of this attribute is
+ * only defined if ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for
+ * this pool. The type of this attribute is size_t.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
+ /**
+ * Alignment of buffers allocated by ::hsa_amd_memory_pool_allocate in this
+ * pool. The value of this attribute is only defined if
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED is true for this pool, and
+ * must be a power of 2. The type of this attribute is size_t.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
+ /**
+ * This memory_pool can be made directly accessible by all the agents in the
+ * system (::hsa_amd_agent_memory_pool_get_info does not return
+ * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this
+ * attribute is bool.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+ /**
+ * Maximum aggregate allocation size in bytes. The type of this attribute
+ * is size_t.
+ */
+ HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
+} hsa_amd_memory_pool_info_t;
+
+/**
+ * @brief Get the current value of an attribute of a memory pool.
+ *
+ * @param[in] memory_pool A valid memory pool.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ */
+hsa_status_t HSA_API
+ hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
+ hsa_amd_memory_pool_info_t attribute,
+ void* value);
+
+/**
+ * @brief Iterate over the memory pools associated with a given agent, and
+ * invoke an application-defined callback on every iteration.
+ *
+ * @details An agent can directly access buffers located in some memory pool, or
+ * be enabled to access them by the application (see ::hsa_amd_agents_allow_access),
+ * yet that memory pool may not be returned by this function for that given
+ * agent.
+ *
+ * A memory pool of fine-grained type must be associated only with the host.
+ *
+ * @param[in] agent A valid agent.
+ *
+ * @param[in] callback Callback to be invoked on the same thread that called
+ * ::hsa_amd_agent_iterate_memory_pools, serially, once per memory pool that is
+ * associated with the agent. The HSA runtime passes two arguments to the
+ * callback: the memory pool, and the application data. If @p callback
+ * returns a status other than ::HSA_STATUS_SUCCESS for a particular iteration,
+ * the traversal stops and ::hsa_amd_agent_iterate_memory_pools returns that status
+ * value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools(
+ hsa_agent_t agent,
+ hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void* data),
+ void* data);
+
+/**
+ * @brief Allocate a block of memory (or buffer) in the specified pool.
+ *
+ * @param[in] memory_pool Memory pool where to allocate memory from. The memory
+ * pool must have the ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED flag set.
+ *
+ * @param[in] size Allocation size, in bytes. Must not be zero. This value is
+ * rounded up to the nearest multiple of
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool.
+ *
+ * @param[in] flags A bit-field that is used to specify allocation
+ * directives. Reserved parameter, must be 0.
+ *
+ * @param[out] ptr Pointer to the location where to store the base virtual
+ * address of
+ * the allocated block. The returned base address is aligned to the value of
+ * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT in @p memory_pool. If the
+ * allocation fails, the returned value is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES No memory is available.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The memory pool is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to
+ * allocate memory in @p memory_pool, or @p size is greater than
+ * the value of HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE in @p memory_pool.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0,
+ * or flags is not 0.
+ *
+ */
+hsa_status_t HSA_API
+ hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, size_t size,
+ uint32_t flags, void** ptr);
+
+/**
+ * @brief Deallocate a block of memory previously allocated using
+ * ::hsa_amd_memory_pool_allocate.
+ *
+ * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value
+ * previously returned by ::hsa_amd_memory_pool_allocate, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr);
+
+/**
+ * @brief Asynchronously copy a block of memory from the location pointed to by
+ * @p src on the @p src_agent to the memory block pointed to by @p dst on the @p
+ * dst_agent.
+ * Because the DMA engines used may not be in the same coherency domain, the caller must ensure
+ * that buffers are system-level coherent. In general this requires the sending device to have
+ * released the buffer to system scope prior to executing the copy API and the receiving device
+ * must execute a system scope acquire fence prior to use of the destination buffer.
+ *
+ * @param[out] dst Buffer where the content is to be copied.
+ *
+ * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly
+ * access both the source and destination buffers in their current locations.
+ *
+ * @param[in] src A valid pointer to the source of data to be copied. The source
+ * buffer must not overlap with the destination buffer, otherwise the copy will succeed
+ * but contents of @p dst is undefined.
+ *
+ * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly
+ * access both the source and destination buffers in their current locations.
+ *
+ * @param[in] size Number of bytes to copy. If @p size is 0, no copy is
+ * performed and the function returns success. Copying a number of bytes larger
+ * than the size of the buffers pointed by @p dst or @p src results in undefined
+ * behavior.
+ *
+ * @param[in] num_dep_signals Number of dependent signals. Can be 0.
+ *
+ * @param[in] dep_signals List of signals that must be waited on before the copy
+ * operation starts. The copy will start after every signal has been observed with
+ * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy
+ * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this
+ * argument is ignored.
+ *
+ * @param[in] completion_signal Signal used to indicate completion of the copy
+ * operation. When the copy operation is finished, the value of the signal is
+ * decremented. The runtime indicates that an error has occurred during the copy
+ * operation by setting the value of the completion signal to a negative
+ * number. The signal handle must not be 0.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. The
+ * application is responsible for checking for asynchronous error conditions
+ * (see the description of @p completion_signal).
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination
+ * pointers are NULL, or the completion signal is 0.
+ */
+hsa_status_t HSA_API
+ hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
+ hsa_agent_t src_agent, size_t size,
+ uint32_t num_dep_signals,
+ const hsa_signal_t* dep_signals,
+ hsa_signal_t completion_signal);
+
+/*
+[Provisional API]
+Pitched memory descriptor.
+All elements must be 4 byte aligned. Pitch and slice are in bytes.
+*/
+typedef struct hsa_pitched_ptr_s {
+ void* base;
+ size_t pitch;
+ size_t slice;
+} hsa_pitched_ptr_t;
+
+/*
+[Provisional API]
+Copy direction flag.
+*/
+typedef enum {
+ hsaHostToHost = 0,
+ hsaHostToDevice = 1,
+ hsaDeviceToHost = 2,
+ hsaDeviceToDevice = 3
+} hsa_amd_copy_direction_t;
+
+/*
+[Provisional API]
+SDMA 3D memory copy API. The same requirements must be met by src and dst as in
+hsa_amd_memory_async_copy.
+Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects
+must not overlap.
+CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available.
+Offsets and range carry x in bytes, y and z in rows and layers.
+*/
+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+ const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+ const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+ hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+ hsa_signal_t completion_signal);
+
+/**
+ * @brief Type of accesses to a memory pool from a given agent.
+ */
+typedef enum {
+ /**
+ * The agent cannot directly access any buffer in the memory pool.
+ */
+ HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0,
+ /**
+ * The agent can directly access a buffer located in the pool; the application
+ * does not need to invoke ::hsa_amd_agents_allow_access.
+ */
+ HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT = 1,
+ /**
+ * The agent can directly access a buffer located in the pool, but only if the
+ * application has previously requested access to that buffer using
+ * ::hsa_amd_agents_allow_access.
+ */
+ HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT = 2
+} hsa_amd_memory_pool_access_t;
+
+/**
+ * @brief Properties of the relationship between an agent a memory pool.
+ */
+typedef enum {
+ /**
+ * Hyper-transport bus type.
+ */
+ HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT = 0,
+
+ /**
+ * QPI bus type.
+ */
+ HSA_AMD_LINK_INFO_TYPE_QPI = 1,
+
+ /**
+ * PCIe bus type.
+ */
+ HSA_AMD_LINK_INFO_TYPE_PCIE = 2,
+
+ /**
+ * Infiniband bus type.
+ */
+ HSA_AMD_LINK_INFO_TYPE_INFINBAND = 3,
+
+ /**
+ * xGMI link type.
+ */
+ HSA_AMD_LINK_INFO_TYPE_XGMI = 4
+
+} hsa_amd_link_info_type_t;
+
+/**
+ * @brief Link properties when accessing the memory pool from the specified
+ * agent.
+ */
+typedef struct hsa_amd_memory_pool_link_info_s {
+ /**
+ * Minimum transfer latency (rounded to ns).
+ */
+ uint32_t min_latency;
+
+ /**
+ * Maximum transfer latency (rounded to ns).
+ */
+ uint32_t max_latency;
+
+ /**
+ * Minimum link interface bandwidth in MB/s.
+ */
+ uint32_t min_bandwidth;
+
+ /**
+ * Maximum link interface bandwidth in MB/s.
+ */
+ uint32_t max_bandwidth;
+
+ /**
+ * Support for 32-bit atomic transactions.
+ */
+ bool atomic_support_32bit;
+
+ /**
+ * Support for 64-bit atomic transactions.
+ */
+ bool atomic_support_64bit;
+
+ /**
+ * Support for cache coherent transactions.
+ */
+ bool coherent_support;
+
+ /**
+ * The type of bus/link.
+ */
+ hsa_amd_link_info_type_t link_type;
+
+ /**
+ * NUMA distance of memory pool relative to querying agent
+ */
+ uint32_t numa_distance;
+} hsa_amd_memory_pool_link_info_t;
+
+/**
+ * @brief Properties of the relationship between an agent a memory pool.
+ */
+typedef enum {
+ /**
+ * Access to buffers located in the memory pool. The type of this attribute
+ * is ::hsa_amd_memory_pool_access_t.
+ *
+ * An agent can always directly access buffers currently located in a memory
+ * pool that is associated (the memory_pool is one of the values returned by
+ * ::hsa_amd_agent_iterate_memory_pools on the agent) with that agent. If the
+ * buffer is currently located in a memory pool that is not associated with
+ * the agent, and the value returned by this function for the given
+ * combination of agent and memory pool is not
+ * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, the application still needs to invoke
+ * ::hsa_amd_agents_allow_access in order to gain direct access to the buffer.
+ *
+ * If the given agent can directly access buffers the pool, the result is not
+ * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is associated with
+ * the agent, or it is of fined-grained type, the result must not be
+ * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. If the memory pool is not associated
+ * with the agent, and does not reside in the global segment, the result must
+ * be HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED.
+ */
+ HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0,
+
+ /**
+ * Number of links to hop when accessing the memory pool from the specified
+ * agent. The value of this attribute is zero if the memory pool is associated
+ * with the agent, or if the access type is
+ * HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED. The type of this attribute is
+ * uint32_t.
+ */
+ HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS = 1,
+
+ /**
+ * Details of each link hop when accessing the memory pool starting from the
+ * specified agent. The type of this attribute is an array size of
+ * HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS with each element containing
+ * ::hsa_amd_memory_pool_link_info_t.
+ */
+ HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO = 2
+
+} hsa_amd_agent_memory_pool_info_t;
+
+/**
+ * @brief Get the current value of an attribute of the relationship between an
+ * agent and a memory pool.
+ *
+ * @param[in] agent Agent.
+ *
+ * @param[in] memory_pool Memory pool.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to a application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ */
+hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
+ hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
+ hsa_amd_agent_memory_pool_info_t attribute, void* value);
+
+/**
+ * @brief Enable direct access to a buffer from a given set of agents.
+ *
+ * @details
+ *
+ * Upon return, only the listed agents and the agent associated with the
+ * buffer's memory pool have direct access to the @p ptr.
+ *
+ * Any agent that has access to the buffer before and after the call to
+ * ::hsa_amd_agents_allow_access will also have access while
+ * ::hsa_amd_agents_allow_access is in progress.
+ *
+ * The caller is responsible for ensuring that each agent in the list
+ * must be able to access the memory pool containing @p ptr
+ * (using ::hsa_amd_agent_memory_pool_get_info with ::HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS attribute),
+ * otherwise error code is returned.
+ *
+ * @param[in] num_agents Size of @p agents.
+ *
+ * @param[in] agents List of agents. If @p num_agents is 0, this argument is
+ * ignored.
+ *
+ * @param[in] flags A list of bit-field that is used to specify access
+ * information in a per-agent basis. This is currently reserved and must be NULL.
+ *
+ * @param[in] ptr A buffer previously allocated using ::hsa_amd_memory_pool_allocate.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_agents is 0, or @p agents
+ * is NULL, @p flags is not NULL, or attempting to enable access to agent(s)
+ * because @p ptr is allocated from an inaccessible pool.
+ *
+ */
+hsa_status_t HSA_API
+ hsa_amd_agents_allow_access(uint32_t num_agents, const hsa_agent_t* agents,
+ const uint32_t* flags, const void* ptr);
+
+/**
+ * @brief Query if buffers currently located in some memory pool can be
+ * relocated to a destination memory pool.
+ *
+ * @details If the returned value is non-zero, a migration of a buffer to @p
+ * dst_memory_pool using ::hsa_amd_memory_migrate may nevertheless fail due to
+ * resource limitations.
+ *
+ * @param[in] src_memory_pool Source memory pool.
+ *
+ * @param[in] dst_memory_pool Destination memory pool.
+ *
+ * @param[out] result Pointer to a memory location where the result of the query
+ * is stored. Must not be NULL. If buffers currently located in @p
+ * src_memory_pool can be relocated to @p dst_memory_pool, the result is
+ * true.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL One of the memory pools is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL.
+ */
+hsa_status_t HSA_API
+ hsa_amd_memory_pool_can_migrate(hsa_amd_memory_pool_t src_memory_pool,
+ hsa_amd_memory_pool_t dst_memory_pool,
+ bool* result);
+
+/**
+ * @brief Relocate a buffer to a new memory pool.
+ *
+ * @details When a buffer is migrated, its virtual address remains the same but
+ * its physical contents are moved to the indicated memory pool.
+ *
+ * After migration, only the agent associated with the destination pool will have access.
+ *
+ * The caller is also responsible for ensuring that the allocation in the
+ * source memory pool where the buffer is currently located can be migrated to the
+ * specified destination memory pool (using ::hsa_amd_memory_pool_can_migrate returns a value of true
+ * for the source and destination memory pools), otherwise behavior is undefined.
+ *
+ * The caller must ensure that the buffer is not accessed while it is migrated.
+ *
+ * @param[in] ptr Buffer to be relocated. The buffer must have been released to system
+ * prior to call this API. The buffer will be released to system upon completion.
+ *
+ * @param[in] memory_pool Memory pool where to place the buffer.
+ *
+ * @param[in] flags A bit-field that is used to specify migration
+ * information. Must be zero.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL The destination memory pool is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p flags is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_migrate(const void* ptr,
+ hsa_amd_memory_pool_t memory_pool,
+ uint32_t flags);
+
+/**
+ *
+ * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+ * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously
+ * locked memory, then the overlap area is kept locked (i.e multiple mappings are permitted). In
+ * this case, the same input @p host_ptr may give different locked @p agent_ptr and when it does,
+ * they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent).
+ * Accesses to @p agent_ptr are coarse grained.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator.
+ *
+ * @param[in] size The size to be locked.
+ *
+ * @param[in] agents Array of agent handle to gain access to the @p host_ptr.
+ * If this parameter is NULL and the @p num_agent is 0, all agents
+ * in the platform will gain access to the @p host_ptr.
+ *
+ * @param[out] agent_ptr Pointer to the location where to store the new address.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or
+ * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents
+ * is NULL but @p num_agent is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_lock(void* host_ptr, size_t size,
+ hsa_agent_t* agents, int num_agent,
+ void** agent_ptr);
+
+/**
+ *
+ * @brief Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
+ * return a new pointer accessible by the @p agents. If the @p host_ptr overlaps with previously
+ * locked memory, then the overlap area is kept locked (i.e. multiple mappings are permitted).
+ * In this case, the same input @p host_ptr may give different locked @p agent_ptr and when it
+ * does, they are not necessarily coherent (i.e. accessing either @p agent_ptr is not equivalent).
+ * Acesses to the memory via @p agent_ptr have the same access properties as memory allocated from
+ * @p pool as determined by ::hsa_amd_memory_pool_get_info and ::hsa_amd_agent_memory_pool_get_info
+ * (ex. coarse/fine grain, platform atomic support, link info). Physical composition and placement
+ * of the memory (ex. page size, NUMA binding) is not changed.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator.
+ *
+ * @param[in] size The size to be locked.
+ *
+ * @param[in] agents Array of agent handle to gain access to the @p host_ptr.
+ * If this parameter is NULL and the @p num_agent is 0, all agents
+ * in the platform will gain access to the @p host_ptr.
+ *
+ * @param[in] pool Global memory pool owned by a CPU agent.
+ *
+ * @param[in] flags A bit-field that is used to specify allocation
+ * directives. Reserved parameter, must be 0.
+ *
+ * @param[out] agent_ptr Pointer to the location where to store the new address.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure in
+ * allocating the necessary resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT One or more agent in @p agents is
+ * invalid or can not access @p pool.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_MEMORY_POOL @p pool is invalid or not owned
+ * by a CPU agent.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 or @p host_ptr or
+ * @p agent_ptr is NULL or @p agents not NULL but @p num_agent is 0 or @p agents
+ * is NULL but @p num_agent is not 0 or flags is not 0.
+ */
+hsa_status_t HSA_API hsa_amd_memory_lock_to_pool(void* host_ptr, size_t size, hsa_agent_t* agents,
+ int num_agent, hsa_amd_memory_pool_t pool,
+ uint32_t flags, void** agent_ptr);
+
+/**
+ *
+ * @brief Unpin the host pointer previously pinned via ::hsa_amd_memory_lock or
+ * ::hsa_amd_memory_lock_to_pool.
+ *
+ * @details The behavior is undefined if the host pointer being unpinned does not
+ * match previous pinned address or if the host pointer was already deallocated.
+ *
+ * @param[in] host_ptr A buffer allocated by C/C++ or OS allocator that was
+ * pinned previously via ::hsa_amd_memory_lock or ::hsa_amd_memory_lock_to_pool.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ */
+hsa_status_t HSA_API hsa_amd_memory_unlock(void* host_ptr);
+
+/**
+ * @brief Sets the first @p count of uint32_t of the block of memory pointed by
+ * @p ptr to the specified @p value.
+ *
+ * @param[in] ptr Pointer to the block of memory to fill.
+ *
+ * @param[in] value Value to be set.
+ *
+ * @param[in] count Number of uint32_t element to be set to the value.
+ *
+ * @retval HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL or
+ * not 4 bytes aligned
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ALLOCATION if the given memory
+ * region was not allocated with HSA runtime APIs.
+ *
+ */
+hsa_status_t HSA_API
+ hsa_amd_memory_fill(void* ptr, uint32_t value, size_t count);
+
+/**
+ * @brief Maps an interop object into the HSA flat address space and establishes
+ * memory residency. The metadata pointer is valid during the lifetime of the
+ * map (until hsa_amd_interop_unmap_buffer is called).
+ * Multiple calls to hsa_amd_interop_map_buffer with the same interop_handle
+ * result in multiple mappings with potentially different addresses and
+ * different metadata pointers. Concurrent operations on these addresses are
+ * not coherent. Memory must be fenced to system scope to ensure consistency,
+ * between mappings and with any views of this buffer in the originating
+ * software stack.
+ *
+ * @param[in] num_agents Number of agents which require access to the memory
+ *
+ * @param[in] agents List of accessing agents.
+ *
+ * @param[in] interop_handle Handle of interop buffer (dmabuf handle in Linux)
+ *
+ * @param [in] flags Reserved, must be 0
+ *
+ * @param[out] size Size in bytes of the mapped object
+ *
+ * @param[out] ptr Base address of the mapped object
+ *
+ * @param[out] metadata_size Size of metadata in bytes, may be NULL
+ *
+ * @param[out] metadata Pointer to metadata, may be NULL
+ *
+ * @retval HSA_STATUS_SUCCESS if successfully mapped
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT all other errors
+ */
+hsa_status_t HSA_API hsa_amd_interop_map_buffer(uint32_t num_agents,
+ hsa_agent_t* agents,
+ int interop_handle,
+ uint32_t flags,
+ size_t* size,
+ void** ptr,
+ size_t* metadata_size,
+ const void** metadata);
+
+/**
+ * @brief Removes a previously mapped interop object from HSA's flat address space.
+ * Ends lifetime for the mapping's associated metadata pointer.
+ */
+hsa_status_t HSA_API hsa_amd_interop_unmap_buffer(void* ptr);
+
+/**
+ * @brief Encodes an opaque vendor specific image format. The length of data
+ * depends on the underlying format. This structure must not be copied as its
+ * true length can not be determined.
+ */
+typedef struct hsa_amd_image_descriptor_s {
+ /*
+ Version number of the descriptor
+ */
+ uint32_t version;
+
+ /*
+ Vendor and device PCI IDs for the format as VENDOR_ID<<16|DEVICE_ID.
+ */
+ uint32_t deviceID;
+
+ /*
+ Start of vendor specific data.
+ */
+ uint32_t data[1];
+} hsa_amd_image_descriptor_t;
+
+/**
+ * @brief Creates an image from an opaque vendor specific image format.
+ * Does not modify data at image_data. Intended initially for
+ * accessing interop images.
+ *
+ * @param agent[in] Agent on which to create the image
+ *
+ * @param[in] image_descriptor[in] Vendor specific image format
+ *
+ * @param[in] image_data Pointer to image backing store
+ *
+ * @param[in] access_permission Access permissions for the image object
+ *
+ * @param[out] image Created image object.
+ *
+ * @retval HSA_STATUS_SUCCESS Image created successfully
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Bad or mismatched descriptor,
+ * null image_data, or mismatched access_permission.
+ */
+hsa_status_t HSA_API hsa_amd_image_create(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ const hsa_amd_image_descriptor_t *image_layout,
+ const void *image_data,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_t *image
+);
+
+/**
+ * @brief Denotes the type of memory in a pointer info query.
+ */
+typedef enum {
+ /*
+ Memory is not known to the HSA driver. Unallocated or unlocked system memory.
+ */
+ HSA_EXT_POINTER_TYPE_UNKNOWN = 0,
+ /*
+ Memory was allocated with an HSA memory allocator.
+ */
+ HSA_EXT_POINTER_TYPE_HSA = 1,
+ /*
+ System memory which has been locked for use with an HSA agent.
+
+ Memory of this type is normal malloc'd memory and is always accessible to
+ the CPU. Pointer info queries may not include CPU agents in the accessible
+ agents list as the CPU has implicit access.
+ */
+ HSA_EXT_POINTER_TYPE_LOCKED = 2,
+ /*
+ Memory originated in a graphics component and is shared with ROCr.
+ */
+ HSA_EXT_POINTER_TYPE_GRAPHICS = 3,
+ /*
+ Memory has been shared with the local process via ROCr IPC APIs.
+ */
+ HSA_EXT_POINTER_TYPE_IPC = 4
+} hsa_amd_pointer_type_t;
+
+/**
+ * @brief Describes a memory allocation known to ROCr.
+ * Within a ROCr major version this structure can only grow.
+ */
+typedef struct hsa_amd_pointer_info_s {
+ /*
+ Size in bytes of this structure. Used for version control within a major ROCr
+ revision. Set to sizeof(hsa_amd_pointer_t) prior to calling
+ hsa_amd_pointer_info. If the runtime supports an older version of pointer
+ info then size will be smaller on return. Members starting after the return
+ value of size will not be updated by hsa_amd_pointer_info.
+ */
+ uint32_t size;
+ /*
+ The type of allocation referenced.
+ */
+ hsa_amd_pointer_type_t type;
+ /*
+ Base address at which non-host agents may access the allocation.
+ */
+ void* agentBaseAddress;
+ /*
+ Base address at which the host agent may access the allocation.
+ */
+ void* hostBaseAddress;
+ /*
+ Size of the allocation
+ */
+ size_t sizeInBytes;
+ /*
+ Application provided value.
+ */
+ void* userData;
+ /*
+ Reports an agent which "owns" (ie has preferred access to) the pool in which the allocation was
+ made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die
+ GPU boards) any such agent may be returned.
+ */
+ hsa_agent_t agentOwner;
+} hsa_amd_pointer_info_t;
+
+/**
+ * @brief Retrieves information about the allocation referenced by the given
+ * pointer. Optionally returns the number and list of agents which can
+ * directly access the allocation.
+ *
+ * @param[in] ptr Pointer which references the allocation to retrieve info for.
+ *
+ * @param[in, out] info Pointer to structure to be filled with allocation info.
+ * Data member size must be set to the size of the structure prior to calling
+ * hsa_amd_pointer_info. On return size will be set to the size of the
+ * pointer info structure supported by the runtime, if smaller. Members
+ * beyond the returned value of size will not be updated by the API.
+ * Must not be NULL.
+ *
+ * @param[in] alloc Function pointer to an allocator used to allocate the
+ * @p accessible array. If NULL @p accessible will not be returned.
+ *
+ * @param[out] num_agents_accessible Recieves the count of agents in
+ * @p accessible. If NULL @p accessible will not be returned.
+ *
+ * @param[out] accessible Recieves a pointer to the array, allocated by @p alloc,
+ * holding the list of agents which may directly access the allocation.
+ * May be NULL.
+ *
+ * @retval HSA_STATUS_SUCCESS Info retrieved successfully
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info.
+ */
+hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr,
+ hsa_amd_pointer_info_t* info,
+ void* (*alloc)(size_t),
+ uint32_t* num_agents_accessible,
+ hsa_agent_t** accessible);
+
+/**
+ * @brief Associates an arbitrary pointer with an allocation known to ROCr.
+ * The pointer can be fetched by hsa_amd_pointer_info in the userData field.
+ *
+ * @param[in] ptr Pointer to the first byte of an allocation known to ROCr
+ * with which to associate @p userdata.
+ *
+ * @param[in] userdata Abitrary pointer to associate with the allocation.
+ *
+ * @retval HSA_STATUS_SUCCESS @p userdata successfully stored.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr.
+ */
+hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr,
+ void* userdata);
+
+/**
+ * @brief 256-bit process independent identifier for a ROCr shared memory
+ * allocation.
+ */
+typedef struct hsa_amd_ipc_memory_s {
+ uint32_t handle[8];
+} hsa_amd_ipc_memory_t;
+
+/**
+ * @brief Prepares an allocation for interprocess sharing and creates a
+ * handle of type hsa_amd_ipc_memory_t uniquely identifying the allocation. A
+ * handle is valid while the allocation it references remains accessible in
+ * any process. In general applications should confirm that a shared memory
+ * region has been attached (via hsa_amd_ipc_memory_attach) in the remote
+ * process prior to releasing that memory in the local process.
+ * Repeated calls for the same allocation may, but are not required to, return
+ * unique handles.
+ *
+ * @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for
+ * sharing.
+ *
+ * @param[in] len Length in bytes of the allocation to share.
+ *
+ * @param[out] handle Process independent identifier referencing the shared
+ * allocation.
+ *
+ * @retval HSA_STATUS_SUCCESS allocation is prepared for interprocess sharing.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr does not point to the
+ * first byte of an allocation made through ROCr, or len is not the full length
+ * of the allocation or handle is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_create(void* ptr, size_t len,
+ hsa_amd_ipc_memory_t* handle);
+
+/**
+ * @brief Imports shared memory into the local process and makes it accessible
+ * by the given agents. If a shared memory handle is attached multiple times
+ * in a process each attach may return a different address. Each returned
+ * address is refcounted and requires a matching number of calls to
+ * hsa_amd_ipc_memory_detach to release the shared memory mapping.
+ *
+ * @param[in] handle Pointer to the identifier for the shared memory.
+ *
+ * @param[in] len Length of the shared memory to import.
+ * Reserved. Must be the full length of the shared allocation in this version.
+ *
+ * @param[in] num_agents Count of agents in @p mapping_agents.
+ * May be zero if all agents are to be allowed access.
+ *
+ * @param[in] mapping_agents List of agents to access the shared memory.
+ * Ignored if @p num_agents is zero.
+ *
+ * @param[out] mapped_ptr Recieves a process local pointer to the shared memory.
+ *
+ * @retval HSA_STATUS_SUCCESS if memory is successfully imported.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid, @p len is
+ * incorrect, @p mapped_ptr is NULL, or some agent for which access was
+ * requested can not access the shared memory.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_attach(
+ const hsa_amd_ipc_memory_t* handle, size_t len,
+ uint32_t num_agents,
+ const hsa_agent_t* mapping_agents,
+ void** mapped_ptr);
+
+/**
+ * @brief Decrements the reference count for the shared memory mapping and
+ * releases access to shared memory imported with hsa_amd_ipc_memory_attach.
+ *
+ * @param[in] mapped_ptr Pointer to the first byte of a shared allocation
+ * imported with hsa_amd_ipc_memory_attach.
+ *
+ * @retval HSA_STATUS_SUCCESS if @p mapped_ptr was imported with
+ * hsa_amd_ipc_memory_attach.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p mapped_ptr was not imported
+ * with hsa_amd_ipc_memory_attach.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_memory_detach(void* mapped_ptr);
+
+/**
+ * @brief 256-bit process independent identifier for a ROCr IPC signal.
+ */
+typedef hsa_amd_ipc_memory_t hsa_amd_ipc_signal_t;
+
+/**
+ * @brief Obtains an interprocess sharing handle for a signal. The handle is
+ * valid while the signal it references remains valid in any process. In
+ * general applications should confirm that the signal has been attached (via
+ * hsa_amd_ipc_signal_attach) in the remote process prior to destroying that
+ * signal in the local process.
+ * Repeated calls for the same signal may, but are not required to, return
+ * unique handles.
+ *
+ * @param[in] signal Signal created with attribute HSA_AMD_SIGNAL_IPC.
+ *
+ * @param[out] handle Process independent identifier referencing the shared
+ * signal.
+ *
+ * @retval HSA_STATUS_SUCCESS @p handle is ready to use for interprocess sharing.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is not a valid signal
+ * created with attribute HSA_AMD_SIGNAL_IPC or handle is NULL.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_signal_create(hsa_signal_t signal, hsa_amd_ipc_signal_t* handle);
+
+/**
+ * @brief Imports an IPC capable signal into the local process. If an IPC
+ * signal handle is attached multiple times in a process each attach may return
+ * a different signal handle. Each returned signal handle is refcounted and
+ * requires a matching number of calls to hsa_signal_destroy to release the
+ * shared signal.
+ *
+ * @param[in] handle Pointer to the identifier for the shared signal.
+ *
+ * @param[out] signal Recieves a process local signal handle to the shared signal.
+ *
+ * @retval HSA_STATUS_SUCCESS if the signal is successfully imported.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED if HSA is not initialized
+ *
+ * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p handle is not valid.
+ */
+hsa_status_t HSA_API hsa_amd_ipc_signal_attach(const hsa_amd_ipc_signal_t* handle,
+ hsa_signal_t* signal);
+
+/**
+ * @brief GPU system event type.
+ */
+typedef enum hsa_amd_event_type_s {
+ /*
+ AMD GPU memory fault.
+ */
+ HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0,
+} hsa_amd_event_type_t;
+
+/**
+ * @brief Flags denoting the cause of a memory fault.
+ */
+typedef enum {
+ // Page not present or supervisor privilege.
+ HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0,
+ // Write access to a read-only page.
+ HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1,
+ // Execute access to a page marked NX.
+ HSA_AMD_MEMORY_FAULT_NX = 1 << 2,
+ // GPU attempted access to a host only page.
+ HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
+ // DRAM ECC failure.
+ HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
+ // Can't determine the exact fault address.
+ HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
+ // SRAM ECC failure (ie registers, no fault address).
+ HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
+ // GPU reset following unspecified hang.
+ HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
+} hsa_amd_memory_fault_reason_t;
+
+/**
+ * @brief AMD GPU memory fault event data.
+ */
+typedef struct hsa_amd_gpu_memory_fault_info_s {
+ /*
+ The agent where the memory fault occurred.
+ */
+ hsa_agent_t agent;
+ /*
+ Virtual address accessed.
+ */
+ uint64_t virtual_address;
+ /*
+ Bit field encoding the memory access failure reasons. There could be multiple bits set
+ for one fault. Bits are defined in hsa_amd_memory_fault_reason_t.
+ */
+ uint32_t fault_reason_mask;
+} hsa_amd_gpu_memory_fault_info_t;
+
+/**
+ * @brief AMD GPU event data passed to event handler.
+ */
+typedef struct hsa_amd_event_s {
+ /*
+ The event type.
+ */
+ hsa_amd_event_type_t event_type;
+ union {
+ /*
+ The memory fault info, only valid when @p event_type is HSA_AMD_GPU_MEMORY_FAULT_EVENT.
+ */
+ hsa_amd_gpu_memory_fault_info_t memory_fault;
+ };
+} hsa_amd_event_t;
+
+typedef hsa_status_t (*hsa_amd_system_event_callback_t)(const hsa_amd_event_t* event, void* data);
+
+/**
+ * @brief Register AMD GPU event handler.
+ *
+ * @param[in] callback Callback to be invoked when an event is triggered.
+ * The HSA runtime passes two arguments to the callback: @p event
+ * is defined per event by the HSA runtime, and @p data is the user data.
+ *
+ * @param[in] data User data that is passed to @p callback. May be NULL.
+ *
+ * @retval HSA_STATUS_SUCCESS The handler has been registered successfully.
+ *
+ * @retval HSA_STATUS_ERROR An event handler has already been registered.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p event is invalid.
+ */
+hsa_status_t HSA_API hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback,
+ void* data);
+
+/**
+ * @brief Per-queue dispatch and wavefront scheduling priority.
+ */
+typedef enum hsa_amd_queue_priority_s {
+ /*
+ Below normal/high priority compute and all graphics
+ */
+ HSA_AMD_QUEUE_PRIORITY_LOW = 0,
+ /*
+ Above low priority compute, below high priority compute and all graphics
+ */
+ HSA_AMD_QUEUE_PRIORITY_NORMAL = 1,
+ /*
+ Above low/normal priority compute and all graphics
+ */
+ HSA_AMD_QUEUE_PRIORITY_HIGH = 2,
+} hsa_amd_queue_priority_t;
+
+/**
+ * @brief Modifies the dispatch and wavefront scheduling prioirty for a
+ * given compute queue. The default is HSA_AMD_QUEUE_PRIORITY_NORMAL.
+ *
+ * @param[in] queue Compute queue to apply new priority to.
+ *
+ * @param[in] priority Priority to associate with queue.
+ *
+ * @retval HSA_STATUS_SUCCESS if priority was changed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_QUEUE if queue is not a valid
+ * compute queue handle.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT if priority is not a valid
+ * value from hsa_amd_queue_priority_t.
+ */
+hsa_status_t HSA_API hsa_amd_queue_set_priority(hsa_queue_t* queue,
+ hsa_amd_queue_priority_t priority);
+
+/**
+ * @brief Deallocation notifier function type.
+ */
+typedef void (*hsa_amd_deallocation_callback_t)(void* ptr, void* user_data);
+
+/**
+ * @brief Registers a deallocation notifier monitoring for release of agent
+ * accessible address @p ptr. If successful, @p callback will be invoked when
+ * @p ptr is removed from accessibility from all agents.
+ *
+ * Notification callbacks are automatically deregistered when they are invoked.
+ *
+ * Note: The current version supports notifications of address release
+ * originating from ::hsa_amd_memory_pool_free. Support for other address
+ * release APIs will follow.
+ *
+ * @param[in] ptr Agent accessible address to monitor for deallocation. Passed
+ * to @p callback.
+ *
+ * @param[in] callback Notifier to be invoked when @p ptr is released from
+ * agent accessibility.
+ *
+ * @param[in] user_data User provided value passed to @p callback. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The notifier registered successfully
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION @p ptr does not refer to a valid agent accessible
+ * address.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL or @p ptr is NULL.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES if there is a failure in allocating
+ * necessary resources
+ */
+hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr,
+ hsa_amd_deallocation_callback_t callback,
+ void* user_data);
+
+/**
+ * @brief Removes a deallocation notifier previously registered with
+ * ::hsa_amd_register_deallocation_callback. Arguments must be identical to
+ * those given in ::hsa_amd_register_deallocation_callback.
+ *
+ * @param[in] ptr Agent accessible address which was monitored for deallocation.
+ *
+ * @param[in] callback Notifier to be removed.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The notifier has been removed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The given notifier was not registered.
+ */
+hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr,
+ hsa_amd_deallocation_callback_t callback);
+
+#ifdef __cplusplus
+} // end extern "C" block
+#endif
+
+#endif // header guard
diff --git a/third_party/rocm/include/hsa/hsa_ext_finalize.h b/third_party/rocm/include/hsa/hsa_ext_finalize.h
new file mode 100644
index 0000000..94c4582
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa_ext_finalize.h
@@ -0,0 +1,531 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
+#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
+
+#include "hsa.h"
+
+#undef HSA_API
+#ifdef HSA_EXPORT_FINALIZER
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct BrigModuleHeader;
+typedef struct BrigModuleHeader* BrigModule_t;
+
+/** \defgroup ext-alt-finalizer-extensions Finalization Extensions
+ * @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t by this extension.
+ */
+enum {
+ /**
+ * The HSAIL program is invalid.
+ */
+ HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000,
+ /**
+ * The HSAIL module is invalid.
+ */
+ HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001,
+ /**
+ * Machine model or profile of the HSAIL module do not match the machine model
+ * or profile of the HSAIL program.
+ */
+ HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002,
+ /**
+ * The HSAIL module is already a part of the HSAIL program.
+ */
+ HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003,
+ /**
+ * Compatibility mismatch between symbol declaration and symbol definition.
+ */
+ HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004,
+ /**
+ * The finalization encountered an error while finalizing a kernel or
+ * indirect function.
+ */
+ HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005,
+ /**
+ * Mismatch between a directive in the control directive structure and in
+ * the HSAIL kernel.
+ */
+ HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006
+};
+
+/** @} */
+
+/** \defgroup ext-alt-finalizer-program Finalization Program
+ * @{
+ */
+
+/**
+ * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains
+ * the definition of the BrigModule_t type.
+ */
+typedef BrigModule_t hsa_ext_module_t;
+
+/**
+ * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL
+ * modules that collectively define functions and variables used by kernels and
+ * indirect functions.
+ */
+typedef struct hsa_ext_program_s {
+ /**
+ * Opaque handle.
+ */
+ uint64_t handle;
+} hsa_ext_program_t;
+
+/**
+ * @brief Create an empty HSAIL program.
+ *
+ * @param[in] machine_model Machine model used in the HSAIL program.
+ *
+ * @param[in] profile Profile used in the HSAIL program.
+ *
+ * @param[in] default_float_rounding_mode Default float rounding mode used in
+ * the HSAIL program.
+ *
+ * @param[in] options Vendor-specific options. May be NULL.
+ *
+ * @param[out] program Memory location where the HSA runtime stores the newly
+ * created HSAIL program handle.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid,
+ * @p profile is invalid, @p default_float_rounding_mode is invalid, or
+ * @p program is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_create(
+ hsa_machine_model_t machine_model,
+ hsa_profile_t profile,
+ hsa_default_float_rounding_mode_t default_float_rounding_mode,
+ const char *options,
+ hsa_ext_program_t *program);
+
+/**
+ * @brief Destroy a HSAIL program.
+ *
+ * @details The HSAIL program handle becomes invalid after it has been
+ * destroyed. Code object handles produced by ::hsa_ext_program_finalize are
+ * still valid after the HSAIL program has been destroyed, and can be used as
+ * intended. Resources allocated outside and associated with the HSAIL program
+ * (such as HSAIL modules that are added to the HSAIL program) can be released
+ * after the finalization program has been destroyed.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is
+ * invalid.
+ */
+hsa_status_t HSA_API hsa_ext_program_destroy(
+ hsa_ext_program_t program);
+
+/**
+ * @brief Add a HSAIL module to an existing HSAIL program.
+ *
+ * @details The HSA runtime does not perform a deep copy of the HSAIL module
+ * upon addition. Instead, it stores a pointer to the HSAIL module. The
+ * ownership of the HSAIL module belongs to the application, which must ensure
+ * that @p module is not released before destroying the HSAIL program.
+ *
+ * The HSAIL module is successfully added to the HSAIL program if @p module is
+ * valid, if all the declarations and definitions for the same symbol are
+ * compatible, and if @p module specify machine model and profile that matches
+ * the HSAIL program.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] module HSAIL module. The application can add the same HSAIL module
+ * to @p program at most once. The HSAIL module must specify the same machine
+ * model and profile as @p program. If the floating-mode rounding mode of @p
+ * module is not default, then it should match that of @p program.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p
+ * module does not match machine model of @p program, or the profile of @p
+ * module does not match profile of @p program.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is
+ * already a part of the HSAIL program.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol
+ * definition compatibility mismatch. See the symbol compatibility rules in the
+ * HSA Programming Reference Manual.
+ */
+hsa_status_t HSA_API hsa_ext_program_add_module(
+ hsa_ext_program_t program,
+ hsa_ext_module_t module);
+
+/**
+ * @brief Iterate over the HSAIL modules in a program, and invoke an
+ * application-defined callback on every iteration.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] callback Callback to be invoked once per HSAIL module in the
+ * program. The HSA runtime passes three arguments to the callback: the program,
+ * a HSAIL module, and the application data. If @p callback returns a status
+ * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal
+ * stops and ::hsa_ext_program_iterate_modules returns that status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_iterate_modules(
+ hsa_ext_program_t program,
+ hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module,
+ void* data),
+ void* data);
+
+/**
+ * @brief HSAIL program attributes.
+ */
+typedef enum {
+ /**
+ * Machine model specified when the HSAIL program was created. The type
+ * of this attribute is ::hsa_machine_model_t.
+ */
+ HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0,
+ /**
+ * Profile specified when the HSAIL program was created. The type of
+ * this attribute is ::hsa_profile_t.
+ */
+ HSA_EXT_PROGRAM_INFO_PROFILE = 1,
+ /**
+ * Default float rounding mode specified when the HSAIL program was
+ * created. The type of this attribute is ::hsa_default_float_rounding_mode_t.
+ */
+ HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2
+} hsa_ext_program_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given HSAIL program.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behaviour is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * HSAIL program attribute, or @p value is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_program_get_info(
+ hsa_ext_program_t program,
+ hsa_ext_program_info_t attribute,
+ void *value);
+
+/**
+ * @brief Finalizer-determined call convention.
+ */
+typedef enum {
+ /**
+ * Finalizer-determined call convention.
+ */
+ HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1
+} hsa_ext_finalizer_call_convention_t;
+
+/**
+ * @brief Control directives specify low-level information about the
+ * finalization process.
+ */
+typedef struct hsa_ext_control_directives_s {
+ /**
+ * Bitset indicating which control directives are enabled. The bit assigned to
+ * a control directive is determined by the corresponding value in
+ * BrigControlDirective.
+ *
+ * If a control directive is disabled, its corresponding field value (if any)
+ * must be 0. Control directives that are only present or absent (such as
+ * partial workgroups) have no corresponding field as the presence of the bit
+ * in this mask is sufficient.
+ */
+ uint64_t control_directives_mask;
+ /**
+ * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit
+ * assigned to an HSAIL exception is determined by the corresponding value
+ * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions
+ * control directive, the finalizer uses the union of the two masks.
+ */
+ uint16_t break_exceptions_mask;
+ /**
+ * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The
+ * bit assigned to an HSAIL exception is determined by the corresponding value
+ * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions
+ * control directive, the finalizer uses the union of the two masks.
+ */
+ uint16_t detect_exceptions_mask;
+ /**
+ * Maximum size (in bytes) of dynamic group memory that will be allocated by
+ * the application for any dispatch of the kernel. If the kernel contains a
+ * maxdynamicsize control directive, the two values should match.
+ */
+ uint32_t max_dynamic_group_size;
+ /**
+ * Maximum number of grid work-items that will be used by the application to
+ * launch the kernel. If the kernel contains a maxflatgridsize control
+ * directive, the value of @a max_flat_grid_size must not be greater than the
+ * value of the directive, and takes precedence.
+ *
+ * The value specified for maximum absolute grid size must be greater than or
+ * equal to the product of the values specified by @a required_grid_size.
+ *
+ * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a
+ * control_directives_mask, this field must be greater than 0.
+ */
+ uint64_t max_flat_grid_size;
+ /**
+ * Maximum number of work-group work-items that will be used by the
+ * application to launch the kernel. If the kernel contains a
+ * maxflatworkgroupsize control directive, the value of @a
+ * max_flat_workgroup_size must not be greater than the value of the
+ * directive, and takes precedence.
+ *
+ * The value specified for maximum absolute grid size must be greater than or
+ * equal to the product of the values specified by @a required_workgroup_size.
+ *
+ * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a
+ * control_directives_mask, this field must be greater than 0.
+ */
+ uint32_t max_flat_workgroup_size;
+ /**
+ * Reserved. Must be 0.
+ */
+ uint32_t reserved1;
+ /**
+ * Grid size that will be used by the application in any dispatch of the
+ * kernel. If the kernel contains a requiredgridsize control directive, the
+ * dimensions should match.
+ *
+ * The specified grid size must be consistent with @a required_workgroup_size
+ * and @a required_dim. Also, the product of the three dimensions must not
+ * exceed @a max_flat_grid_size. Note that the listed invariants must hold
+ * only if all the corresponding control directives are enabled.
+ *
+ * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a
+ * control_directives_mask, the three dimension values must be greater than 0.
+ */
+ uint64_t required_grid_size[3];
+ /**
+ * Work-group size that will be used by the application in any dispatch of the
+ * kernel. If the kernel contains a requiredworkgroupsize control directive,
+ * the dimensions should match.
+ *
+ * The specified work-group size must be consistent with @a required_grid_size
+ * and @a required_dim. Also, the product of the three dimensions must not
+ * exceed @a max_flat_workgroup_size. Note that the listed invariants must
+ * hold only if all the corresponding control directives are enabled.
+ *
+ * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a
+ * control_directives_mask, the three dimension values must be greater than 0.
+ */
+ hsa_dim3_t required_workgroup_size;
+ /**
+ * Number of dimensions that will be used by the application to launch the
+ * kernel. If the kernel contains a requireddim control directive, the two
+ * values should match.
+ *
+ * The specified dimensions must be consistent with @a required_grid_size and
+ * @a required_workgroup_size. This invariant must hold only if all the
+ * corresponding control directives are enabled.
+ *
+ * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a
+ * control_directives_mask, this field must be 1, 2, or 3.
+ */
+ uint8_t required_dim;
+ /**
+ * Reserved. Must be 0.
+ */
+ uint8_t reserved2[75];
+} hsa_ext_control_directives_t;
+
+/**
+ * @brief Finalize an HSAIL program for a given instruction set architecture.
+ *
+ * @details Finalize all of the kernels and indirect functions that belong to
+ * the same HSAIL program for a specific instruction set architecture (ISA). The
+ * transitive closure of all functions specified by call or scall must be
+ * defined. Kernels and indirect functions that are being finalized must be
+ * defined. Kernels and indirect functions that are referenced in kernels and
+ * indirect functions being finalized may or may not be defined, but must be
+ * declared. All the global/readonly segment variables that are referenced in
+ * kernels and indirect functions being finalized may or may not be defined, but
+ * must be declared.
+ *
+ * @param[in] program HSAIL program.
+ *
+ * @param[in] isa Instruction set architecture to finalize for.
+ *
+ * @param[in] call_convention A call convention used in a finalization. Must
+ * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive)
+ * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p
+ * isa (not inclusive).
+ *
+ * @param[in] control_directives Low-level control directives that influence
+ * the finalization process.
+ *
+ * @param[in] options Vendor-specific options. May be NULL.
+ *
+ * @param[in] code_object_type Type of code object to produce.
+ *
+ * @param[out] code_object Code object generated by the Finalizer, which
+ * contains the machine code for the kernels and indirect functions in the HSAIL
+ * program. The code object is independent of the HSAIL module that was used to
+ * generate it.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate
+ * resources required for the operation.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in
+ * the control directive structure and in the HSAIL kernel mismatch, or if the
+ * same directive is used with a different value in one of the functions used by
+ * this kernel.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer
+ * encountered an error while compiling a kernel or an indirect function.
+ */
+hsa_status_t HSA_API hsa_ext_program_finalize(
+ hsa_ext_program_t program,
+ hsa_isa_t isa,
+ int32_t call_convention,
+ hsa_ext_control_directives_t control_directives,
+ const char *options,
+ hsa_code_object_type_t code_object_type,
+ hsa_code_object_t *code_object);
+
+/** @} */
+
+#define hsa_ext_finalizer_1_00
+
+typedef struct hsa_ext_finalizer_1_00_pfn_s {
+ hsa_status_t (*hsa_ext_program_create)(
+ hsa_machine_model_t machine_model, hsa_profile_t profile,
+ hsa_default_float_rounding_mode_t default_float_rounding_mode,
+ const char *options, hsa_ext_program_t *program);
+
+ hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program);
+
+ hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program,
+ hsa_ext_module_t module);
+
+ hsa_status_t (*hsa_ext_program_iterate_modules)(
+ hsa_ext_program_t program,
+ hsa_status_t (*callback)(hsa_ext_program_t program,
+ hsa_ext_module_t module, void *data),
+ void *data);
+
+ hsa_status_t (*hsa_ext_program_get_info)(
+ hsa_ext_program_t program, hsa_ext_program_info_t attribute,
+ void *value);
+
+ hsa_status_t (*hsa_ext_program_finalize)(
+ hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention,
+ hsa_ext_control_directives_t control_directives, const char *options,
+ hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object);
+} hsa_ext_finalizer_1_00_pfn_t;
+
+#ifdef __cplusplus
+} // extern "C" block
+#endif // __cplusplus
+
+#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_
diff --git a/third_party/rocm/include/hsa/hsa_ext_image.h b/third_party/rocm/include/hsa/hsa_ext_image.h
new file mode 100644
index 0000000..b25f168
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa_ext_image.h
@@ -0,0 +1,1454 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_EXT_IMAGE_H
+#define HSA_EXT_IMAGE_H
+
+#include "hsa.h"
+
+#undef HSA_API
+#ifdef HSA_EXPORT_IMAGES
+#define HSA_API HSA_API_EXPORT
+#else
+#define HSA_API HSA_API_IMPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /*__cplusplus*/
+
+/** \defgroup ext-images Images and Samplers
+ * @{
+ */
+
+/**
+ * @brief Enumeration constants added to ::hsa_status_t by this extension.
+ *
+ * @remark Additions to hsa_status_t
+ */
+enum {
+ /**
+ * Image format is not supported.
+ */
+ HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000,
+ /**
+ * Image size is not supported.
+ */
+ HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001,
+ /**
+ * Image pitch is not supported or invalid.
+ */
+ HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002,
+ /**
+ * Sampler descriptor is not supported or invalid.
+ */
+ HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003
+};
+
+/**
+ * @brief Enumeration constants added to ::hsa_agent_info_t by this
+ * extension.
+ *
+ * @remark Additions to hsa_agent_info_t
+ */
+enum {
+ /**
+ * Maximum number of elements in 1D images. Must be at least 16384. The type
+ * of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000,
+ /**
+ * Maximum number of elements in 1DA images. Must be at least 16384. The type
+ * of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001,
+ /**
+ * Maximum number of elements in 1DB images. Must be at least 65536. The type
+ * of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002,
+ /**
+ * Maximum dimensions (width, height) of 2D images, in image elements. The X
+ * and Y maximums must be at least 16384. The type of this attribute is
+ * size_t[2].
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003,
+ /**
+ * Maximum dimensions (width, height) of 2DA images, in image elements. The X
+ * and Y maximums must be at least 16384. The type of this attribute is
+ * size_t[2].
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004,
+ /**
+ * Maximum dimensions (width, height) of 2DDEPTH images, in image
+ * elements. The X and Y maximums must be at least 16384. The type of this
+ * attribute is size_t[2].
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005,
+ /**
+ * Maximum dimensions (width, height) of 2DADEPTH images, in image
+ * elements. The X and Y maximums must be at least 16384. The type of this
+ * attribute is size_t[2].
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006,
+ /**
+ * Maximum dimensions (width, height, depth) of 3D images, in image
+ * elements. The maximum along any dimension must be at least 2048. The type
+ * of this attribute is size_t[3].
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007,
+ /**
+ * Maximum number of image layers in a image array. Must be at least 2048. The
+ * type of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008,
+ /**
+ * Maximum number of read-only image handles that can be created for an agent at any one
+ * time. Must be at least 128. The type of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009,
+ /**
+ * Maximum number of write-only and read-write image handles (combined) that
+ * can be created for an agent at any one time. Must be at least 64. The type of this
+ * attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A,
+ /**
+ * Maximum number of sampler handlers that can be created for an agent at any one
+ * time. Must be at least 16. The type of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B,
+ /**
+ * Image pitch alignment. The agent only supports linear image data
+ * layouts with a row pitch that is a multiple of this value. Must be
+ * a power of 2. The type of this attribute is size_t.
+ */
+ HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C
+};
+
+/**
+ * @brief Image handle, populated by ::hsa_ext_image_create or
+ * ::hsa_ext_image_create_with_layout. Image
+ * handles are only unique within an agent, not across agents.
+ *
+ */
+typedef struct hsa_ext_image_s {
+ /**
+ * Opaque handle. For a given agent, two handles reference the same object of
+ * the enclosing type if and only if they are equal.
+ */
+ uint64_t handle;
+
+} hsa_ext_image_t;
+
+/**
+ * @brief Geometry associated with the image. This specifies the
+ * number of image dimensions and whether the image is an image
+ * array. See the <em>Image Geometry</em> section in the <em>HSA
+ * Programming Reference Manual</em> for definitions on each
+ * geometry. The enumeration values match the BRIG type @p
+ * hsa_ext_brig_image_geometry_t.
+ */
+typedef enum {
+/**
+ * One-dimensional image addressed by width coordinate.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_1D = 0,
+
+ /**
+ * Two-dimensional image addressed by width and height coordinates.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_2D = 1,
+
+ /**
+ * Three-dimensional image addressed by width, height, and depth coordinates.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_3D = 2,
+
+ /**
+ * Array of one-dimensional images with the same size and format. 1D arrays
+ * are addressed by width and index coordinate.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_1DA = 3,
+
+ /**
+ * Array of two-dimensional images with the same size and format. 2D arrays
+ * are addressed by width, height, and index coordinates.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_2DA = 4,
+
+ /**
+ * One-dimensional image addressed by width coordinate. It has
+ * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An
+ * image with an opaque image data layout will always use a linear
+ * image data layout, and one with an explicit image data layout
+ * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_1DB = 5,
+
+ /**
+ * Two-dimensional depth image addressed by width and height coordinates.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6,
+
+ /**
+ * Array of two-dimensional depth images with the same size and format. 2D
+ * arrays are addressed by width, height, and index coordinates.
+ */
+ HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7
+} hsa_ext_image_geometry_t;
+
+/**
+ * @brief Channel type associated with the elements of an image. See
+ * the <em>Channel Type</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each channel type. The
+ * enumeration values and definition match the BRIG type @p
+ * hsa_ext_brig_image_channel_type_t.
+ */
+typedef enum {
+ HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
+ HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
+} hsa_ext_image_channel_type_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants.
+ */
+typedef uint32_t hsa_ext_image_channel_type32_t;
+
+/**
+ *
+ * @brief Channel order associated with the elements of an image. See
+ * the <em>Channel Order</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each channel order. The
+ * enumeration values match the BRIG type @p
+ * hsa_ext_brig_image_channel_order_t.
+ */
+typedef enum {
+ HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
+ HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
+} hsa_ext_image_channel_order_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants.
+ */
+typedef uint32_t hsa_ext_image_channel_order32_t;
+
+
+/**
+ * @brief Image format.
+ */
+typedef struct hsa_ext_image_format_s {
+ /**
+ * Channel type.
+ */
+ hsa_ext_image_channel_type32_t channel_type;
+
+ /**
+ * Channel order.
+ */
+ hsa_ext_image_channel_order32_t channel_order;
+} hsa_ext_image_format_t;
+
+/**
+ * @brief Implementation independent image descriptor.
+ */
+typedef struct hsa_ext_image_descriptor_s {
+ /**
+ * Image geometry.
+ */
+ hsa_ext_image_geometry_t geometry;
+ /**
+ * Width of the image, in components.
+ */
+ size_t width;
+ /**
+ * Height of the image, in components. Only used if the geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D,
+ * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0.
+ */
+ size_t height;
+ /**
+ * Depth of the image, in components. Only used if the geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0.
+ */
+ size_t depth;
+ /**
+ * Number of image layers in the image array. Only used if the geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0.
+ */
+ size_t array_size;
+ /**
+ * Image format.
+ */
+ hsa_ext_image_format_t format;
+} hsa_ext_image_descriptor_t;
+
+/**
+ * @brief Image capability.
+ */
+typedef enum {
+ /**
+ * Images of this geometry, format, and layout are not supported by
+ * the agent.
+ */
+ HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0,
+ /**
+ * Read-only images of this geometry, format, and layout are
+ * supported by the agent.
+ */
+ HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1,
+ /**
+ * Write-only images of this geometry, format, and layout are
+ * supported by the agent.
+ */
+ HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2,
+ /**
+ * Read-write images of this geometry, format, and layout are
+ * supported by the agent.
+ */
+ HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4,
+ /**
+ * @deprecated Images of this geometry, format, and layout can be accessed from
+ * read-modify-write atomic operations in the agent.
+ */
+ HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8,
+ /**
+ * Images of this geometry, format, and layout are guaranteed to
+ * have a consistent data layout regardless of how they are
+ * accessed by the associated agent.
+ */
+ HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10
+} hsa_ext_image_capability_t;
+
+/**
+ * @brief Image data layout.
+ *
+ * @details An image data layout denotes such aspects of image data
+ * layout as tiling and organization of channels in memory. Some image
+ * data layouts may only apply to specific image geometries, formats,
+ * and access permissions. Different agents may support different
+ * image layout identifiers, including vendor specific layouts. Note
+ * that an agent may not support the same image data layout for
+ * different access permissions to images with the same image
+ * geometry, size, and format. If multiple agents support the same
+ * image data layout then it is possible to use separate image handles
+ * for each agent that references the same image data.
+ */
+
+typedef enum {
+ /**
+ * An implementation specific opaque image data layout which can
+ * vary depending on the agent, geometry, image format, image size,
+ * and access permissions.
+ */
+ HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0,
+ /**
+ * The image data layout is specified by the following rules in
+ * ascending byte address order. For a 3D image, 2DA image array,
+ * or 1DA image array, the image data is stored as a linear sequence
+ * of adjacent 2D image slices, 2D images, or 1D images
+ * respectively, spaced according to the slice pitch. Each 2D image
+ * is stored as a linear sequence of adjacent image rows, spaced
+ * according to the row pitch. Each 1D or 1DB image is stored as a
+ * single image row. Each image row is stored as a linear sequence
+ * of image elements. Each image element is stored as a linear
+ * sequence of image components specified by the left to right
+ * channel order definition. Each image component is stored using
+ * the memory type specified by the channel type.
+ *
+ * The 1DB image geometry always uses the linear image data layout.
+ */
+ HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1
+} hsa_ext_image_data_layout_t;
+
+/**
+ * @brief Retrieve the supported image capabilities for a given combination of
+ * agent, geometry, and image format for an image created with an opaque image
+ * data layout.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] geometry Geometry.
+ *
+ * @param[in] image_format Pointer to an image format. Must not be NULL.
+ *
+ * @param[out] capability_mask Pointer to a memory location where the HSA
+ * runtime stores a bit-mask of supported image capability
+ * (::hsa_ext_image_capability_t) values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is
+ * NULL, or @p capability_mask is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_get_capability(
+ hsa_agent_t agent,
+ hsa_ext_image_geometry_t geometry,
+ const hsa_ext_image_format_t *image_format,
+ uint32_t *capability_mask);
+
+/**
+ * @brief Retrieve the supported image capabilities for a given combination of
+ * agent, geometry, image format, and image layout for an image created with
+ * an explicit image data layout.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] geometry Geometry.
+ *
+ * @param[in] image_format Pointer to an image format. Must not be NULL.
+ *
+ * @param[in] image_data_layout The image data layout.
+ * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use
+ * ::hsa_ext_image_get_capability instead.
+ *
+ * @param[out] capability_mask Pointer to a memory location where the HSA
+ * runtime stores a bit-mask of supported image capability
+ * (::hsa_ext_image_capability_t) values. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is
+ * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p capability_mask is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout(
+ hsa_agent_t agent,
+ hsa_ext_image_geometry_t geometry,
+ const hsa_ext_image_format_t *image_format,
+ hsa_ext_image_data_layout_t image_data_layout,
+ uint32_t *capability_mask);
+
+/**
+ * @brief Agent specific image size and alignment requirements, populated by
+ * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout.
+ */
+typedef struct hsa_ext_image_data_info_s {
+ /**
+ * Image data size, in bytes.
+ */
+ size_t size;
+
+ /**
+ * Image data alignment, in bytes. Must always be a power of 2.
+ */
+ size_t alignment;
+
+} hsa_ext_image_data_info_t;
+
+/**
+ * @brief Retrieve the image data requirements for a given combination of agent, image
+ * descriptor, and access permission for an image created with an opaque image
+ * data layout.
+ *
+ * @details The optimal image data size and alignment requirements may
+ * vary depending on the image attributes specified in @p
+ * image_descriptor, the @p access_permission, and the @p agent. Also,
+ * different implementations of the HSA runtime may return different
+ * requirements for the same input values.
+ *
+ * The implementation must return the same image data requirements for
+ * different access permissions with matching image descriptors as long
+ * as ::hsa_ext_image_get_capability reports
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image
+ * descriptors match if they have the same values, with the exception
+ * that s-form channel orders match the corresponding non-s-form
+ * channel order and vice versa.
+ *
+ * @param[in] agent Agent to be associated with the image handle.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by @p agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type. The @p agent must support the image format
+ * specified in @p image_descriptor for the given @p
+ * access_permission.
+ *
+ * @param[out] image_data_info Memory location where the runtime stores the
+ * size and alignment requirements. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p
+ * agent does not support the image format specified by @p
+ * image_descriptor with the specified @p access_permission.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor with the specified @p access_permission.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * access_permission is not a valid access permission value, or @p
+ * image_data_info is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_data_get_info(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_info_t *image_data_info);
+
+/**
+ * @brief Retrieve the image data requirements for a given combination of
+ * image descriptor, access permission, image data layout, image data row pitch,
+ * and image data slice pitch for an image created with an explicit image
+ * data layout.
+ *
+ * @details The image data size and alignment requirements may vary
+ * depending on the image attributes specified in @p image_descriptor,
+ * the @p access_permission, and the image layout. However, different
+ * implementations of the HSA runtime will return the same
+ * requirements for the same input values.
+ *
+ * The implementation must return the same image data requirements for
+ * different access permissions with matching image descriptors and
+ * matching image layouts as long as ::hsa_ext_image_get_capability
+ * reports
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image
+ * descriptors match if they have the same values, with the exception
+ * that s-form channel orders match the corresponding non-s-form
+ * channel order and vice versa. Image layouts match if they are the
+ * same image data layout and use the same image row and slice pitch
+ * values.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by an agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type.
+ *
+ * @param[in] image_data_layout The image data layout to use.
+ * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use
+ * ::hsa_ext_image_data_get_info instead.
+ *
+ * @param[in] image_data_row_pitch The size in bytes for a single row
+ * of the image in the image data. If 0 is specified then the default
+ * row pitch value is used: image width * image element byte size.
+ * The value used must be greater than or equal to the default row
+ * pitch, and be a multiple of the image element byte size. For the
+ * linear image layout it must also be a multiple of the image linear
+ * row pitch alignment for the agents that will access the image data
+ * using image instructions.
+ *
+ * @param[in] image_data_slice_pitch The size in bytes of a single
+ * slice of a 3D image, or the size in bytes of each image layer in an
+ * image array in the image data. If 0 is specified then the default
+ * slice pitch value is used: row pitch * height if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must
+ * be 0 if the default slice pitch is 0, be greater than or equal to
+ * the default slice pitch, and be a multiple of the row pitch.
+ *
+ * @param[out] image_data_info Memory location where the runtime stores the
+ * size and alignment requirements. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image
+ * format specified by @p image_descriptor is not supported for the
+ * @p access_permission and @p image_data_layout specified.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image
+ * dimensions specified by @p image_descriptor are not supported for
+ * the @p access_permission and @p image_data_layout specified.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and
+ * slice pitch specified by @p image_data_row_pitch and @p
+ * image_data_slice_pitch are invalid or not supported.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is
+ * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p image_data_info is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_layout_t image_data_layout,
+ size_t image_data_row_pitch,
+ size_t image_data_slice_pitch,
+ hsa_ext_image_data_info_t *image_data_info);
+
+/**
+ * @brief Creates an agent specific image handle to an image with an
+ * opaque image data layout.
+ *
+ * @details Images with an opaque image data layout created with
+ * different access permissions but matching image descriptors and
+ * same agent can share the same image data if
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported
+ * by ::hsa_ext_image_get_capability for the image format specified in
+ * the image descriptor. Image descriptors match if they have the same
+ * values, with the exception that s-form channel orders match the
+ * corresponding non-s-form channel order and vice versa.
+ *
+ * If necessary, an application can use image operations (import,
+ * export, copy, clear) to prepare the image for the intended use
+ * regardless of the access permissions.
+ *
+ * @param[in] agent agent to be associated with the image handle created.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] image_data Image data buffer that must have been allocated
+ * according to the size and alignment requirements dictated by
+ * ::hsa_ext_image_data_get_info. Must not be NULL.
+ *
+ * Any previous memory contents are preserved upon creation. The application is
+ * responsible for ensuring that the lifetime of the image data exceeds that of
+ * all the associated images.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by agent. The access permission defines how the agent
+ * is allowed to access the image using the image handle created and
+ * must match the corresponding HSAIL image handle type. The agent
+ * must support the image format specified in @p image_descriptor for
+ * the given @p access_permission.
+ *
+ * @param[out] image Pointer to a memory location where the HSA runtime stores
+ * the newly created image handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent
+ * does not have the capability to support the image format contained
+ * in @p image_descriptor using the specified @p access_permission.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor using the specified @p access_permission.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * support the creation of more image handles with the given @p access_permission).
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * image_data is NULL, @p image_data does not have a valid alignment,
+ * @p access_permission is not a valid access permission
+ * value, or @p image is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_create(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ const void *image_data,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_t *image);
+
+/**
+ * @brief Creates an agent specific image handle to an image with an explicit
+ * image data layout.
+ *
+ * @details Images with an explicit image data layout created with
+ * different access permissions but matching image descriptors and
+ * matching image layout can share the same image data if
+ * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported
+ * by ::hsa_ext_image_get_capability_with_layout for the image format
+ * specified in the image descriptor and specified image data
+ * layout. Image descriptors match if they have the same values, with
+ * the exception that s-form channel orders match the corresponding
+ * non-s-form channel order and vice versa. Image layouts match if
+ * they are the same image data layout and use the same image row and
+ * slice values.
+ *
+ * If necessary, an application can use image operations (import, export, copy,
+ * clear) to prepare the image for the intended use regardless of the access
+ * permissions.
+ *
+ * @param[in] agent agent to be associated with the image handle created.
+ *
+ * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL.
+ *
+ * @param[in] image_data Image data buffer that must have been allocated
+ * according to the size and alignment requirements dictated by
+ * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL.
+ *
+ * Any previous memory contents are preserved upon creation. The application is
+ * responsible for ensuring that the lifetime of the image data exceeds that of
+ * all the associated images.
+ *
+ * @param[in] access_permission Access permission of the image when
+ * accessed by the agent. The access permission defines how the agent
+ * is allowed to access the image and must match the corresponding
+ * HSAIL image handle type. The agent must support the image format
+ * specified in @p image_descriptor for the given @p access_permission
+ * and @p image_data_layout.
+ *
+ * @param[in] image_data_layout The image data layout to use for the
+ * @p image_data. It is invalid to use
+ * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create
+ * instead.
+ *
+ * @param[in] image_data_row_pitch The size in bytes for a single row
+ * of the image in the image data. If 0 is specified then the default
+ * row pitch value is used: image width * image element byte size.
+ * The value used must be greater than or equal to the default row
+ * pitch, and be a multiple of the image element byte size. For the
+ * linear image layout it must also be a multiple of the image linear
+ * row pitch alignment for the agents that will access the image data
+ * using image instructions.
+ *
+ * @param[in] image_data_slice_pitch The size in bytes of a single
+ * slice of a 3D image, or the size in bytes of each image layer in an
+ * image array in the image data. If 0 is specified then the default
+ * slice pitch value is used: row pitch * height if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is
+ * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must
+ * be 0 if the default slice pitch is 0, be greater than or equal to
+ * the default slice pitch, and be a multiple of the row pitch.
+ *
+ * @param[out] image Pointer to a memory location where the HSA runtime stores
+ * the newly created image handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does
+ * not have the capability to support the image format contained in the image
+ * descriptor using the specified @p access_permission and @p image_data_layout.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent
+ * does not support the image dimensions specified by @p
+ * image_descriptor using the specified @p access_permission and @p
+ * image_data_layout.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does
+ * not support the row and slice pitch specified by @p image_data_row_pitch
+ * and @p image_data_slice_pitch, or the values are invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * support the creation of more image handles with the given @p access_permission).
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p
+ * image_data is NULL, @p image_data does not have a valid alignment,
+ * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE,
+ * or @p image is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_create_with_layout(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ const void *image_data,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_layout_t image_data_layout,
+ size_t image_data_row_pitch,
+ size_t image_data_slice_pitch,
+ hsa_ext_image_t *image);
+
+/**
+ * @brief Destroy an image handle previously created using ::hsa_ext_image_create or
+ * ::hsa_ext_image_create_with_layout.
+ *
+ * @details Destroying the image handle does not free the associated image data,
+ * or modify its contents. The application should not destroy an image handle while
+ * there are references to it queued for execution or currently being used in a
+ * kernel dispatch.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] image Image handle to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ */
+hsa_status_t HSA_API hsa_ext_image_destroy(
+ hsa_agent_t agent,
+ hsa_ext_image_t image);
+
+/**
+ * @brief Copies a portion of one image (the source) to another image (the
+ * destination).
+ *
+ * @details The source and destination image formats should be the
+ * same, with the exception that s-form channel orders match the
+ * corresponding non-s-form channel order and vice versa. For example,
+ * it is allowed to copy a source image with a channel order of
+ * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a
+ * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB.
+ *
+ * The source and destination images do not have to be of the same geometry and
+ * appropriate scaling is performed by the HSA runtime. It is possible to copy
+ * subregions between any combinations of source and destination geometries, provided
+ * that the dimensions of the subregions are the same. For example, it is
+ * allowed to copy a rectangular region from a 2D image to a slice of a 3D
+ * image.
+ *
+ * If the source and destination image data overlap, or the combination of
+ * offset and range references an out-out-bounds element in any of the images,
+ * the behavior is undefined.
+ *
+ * @param[in] agent Agent associated with both the source and destination image handles.
+ *
+ * @param[in] src_image Image handle of source image. The agent associated with the source
+ * image handle must be identical to that of the destination image.
+ *
+ * @param[in] src_offset Pointer to the offset within the source image where to
+ * copy the data from. Must not be NULL.
+ *
+ * @param[in] dst_image Image handle of destination image.
+ *
+ * @param[in] dst_offset Pointer to the offset within the destination
+ * image where to copy the data. Must not be NULL.
+ *
+ * @param[in] range Dimensions of the image portion to be copied. The HSA
+ * runtime computes the size of the image data to be copied using this
+ * argument. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is
+ * NULL, @p dst_offset is NULL, or @p range is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_copy(
+ hsa_agent_t agent,
+ hsa_ext_image_t src_image,
+ const hsa_dim3_t* src_offset,
+ hsa_ext_image_t dst_image,
+ const hsa_dim3_t* dst_offset,
+ const hsa_dim3_t* range);
+
+/**
+ * @brief Image region.
+ */
+typedef struct hsa_ext_image_region_s {
+ /**
+ * Offset within an image (in coordinates).
+ */
+ hsa_dim3_t offset;
+
+ /**
+ * Dimension size of the image range (in coordinates). The x, y, and z dimensions
+ * correspond to width, height, and depth or index respectively.
+ */
+ hsa_dim3_t range;
+} hsa_ext_image_region_t;
+
+/**
+ * @brief Import a linearly organized image data from memory directly to an
+ * image handle.
+ *
+ * @details This operation updates the image data referenced by the image handle
+ * from the source memory. The size of the data imported from memory is
+ * implicitly derived from the image region.
+ *
+ * It is the application's responsibility to avoid out of bounds memory access.
+ *
+ * None of the source memory or destination image data memory can
+ * overlap. Overlapping of any of the source and destination image
+ * data memory within the import operation produces undefined results.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] src_memory Source memory. Must not be NULL.
+ *
+ * @param[in] src_row_pitch The size in bytes of a single row of the image in the
+ * source memory. If the value is smaller than the destination image region
+ * width * image element byte size, then region width * image element byte
+ * size is used.
+ *
+ * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image,
+ * or the size in bytes of each image layer in an image array in the source memory.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the
+ * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for
+ * @p src_row_pitch * destination image region height, then the value used for
+ * @p src_row_pitch * destination image region height is used.
+ * Otherwise, the value is not used.
+ *
+ * @param[in] dst_image Image handle of destination image.
+ *
+ * @param[in] image_region Pointer to the image region to be updated. Must not
+ * be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p
+ * image_region is NULL.
+ *
+ */
+hsa_status_t HSA_API hsa_ext_image_import(
+ hsa_agent_t agent,
+ const void *src_memory,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ hsa_ext_image_t dst_image,
+ const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Export the image data to linearly organized memory.
+ *
+ * @details The operation updates the destination memory with the image data of
+ * @p src_image. The size of the data exported to memory is implicitly derived
+ * from the image region.
+ *
+ * It is the application's responsibility to avoid out of bounds memory access.
+ *
+ * None of the destination memory or source image data memory can
+ * overlap. Overlapping of any of the source and destination image
+ * data memory within the export operation produces undefined results.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] src_image Image handle of source image.
+ *
+ * @param[in] dst_memory Destination memory. Must not be NULL.
+ *
+ * @param[in] dst_row_pitch The size in bytes of a single row of the image in the
+ * destination memory. If the value is smaller than the source image region
+ * width * image element byte size, then region width * image element byte
+ * size is used.
+ *
+ * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image,
+ * or the size in bytes of each image in an image array in the destination memory.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the
+ * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used.
+ * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or
+ * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for
+ * @p dst_row_pitch * source image region height, then the value used for
+ * @p dst_row_pitch * source image region height is used.
+ * Otherwise, the value is not used.
+ *
+ * @param[in] image_region Pointer to the image region to be exported. Must not
+ * be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p
+ * image_region is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_export(
+ hsa_agent_t agent,
+ hsa_ext_image_t src_image,
+ void *dst_memory,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Clear a region of an image so that every image element has
+ * the specified value.
+ *
+ * @param[in] agent Agent associated with the image handle.
+ *
+ * @param[in] image Image handle for image to be cleared.
+ *
+ * @param[in] data The value to which to set each image element being
+ * cleared. It is specified as an array of image component values. The
+ * number of array elements must match the number of access components
+ * for the image channel order. The type of each array element must
+ * match the image access type of the image channel type. When the
+ * value is used to set the value of an image element, the conversion
+ * method corresponding to the image channel type is used. See the
+ * <em>Channel Order</em> section and <em>Channel Type</em> section in
+ * the <em>HSA Programming Reference Manual</em> for more
+ * information. Must not be NULL.
+ *
+ * @param[in] image_region Pointer to the image region to clear. Must not be
+ * NULL. If the region references an out-out-bounds element, the behavior is
+ * undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p
+ * image_region is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_image_clear(
+ hsa_agent_t agent,
+ hsa_ext_image_t image,
+ const void* data,
+ const hsa_ext_image_region_t *image_region);
+
+/**
+ * @brief Sampler handle. Samplers are populated by
+ * ::hsa_ext_sampler_create. Sampler handles are only unique within an
+ * agent, not across agents.
+ */
+typedef struct hsa_ext_sampler_s {
+ /**
+ * Opaque handle. For a given agent, two handles reference the same object of
+ * the enclosing type if and only if they are equal.
+ */
+ uint64_t handle;
+} hsa_ext_sampler_t;
+
+/**
+ * @brief Sampler address modes. The sampler address mode describes
+ * the processing of out-of-range image coordinates. See the
+ * <em>Addressing Mode</em> section in the <em>HSA Programming Reference
+ * Manual</em> for definitions on each address mode. The values
+ * match the BRIG type @p hsa_ext_brig_sampler_addressing_t.
+ */
+typedef enum {
+ /**
+ * Out-of-range coordinates are not handled.
+ */
+ HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0,
+
+ /**
+ * Clamp out-of-range coordinates to the image edge.
+ */
+ HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1,
+
+ /**
+ * Clamp out-of-range coordinates to the image border color.
+ */
+ HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2,
+
+ /**
+ * Wrap out-of-range coordinates back into the valid coordinate
+ * range so the image appears as repeated tiles.
+ */
+ HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3,
+
+ /**
+ * Mirror out-of-range coordinates back into the valid coordinate
+ * range so the image appears as repeated tiles with every other
+ * tile a reflection.
+ */
+ HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4
+
+} hsa_ext_sampler_addressing_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_addressing_mode32_t;
+
+/**
+ * @brief Sampler coordinate normalization modes. See the
+ * <em>Coordinate Normalization Mode</em> section in the <em>HSA
+ * Programming Reference Manual</em> for definitions on each
+ * coordinate normalization mode. The values match the BRIG type @p
+ * hsa_ext_brig_sampler_coord_normalization_t.
+ */
+typedef enum {
+
+ /**
+ * Coordinates are used to directly address an image element.
+ */
+ HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0,
+
+ /**
+ * Coordinates are scaled by the image dimension size before being
+ * used to address an image element.
+ */
+ HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1
+
+} hsa_ext_sampler_coordinate_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_coordinate_mode32_t;
+
+
+/**
+ * @brief Sampler filter modes. See the <em>Filter Mode</em> section
+ * in the <em>HSA Programming Reference Manual</em> for definitions
+ * on each address mode. The enumeration values match the BRIG type @p
+ * hsa_ext_brig_sampler_filter_t.
+ */
+typedef enum {
+ /**
+ * Filter to the image element nearest (in Manhattan distance) to the
+ * specified coordinate.
+ */
+ HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0,
+
+ /**
+ * Filter to the image element calculated by combining the elements in a 2x2
+ * square block or 2x2x2 cube block around the specified coordinate. The
+ * elements are combined using linear interpolation.
+ */
+ HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1
+
+} hsa_ext_sampler_filter_mode_t;
+
+/**
+ * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants.
+ */
+typedef uint32_t hsa_ext_sampler_filter_mode32_t;
+
+/**
+ * @brief Implementation independent sampler descriptor.
+ */
+typedef struct hsa_ext_sampler_descriptor_s {
+ /**
+ * Sampler coordinate mode describes the normalization of image coordinates.
+ */
+ hsa_ext_sampler_coordinate_mode32_t coordinate_mode;
+
+ /**
+ * Sampler filter type describes the type of sampling performed.
+ */
+ hsa_ext_sampler_filter_mode32_t filter_mode;
+
+ /**
+ * Sampler address mode describes the processing of out-of-range image
+ * coordinates.
+ */
+ hsa_ext_sampler_addressing_mode32_t address_mode;
+
+} hsa_ext_sampler_descriptor_t;
+
+/**
+ * @brief Create an agent specific sampler handle for a given agent
+ * independent sampler descriptor and agent.
+ *
+ * @param[in] agent Agent to be associated with the sampler handle created.
+ *
+ * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be
+ * NULL.
+ *
+ * @param[out] sampler Memory location where the HSA runtime stores the newly
+ * created sampler handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ *
+ * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The
+ * @p agent does not have the capability to support the properties
+ * specified by @p sampler_descriptor or it is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate
+ * the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or
+ * @p sampler is NULL.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_create(
+ hsa_agent_t agent,
+ const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+ hsa_ext_sampler_t *sampler);
+
+/**
+ * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create.
+ *
+ * @details The sampler handle should not be destroyed while there are
+ * references to it queued for execution or currently being used in a
+ * kernel dispatch.
+ *
+ * @param[in] agent Agent associated with the sampler handle.
+ *
+ * @param[in] sampler Sampler handle to destroy.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid.
+ */
+hsa_status_t HSA_API hsa_ext_sampler_destroy(
+ hsa_agent_t agent,
+ hsa_ext_sampler_t sampler);
+
+
+#define hsa_ext_images_1_00
+
+/**
+ * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ext_images_1_00_pfn_s {
+
+ hsa_status_t (*hsa_ext_image_get_capability)(
+ hsa_agent_t agent,
+ hsa_ext_image_geometry_t geometry,
+ const hsa_ext_image_format_t *image_format,
+ uint32_t *capability_mask);
+
+ hsa_status_t (*hsa_ext_image_data_get_info)(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_info_t *image_data_info);
+
+ hsa_status_t (*hsa_ext_image_create)(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ const void *image_data,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_t *image);
+
+ hsa_status_t (*hsa_ext_image_destroy)(
+ hsa_agent_t agent,
+ hsa_ext_image_t image);
+
+ hsa_status_t (*hsa_ext_image_copy)(
+ hsa_agent_t agent,
+ hsa_ext_image_t src_image,
+ const hsa_dim3_t* src_offset,
+ hsa_ext_image_t dst_image,
+ const hsa_dim3_t* dst_offset,
+ const hsa_dim3_t* range);
+
+ hsa_status_t (*hsa_ext_image_import)(
+ hsa_agent_t agent,
+ const void *src_memory,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ hsa_ext_image_t dst_image,
+ const hsa_ext_image_region_t *image_region);
+
+ hsa_status_t (*hsa_ext_image_export)(
+ hsa_agent_t agent,
+ hsa_ext_image_t src_image,
+ void *dst_memory,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ const hsa_ext_image_region_t *image_region);
+
+ hsa_status_t (*hsa_ext_image_clear)(
+ hsa_agent_t agent,
+ hsa_ext_image_t image,
+ const void* data,
+ const hsa_ext_image_region_t *image_region);
+
+ hsa_status_t (*hsa_ext_sampler_create)(
+ hsa_agent_t agent,
+ const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+ hsa_ext_sampler_t *sampler);
+
+ hsa_status_t (*hsa_ext_sampler_destroy)(
+ hsa_agent_t agent,
+ hsa_ext_sampler_t sampler);
+
+} hsa_ext_images_1_00_pfn_t;
+
+#define hsa_ext_images_1
+
+/**
+ * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table.
+ */
+typedef struct hsa_ext_images_1_pfn_s {
+
+ hsa_status_t (*hsa_ext_image_get_capability)(
+ hsa_agent_t agent,
+ hsa_ext_image_geometry_t geometry,
+ const hsa_ext_image_format_t *image_format,
+ uint32_t *capability_mask);
+
+ hsa_status_t (*hsa_ext_image_data_get_info)(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_info_t *image_data_info);
+
+ hsa_status_t (*hsa_ext_image_create)(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ const void *image_data,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_t *image);
+
+ hsa_status_t (*hsa_ext_image_destroy)(
+ hsa_agent_t agent,
+ hsa_ext_image_t image);
+
+ hsa_status_t (*hsa_ext_image_copy)(
+ hsa_agent_t agent,
+ hsa_ext_image_t src_image,
+ const hsa_dim3_t* src_offset,
+ hsa_ext_image_t dst_image,
+ const hsa_dim3_t* dst_offset,
+ const hsa_dim3_t* range);
+
+ hsa_status_t (*hsa_ext_image_import)(
+ hsa_agent_t agent,
+ const void *src_memory,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ hsa_ext_image_t dst_image,
+ const hsa_ext_image_region_t *image_region);
+
+ hsa_status_t (*hsa_ext_image_export)(
+ hsa_agent_t agent,
+ hsa_ext_image_t src_image,
+ void *dst_memory,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ const hsa_ext_image_region_t *image_region);
+
+ hsa_status_t (*hsa_ext_image_clear)(
+ hsa_agent_t agent,
+ hsa_ext_image_t image,
+ const void* data,
+ const hsa_ext_image_region_t *image_region);
+
+ hsa_status_t (*hsa_ext_sampler_create)(
+ hsa_agent_t agent,
+ const hsa_ext_sampler_descriptor_t *sampler_descriptor,
+ hsa_ext_sampler_t *sampler);
+
+ hsa_status_t (*hsa_ext_sampler_destroy)(
+ hsa_agent_t agent,
+ hsa_ext_sampler_t sampler);
+
+ hsa_status_t (*hsa_ext_image_get_capability_with_layout)(
+ hsa_agent_t agent,
+ hsa_ext_image_geometry_t geometry,
+ const hsa_ext_image_format_t *image_format,
+ hsa_ext_image_data_layout_t image_data_layout,
+ uint32_t *capability_mask);
+
+ hsa_status_t (*hsa_ext_image_data_get_info_with_layout)(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_layout_t image_data_layout,
+ size_t image_data_row_pitch,
+ size_t image_data_slice_pitch,
+ hsa_ext_image_data_info_t *image_data_info);
+
+ hsa_status_t (*hsa_ext_image_create_with_layout)(
+ hsa_agent_t agent,
+ const hsa_ext_image_descriptor_t *image_descriptor,
+ const void *image_data,
+ hsa_access_permission_t access_permission,
+ hsa_ext_image_data_layout_t image_data_layout,
+ size_t image_data_row_pitch,
+ size_t image_data_slice_pitch,
+ hsa_ext_image_t *image);
+
+} hsa_ext_images_1_pfn_t;
+/** @} */
+
+#ifdef __cplusplus
+} // end extern "C" block
+#endif /*__cplusplus*/
+
+#endif
diff --git a/third_party/rocm/include/hsa/hsa_ven_amd_aqlprofile.h b/third_party/rocm/include/hsa/hsa_ven_amd_aqlprofile.h
new file mode 100644
index 0000000..fb763c0
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa_ven_amd_aqlprofile.h
@@ -0,0 +1,355 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
+#define OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
+
+#include <stdint.h>
+#include "hsa.h"
+
+#define HSA_AQLPROFILE_VERSION_MAJOR 2
+#define HSA_AQLPROFILE_VERSION_MINOR 0
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+////////////////////////////////////////////////////////////////////////////////
+// Library version
+uint32_t hsa_ven_amd_aqlprofile_version_major();
+uint32_t hsa_ven_amd_aqlprofile_version_minor();
+
+///////////////////////////////////////////////////////////////////////
+// Library API:
+// The library provides helper methods for instantiation of
+// the profile context object and for populating of the start
+// and stop AQL packets. The profile object contains a profiling
+// events list and needed for profiling buffers descriptors,
+// a command buffer and an output data buffer. To check if there
+// was an error the library methods return a status code. Also
+// the library provides methods for querying required buffers
+// attributes, to validate the event attributes and to get profiling
+// output data.
+//
+// Returned status:
+// hsa_status_t – HSA status codes are used from hsa.h header
+//
+// Supported profiling features:
+//
+// Supported profiling events
+typedef enum {
+ HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC = 0,
+ HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE = 1,
+} hsa_ven_amd_aqlprofile_event_type_t;
+
+// Supported performance counters (PMC) blocks
+// The block ID is the same for a block instances set, for example
+// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
+// will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC.
+typedef enum {
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPF = 1,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GDS = 2,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBM = 3,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GRBMSE = 4,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SPI = 5,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ = 6,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQCS = 7,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SRBM = 8,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SX = 9,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TA = 10,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCA = 11,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCC = 12,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TCP = 13,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_TD = 14,
+ // Memory related blocks
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCARB = 15,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCHUB = 16,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCMCBVM = 17,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCSEQ = 18,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCVML2 = 19,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MCXBAR = 20,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATC = 21,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_ATCL2 = 22,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCEA = 23,
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_RPB = 24,
+ // System blocks
+ HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SDMA = 25,
+
+ HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER
+} hsa_ven_amd_aqlprofile_block_name_t;
+
+// PMC event object structure
+// ‘counter_id’ value is specified in GFXIPs perfcounter user guides
+// which is the counters select value, “Performance Counters Selection”
+// chapter.
+typedef struct {
+ hsa_ven_amd_aqlprofile_block_name_t block_name;
+ uint32_t block_index;
+ uint32_t counter_id;
+} hsa_ven_amd_aqlprofile_event_t;
+
+// Check if event is valid for the specific GPU
+hsa_status_t hsa_ven_amd_aqlprofile_validate_event(
+ hsa_agent_t agent, // HSA handle for the profiling GPU
+ const hsa_ven_amd_aqlprofile_event_t* event, // [in] Pointer on validated event
+ bool* result); // [out] True if the event valid, False otherwise
+
+// Profiling parameters
+// All parameters are generic and if not applicable for a specific
+// profile configuration then error status will be returned.
+typedef enum {
+ // Trace applicable parameters
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6,
+ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7,
+} hsa_ven_amd_aqlprofile_parameter_name_t;
+
+// Profile parameter object
+typedef struct {
+ hsa_ven_amd_aqlprofile_parameter_name_t parameter_name;
+ uint32_t value;
+} hsa_ven_amd_aqlprofile_parameter_t;
+
+//
+// Profile context object:
+// The library provides a profile object structure which contains
+// the events array, a buffer for the profiling start/stop commands
+// and a buffer for the output data.
+// The buffers are specified by the buffer descriptors and allocated
+// by the application. The buffers allocation attributes, the command
+// buffer size, the PMC output buffer size as well as profiling output
+// data can be get using the generic get profile info helper _get_info.
+//
+// Buffer descriptor
+typedef struct {
+ void* ptr;
+ uint32_t size;
+} hsa_ven_amd_aqlprofile_descriptor_t;
+
+// Profile context object structure, contains profiling events list and
+// needed for profiling buffers descriptors, a command buffer and
+// an output data buffer
+typedef struct {
+ hsa_agent_t agent; // GFXIP handle
+ hsa_ven_amd_aqlprofile_event_type_t type; // Events type
+ const hsa_ven_amd_aqlprofile_event_t* events; // Events array
+ uint32_t event_count; // Events count
+ const hsa_ven_amd_aqlprofile_parameter_t* parameters; // Parameters array
+ uint32_t parameter_count; // Parameters count
+ hsa_ven_amd_aqlprofile_descriptor_t output_buffer; // Output buffer
+ hsa_ven_amd_aqlprofile_descriptor_t command_buffer; // PM4 commands
+} hsa_ven_amd_aqlprofile_profile_t;
+
+//
+// AQL packets populating methods:
+// The helper methods to populate provided by the application START and
+// STOP AQL packets which the application is required to submit before and
+// after profiled GPU task packets respectively.
+//
+// AQL Vendor Specific packet which carries a PM4 command
+typedef struct {
+ uint16_t header;
+ uint16_t pm4_command[27];
+ hsa_signal_t completion_signal;
+} hsa_ext_amd_aql_pm4_packet_t;
+
+// Method to populate the provided AQL packet with profiling start commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_start(
+ hsa_ven_amd_aqlprofile_profile_t* profile, // [in/out] profile contex object
+ hsa_ext_amd_aql_pm4_packet_t* aql_start_packet); // [out] profile start AQL packet
+
+// Method to populate the provided AQL packet with profiling stop commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_stop(
+ const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile contex object
+ hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet); // [out] profile stop AQL packet
+
+// Method to populate the provided AQL packet with profiling read commands
+// Only 'pm4_command' fields of the packet are set and the application
+// is responsible to set Vendor Specific header type and a completion signal
+hsa_status_t hsa_ven_amd_aqlprofile_read(
+ const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile contex object
+ hsa_ext_amd_aql_pm4_packet_t* aql_read_packet); // [out] profile stop AQL packet
+
+// Legacy devices, PM4 profiling packet size
+const unsigned HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE = 192;
+// Legacy devices, converting the profiling AQL packet to PM4 packet blob
+hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4(
+ const hsa_ext_amd_aql_pm4_packet_t* aql_packet, // [in] AQL packet
+ void* data); // [out] PM4 packet blob
+
+//
+// Get profile info:
+// Generic method for getting various profile info including profile buffers
+// attributes like the command buffer size and the profiling PMC results.
+// It’s implied that all counters are 64bit values.
+//
+// Profile generic output data:
+typedef struct {
+ uint32_t sample_id; // PMC sample or trace buffer index
+ union {
+ struct {
+ hsa_ven_amd_aqlprofile_event_t event; // PMC event
+ uint64_t result; // PMC result
+ } pmc_data;
+ hsa_ven_amd_aqlprofile_descriptor_t trace_data; // Trace output data descriptor
+ };
+} hsa_ven_amd_aqlprofile_info_data_t;
+
+// ID query type
+typedef struct {
+ const char* name;
+ uint32_t id;
+ uint32_t instance_count;
+} hsa_ven_amd_aqlprofile_id_query_t;
+
+// Profile attributes
+typedef enum {
+ HSA_VEN_AMD_AQLPROFILE_INFO_COMMAND_BUFFER_SIZE = 0, // get_info returns uint32_t value
+ HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA_SIZE = 1, // get_info returns uint32_t value
+ HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA = 2, // get_info returns PMC uint64_t value
+ // in info_data object
+ HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA = 3, // get_info returns trace buffer ptr/size
+ // in info_data object
+ //
+ HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS = 4, // get_info returns number of block counter
+ HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID = 5, // get_info returns block id, instances
+ // by name string using _id_query_t
+ //
+ HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD = 6, // get_info returns size/pointer for
+ // counters enable command buffer
+ HSA_VEN_AMD_AQLPROFILE_INFO_DISABLE_CMD = 7, // get_info returns size/pointer for
+ // counters disable command buffer
+} hsa_ven_amd_aqlprofile_info_type_t;
+
+// Definition of output data iterator callback
+typedef hsa_status_t (*hsa_ven_amd_aqlprofile_data_callback_t)(
+ hsa_ven_amd_aqlprofile_info_type_t info_type, // [in] data type, PMC or trace data
+ hsa_ven_amd_aqlprofile_info_data_t* info_data, // [in] info_data object
+ void* callback_data); // [in/out] data passed to the callback
+
+// Method for getting the profile info
+hsa_status_t hsa_ven_amd_aqlprofile_get_info(
+ const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object
+ hsa_ven_amd_aqlprofile_info_type_t attribute, // [in] requested profile attribute
+ void* value); // [in/out] returned value
+
+// Method for iterating the events output data
+hsa_status_t hsa_ven_amd_aqlprofile_iterate_data(
+ const hsa_ven_amd_aqlprofile_profile_t* profile, // [in] profile context object
+ hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate the output data
+ void* data); // [in/out] data passed to the callback
+
+// Return error string
+hsa_status_t hsa_ven_amd_aqlprofile_error_string(
+ const char** str); // [out] pointer on the error string
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_aqlprofile_VERSION_MAJOR 1
+#define hsa_ven_amd_aqlprofile_LIB(suff) "libhsa-amd-aqlprofile" suff ".so"
+
+#ifdef HSA_LARGE_MODEL
+static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("64");
+#else
+static const char kAqlProfileLib[] = hsa_ven_amd_aqlprofile_LIB("");
+#endif
+
+/**
+ * @brief Extension function table.
+ */
+typedef struct hsa_ven_amd_aqlprofile_1_00_pfn_s {
+ uint32_t (*hsa_ven_amd_aqlprofile_version_major)();
+ uint32_t (*hsa_ven_amd_aqlprofile_version_minor)();
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_error_string)(
+ const char** str);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_validate_event)(
+ hsa_agent_t agent,
+ const hsa_ven_amd_aqlprofile_event_t* event,
+ bool* result);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_start)(
+ hsa_ven_amd_aqlprofile_profile_t* profile,
+ hsa_ext_amd_aql_pm4_packet_t* aql_start_packet);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_stop)(
+ const hsa_ven_amd_aqlprofile_profile_t* profile,
+ hsa_ext_amd_aql_pm4_packet_t* aql_stop_packet);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_read)(
+ const hsa_ven_amd_aqlprofile_profile_t* profile,
+ hsa_ext_amd_aql_pm4_packet_t* aql_read_packet);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_legacy_get_pm4)(
+ const hsa_ext_amd_aql_pm4_packet_t* aql_packet,
+ void* data);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_get_info)(
+ const hsa_ven_amd_aqlprofile_profile_t* profile,
+ hsa_ven_amd_aqlprofile_info_type_t attribute,
+ void* value);
+
+ hsa_status_t (*hsa_ven_amd_aqlprofile_iterate_data)(
+ const hsa_ven_amd_aqlprofile_profile_t* profile,
+ hsa_ven_amd_aqlprofile_data_callback_t callback,
+ void* data);
+} hsa_ven_amd_aqlprofile_1_00_pfn_t;
+
+typedef hsa_ven_amd_aqlprofile_1_00_pfn_t hsa_ven_amd_aqlprofile_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // OPENSRC_HSA_RUNTIME_INC_HSA_VEN_AMD_AQLPROFILE_H_
diff --git a/third_party/rocm/include/hsa/hsa_ven_amd_loader.h b/third_party/rocm/include/hsa/hsa_ven_amd_loader.h
new file mode 100644
index 0000000..3ce8475
--- /dev/null
+++ b/third_party/rocm/include/hsa/hsa_ven_amd_loader.h
@@ -0,0 +1,589 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// HSA AMD extension for additional loader functionality.
+
+#ifndef HSA_VEN_AMD_LOADER_H
+#define HSA_VEN_AMD_LOADER_H
+
+#include "hsa.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @brief Queries equivalent host address for given @p device_address, and
+ * records it in @p host_address.
+ *
+ *
+ * @details Contents of memory pointed to by @p host_address would be identical
+ * to contents of memory pointed to by @p device_address. Only difference
+ * between the two is host accessibility: @p host_address is always accessible
+ * from host, @p device_address might not be accessible from host.
+ *
+ * If @p device_address already points to host accessible memory, then the value
+ * of @p device_address is simply copied into @p host_address.
+ *
+ * The lifetime of @p host_address is the same as the lifetime of @p
+ * device_address, and both lifetimes are limited by the lifetime of the
+ * executable that is managing these addresses.
+ *
+ *
+ * @param[in] device_address Device address to query equivalent host address
+ * for.
+ *
+ * @param[out] host_address Pointer to application-allocated buffer to record
+ * queried equivalent host address in.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or
+ * null, or @p host_address is null.
+ */
+hsa_status_t hsa_ven_amd_loader_query_host_address(
+ const void *device_address,
+ const void **host_address);
+
+/**
+ * @brief The storage type of the code object that is backing loaded memory
+ * segment.
+ */
+typedef enum {
+ /**
+ * Loaded memory segment is not backed by any code object (anonymous), as the
+ * case would be with BSS (uninitialized data).
+ */
+ HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0,
+ /**
+ * Loaded memory segment is backed by the code object that is stored in the
+ * file.
+ */
+ HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1,
+ /**
+ * Loaded memory segment is backed by the code object that is stored in the
+ * memory.
+ */
+ HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2
+} hsa_ven_amd_loader_code_object_storage_type_t;
+
+/**
+ * @brief Loaded memory segment descriptor.
+ *
+ *
+ * @details Loaded memory segment descriptor describes underlying loaded memory
+ * segment. Loaded memory segment is created/allocated by the executable during
+ * the loading of the code object that is backing underlying memory segment.
+ *
+ * The lifetime of underlying memory segment is limited by the lifetime of the
+ * executable that is managing underlying memory segment.
+ */
+typedef struct hsa_ven_amd_loader_segment_descriptor_s {
+ /**
+ * Agent underlying memory segment is allocated on. If the code object that is
+ * backing underlying memory segment is program code object, then 0.
+ */
+ hsa_agent_t agent;
+ /**
+ * Executable that is managing this underlying memory segment.
+ */
+ hsa_executable_t executable;
+ /**
+ * Storage type of the code object that is backing underlying memory segment.
+ */
+ hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type;
+ /**
+ * If the storage type of the code object that is backing underlying memory
+ * segment is:
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null;
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated
+ * filepath to the code object;
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host
+ * accessible pointer to the first byte of the code object.
+ */
+ const void *code_object_storage_base;
+ /**
+ * If the storage type of the code object that is backing underlying memory
+ * segment is:
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of
+ * the filepath to the code object (including null-terminating character);
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in
+ * bytes, of the memory occupied by the code object.
+ */
+ size_t code_object_storage_size;
+ /**
+ * If the storage type of the code object that is backing underlying memory
+ * segment is:
+ * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0;
+ * - other, then offset, in bytes, from the beginning of the code object to
+ * the first byte in the code object data is copied from.
+ */
+ size_t code_object_storage_offset;
+ /**
+ * Starting address of the underlying memory segment.
+ */
+ const void *segment_base;
+ /**
+ * Size, in bytes, of the underlying memory segment.
+ */
+ size_t segment_size;
+} hsa_ven_amd_loader_segment_descriptor_t;
+
+/**
+ * @brief Either queries loaded memory segment descriptors, or total number of
+ * loaded memory segment descriptors.
+ *
+ *
+ * @details If @p segment_descriptors is not null and @p num_segment_descriptors
+ * points to number that exactly matches total number of loaded memory segment
+ * descriptors, then queries loaded memory segment descriptors, and records them
+ * in @p segment_descriptors. If @p segment_descriptors is null and @p
+ * num_segment_descriptors points to zero, then queries total number of loaded
+ * memory segment descriptors, and records it in @p num_segment_descriptors. In
+ * all other cases returns appropriate error code (see below).
+ *
+ * The caller of this function is responsible for the allocation/deallocation
+ * and the lifetime of @p segment_descriptors and @p num_segment_descriptors.
+ *
+ * The lifetime of loaded memory segments that are described by queried loaded
+ * memory segment descriptors is limited by the lifetime of the executable that
+ * is managing loaded memory segments.
+ *
+ * Queried loaded memory segment descriptors are always self-consistent: they
+ * describe a complete set of loaded memory segments that are being backed by
+ * fully loaded code objects that are present at the time (i.e. this function
+ * is blocked until all executable manipulations are fully complete).
+ *
+ *
+ * @param[out] segment_descriptors Pointer to application-allocated buffer to
+ * record queried loaded memory segment descriptors in. Can be null if @p
+ * num_segment_descriptors points to zero.
+ *
+ * @param[in,out] num_segment_descriptors Pointer to application-allocated
+ * buffer that contains either total number of loaded memory segment descriptors
+ * or zero.
+ *
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null
+ * while @p num_segment_descriptors points to non-zero number, @p
+ * segment_descriptors is not null while @p num_segment_descriptors points to
+ * zero, or @p num_segment_descriptors is null.
+ *
+ * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors
+ * does not point to number that exactly matches total number of loaded memory
+ * segment descriptors.
+ */
+hsa_status_t hsa_ven_amd_loader_query_segment_descriptors(
+ hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+ size_t *num_segment_descriptors);
+
+/**
+ * @brief Obtains the handle of executable to which the device address belongs.
+ *
+ * @details This method should not be used to obtain executable handle by using
+ * a host address. The executable returned is expected to be alive until its
+ * destroyed by the user.
+ *
+ * @retval HSA_STATUS_SUCCESS Function is executed successfully.
+ *
+ * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized.
+ *
+ * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there
+ * is no exectuable found for this kernel code object.
+ */
+hsa_status_t hsa_ven_amd_loader_query_executable(
+ const void *device_address,
+ hsa_executable_t *executable);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Iterate over the loaded code objects in an executable, and invoke
+ * an application-defined callback on every iteration.
+ *
+ * @param[in] executable Executable.
+ *
+ * @param[in] callback Callback to be invoked once per loaded code object. The
+ * HSA runtime passes three arguments to the callback: the executable, a
+ * loaded code object, and the application data. If @p callback returns a
+ * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the
+ * traversal stops and
+ * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that
+ * status value.
+ *
+ * @param[in] data Application data that is passed to @p callback on every
+ * iteration. May be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
+ hsa_executable_t executable,
+ hsa_status_t (*callback)(
+ hsa_executable_t executable,
+ hsa_loaded_code_object_t loaded_code_object,
+ void *data),
+ void *data);
+
+/**
+ * @brief Loaded code object kind.
+ */
+typedef enum {
+ /**
+ * Program code object.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1,
+ /**
+ * Agent code object.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2
+} hsa_ven_amd_loader_loaded_code_object_kind_t;
+
+/**
+ * @brief Loaded code object attributes.
+ */
+typedef enum hsa_ven_amd_loader_loaded_code_object_info_e {
+ /**
+ * The executable in which this loaded code object is loaded. The
+ * type of this attribute is ::hsa_executable_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1,
+ /**
+ * The kind of this loaded code object. The type of this attribute is
+ * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2,
+ /**
+ * The agent on which this loaded code object is loaded. The
+ * value of this attribute is only defined if
+ * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is
+ * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this
+ * attribute is ::hsa_agent_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3,
+ /**
+ * The storage type of the code object reader used to load the loaded code object.
+ * The type of this attribute is ::uint32_t interpreted as a
+ * ::hsa_ven_amd_loader_code_object_storage_type_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4,
+ /**
+ * The memory address of the first byte of the code object that was loaaded.
+ * The value of this attribute is only defined if
+ * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+ * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+ * attribute is ::uint64_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5,
+ /**
+ * The memory size in bytes of the code object that was loaaded.
+ * The value of this attribute is only defined if
+ * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+ * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this
+ * attribute is ::uint64_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6,
+ /**
+ * The file descriptor of the code object that was loaaded.
+ * The value of this attribute is only defined if
+ * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is
+ * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this
+ * attribute is ::int.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7,
+ /**
+ * The signed byte address difference of the memory address at which the code
+ * object is loaded minus the virtual address specified in the code object
+ * that is loaded. The value of this attribute is only defined if the
+ * executable in which the code object is loaded is froozen. The type of this
+ * attribute is ::int64_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8,
+ /**
+ * The base memory address at which the code object is loaded. This is the
+ * base address of the allocation for the lowest addressed segment of the code
+ * object that is loaded. Note that any non-loaded segments before the first
+ * loaded segment are ignored. The value of this attribute is only defined if
+ * the executable in which the code object is loaded is froozen. The type of
+ * this attribute is ::uint64_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9,
+ /**
+ * The byte size of the loaded code objects contiguous memory allocation. The
+ * value of this attribute is only defined if the executable in which the code
+ * object is loaded is froozen. The type of this attribute is ::uint64_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10,
+ /**
+ * The length of the URI in bytes, not including the NUL terminator. The type
+ * of this attribute is uint32_t.
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11,
+ /**
+ * The URI name from which the code object was loaded. The type of this
+ * attribute is a NUL terminated \p char* with the length equal to the value
+ * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute.
+ * The URI name syntax is defined by the following BNF syntax:
+ *
+ * code_object_uri ::== file_uri | memory_uri
+ * file_uri ::== "file://" file_path [ range_specifier ]
+ * memory_uri ::== "memory://" process_id range_specifier
+ * range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number
+ * file_path ::== URI_ENCODED_OS_FILE_PATH
+ * process_id ::== DECIMAL_NUMBER
+ * number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER
+ *
+ * ``number`` is a C integral literal where hexadecimal values are prefixed by
+ * "0x" or "0X", and octal values by "0".
+ *
+ * ``file_path`` is the file's path specified as a URI encoded UTF-8 string.
+ * In URI encoding, every character that is not in the regular expression
+ * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits
+ * proceeded by "%". Directories in the path are separated by "/".
+ *
+ * ``offset`` is a 0-based byte offset to the start of the code object. For a
+ * file URI, it is from the start of the file specified by the ``file_path``,
+ * and if omitted defaults to 0. For a memory URI, it is the memory address
+ * and is required.
+ *
+ * ``size`` is the number of bytes in the code object. For a file URI, if
+ * omitted it defaults to the size of the file. It is required for a memory
+ * URI.
+ *
+ * ``process_id`` is the identity of the process owning the memory. For Linux
+ * it is the C unsigned integral decimal literal for the process ID (PID).
+ *
+ * For example:
+ *
+ * file:///dir1/dir2/file1
+ * file:///dir3/dir4/file2#offset=0x2000&size=3000
+ * memory://1234#offset=0x20000&size=3000
+ */
+ HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12,
+} hsa_ven_amd_loader_loaded_code_object_info_t;
+
+/**
+ * @brief Get the current value of an attribute for a given loaded code
+ * object.
+ *
+ * @param[in] loaded_code_object Loaded code object.
+ *
+ * @param[in] attribute Attribute to query.
+ *
+ * @param[out] value Pointer to an application-allocated buffer where to store
+ * the value of the attribute. If the buffer passed by the application is not
+ * large enough to hold the value of @p attribute, the behavior is undefined.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is
+ * invalid.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid
+ * loaded code object attribute, or @p value is NULL.
+ */
+hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info(
+ hsa_loaded_code_object_t loaded_code_object,
+ hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+ void *value);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Create a code object reader to operate on a file with size and offset.
+ *
+ * @param[in] file File descriptor. The file must have been opened by
+ * application with at least read permissions prior calling this function. The
+ * file must contain a vendor-specific code object.
+ *
+ * The file is owned and managed by the application; the lifetime of the file
+ * descriptor must exceed that of any associated code object reader.
+ *
+ * @param[in] size Size of the code object embedded in @p file.
+ *
+ * @param[in] offset 0-based offset relative to the beginning of the @p file
+ * that denotes the beginning of the code object embedded within the @p file.
+ *
+ * @param[out] code_object_reader Memory location to store the newly created
+ * code object reader handle. Must not be NULL.
+ *
+ * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully.
+ *
+ * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
+ * initialized.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least
+ * read permissions. This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset
+ * do not form a valid code object. If file size is 0. Or offset > file size.
+ * This condition may also be reported as
+ * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the
+ * ::hsa_executable_load_agent_code_object function.
+ *
+ * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to
+ * allocate the required resources.
+ *
+ * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL.
+ */
+hsa_status_t
+hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size(
+ hsa_file_t file,
+ size_t offset,
+ size_t size,
+ hsa_code_object_reader_t *code_object_reader);
+
+//===----------------------------------------------------------------------===//
+
+/**
+ * @brief Extension version.
+ */
+#define hsa_ven_amd_loader 001002
+
+/**
+ * @brief Extension function table version 1.00.
+ */
+typedef struct hsa_ven_amd_loader_1_00_pfn_s {
+ hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+ const void *device_address,
+ const void **host_address);
+
+ hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+ hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+ size_t *num_segment_descriptors);
+
+ hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+ const void *device_address,
+ hsa_executable_t *executable);
+} hsa_ven_amd_loader_1_00_pfn_t;
+
+/**
+ * @brief Extension function table version 1.01.
+ */
+typedef struct hsa_ven_amd_loader_1_01_pfn_s {
+ hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+ const void *device_address,
+ const void **host_address);
+
+ hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+ hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+ size_t *num_segment_descriptors);
+
+ hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+ const void *device_address,
+ hsa_executable_t *executable);
+
+ hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+ hsa_executable_t executable,
+ hsa_status_t (*callback)(
+ hsa_executable_t executable,
+ hsa_loaded_code_object_t loaded_code_object,
+ void *data),
+ void *data);
+
+ hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+ hsa_loaded_code_object_t loaded_code_object,
+ hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+ void *value);
+} hsa_ven_amd_loader_1_01_pfn_t;
+
+/**
+ * @brief Extension function table version 1.02.
+ */
+typedef struct hsa_ven_amd_loader_1_02_pfn_s {
+ hsa_status_t (*hsa_ven_amd_loader_query_host_address)(
+ const void *device_address,
+ const void **host_address);
+
+ hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)(
+ hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors,
+ size_t *num_segment_descriptors);
+
+ hsa_status_t (*hsa_ven_amd_loader_query_executable)(
+ const void *device_address,
+ hsa_executable_t *executable);
+
+ hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)(
+ hsa_executable_t executable,
+ hsa_status_t (*callback)(
+ hsa_executable_t executable,
+ hsa_loaded_code_object_t loaded_code_object,
+ void *data),
+ void *data);
+
+ hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)(
+ hsa_loaded_code_object_t loaded_code_object,
+ hsa_ven_amd_loader_loaded_code_object_info_t attribute,
+ void *value);
+
+ hsa_status_t
+ (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)(
+ hsa_file_t file,
+ size_t offset,
+ size_t size,
+ hsa_code_object_reader_t *code_object_reader);
+} hsa_ven_amd_loader_1_02_pfn_t;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* HSA_VEN_AMD_LOADER_H */
diff --git a/third_party/rocm/version.txt b/third_party/rocm/version.txt
new file mode 100644
index 0000000..21016b3
--- /dev/null
+++ b/third_party/rocm/version.txt
@@ -0,0 +1 @@
+4.1.1-34