blob: 3113643ca1d18c743e0959c0bd2612eff6bc1b30 [file] [log] [blame]
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Local definitions, substitute for standard headers.
// Our microkernels are completely self-contained, not even including standard
// C library headers such as stdint.h, as that invariably runs into build
// breakage on some compilation hosts where these headers happen to include
// platform stuff that wasn't expecting to get compiled for AMDGPU.
// This code is only ever getting compiled by recent Clang, in C23 mode, and for
// the AMDGPU target. This greatly simplifies this header; in particular, we
// can use any Clang built-in or predefined macro.
#ifndef COMPILER_PLUGINS_TARGET_ROCM_BUILTINS_UKERNEL_COMMON_H_
#define COMPILER_PLUGINS_TARGET_ROCM_BUILTINS_UKERNEL_COMMON_H_
//===----------------------------------------------------------------------===//
// Local replacements for stdint.h
//===----------------------------------------------------------------------===//
typedef __INT8_TYPE__ int8_t;
typedef __INT16_TYPE__ int16_t;
typedef __INT32_TYPE__ int32_t;
typedef __INT64_TYPE__ int64_t;
typedef __UINT8_TYPE__ uint8_t;
typedef __UINT16_TYPE__ uint16_t;
typedef __UINT32_TYPE__ uint32_t;
typedef __UINT64_TYPE__ uint64_t;
#define INT8_MIN __INT8_MIN__
#define INT16_MIN __INT16_MIN__
#define INT32_MIN __INT32_MIN__
#define INT64_MIN __INT64_MIN__
#define INT8_MAX __INT8_MAX__
#define INT16_MAX __INT16_MAX__
#define INT32_MAX __INT32_MAX__
#define INT64_MAX __INT64_MAX__
#define UINT8_MAX __UINT8_MAX__
#define UINT16_MAX __UINT16_MAX__
#define UINT32_MAX __UINT32_MAX__
#define UINT64_MAX __UINT64_MAX__
// Note: intentionally NO size_t and other address-space-size-dependent types.
// Since on AMDGPU we are dealing with different address spaces with different
// pointer sizes (64-bit global vs. 32-bit LDS), each function can use int64 or
// int32 explicitly.
//===----------------------------------------------------------------------===//
// Local replacements for float.h
//===----------------------------------------------------------------------===//
#define FLT_EPSILON __FLT_EPSILON__
#define FLT_MIN __FLT_MIN__
#define FLT_MAX __FLT_MAX__
//===----------------------------------------------------------------------===//
// Vector typedefs
//===----------------------------------------------------------------------===//
typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t;
//===----------------------------------------------------------------------===//
// Declarations for Clangd, which may be slightly older than actual clang.
// Drop these as clangd versions used in practice gain these builtins.
// Unconditionally declaring these, regardless of clang version, ensures that
// any mistake in the declaration is caught by newer clangs.
//===----------------------------------------------------------------------===//
// Missing in clang 18. https://github.com/llvm/llvm-project/pull/80741.
unsigned int __builtin_amdgcn_wavefrontsize();
//===----------------------------------------------------------------------===//
// Local replacements for AMD device library headers
//===----------------------------------------------------------------------===//
_Float16 __ockl_wfred_max_f16(_Float16);
int64_t __ockl_wfred_min_i64(int64_t);
int32_t __ockl_wfred_min_i32(int32_t);
#define __CLK_LOCAL_MEM_FENCE 0x01
typedef unsigned __cl_mem_fence_flags;
static inline void __threadfence_block() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}
static inline void __work_group_barrier(__cl_mem_fence_flags flags) {
if (flags) {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
__builtin_amdgcn_s_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
} else {
__builtin_amdgcn_s_barrier();
}
}
static inline void __barrier(int n) {
__work_group_barrier((__cl_mem_fence_flags)n);
}
[[clang::convergent]] static inline void __syncthreads() {
__barrier(__CLK_LOCAL_MEM_FENCE);
}
//===----------------------------------------------------------------------===//
// Local replacements for HIP headers
//===----------------------------------------------------------------------===//
static inline int __lane_id() {
return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}
static inline int __shfl_xor_i(int var, int lane_mask) {
const int width = __builtin_amdgcn_wavefrontsize();
int self = __lane_id();
int index = self ^ lane_mask;
index = index >= ((self + width) & ~(width - 1)) ? self : index;
return __builtin_amdgcn_ds_bpermute(index << 2, var);
}
static inline float __shfl_xor_f(float var, int lane_mask) {
union {
int i;
float f;
} tmp;
tmp.f = var;
tmp.i = __shfl_xor_i(tmp.i, lane_mask);
return tmp.f;
}
static inline uint64_t __ballot(int predicate) {
return __builtin_amdgcn_uicmp(
predicate, 0, 33 /*ICMP_NE from llvm/include/llvm/IR/InstrTypes.h*/);
}
#endif // COMPILER_PLUGINS_TARGET_ROCM_BUILTINS_UKERNEL_COMMON_H_