Fixes to enable clang-cl compilation of compiler/runtime. (#16299)
Also cleaning up some warnings found during the process.
Had to change the ukernel AVX detection to try compiling sources instead
of relying on the presence of compiler flags as on MSVC/clang-cl there's
only a single flag (`/arch:AVX512`) and its presence does not indicate
support for subfeatures.
diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index 5b36433..10342b4 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake
@@ -135,11 +135,12 @@
# https://docs.microsoft.com/en-us/cpp/build/reference/bigobj-increase-number-of-sections-in-dot-obj-file
"/bigobj"
- # Use the modern C preprocessor to more closely match standards/clang/gcc behavior.
- "/Zc:preprocessor"
-
# Enable C11 standards conforming mode.
"$<$<COMPILE_LANGUAGE:C>:/std:c11>"
+
+ MSVC
+ # Use the modern C preprocessor to more closely match standards/clang/gcc behavior.
+ "/Zc:preprocessor"
)
# Compiler diagnostics.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
index 70529bf..351f22d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
@@ -24,22 +24,6 @@
namespace {
-/// Given a LayoutAttr, find the shape of the given layout dimension. It is
-/// expected that the layout has at most one instance of the requested
-/// dimension. Example:
-/// LayoutAttr: <<BATCHX: 4>, <BATCHY: 4, LANEX: 4>>
-/// dim: BATCHX
-/// output: 4
-static std::optional<int64_t> findDimShape(LayoutAttr layout,
- LayoutDimension dim) {
- for (PerDimLayoutAttr dimLayout : layout.getLayouts()) {
- if (std::optional<int64_t> shape = dimLayout.getShape(dim)) {
- return shape;
- }
- }
- return std::nullopt;
-}
-
/// Given the state of the iterator, compute the indices of the original vector
/// that the current iterator state is iterating over. These indices are
/// parameterized by the thread grid.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index ab2f351..ac844fc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -695,8 +695,7 @@
/// pattern.
struct RewriteFuncOpABI : public OpRewritePattern<LLVM::LLVMFuncOp> {
RewriteFuncOpABI(HALDispatchABI &abi, LLVMTypeConverter &typeConverter)
- : OpRewritePattern(&typeConverter.getContext()), abi(abi),
- typeConverter(typeConverter) {}
+ : OpRewritePattern(&typeConverter.getContext()), abi(abi) {}
LogicalResult matchAndRewrite(LLVM::LLVMFuncOp funcOp,
PatternRewriter &rewriter) const override {
@@ -742,7 +741,6 @@
private:
HALDispatchABI &abi;
- LLVMTypeConverter &typeConverter;
};
/// Lower call ops with specified ABI. The ABI to use is looked up from the
@@ -754,8 +752,7 @@
/// pattern.
struct RewriteCallOpABI : public OpRewritePattern<LLVM::CallOp> {
RewriteCallOpABI(HALDispatchABI &abi, LLVMTypeConverter &typeConverter)
- : OpRewritePattern(&typeConverter.getContext()), abi(abi),
- typeConverter(typeConverter) {}
+ : OpRewritePattern(&typeConverter.getContext()), abi(abi) {}
LogicalResult matchAndRewrite(LLVM::CallOp callOp,
PatternRewriter &rewriter) const override {
@@ -790,7 +787,6 @@
private:
HALDispatchABI &abi;
- LLVMTypeConverter &typeConverter;
};
/// Rewrites calls to extern functions to dynamic library import calls.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
index f72e936..dd06283 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
@@ -72,7 +72,7 @@
}
ExecutableLibraryDI::ExecutableLibraryDI(const LLVMTypeConverter *typeConverter)
- : typeConverter(typeConverter), builder(&typeConverter->getContext()) {
+ : builder(&typeConverter->getContext()) {
auto *context = builder.getContext();
fileAttr = LLVM::DIFileAttr::get(
context, "runtime/src/iree/hal/local/executable_library.h", ".");
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
index 73a4e8f..85297e5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
@@ -95,7 +95,6 @@
LLVM::DIDerivedTypeAttr getWorkgroupStateV0T();
private:
- const LLVMTypeConverter *typeConverter;
Builder builder;
LLVM::DIFileAttr fileAttr;
unsigned ptrBitwidth;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 79571b3..70186e1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -182,14 +182,6 @@
[&](Range r) { return getStaticValue(r.size); });
}
-/// Returns true if all the input and output tensor operands of 'op' are fully
-/// dynamic.
-static bool isFullyDynamicOp(linalg::LinalgOp op) {
- SmallVector<int64_t> loopRanges = op.getStaticLoopRanges();
- return llvm::all_of(loopRanges,
- [](int64_t size) { return ShapedType::isDynamic(size); });
-}
-
/// Returns the vectorization pre-processing strategy (peeling, masking) for the
/// given LinalgOp. It is based on either (in the priority order):
/// * user-specified value, or
@@ -983,11 +975,6 @@
return inputAndOutputElementTypes[0];
}
-static void getFullRegisterHeuristicsMatmulVectorSizes(
- mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp op,
- int64_t vectorSize, SmallVectorImpl<int64_t> &sizes,
- SmallVectorImpl<bool> &scalableSizeFlags) {}
-
/// Compute or adjust existing vector sizes using a generic heuristic that will
/// aim to fill at least one full vector register for all the element types of
/// the matmul. For now, the current heuristics only look at the N dimension but
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index b380043..028abe4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -63,22 +63,6 @@
};
} // namespace
-/// The pipeline parser doesnt like strings that have `'` or `"` in them. But it
-/// is needed for demarcating the option value. So just drop them before sending
-/// it one.
-static StringRef sanitizePipelineString(StringRef input) {
- if (input.empty())
- return input;
- // If first/last character is ' or ", drop them.
- if (input.front() == '\'' || input.front() == '"') {
- input = input.drop_front();
- }
- if (input.back() == '\'' || input.back() == '"') {
- input = input.drop_back();
- }
- return input;
-}
-
/// Verify that valid configuration is set for all ops within the compiled
/// module.
template <typename F>
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp
index 3259ec6..32c0663 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp
@@ -35,8 +35,6 @@
#define DEBUG_TYPE "iree-spirv-tile-and-promote"
-constexpr int kMaxVectorNumBits = 128;
-
namespace mlir::iree_compiler {
//====---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/LinalgOpInfo.h b/compiler/src/iree/compiler/Codegen/Utils/LinalgOpInfo.h
index eecd1c6..1bfb661 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/LinalgOpInfo.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/LinalgOpInfo.h
@@ -38,7 +38,6 @@
void computeInfo(linalg::LinalgOp);
TransposeMapFilter transposeMapFilter;
- bool transposeTrait;
bool reductionTrait;
bool dynamicTrait;
SmallVector<OpOperand *> transposeOperands;
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DeduplicateExecutables.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DeduplicateExecutables.cpp
index f35c477..f3376be 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DeduplicateExecutables.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DeduplicateExecutables.cpp
@@ -29,15 +29,6 @@
SymbolOpInterface leafOp) {
return nestSymbolRef(baseRefAttr, FlatSymbolRefAttr::get(leafOp));
}
-static SymbolRefAttr nestSymbolRef(SymbolOpInterface baseOp,
- FlatSymbolRefAttr leafRefAttr) {
- return nestSymbolRef(SymbolRefAttr::get(baseOp), leafRefAttr);
-}
-static SymbolRefAttr nestSymbolRef(SymbolOpInterface baseOp,
- SymbolOpInterface leafOp) {
- return nestSymbolRef(SymbolRefAttr::get(baseOp),
- FlatSymbolRefAttr::get(leafOp));
-}
// Recursively gathers symbol->symbol replacements from the old object table
// regions to the new object table regions into |symbolReplacements|.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp
index 79185d3..01820cd 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp
@@ -37,7 +37,6 @@
static const StringRef kLineStyleDataFlow = "solid";
static const StringRef kShapeNode = "box";
static const StringRef kShapeBox = "box";
-static const StringRef kShapeTab = "tab";
static const StringRef kShapeNone = "plain";
static const StringRef kShapeEllipse = "ellipse";
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
index fadfa37..2ceda0d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
@@ -240,22 +240,6 @@
return isa<IREE::LinalgExt::SetEncodingOp, tensor::PackOp>(op);
}
-/// Returns the source of the pack-like operation.
-// TODO(ravishankarm): This seems like a use case for an interface.
-static Value getSourceOfPackLikeOp(Operation *op) {
- return TypeSwitch<Operation *, Value>(op)
- .Case<tensor::PackOp>([](auto packOp) { return packOp.getSource(); })
- .Case<IREE::LinalgExt::SetEncodingOp>(
- [](auto setEncodingOp) { return setEncodingOp.getSource(); })
- .Default([](Operation *) { return nullptr; });
-}
-static RankedTensorType getSourceTypeOfPackLikeOp(Operation *op) {
- Value source = getSourceOfPackLikeOp(op);
- if (!source)
- return nullptr;
- return llvm::cast<RankedTensorType>(source.getType());
-}
-
/// Returns true if the operation is an `unpack` op or an `unset_encoding` op,
/// or an `extract_slice` op whose source operand matches those criteria,
/// recursively.
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
index 1201b4e..81b9be9 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
@@ -348,102 +348,6 @@
}
};
-// Materialize copy-on-write (🐄) ops where required for |rootValue|.
-// Only valid in tensor/async ops - don't use with stream.cmd.*.
-static bool materializeCOW(Location loc, Value rootValue, OpBuilder &builder) {
- auto valueType =
- llvm::dyn_cast<IREE::Stream::ResourceType>(rootValue.getType());
- if (!valueType)
- return false;
-
- // If our rootValue is a constant then we need to ensure that we aren't
- // tied to a constant operand. If we are we need to clone to a
- // non-constant value.
- bool forceClone = valueType.getLifetime() == IREE::Stream::Lifetime::Constant;
-
- // Identify if we need to insert a copy-on-write clone.
- // We do this per use as a single consuming op may use the result of this
- // multiple times - some tied and some not - and if it has it tied several
- // times each will need its own clone.
- struct TiedUse {
- Operation *user;
- unsigned operandIndex;
- Value value;
- };
- SmallVector<TiedUse> tiedUses;
- unsigned untiedUses = 0;
- for (auto &use : rootValue.getUses()) {
- if (isa<IREE::Stream::TimepointAwaitOp>(use.getOwner()))
- continue;
- auto tiedOp = dyn_cast<IREE::Util::TiedOpInterface>(use.getOwner());
- bool isTied = tiedOp && tiedOp.isOperandTied(use.getOperandNumber());
- if (isTied) {
- tiedUses.push_back({use.getOwner(), use.getOperandNumber(), rootValue});
- } else {
- ++untiedUses;
- }
- }
- if (tiedUses.empty()) {
- // All uses are as normal capturing SSA values.
- return false;
- } else if (tiedUses.size() == 1 && untiedUses == 0 && !forceClone) {
- // Only one use and it's tied - we've already reserved our results for it.
- return false;
- }
-
- // Mixed/multiple tied uses. Clone for each tied use but leave the untied
- // ones referencing us.
- IREE::Stream::AffinityAttr sourceAffinity;
- if (auto affinityOp = dyn_cast_or_null<IREE::Stream::AffinityOpInterface>(
- rootValue.getDefiningOp())) {
- sourceAffinity = affinityOp.getAffinity();
- }
- for (auto &tiedUse : tiedUses) {
- auto cloneLoc =
- FusedLoc::get(builder.getContext(), {loc, tiedUse.user->getLoc()});
-
- builder.setInsertionPoint(tiedUse.user);
-
- auto sizeAwareType =
- llvm::cast<IREE::Util::SizeAwareTypeInterface>(tiedUse.value.getType());
- auto targetSize =
- sizeAwareType.queryValueSize(cloneLoc, tiedUse.value, builder);
-
- IREE::Stream::AffinityAttr targetAffinity;
- if (auto affinityOp =
- dyn_cast<IREE::Stream::AffinityOpInterface>(tiedUse.user)) {
- targetAffinity = affinityOp.getAffinity();
- }
-
- auto cloneOp = builder.create<IREE::Stream::AsyncCloneOp>(
- cloneLoc, tiedUse.value.getType(), tiedUse.value, targetSize,
- targetSize, targetAffinity ? targetAffinity : sourceAffinity);
- tiedUse.user->setOperand(tiedUse.operandIndex, cloneOp.getResult());
- }
-
- return true;
-}
-
-// Materialize copy-on-write (🐄) ops where required.
-// This models what a runtime normally does with copy-on-write but uses the
-// information we have in the SSA use-def chain to identify ties that write and
-// covering reads.
-template <typename Op>
-struct MaterializeCOW : public OpRewritePattern<Op> {
- using OpRewritePattern<Op>::OpRewritePattern;
- LogicalResult matchAndRewrite(Op op,
- PatternRewriter &rewriter) const override {
- bool didChange = false;
-
- // Handle results of this op (primary use case).
- for (auto result : op->getResults()) {
- didChange = materializeCOW(op.getLoc(), result, rewriter) || didChange;
- }
-
- return didChange ? success() : failure();
- }
-};
-
// Ties the results of execution region to their operands when the region
// operations are tied throughout the entire body.
//
diff --git a/runtime/src/iree/base/internal/math.h b/runtime/src/iree/base/internal/math.h
index d801bda..c3c29f9 100644
--- a/runtime/src/iree/base/internal/math.h
+++ b/runtime/src/iree/base/internal/math.h
@@ -68,7 +68,7 @@
//==============================================================================
static inline int iree_math_count_leading_zeros_u32(const uint32_t n) {
-#if defined(IREE_COMPILER_MSVC)
+#if defined(IREE_COMPILER_MSVC_COMPAT)
unsigned long result = 0; // NOLINT(runtime/int)
if (_BitScanReverse(&result, n)) {
return (int)(31 - result);
@@ -97,7 +97,7 @@
}
static inline int iree_math_count_leading_zeros_u64(uint64_t n) {
-#if defined(IREE_COMPILER_MSVC) && \
+#if defined(IREE_COMPILER_MSVC_COMPAT) && \
(defined(IREE_ARCH_ARM_64) || defined(IREE_ARCH_X86_64))
// MSVC does not have __buitin_clzll. Use _BitScanReverse64.
unsigned long result = 0; // NOLINT(runtime/int)
@@ -105,7 +105,7 @@
return (int)(63 - result);
}
return 64;
-#elif defined(IREE_COMPILER_MSVC)
+#elif defined(IREE_COMPILER_MSVC_COMPAT)
// MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
unsigned long result = 0; // NOLINT(runtime/int)
if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
@@ -141,7 +141,7 @@
}
static inline int iree_math_count_trailing_zeros_u32(uint32_t n) {
-#if defined(IREE_COMPILER_MSVC)
+#if defined(IREE_COMPILER_MSVC_COMPAT)
unsigned long result = 0; // NOLINT(runtime/int)
_BitScanForward(&result, n);
return (int)result;
@@ -160,11 +160,11 @@
}
static inline int iree_math_count_trailing_zeros_u64(uint64_t n) {
-#if defined(IREE_COMPILER_MSVC) && defined(IREE_PTR_SIZE_64)
+#if defined(IREE_COMPILER_MSVC_COMPAT) && defined(IREE_PTR_SIZE_64)
unsigned long result = 0; // NOLINT(runtime/int)
_BitScanForward64(&result, n);
return (int)result;
-#elif defined(IREE_COMPILER_MSVC) && defined(IREE_PTR_SIZE_32)
+#elif defined(IREE_COMPILER_MSVC_COMPAT) && defined(IREE_PTR_SIZE_32)
unsigned long result = 0; // NOLINT(runtime/int)
if ((uint32_t)(n) == 0) {
_BitScanForward(&result, n >> 32);
diff --git a/runtime/src/iree/base/internal/synchronization.c b/runtime/src/iree/base/internal/synchronization.c
index ccb42a4..cc484b4 100644
--- a/runtime/src/iree/base/internal/synchronization.c
+++ b/runtime/src/iree/base/internal/synchronization.c
@@ -61,7 +61,7 @@
// Cross-platform processor yield (where supported)
//==============================================================================
-#if defined(IREE_COMPILER_MSVC)
+#if defined(IREE_COMPILER_MSVC_COMPAT)
// MSVC uses architecture-specific intrinsics.
diff --git a/runtime/src/iree/base/target_platform.h b/runtime/src/iree/base/target_platform.h
index 08310f7..a2beb8c 100644
--- a/runtime/src/iree/base/target_platform.h
+++ b/runtime/src/iree/base/target_platform.h
@@ -34,6 +34,7 @@
// IREE_COMPILER_GCC
// IREE_COMPILER_GCC_COMPAT
// IREE_COMPILER_MSVC
+// IREE_COMPILER_MSVC_COMPAT
//
// IREE_SANITIZER_ADDRESS
// IREE_SANITIZER_MEMORY
@@ -197,12 +198,18 @@
#if defined(__clang__)
#define IREE_COMPILER_CLANG 1
+#if defined(_MSC_VER)
+// clang-cl is msvc-like (but also still clang).
+#define IREE_COMPILER_MSVC_COMPAT 1
+#else
#define IREE_COMPILER_GCC_COMPAT 1
+#endif // _MSC_VER
#elif defined(__GNUC__)
#define IREE_COMPILER_GCC 1
#define IREE_COMPILER_GCC_COMPAT 1
#elif defined(_MSC_VER)
#define IREE_COMPILER_MSVC 1
+#define IREE_COMPILER_MSVC_COMPAT 1
#else
#error Unrecognized compiler.
#endif // compiler versions
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
index 18c8274..419afef 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
@@ -155,7 +155,7 @@
"-mavx2"
"-mfma"
"-mf16c"
- MSVC
+ MSVC_OR_CLANG_CL
"/arch:AVX2"
)
@@ -183,7 +183,7 @@
"-mavx512cd"
"-mavx512bw"
"-mavx512dq"
- MSVC
+ MSVC_OR_CLANG_CL
"/arch:AVX512"
)
@@ -192,7 +192,8 @@
iree_select_compiler_opts(IREE_UK_COPTS_X86_64_AVX512_VNNI_RELATIVE
CLANG_OR_GCC
"-mavx512vnni"
- MSVC
+ CLANG_CL
+ "/clang:-mavx512vnni"
)
set(IREE_UK_COPTS_X86_64_AVX512_VNNI
"${IREE_UK_COPTS_X86_64_AVX512_BASE}"
@@ -204,7 +205,8 @@
iree_select_compiler_opts(IREE_UK_COPTS_X86_64_AVX512_BF16_RELATIVE
CLANG_OR_GCC
"-mavx512bf16"
- MSVC
+ CLANG_CL
+ "/clang:-mavx512bf16"
)
set(IREE_UK_COPTS_X86_64_AVX512_BF16
"${IREE_UK_COPTS_X86_64_AVX512_BASE}"
@@ -224,8 +226,8 @@
# /arch: flag is often too coarse-grained to be meaningful.
# GCC version check
-if ((CMAKE_C_COMPILER_ID STREQUAL GNU) AND
- (CMAKE_C_COMPILER_VERSION VERSION_LESS 12))
+if((CMAKE_C_COMPILER_ID STREQUAL GNU) AND
+ (CMAKE_C_COMPILER_VERSION VERSION_LESS 12))
# Old GCC versions have incompatible x86 intrinsics. Supporting them
# is not considered worth it. By not defining these tokens, we leave these
# code paths out. At least GCC 9 is known to be problematic, while GCC 12 is
@@ -237,27 +239,79 @@
endif() # GCC version check
# MSVC version check for AVX-512-BF16
-if (MSVC_VERSION AND ("${MSVC_VERSION}" VERSION_LESS 1937))
+if(MSVC_VERSION AND ("${MSVC_VERSION}" VERSION_LESS 1937))
# Missing _mm512_cvtpbh_ps intrinsic at _MSC_VER=1930.
set(IREE_UK_TRY_X86_64_AVX512_BF16 OFF)
endif() # MSVC version check for AVX-512-BF16
+# clang-cl version check for vnni bf16 bug.
+# Version 16-17 crash compiling the file and in clang-cl we can't use the
+# inline asm workaround: https://github.com/llvm/llvm-project/issues/68117.
+if((CMAKE_C_COMPILER_ID MATCHES "Clang" AND
+ CMAKE_C_SIMULATE_ID MATCHES "MSVC") AND
+ (CMAKE_C_COMPILER_VERSION VERSION_LESS 18))
+ set(IREE_UK_TRY_X86_64_AVX512_BF16 OFF)
+endif()
+
# Now check compiler support for what we've decided to try.
+# Some instructions are not available with baseline arch flags and need to be
+# tested via compilation. We include the intrinsics used by the kernels whose
+# presence indicates availability of all required intrinsics (vs testing each).
-if (IREE_UK_TRY_X86_64_AVX2_FMA)
- check_cxx_compiler_flag("${IREE_UK_COPTS_X86_64_AVX2_FMA}" IREE_UK_BUILD_X86_64_AVX2_FMA)
+if(IREE_UK_TRY_X86_64_AVX2_FMA)
+ check_cxx_compiler_flag(
+ "${IREE_UK_COPTS_X86_64_AVX2_FMA}"
+ IREE_UK_BUILD_X86_64_AVX2_FMA
+ )
+else()
+ set(IREE_UK_BUILD_X86_64_AVX2_FMA OFF)
endif()
-if (IREE_UK_TRY_X86_64_AVX512_BASE)
- check_cxx_compiler_flag("${IREE_UK_COPTS_X86_64_AVX512_BASE}" IREE_UK_BUILD_X86_64_AVX512_BASE)
+if(IREE_UK_TRY_X86_64_AVX512_BASE)
+ check_cxx_compiler_flag(
+ "${IREE_UK_COPTS_X86_64_AVX512_BASE}"
+ IREE_UK_BUILD_X86_64_AVX512_BASE
+ )
+else()
+ set(IREE_UK_BUILD_X86_64_AVX512_BASE OFF)
endif()
-if (IREE_UK_TRY_X86_64_AVX512_VNNI)
- check_cxx_compiler_flag("${IREE_UK_COPTS_X86_64_AVX512_VNNI}" IREE_UK_BUILD_X86_64_AVX512_VNNI)
+if(IREE_UK_TRY_X86_64_AVX512_VNNI)
+ string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${IREE_UK_COPTS_X86_64_AVX512_VNNI}")
+ string(JOIN "\n" IREE_UK_BUILD_X86_64_AVX512_VNNI_TEST
+ "#include <immintrin.h>"
+ "int main() {"
+ " __m512i a, b, d;"
+ " _mm512_dpwssd_epi32(d, a, b);"
+ " return 0;"
+ "}"
+ )
+ check_c_source_compiles(
+ "${IREE_UK_BUILD_X86_64_AVX512_VNNI_TEST}"
+ IREE_UK_BUILD_X86_64_AVX512_VNNI
+ )
+ unset(CMAKE_REQUIRED_FLAGS)
+else()
+ set(IREE_UK_BUILD_X86_64_AVX512_VNNI OFF)
endif()
-if (IREE_UK_TRY_X86_64_AVX512_BF16)
- check_cxx_compiler_flag("${IREE_UK_COPTS_X86_64_AVX512_BF16}" IREE_UK_BUILD_X86_64_AVX512_BF16)
+if(IREE_UK_TRY_X86_64_AVX512_BF16)
+ string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${IREE_UK_COPTS_X86_64_AVX512_BF16}")
+ string(JOIN "\n" IREE_UK_BUILD_X86_64_AVX512_BF16_TEST
+ "#include <immintrin.h>"
+ "int main() {"
+ " __m256bh a;"
+ " _mm512_cvtneps_pbh(a);"
+ " return 0;"
+ "}"
+ )
+ check_c_source_compiles(
+ "${IREE_UK_BUILD_X86_64_AVX512_BF16_TEST}"
+ IREE_UK_BUILD_X86_64_AVX512_BF16
+ )
+ unset(CMAKE_REQUIRED_FLAGS)
+else()
+ set(IREE_UK_BUILD_X86_64_AVX512_BF16 OFF)
endif()
# Now generate the configured header. This needs to happen after all
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/mmt4d_x86_64_avx512_bf16.c b/runtime/src/iree/builtins/ukernel/arch/x86_64/mmt4d_x86_64_avx512_bf16.c
index 81ec2f7..f4f0fa4 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/mmt4d_x86_64_avx512_bf16.c
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/mmt4d_x86_64_avx512_bf16.c
@@ -7,7 +7,7 @@
#include "iree/builtins/ukernel/arch/x86_64/common_x86_64.h"
#include "iree/builtins/ukernel/arch/x86_64/mmt4d_x86_64_internal.h"
-#ifdef IREE_UK_COMPILER_CLANG
+#if defined(IREE_UK_COMPILER_CLANG) && !defined(IREE_UK_COMPILER_MSVC)
// This inline-asm function is a work-around for:
// 1. https://github.com/llvm/llvm-project/issues/68117
// Summary: LLVM crash affecting Clang 16-17. Fixed in Clang 18.
@@ -38,7 +38,7 @@
acc, bitcast_16xf32_to_32xbf16(lhs),
bitcast_16xf32_to_32xbf16(_mm512_set1_ps(*(const float*)rhs)));
}
-#endif
+#endif // IREE_UK_COMPILER_CLANG
static inline void
iree_uk_mmt4d_tile_bf16bf16fXX_1x16x2_to_16x16x2_x86_64_avx512_bf16(
diff --git a/runtime/src/iree/io/formats/irpa/irpa_parser.c b/runtime/src/iree/io/formats/irpa/irpa_parser.c
index 04fac9c..dd09b3a 100644
--- a/runtime/src/iree/io/formats/irpa/irpa_parser.c
+++ b/runtime/src/iree/io/formats/irpa/irpa_parser.c
@@ -40,8 +40,8 @@
}
iree_io_physical_offset_t view_offset =
base_offset + header->metadata_segment.offset + range.offset;
- *out_view =
- iree_make_string_view(file_contents.data + view_offset, range.length);
+ *out_view = iree_make_string_view(
+ (const char*)file_contents.data + view_offset, range.length);
return iree_ok_status();
}
diff --git a/runtime/src/iree/modules/hal/exports.inl b/runtime/src/iree/modules/hal/exports.inl
index e61638a..8100bd4 100644
--- a/runtime/src/iree/modules/hal/exports.inl
+++ b/runtime/src/iree/modules/hal/exports.inl
@@ -80,7 +80,7 @@
EXPORT_FN("fence.await", iree_hal_module_fence_await, iCrD, i)
EXPORT_FN("fence.create", iree_hal_module_fence_create, ri, r)
-EXPORT_FN("fence.fail", iree_hal_module_fence_signal, ri, v)
+EXPORT_FN("fence.fail", iree_hal_module_fence_fail, ri, v)
EXPORT_FN("fence.join", iree_hal_module_fence_join, CrD, r)
EXPORT_FN("fence.query", iree_hal_module_fence_query, r, i)
EXPORT_FN("fence.signal", iree_hal_module_fence_signal, r, v)
diff --git a/runtime/src/iree/task/topology_win32.c b/runtime/src/iree/task/topology_win32.c
index b74285c..50cf48c 100644
--- a/runtime/src/iree/task/topology_win32.c
+++ b/runtime/src/iree/task/topology_win32.c
@@ -165,13 +165,13 @@
return iree_make_status(
iree_status_code_from_win32_error(GetLastError()),
"failed to query logical processor information size (%08X)",
- GetLastError());
+ (unsigned)GetLastError());
}
if (cache_relationships_size > 64 * 1024) {
return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
"logical processor information size overflow (got "
"%u which is large for a stack alloc)",
- cache_relationships_size);
+ (unsigned)cache_relationships_size);
}
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* cache_relationships =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
@@ -182,7 +182,8 @@
&cache_relationships_size)) {
return iree_make_status(
iree_status_code_from_win32_error(GetLastError()),
- "failed to query logical processor information (%08X)", GetLastError());
+ "failed to query logical processor information (%08X)",
+ (unsigned)GetLastError());
}
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* cache_relationships_end =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)
@@ -222,14 +223,14 @@
return iree_make_status(
iree_status_code_from_win32_error(GetLastError()),
"failed to query logical processor information size (%08X)",
- GetLastError());
+ (unsigned)GetLastError());
}
if (all_relationships_size > 64 * 1024) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
"logical processor information size overflow (got "
"%u which is large for a stack alloc)",
- all_relationships_size);
+ (unsigned)all_relationships_size);
}
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
@@ -241,7 +242,8 @@
IREE_TRACE_ZONE_END(z0);
return iree_make_status(
iree_status_code_from_win32_error(GetLastError()),
- "failed to query logical processor information (%08X)", GetLastError());
+ "failed to query logical processor information (%08X)",
+ (unsigned)GetLastError());
}
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships_end =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)all_relationships +
@@ -278,8 +280,6 @@
// Build an on-stack table for random access into all logical processors.
// This isn't strictly required but makes it easier to walk the CPU table.
- PROCESSOR_RELATIONSHIP** all_processors =
- iree_alloca(sizeof(PROCESSOR_RELATIONSHIP*) * total_processor_count);
iree_host_size_t global_processor_count = 0;
for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
p < all_relationships_end;
@@ -361,14 +361,14 @@
return iree_make_status(
iree_status_code_from_win32_error(GetLastError()),
"failed to query logical processor information size (%08X)",
- GetLastError());
+ (unsigned)GetLastError());
}
if (all_relationships_size > 64 * 1024) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
"logical processor information size overflow (got "
"%u which is large for a stack alloc)",
- all_relationships_size);
+ (unsigned)all_relationships_size);
}
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
@@ -380,7 +380,8 @@
IREE_TRACE_ZONE_END(z0);
return iree_make_status(
iree_status_code_from_win32_error(GetLastError()),
- "failed to query logical processor information (%08X)", GetLastError());
+ "failed to query logical processor information (%08X)",
+ (unsigned)GetLastError());
}
SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships_end =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)all_relationships +
diff --git a/runtime/src/iree/vm/bytecode/dispatch.c b/runtime/src/iree/vm/bytecode/dispatch.c
index 1c3d528..6ebbc38 100644
--- a/runtime/src/iree/vm/bytecode/dispatch.c
+++ b/runtime/src/iree/vm/bytecode/dispatch.c
@@ -923,7 +923,7 @@
if (IREE_UNLIKELY(!buffer)) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "buffer is null");
}
- uint64_t* result = VM_DecResultRegI64("result");
+ uint64_t* result = (uint64_t*)VM_DecResultRegI64("result");
*result = (uint64_t)iree_vm_buffer_length(buffer);
});
@@ -1051,7 +1051,7 @@
"source_buffer is null");
}
iree_host_size_t offset = VM_DecOperandRegI64HostSize("source_offset");
- uint32_t* result = VM_DecResultRegI32("result");
+ int32_t* result = VM_DecResultRegI32("result");
vm_buffer_load_i8u_inline(buffer, offset, result);
});
DISPATCH_OP(CORE, BufferLoadI8S, {
@@ -1064,7 +1064,7 @@
"source_buffer is null");
}
iree_host_size_t offset = VM_DecOperandRegI64HostSize("source_offset");
- uint32_t* result = VM_DecResultRegI32("result");
+ int32_t* result = VM_DecResultRegI32("result");
vm_buffer_load_i8s_inline(buffer, offset, result);
});
DISPATCH_OP(CORE, BufferLoadI16U, {
@@ -1077,7 +1077,7 @@
"source_buffer is null");
}
iree_host_size_t offset = VM_DecOperandRegI64HostSize("source_offset");
- uint32_t* result = VM_DecResultRegI32("result");
+ int32_t* result = VM_DecResultRegI32("result");
vm_buffer_load_i16u_inline(buffer, offset, result);
});
DISPATCH_OP(CORE, BufferLoadI16S, {
@@ -1090,7 +1090,7 @@
"source_buffer is null");
}
iree_host_size_t offset = VM_DecOperandRegI64HostSize("source_offset");
- uint32_t* result = VM_DecResultRegI32("result");
+ int32_t* result = VM_DecResultRegI32("result");
vm_buffer_load_i16s_inline(buffer, offset, result);
});
DISPATCH_OP(CORE, BufferLoadI32, {
@@ -1103,7 +1103,7 @@
"source_buffer is null");
}
iree_host_size_t offset = VM_DecOperandRegI64HostSize("source_offset");
- uint32_t* result = VM_DecResultRegI32("result");
+ int32_t* result = VM_DecResultRegI32("result");
vm_buffer_load_i32_inline(buffer, offset, result);
});
DISPATCH_OP(CORE, BufferLoadI64, {
@@ -1116,7 +1116,7 @@
"source_buffer is null");
}
iree_host_size_t offset = VM_DecOperandRegI64HostSize("source_offset");
- uint64_t* result = VM_DecResultRegI64("result");
+ int64_t* result = VM_DecResultRegI64("result");
vm_buffer_load_i64_inline(buffer, offset, result);
});
@@ -1189,7 +1189,7 @@
iree_host_size_t source_offset =
VM_DecOperandRegI64HostSize("source_offset");
iree_host_size_t length = VM_DecOperandRegI64HostSize("length");
- uint64_t* result = VM_DecResultRegI64("result");
+ int64_t* result = VM_DecResultRegI64("result");
IREE_RETURN_IF_ERROR(
iree_vm_buffer_hash(source_buffer, source_offset, length, result));
});