blob: bac6b383ccbcd79c3c80fcc4a158c3dc1b826c90 [file]
// RUN: iree-compile %s \
// RUN: --iree-hal-target-device=local \
// RUN: --iree-hal-local-target-device-backends=vmvx \
// RUN: --compile-to=stream | \
// RUN: FileCheck %s --check-prefix=CHECK-IR
// Ensure no allocations made it into the IR - in this example the output should
// be placed in the %output storage and the required transient in
// %transient_storage.
// CHECK-IR-LABEL: util.func public @in_place_computation
// CHECK-IR-NOT: stream.resource.alloc
// CHECK-IR-NOT: stream.resource.alloca
// CHECK-IR-NOT: stream.resource.dealloca
// RUN: (iree-compile %s \
// RUN: --iree-hal-target-device=local \
// RUN: --iree-hal-local-target-device-backends=vmvx | \
// RUN: iree-run-module \
// RUN: --device=local-task \
// RUN: --module=- \
// RUN: --function=in_place_computation \
// RUN: --input="64xf32=1.0" \
// RUN: --input="64xf32=0" \
// RUN: --input="&512xi8=0") | \
// RUN: FileCheck %s --check-prefix=CHECK-EXEC
// Ensure we produce the right value (the returned buffer view is aliasing the
// %output storage provided, but there's no way to test that here).
// CHECK-EXEC-LABEL: EXEC @in_place_computation
// CHECK-EXEC: 64xf32=5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
// Test for external transients feature: verifies that transient allocations
// can be emplaced into externally-provided storage.
//
// The test performs three sequential operations wrapped in dispatch regions
// to prevent fusion:
// 1. temp1 = input + 1.0 (uses input + transient storage)
// 2. temp2 = temp1 * 2.0 (uses transient + transient storage)
// 3. output = temp2 + input (uses transient + output buffer)
//
// With input=[1.0, 1.0, ...], the expected output is [5.0, 5.0, ...].
util.func public @in_place_computation(
%input: tensor<64xf32>,
%output: tensor<64xf32> {iree.abi.output = 0 : index},
%transient_storage: !hal.buffer {iree.abi.transients}
) -> tensor<64xf32> {
// Dispatch 1: temp1 = input + 1.0
%temp1 = flow.dispatch.region -> (tensor<64xf32>) {
%one = arith.constant dense<1.0> : tensor<64xf32>
%result = arith.addf %input, %one : tensor<64xf32>
flow.return %result : tensor<64xf32>
}
// Dispatch 2: temp2 = temp1 * 2.0
%temp2 = flow.dispatch.region -> (tensor<64xf32>) {
%two = arith.constant dense<2.0> : tensor<64xf32>
%result = arith.mulf %temp1, %two : tensor<64xf32>
flow.return %result : tensor<64xf32>
}
// Dispatch 3: output = temp2 + input
%result = flow.dispatch.region -> (tensor<64xf32>) {
%final = arith.addf %temp2, %input : tensor<64xf32>
flow.return %final : tensor<64xf32>
}
util.return %result : tensor<64xf32>
}