tools/test/iree-run-module-in-place.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile %s \
 // RUN:    --iree-hal-target-device=local \
 // RUN:    --iree-hal-local-target-device-backends=vmvx \
 // RUN:    --compile-to=stream | \
 // RUN: FileCheck %s --check-prefix=CHECK-IR

 // Ensure no allocations made it into the IR - in this example the output should
 // be placed in the %output storage and the required transient in
 // %transient_storage.
 // CHECK-IR-LABEL: util.func public @in_place_computation
 // CHECK-IR-NOT: stream.resource.alloc
 // CHECK-IR-NOT: stream.resource.alloca
 // CHECK-IR-NOT: stream.resource.dealloca

 // RUN: (iree-compile %s \
 // RUN:    --iree-hal-target-device=local \
 // RUN:    --iree-hal-local-target-device-backends=vmvx | \
 // RUN:  iree-run-module \
 // RUN:    --device=local-task \
 // RUN:    --module=- \
 // RUN:    --function=in_place_computation \
 // RUN:    --input="64xf32=1.0" \
 // RUN:    --input="64xf32=0" \
 // RUN:    --input="&512xi8=0") | \
 // RUN:  FileCheck %s --check-prefix=CHECK-EXEC

 // Ensure we produce the right value (the returned buffer view is aliasing the
 // %output storage provided, but there's no way to test that here).
 // CHECK-EXEC-LABEL: EXEC @in_place_computation
 // CHECK-EXEC: 64xf32=5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5

 // Test for external transients feature: verifies that transient allocations
 // can be emplaced into externally-provided storage.
 //
 // The test performs three sequential operations wrapped in dispatch regions
 // to prevent fusion:
 // 1. temp1 = input + 1.0  (uses input + transient storage)
 // 2. temp2 = temp1 * 2.0  (uses transient + transient storage)
 // 3. output = temp2 + input (uses transient + output buffer)
 //
 // With input=[1.0, 1.0, ...], the expected output is [5.0, 5.0, ...].

 util.func public @in_place_computation(
   %input: tensor<64xf32>,
   %output: tensor<64xf32> {iree.abi.output = 0 : index},
   %transient_storage: !hal.buffer {iree.abi.transients}
 ) -> tensor<64xf32> {
   // Dispatch 1: temp1 = input + 1.0
   %temp1 = flow.dispatch.region -> (tensor<64xf32>) {
     %one = arith.constant dense<1.0> : tensor<64xf32>
     %result = arith.addf %input, %one : tensor<64xf32>
     flow.return %result : tensor<64xf32>
   }

   // Dispatch 2: temp2 = temp1 * 2.0
   %temp2 = flow.dispatch.region -> (tensor<64xf32>) {
     %two = arith.constant dense<2.0> : tensor<64xf32>
     %result = arith.mulf %temp1, %two : tensor<64xf32>
     flow.return %result : tensor<64xf32>
   }

   // Dispatch 3: output = temp2 + input
   %result = flow.dispatch.region -> (tensor<64xf32>) {
     %final = arith.addf %temp2, %input : tensor<64xf32>
     flow.return %final : tensor<64xf32>
   }

   util.return %result : tensor<64xf32>
 }
	// RUN: iree-compile %s \
	// RUN: --iree-hal-target-device=local \
	// RUN: --iree-hal-local-target-device-backends=vmvx \
	// RUN: --compile-to=stream \| \
	// RUN: FileCheck %s --check-prefix=CHECK-IR

	// Ensure no allocations made it into the IR - in this example the output should
	// be placed in the %output storage and the required transient in
	// %transient_storage.
	// CHECK-IR-LABEL: util.func public @in_place_computation
	// CHECK-IR-NOT: stream.resource.alloc
	// CHECK-IR-NOT: stream.resource.alloca
	// CHECK-IR-NOT: stream.resource.dealloca

	// RUN: (iree-compile %s \
	// RUN: --iree-hal-target-device=local \
	// RUN: --iree-hal-local-target-device-backends=vmvx \| \
	// RUN: iree-run-module \
	// RUN: --device=local-task \
	// RUN: --module=- \
	// RUN: --function=in_place_computation \
	// RUN: --input="64xf32=1.0" \
	// RUN: --input="64xf32=0" \
	// RUN: --input="&512xi8=0") \| \
	// RUN: FileCheck %s --check-prefix=CHECK-EXEC

	// Ensure we produce the right value (the returned buffer view is aliasing the
	// %output storage provided, but there's no way to test that here).
	// CHECK-EXEC-LABEL: EXEC @in_place_computation
	// CHECK-EXEC: 64xf32=5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5

	// Test for external transients feature: verifies that transient allocations
	// can be emplaced into externally-provided storage.
	//
	// The test performs three sequential operations wrapped in dispatch regions
	// to prevent fusion:
	// 1. temp1 = input + 1.0 (uses input + transient storage)
	// 2. temp2 = temp1 * 2.0 (uses transient + transient storage)
	// 3. output = temp2 + input (uses transient + output buffer)
	//
	// With input=[1.0, 1.0, ...], the expected output is [5.0, 5.0, ...].

	util.func public @in_place_computation(
	%input: tensor<64xf32>,
	%output: tensor<64xf32> {iree.abi.output = 0 : index},
	%transient_storage: !hal.buffer {iree.abi.transients}
	) -> tensor<64xf32> {
	// Dispatch 1: temp1 = input + 1.0
	%temp1 = flow.dispatch.region -> (tensor<64xf32>) {
	%one = arith.constant dense<1.0> : tensor<64xf32>
	%result = arith.addf %input, %one : tensor<64xf32>
	flow.return %result : tensor<64xf32>
	}

	// Dispatch 2: temp2 = temp1 * 2.0
	%temp2 = flow.dispatch.region -> (tensor<64xf32>) {
	%two = arith.constant dense<2.0> : tensor<64xf32>
	%result = arith.mulf %temp1, %two : tensor<64xf32>
	flow.return %result : tensor<64xf32>
	}

	// Dispatch 3: output = temp2 + input
	%result = flow.dispatch.region -> (tensor<64xf32>) {
	%final = arith.addf %temp2, %input : tensor<64xf32>
	flow.return %final : tensor<64xf32>
	}

	util.return %result : tensor<64xf32>
	}