tests/e2e/linalg_ext_ops/scan_configured.mlir - 3p/openxla/iree - Git at Google

 #translation = #iree_codegen.translation_info<
     pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [64, 1, 1] subgroup_size = 64>
 #config = #iree_gpu.lowering_config<{
     lane_basis = [[8], [0]],
     subgroup_basis = [[1], [0]],
     thread = [8]
 }>
 #compilation = #iree_codegen.compilation_info<
     lowering_config = #config,
     translation_info = #translation>

 func.func @scan_dim0_inclusive_sum_configured_vector_distribute() {
   %c1 = arith.constant 1 : index
   %input_empty = tensor.empty() : tensor<256xf32>
   %input_init = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]}
       outs(%input_empty : tensor<256xf32>) {
     ^bb0(%out: f32):
       %idx = linalg.index 0 : index
       %idx1 = arith.addi %idx, %c1 : index
       %idx_i32 = arith.index_cast %idx1 : index to i32
       %value = arith.sitofp %idx_i32 : i32 to f32
       linalg.yield %value : f32
   } -> tensor<256xf32>
   %input = util.optimization_barrier %input_init : tensor<256xf32>

   %output = tensor.empty() : tensor<256xf32>
   %accum = util.unfoldable_constant dense<0.0> : tensor<f32>
   %0:2 = iree_linalg_ext.scan {compilation_info = #compilation}
          dimension(0) inclusive(true)
          ins(%input : tensor<256xf32>)
          outs(%output, %accum : tensor<256xf32>, tensor<f32>) {
            ^bb0(%lhs : f32, %rhs : f32):
              %sum = arith.addf %lhs, %rhs : f32
              iree_linalg_ext.yield %sum : f32
          } -> tensor<256xf32>, tensor<f32>

   %prefix = tensor.extract_slice %0#0[0] [4] [1]
       : tensor<256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %prefix,
       dense<[1.0, 3.0, 6.0, 10.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   check.expect_almost_eq_const(
       %0#1,
       dense<32896.0> : tensor<f32>
   ) : tensor<f32>

   return
 }

 #translation_exclusive = #iree_codegen.translation_info<
     pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [64, 1, 1] subgroup_size = 64>
 #config_exclusive = #iree_gpu.lowering_config<{
     lane_basis = [[16], [0]],
     subgroup_basis = [[1], [0]],
     thread = [16]
 }>
 #compilation_exclusive = #iree_codegen.compilation_info<
     lowering_config = #config_exclusive,
     translation_info = #translation_exclusive>

 func.func @scan_dim0_exclusive_sum_configured_vector_distribute() {
   %c1 = arith.constant 1 : index
   %input_empty = tensor.empty() : tensor<256xf32>
   %input_init = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]}
       outs(%input_empty : tensor<256xf32>) {
     ^bb0(%out: f32):
       %idx = linalg.index 0 : index
       %idx1 = arith.addi %idx, %c1 : index
       %idx_i32 = arith.index_cast %idx1 : index to i32
       %value = arith.sitofp %idx_i32 : i32 to f32
       linalg.yield %value : f32
   } -> tensor<256xf32>
   %input = util.optimization_barrier %input_init : tensor<256xf32>

   %output = tensor.empty() : tensor<256xf32>
   %accum = util.unfoldable_constant dense<5.0> : tensor<f32>
   %0:2 = iree_linalg_ext.scan {compilation_info = #compilation_exclusive}
          dimension(0) inclusive(false)
          ins(%input : tensor<256xf32>)
          outs(%output, %accum : tensor<256xf32>, tensor<f32>) {
            ^bb0(%lhs : f32, %rhs : f32):
              %sum = arith.addf %lhs, %rhs : f32
              iree_linalg_ext.yield %sum : f32
          } -> tensor<256xf32>, tensor<f32>

   %prefix = tensor.extract_slice %0#0[0] [4] [1]
       : tensor<256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %prefix,
       dense<[5.0, 6.0, 8.0, 11.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %boundary = tensor.extract_slice %0#0[14] [4] [1]
       : tensor<256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %boundary,
       dense<[110.0, 125.0, 141.0, 158.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   check.expect_almost_eq_const(
       %0#1,
       dense<32645.0> : tensor<f32>
   ) : tensor<f32>

   return
 }

 #translation_large = #iree_codegen.translation_info<
     pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [64, 1, 1] subgroup_size = 64>
 #config_large = #iree_gpu.lowering_config<{
     lane_basis = [[2, 8], [0, 1]],
     subgroup_basis = [[1, 1], [0, 1]],
     thread = [2, 8]
 }>
 #compilation_large = #iree_codegen.compilation_info<
     lowering_config = #config_large,
     translation_info = #translation_large>

 func.func @scan_dim1_inclusive_sum_large_configured_vector_distribute() {
   %c1 = arith.constant 1 : index
   %c1000 = arith.constant 1000 : index
   %input_empty = tensor.empty() : tensor<64x256xf32>
   %input_init = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
       iterator_types = ["parallel", "parallel"]}
       outs(%input_empty : tensor<64x256xf32>) {
     ^bb0(%out: f32):
       %row = linalg.index 0 : index
       %col = linalg.index 1 : index
       %row_base = arith.muli %row, %c1000 : index
       %col1 = arith.addi %col, %c1 : index
       %idx = arith.addi %row_base, %col1 : index
       %idx_i32 = arith.index_cast %idx : index to i32
       %value = arith.sitofp %idx_i32 : i32 to f32
       linalg.yield %value : f32
   } -> tensor<64x256xf32>
   %input = util.optimization_barrier %input_init : tensor<64x256xf32>

   %output = tensor.empty() : tensor<64x256xf32>
   %accum = util.unfoldable_constant dense<0.0> : tensor<64xf32>
   %0:2 = iree_linalg_ext.scan {compilation_info = #compilation_large}
          dimension(1) inclusive(true)
          ins(%input : tensor<64x256xf32>)
          outs(%output, %accum : tensor<64x256xf32>, tensor<64xf32>) {
            ^bb0(%lhs : f32, %rhs : f32):
              %sum = arith.addf %lhs, %rhs : f32
              iree_linalg_ext.yield %sum : f32
          } -> tensor<64x256xf32>, tensor<64xf32>

   %prefix_row0 = tensor.extract_slice %0#0[0, 0] [1, 4] [1, 1]
       : tensor<64x256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %prefix_row0,
       dense<[1.0, 3.0, 6.0, 10.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %prefix_row1 = tensor.extract_slice %0#0[1, 0] [1, 4] [1, 1]
       : tensor<64x256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %prefix_row1,
       dense<[1001.0, 2003.0, 3006.0, 4010.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %boundary_row0 = tensor.extract_slice %0#0[0, 14] [1, 4] [1, 1]
       : tensor<64x256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %boundary_row0,
       dense<[120.0, 136.0, 153.0, 171.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %boundary_row1 = tensor.extract_slice %0#0[1, 14] [1, 4] [1, 1]
       : tensor<64x256xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %boundary_row1,
       dense<[15120.0, 16136.0, 17153.0, 18171.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %acc_prefix = tensor.extract_slice %0#1[0] [2] [1]
       : tensor<64xf32> to tensor<2xf32>
   check.expect_almost_eq_const(
       %acc_prefix,
       dense<[32896.0, 288896.0]> : tensor<2xf32>
   ) : tensor<2xf32>

   return
 }

 #translation_cross_subgroup = #iree_codegen.translation_info<
     pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [128, 1, 1] subgroup_size = 64>
 #config_cross_subgroup = #iree_gpu.lowering_config<{
     subgroup_basis = [[2], [0]],
     lane_basis = [[8], [0]],
     thread = [4]
 }>
 #compilation_cross_subgroup = #iree_codegen.compilation_info<
     lowering_config = #config_cross_subgroup,
     translation_info = #translation_cross_subgroup>

 func.func @scan_dim0_inclusive_sum_cross_subgroup_configured_vector_distribute() {
   %c1 = arith.constant 1 : index
   %input_empty = tensor.empty() : tensor<128xf32>
   %input_init = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]}
       outs(%input_empty : tensor<128xf32>) {
     ^bb0(%out: f32):
       %idx = linalg.index 0 : index
       %idx1 = arith.addi %idx, %c1 : index
       %idx_i32 = arith.index_cast %idx1 : index to i32
       %value = arith.sitofp %idx_i32 : i32 to f32
       linalg.yield %value : f32
   } -> tensor<128xf32>
   %input = util.optimization_barrier %input_init : tensor<128xf32>

   %output = tensor.empty() : tensor<128xf32>
   %accum = util.unfoldable_constant dense<0.0> : tensor<f32>
   %0:2 = iree_linalg_ext.scan {compilation_info = #compilation_cross_subgroup}
          dimension(0) inclusive(true)
          ins(%input : tensor<128xf32>)
          outs(%output, %accum : tensor<128xf32>, tensor<f32>) {
            ^bb0(%lhs : f32, %rhs : f32):
              %sum = arith.addf %lhs, %rhs : f32
              iree_linalg_ext.yield %sum : f32
          } -> tensor<128xf32>, tensor<f32>

   %prefix = tensor.extract_slice %0#0[0] [4] [1]
       : tensor<128xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %prefix,
       dense<[1.0, 3.0, 6.0, 10.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %boundary = tensor.extract_slice %0#0[62] [4] [1]
       : tensor<128xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %boundary,
       dense<[2016.0, 2080.0, 2145.0, 2211.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   check.expect_almost_eq_const(
       %0#1,
       dense<8256.0> : tensor<f32>
   ) : tensor<f32>

   return
 }

 func.func @scan_dim0_exclusive_sum_cross_subgroup_configured_vector_distribute() {
   %c1 = arith.constant 1 : index
   %input_empty = tensor.empty() : tensor<128xf32>
   %input_init = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]}
       outs(%input_empty : tensor<128xf32>) {
     ^bb0(%out: f32):
       %idx = linalg.index 0 : index
       %idx1 = arith.addi %idx, %c1 : index
       %idx_i32 = arith.index_cast %idx1 : index to i32
       %value = arith.sitofp %idx_i32 : i32 to f32
       linalg.yield %value : f32
   } -> tensor<128xf32>
   %input = util.optimization_barrier %input_init : tensor<128xf32>

   %output = tensor.empty() : tensor<128xf32>
   %accum = util.unfoldable_constant dense<5.0> : tensor<f32>
   %0:2 = iree_linalg_ext.scan {compilation_info = #compilation_cross_subgroup}
          dimension(0) inclusive(false)
          ins(%input : tensor<128xf32>)
          outs(%output, %accum : tensor<128xf32>, tensor<f32>) {
            ^bb0(%lhs : f32, %rhs : f32):
              %sum = arith.addf %lhs, %rhs : f32
              iree_linalg_ext.yield %sum : f32
          } -> tensor<128xf32>, tensor<f32>

   %prefix = tensor.extract_slice %0#0[0] [4] [1]
       : tensor<128xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %prefix,
       dense<[5.0, 6.0, 8.0, 11.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   %boundary = tensor.extract_slice %0#0[62] [4] [1]
       : tensor<128xf32> to tensor<4xf32>
   check.expect_almost_eq_const(
       %boundary,
       dense<[1958.0, 2021.0, 2085.0, 2150.0]> : tensor<4xf32>
   ) : tensor<4xf32>

   check.expect_almost_eq_const(
       %0#1,
       dense<8133.0> : tensor<f32>
   ) : tensor<f32>

   return
 }
	#translation = #iree_codegen.translation_info<
	pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [64, 1, 1] subgroup_size = 64>
	#config = #iree_gpu.lowering_config<{
	lane_basis = [[8], [0]],
	subgroup_basis = [[1], [0]],
	thread = [8]
	}>
	#compilation = #iree_codegen.compilation_info<
	lowering_config = #config,
	translation_info = #translation>

	func.func @scan_dim0_inclusive_sum_configured_vector_distribute() {
	%c1 = arith.constant 1 : index
	%input_empty = tensor.empty() : tensor<256xf32>
	%input_init = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	outs(%input_empty : tensor<256xf32>) {
	^bb0(%out: f32):
	%idx = linalg.index 0 : index
	%idx1 = arith.addi %idx, %c1 : index
	%idx_i32 = arith.index_cast %idx1 : index to i32
	%value = arith.sitofp %idx_i32 : i32 to f32
	linalg.yield %value : f32
	} -> tensor<256xf32>
	%input = util.optimization_barrier %input_init : tensor<256xf32>

	%output = tensor.empty() : tensor<256xf32>
	%accum = util.unfoldable_constant dense<0.0> : tensor<f32>
	%0:2 = iree_linalg_ext.scan {compilation_info = #compilation}
	dimension(0) inclusive(true)
	ins(%input : tensor<256xf32>)
	outs(%output, %accum : tensor<256xf32>, tensor<f32>) {
	^bb0(%lhs : f32, %rhs : f32):
	%sum = arith.addf %lhs, %rhs : f32
	iree_linalg_ext.yield %sum : f32
	} -> tensor<256xf32>, tensor<f32>

	%prefix = tensor.extract_slice %0#0[0] [4] [1]
	: tensor<256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%prefix,
	dense<[1.0, 3.0, 6.0, 10.0]> : tensor<4xf32>
	) : tensor<4xf32>

	check.expect_almost_eq_const(
	%0#1,
	dense<32896.0> : tensor<f32>
	) : tensor<f32>

	return
	}

	#translation_exclusive = #iree_codegen.translation_info<
	pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [64, 1, 1] subgroup_size = 64>
	#config_exclusive = #iree_gpu.lowering_config<{
	lane_basis = [[16], [0]],
	subgroup_basis = [[1], [0]],
	thread = [16]
	}>
	#compilation_exclusive = #iree_codegen.compilation_info<
	lowering_config = #config_exclusive,
	translation_info = #translation_exclusive>

	func.func @scan_dim0_exclusive_sum_configured_vector_distribute() {
	%c1 = arith.constant 1 : index
	%input_empty = tensor.empty() : tensor<256xf32>
	%input_init = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	outs(%input_empty : tensor<256xf32>) {
	^bb0(%out: f32):
	%idx = linalg.index 0 : index
	%idx1 = arith.addi %idx, %c1 : index
	%idx_i32 = arith.index_cast %idx1 : index to i32
	%value = arith.sitofp %idx_i32 : i32 to f32
	linalg.yield %value : f32
	} -> tensor<256xf32>
	%input = util.optimization_barrier %input_init : tensor<256xf32>

	%output = tensor.empty() : tensor<256xf32>
	%accum = util.unfoldable_constant dense<5.0> : tensor<f32>
	%0:2 = iree_linalg_ext.scan {compilation_info = #compilation_exclusive}
	dimension(0) inclusive(false)
	ins(%input : tensor<256xf32>)
	outs(%output, %accum : tensor<256xf32>, tensor<f32>) {
	^bb0(%lhs : f32, %rhs : f32):
	%sum = arith.addf %lhs, %rhs : f32
	iree_linalg_ext.yield %sum : f32
	} -> tensor<256xf32>, tensor<f32>

	%prefix = tensor.extract_slice %0#0[0] [4] [1]
	: tensor<256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%prefix,
	dense<[5.0, 6.0, 8.0, 11.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%boundary = tensor.extract_slice %0#0[14] [4] [1]
	: tensor<256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%boundary,
	dense<[110.0, 125.0, 141.0, 158.0]> : tensor<4xf32>
	) : tensor<4xf32>

	check.expect_almost_eq_const(
	%0#1,
	dense<32645.0> : tensor<f32>
	) : tensor<f32>

	return
	}

	#translation_large = #iree_codegen.translation_info<
	pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [64, 1, 1] subgroup_size = 64>
	#config_large = #iree_gpu.lowering_config<{
	lane_basis = [[2, 8], [0, 1]],
	subgroup_basis = [[1, 1], [0, 1]],
	thread = [2, 8]
	}>
	#compilation_large = #iree_codegen.compilation_info<
	lowering_config = #config_large,
	translation_info = #translation_large>

	func.func @scan_dim1_inclusive_sum_large_configured_vector_distribute() {
	%c1 = arith.constant 1 : index
	%c1000 = arith.constant 1000 : index
	%input_empty = tensor.empty() : tensor<64x256xf32>
	%input_init = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel"]}
	outs(%input_empty : tensor<64x256xf32>) {
	^bb0(%out: f32):
	%row = linalg.index 0 : index
	%col = linalg.index 1 : index
	%row_base = arith.muli %row, %c1000 : index
	%col1 = arith.addi %col, %c1 : index
	%idx = arith.addi %row_base, %col1 : index
	%idx_i32 = arith.index_cast %idx : index to i32
	%value = arith.sitofp %idx_i32 : i32 to f32
	linalg.yield %value : f32
	} -> tensor<64x256xf32>
	%input = util.optimization_barrier %input_init : tensor<64x256xf32>

	%output = tensor.empty() : tensor<64x256xf32>
	%accum = util.unfoldable_constant dense<0.0> : tensor<64xf32>
	%0:2 = iree_linalg_ext.scan {compilation_info = #compilation_large}
	dimension(1) inclusive(true)
	ins(%input : tensor<64x256xf32>)
	outs(%output, %accum : tensor<64x256xf32>, tensor<64xf32>) {
	^bb0(%lhs : f32, %rhs : f32):
	%sum = arith.addf %lhs, %rhs : f32
	iree_linalg_ext.yield %sum : f32
	} -> tensor<64x256xf32>, tensor<64xf32>

	%prefix_row0 = tensor.extract_slice %0#0[0, 0] [1, 4] [1, 1]
	: tensor<64x256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%prefix_row0,
	dense<[1.0, 3.0, 6.0, 10.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%prefix_row1 = tensor.extract_slice %0#0[1, 0] [1, 4] [1, 1]
	: tensor<64x256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%prefix_row1,
	dense<[1001.0, 2003.0, 3006.0, 4010.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%boundary_row0 = tensor.extract_slice %0#0[0, 14] [1, 4] [1, 1]
	: tensor<64x256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%boundary_row0,
	dense<[120.0, 136.0, 153.0, 171.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%boundary_row1 = tensor.extract_slice %0#0[1, 14] [1, 4] [1, 1]
	: tensor<64x256xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%boundary_row1,
	dense<[15120.0, 16136.0, 17153.0, 18171.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%acc_prefix = tensor.extract_slice %0#1[0] [2] [1]
	: tensor<64xf32> to tensor<2xf32>
	check.expect_almost_eq_const(
	%acc_prefix,
	dense<[32896.0, 288896.0]> : tensor<2xf32>
	) : tensor<2xf32>

	return
	}

	#translation_cross_subgroup = #iree_codegen.translation_info<
	pipeline = #iree_gpu.pipeline<VectorDistribute> workgroup_size = [128, 1, 1] subgroup_size = 64>
	#config_cross_subgroup = #iree_gpu.lowering_config<{
	subgroup_basis = [[2], [0]],
	lane_basis = [[8], [0]],
	thread = [4]
	}>
	#compilation_cross_subgroup = #iree_codegen.compilation_info<
	lowering_config = #config_cross_subgroup,
	translation_info = #translation_cross_subgroup>

	func.func @scan_dim0_inclusive_sum_cross_subgroup_configured_vector_distribute() {
	%c1 = arith.constant 1 : index
	%input_empty = tensor.empty() : tensor<128xf32>
	%input_init = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	outs(%input_empty : tensor<128xf32>) {
	^bb0(%out: f32):
	%idx = linalg.index 0 : index
	%idx1 = arith.addi %idx, %c1 : index
	%idx_i32 = arith.index_cast %idx1 : index to i32
	%value = arith.sitofp %idx_i32 : i32 to f32
	linalg.yield %value : f32
	} -> tensor<128xf32>
	%input = util.optimization_barrier %input_init : tensor<128xf32>

	%output = tensor.empty() : tensor<128xf32>
	%accum = util.unfoldable_constant dense<0.0> : tensor<f32>
	%0:2 = iree_linalg_ext.scan {compilation_info = #compilation_cross_subgroup}
	dimension(0) inclusive(true)
	ins(%input : tensor<128xf32>)
	outs(%output, %accum : tensor<128xf32>, tensor<f32>) {
	^bb0(%lhs : f32, %rhs : f32):
	%sum = arith.addf %lhs, %rhs : f32
	iree_linalg_ext.yield %sum : f32
	} -> tensor<128xf32>, tensor<f32>

	%prefix = tensor.extract_slice %0#0[0] [4] [1]
	: tensor<128xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%prefix,
	dense<[1.0, 3.0, 6.0, 10.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%boundary = tensor.extract_slice %0#0[62] [4] [1]
	: tensor<128xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%boundary,
	dense<[2016.0, 2080.0, 2145.0, 2211.0]> : tensor<4xf32>
	) : tensor<4xf32>

	check.expect_almost_eq_const(
	%0#1,
	dense<8256.0> : tensor<f32>
	) : tensor<f32>

	return
	}

	func.func @scan_dim0_exclusive_sum_cross_subgroup_configured_vector_distribute() {
	%c1 = arith.constant 1 : index
	%input_empty = tensor.empty() : tensor<128xf32>
	%input_init = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	outs(%input_empty : tensor<128xf32>) {
	^bb0(%out: f32):
	%idx = linalg.index 0 : index
	%idx1 = arith.addi %idx, %c1 : index
	%idx_i32 = arith.index_cast %idx1 : index to i32
	%value = arith.sitofp %idx_i32 : i32 to f32
	linalg.yield %value : f32
	} -> tensor<128xf32>
	%input = util.optimization_barrier %input_init : tensor<128xf32>

	%output = tensor.empty() : tensor<128xf32>
	%accum = util.unfoldable_constant dense<5.0> : tensor<f32>
	%0:2 = iree_linalg_ext.scan {compilation_info = #compilation_cross_subgroup}
	dimension(0) inclusive(false)
	ins(%input : tensor<128xf32>)
	outs(%output, %accum : tensor<128xf32>, tensor<f32>) {
	^bb0(%lhs : f32, %rhs : f32):
	%sum = arith.addf %lhs, %rhs : f32
	iree_linalg_ext.yield %sum : f32
	} -> tensor<128xf32>, tensor<f32>

	%prefix = tensor.extract_slice %0#0[0] [4] [1]
	: tensor<128xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%prefix,
	dense<[5.0, 6.0, 8.0, 11.0]> : tensor<4xf32>
	) : tensor<4xf32>

	%boundary = tensor.extract_slice %0#0[62] [4] [1]
	: tensor<128xf32> to tensor<4xf32>
	check.expect_almost_eq_const(
	%boundary,
	dense<[1958.0, 2021.0, 2085.0, 2150.0]> : tensor<4xf32>
	) : tensor<4xf32>

	check.expect_almost_eq_const(
	%0#1,
	dense<8133.0> : tensor<f32>
	) : tensor<f32>

	return
	}