tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir - 3p/openxla/iree - Git at Google


 // Preprocessing with generalized packing.
 //
 // RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
 // RUN: iree-opt --iree-hal-target-backends=llvm-cpu \
 // RUN:     --iree-abi-transformation-pipeline \
 // RUN:     --iree-flow-transformation-pipeline \
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: FileCheck %s

 // Check that compilation runs all the way to the end.
 // TODO: this currently fails with:
 //   'memref.alloca' op all stack allocations need to be hoisted to the entry block of the function
 //
 // R-UN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
 // R-UN: iree-compile --iree-hal-target-backends=llvm-cpu

 !a_tensor_t = tensor<1234x567xf32>
 !b_tensor_t = tensor<567x890xf32>
 !c_tensor_t = tensor<1234x890xf32>

 // Note: the normalization in these maps is gone due to InterchangeGenericOps.
 // When using generalized packing, it would be better to drop that pass.

 // CHECK-DAG: #[[$map_lhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>
 // CHECK-DAG: #[[$map_rhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>
 // CHECK-DAG: #[[$map_res:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>

 // CHECK-LABEL: func.func @matmul_dispatch_0
 //       CHECK:   tensor.empty() : tensor<155x18x8x32xf32>
 //       CHECK:   tensor.pack

 // CHECK-LABEL: func.func @matmul_dispatch_1
 //       CHECK:   arith.constant dense<1.000000e-01> : tensor<567x890xf32>
 //       CHECK:   tensor.empty() : tensor<18x56x16x32xf32>
 //       CHECK:   tensor.pack

 // CHECK-LABEL: func.func @matmul_dispatch_2
 //       CHECK:   tensor.empty() : tensor<155x56x8x16xf32>
 //       CHECK:   tensor.pack

 // CHECK-LABEL: func.func @matmul_dispatch_3
 func.func @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
   %c0 = arith.constant dense<0.1> : !b_tensor_t
   //  CHECK-NOT: pack
   //      CHECK: linalg.generic {indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]],
   // CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
   // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
   // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)

   %0 = linalg.matmul
      ins(%arg0, %c0: !a_tensor_t, !b_tensor_t)
     outs(%arg2: !c_tensor_t) -> !c_tensor_t
   return %0 : !c_tensor_t
 }

 // CHECK-LABEL: func.func @matmul_dispatch_4
 //       CHECK:   tensor.unpack

 transform.sequence failures(propagate) {
 ^bb1(%module_op: !pdl.operation):
   %matmul = transform.structured.match interface{LinalgOp} in %module_op
     : (!pdl.operation) -> (!pdl.operation)

   transform.structured.pack_greedily %matmul
       gemm_packed_sizes = [8, 16, 32] gemm_inner_dims_order = [0, 1, 2]
     : (!pdl.operation) -> !transform.op<"linalg.generic">
 }

	// Preprocessing with generalized packing.
	//
	// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule \| \
	// RUN: iree-opt --iree-hal-target-backends=llvm-cpu \
	// RUN: --iree-abi-transformation-pipeline \
	// RUN: --iree-flow-transformation-pipeline \
	// RUN: --iree-stream-transformation-pipeline \
	// RUN: --iree-hal-configuration-pipeline \| \
	// RUN: FileCheck %s

	// Check that compilation runs all the way to the end.
	// TODO: this currently fails with:
	// 'memref.alloca' op all stack allocations need to be hoisted to the entry block of the function
	//
	// R-UN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule \| \
	// R-UN: iree-compile --iree-hal-target-backends=llvm-cpu

	!a_tensor_t = tensor<1234x567xf32>
	!b_tensor_t = tensor<567x890xf32>
	!c_tensor_t = tensor<1234x890xf32>

	// Note: the normalization in these maps is gone due to InterchangeGenericOps.
	// When using generalized packing, it would be better to drop that pass.

	// CHECK-DAG: #[[$map_lhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>
	// CHECK-DAG: #[[$map_rhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>
	// CHECK-DAG: #[[$map_res:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>

	// CHECK-LABEL: func.func @matmul_dispatch_0
	// CHECK: tensor.empty() : tensor<155x18x8x32xf32>
	// CHECK: tensor.pack

	// CHECK-LABEL: func.func @matmul_dispatch_1
	// CHECK: arith.constant dense<1.000000e-01> : tensor<567x890xf32>
	// CHECK: tensor.empty() : tensor<18x56x16x32xf32>
	// CHECK: tensor.pack

	// CHECK-LABEL: func.func @matmul_dispatch_2
	// CHECK: tensor.empty() : tensor<155x56x8x16xf32>
	// CHECK: tensor.pack

	// CHECK-LABEL: func.func @matmul_dispatch_3
	func.func @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
	%c0 = arith.constant dense<0.1> : !b_tensor_t
	// CHECK-NOT: pack
	// CHECK: linalg.generic {indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
	// CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
	// CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>)

	%0 = linalg.matmul
	ins(%arg0, %c0: !a_tensor_t, !b_tensor_t)
	outs(%arg2: !c_tensor_t) -> !c_tensor_t
	return %0 : !c_tensor_t
	}

	// CHECK-LABEL: func.func @matmul_dispatch_4
	// CHECK: tensor.unpack

	transform.sequence failures(propagate) {
	^bb1(%module_op: !pdl.operation):
	%matmul = transform.structured.match interface{LinalgOp} in %module_op
	: (!pdl.operation) -> (!pdl.operation)

	transform.structured.pack_greedily %matmul
	gemm_packed_sizes = [8, 16, 32] gemm_inner_dims_order = [0, 1, 2]
	: (!pdl.operation) -> !transform.op<"linalg.generic">
	}