blob: c40dbe495f6d2f70fbb93b8ef757a790b9024b2c [file] [log] [blame]
// RUN: iree-opt --split-input-file \
// RUN: --iree-util-optimize-int-arithmetic=narrow-to-i32=true --cse %s \
// RUN: | FileCheck %s
// We inherit a number of patterns from upstream for narrowing specific arith
// operations. Those are not the focus of testing, but we may test some of them
// here incidentally as part of verifying that the overall pass and local
// patterns are effective.
// CHECK-LABEL: @narrow_tid_computations
// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : i32
// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : i32
// CHECK-DAG: %[[THREAD_ID_X:.+]] = gpu.thread_id x upper_bound 64
// CHECK-DAG: %[[TID_I32:.+]] = arith.index_castui %[[THREAD_ID_X]] : index to i32
// CHECK: %[[V0:.+]] = arith.divui %[[TID_I32]], %[[C16]] : i32
// CHECK-NEXT: %[[V1:.+]] = arith.remui %[[TID_I32]], %[[C16]] : i32
// CHECK-NEXT: %[[V2:.+]] = arith.muli %[[V0]], %[[C32]] : i32
// CHECK-NEXT: %[[V3:.+]] = arith.addi %[[V2]], %[[V1]] : i32
// CHECK-NEXT: %[[RET:.+]] = arith.index_castui %[[V3]] : i32 to index
// CHECK: return %[[RET]]
util.func @narrow_tid_computations() -> index {
%c16 = arith.constant 16 : index
%c32 = arith.constant 32 : index
%thread_id_x = gpu.thread_id x upper_bound 64
%0 = arith.divui %thread_id_x, %c16 : index
%1 = arith.remui %thread_id_x, %c16 : index
%2 = arith.muli %0, %c32 : index
%3 = arith.addi %2, %1 : index
util.return %3 : index
}
// -----
// CHECK-LABEL: @narrow_assumes
// CHECK-SAME: (%[[ARG0:.+]]: i32)
// CHECK-NEXT: %[[ASSUME:.+]] = util.assume.int %[[ARG0]]<umin = 16, umax = 122, udiv = 16> : i32
// CHECK-NEXT: %[[AS_INDEX:.+]] = arith.index_castui %[[ASSUME]] : i32 to index
// CHECK-NEXT: util.return %[[ASSUME]], %[[AS_INDEX]]
util.func @narrow_assumes(%arg0: i32) -> (i32, index) {
%0 = arith.index_castui %arg0 : i32 to index
%1 = util.assume.int %0<umin = 16, umax = 122, udiv = 16> : index
%2 = arith.index_castui %1 : index to i32
util.return %2, %1 : i32, index
}
// -----
// CHECK-LABEL: @narrow_scf_for
// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : i32
// CHECK-DAG: %[[C96:.+]] = arith.constant 96 : i32
// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : i32
// CHECK-DAG: %[[TID:.+]] = gpu.thread_id x upper_bound 64
// CHECK-DAG: %[[TID_I32:.+]] = arith.index_castui %[[TID]] : index to i32
// CHECK: scf.for %[[ARG1:.+]] = %[[TID_I32]] to %[[C96]] step %[[C64]]
// CHECK-NEXT: %[[V0:.+]] = arith.addi %[[ARG1]], %[[C512]]
// CHECK-NEXT: %[[V0_IDX:.+]] = arith.index_castui %[[V0]] : i32 to index
// CHECK-NEXT: memref.store {{.*}}[%[[V0_IDX]]]
util.func @narrow_scf_for(%arg0: memref<?xf32>) {
%c0_f32 = arith.constant 0.0 : f32
%c64 = arith.constant 64 : index
%c96 = arith.constant 96 : index
%c512 = arith.constant 512 : index
%tid = gpu.thread_id x upper_bound 64
scf.for %arg1 = %tid to %c96 step %c64 {
%0 = arith.addi %arg1, %c512 : index
memref.store %c0_f32, %arg0[%0] : memref<?xf32>
}
util.return
}