Merge pull request #4125 from hanhanW:main-to-google

PiperOrigin-RevId: 346420314
diff --git a/build_tools/docker/base/Dockerfile b/build_tools/docker/base/Dockerfile
index 4177601..7b75939 100644
--- a/build_tools/docker/base/Dockerfile
+++ b/build_tools/docker/base/Dockerfile
@@ -17,7 +17,6 @@
 # Environment variables for IREE.
 ENV CC /usr/bin/clang
 ENV CXX /usr/bin/clang++
-ENV IREE_LLVMAOT_LINKER_PATH /usr/bin/ld
 
 RUN apt-get update \
   && apt-get install -y \
diff --git a/build_tools/docker/bazel-python/Dockerfile b/build_tools/docker/bazel-python/Dockerfile
index 45e2314..137a474 100644
--- a/build_tools/docker/bazel-python/Dockerfile
+++ b/build_tools/docker/bazel-python/Dockerfile
@@ -14,7 +14,7 @@
 
 # An image for building IREE with Python bindings using Bazel.
 
-FROM gcr.io/iree-oss/bazel@sha256:066af7fcb39c13284ed47b2d6afe75f944c1d7415a21beaa5afd6319176654e8 AS final
+FROM gcr.io/iree-oss/bazel@sha256:a5c4e189f48e503276c1ba208fee8365b20df503a1b201cde6608dee5eeebadd AS final
 
 # Install python3 and numpy.
 RUN apt-get update \
diff --git a/build_tools/docker/bazel-tensorflow-swiftshader/Dockerfile b/build_tools/docker/bazel-tensorflow-swiftshader/Dockerfile
index dfad706..0581202 100644
--- a/build_tools/docker/bazel-tensorflow-swiftshader/Dockerfile
+++ b/build_tools/docker/bazel-tensorflow-swiftshader/Dockerfile
@@ -17,7 +17,7 @@
 
 FROM gcr.io/iree-oss/bazel-tensorflow-vulkan AS final
 
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:883a33e4b9d33c6c7b73fc34319a7510f218c1c1598f5253dcb2f64c5aa263a5 swiftshader/ swiftshader/
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:ccae32c83c89a31e8fc5542e480c29f28bbf4a3b3b80198c06b687a92c6813f3 swiftshader/ swiftshader/
 
 # Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
 ENV VK_ICD_FILENAMES /swiftshader/vk_swiftshader_icd.json
diff --git a/build_tools/docker/bazel/Dockerfile b/build_tools/docker/bazel/Dockerfile
index 1c6982a..f08d620 100644
--- a/build_tools/docker/bazel/Dockerfile
+++ b/build_tools/docker/bazel/Dockerfile
@@ -42,7 +42,7 @@
   # is effectively a noop.
   && apt-get install -y "bazel=${BAZEL_VERSION?}" "bazel-${NEW_BAZEL_VERSION?}"
 
-FROM gcr.io/iree-oss/base@sha256:1e57b0957f71cd1aa9d6e4838c51f40bdbb52dd1be0b4b6b14b337b36654cc63 AS final
+FROM gcr.io/iree-oss/base@sha256:9b73f4e2b1239f65a19f2022e54f4b15310b805570831fbe2cf8b4dc928f1d10 AS final
 ARG BAZEL_VERSION
 ARG NEW_BAZEL_VERSION
 COPY --from=install-bazel \
diff --git a/build_tools/docker/cmake-android/Dockerfile b/build_tools/docker/cmake-android/Dockerfile
index 03e50f9..813af72 100644
--- a/build_tools/docker/cmake-android/Dockerfile
+++ b/build_tools/docker/cmake-android/Dockerfile
@@ -24,7 +24,7 @@
 
 RUN unzip "android-ndk-${NDK_VERSION?}-linux-x86_64.zip" -d /usr/src/
 
-FROM gcr.io/iree-oss/cmake@sha256:644cc10ea5a33bd97be51a8f6fd6ee7e2ab3904f468873be0f71373b0ec48919 AS final
+FROM gcr.io/iree-oss/cmake@sha256:9d9953acf5ca0cf1ff3e8de32f10f24dfab1c4e8ec5d1fc047f556024ee4bed6 AS final
 ARG NDK_VERSION
 COPY --from=install-ndk "/usr/src/android-ndk-${NDK_VERSION}" "/usr/src/android-ndk-${NDK_VERSION}"
 ENV ANDROID_NDK "/usr/src/android-ndk-${NDK_VERSION}"
diff --git a/build_tools/docker/cmake-python-swiftshader/Dockerfile b/build_tools/docker/cmake-python-swiftshader/Dockerfile
index 61d3266..9c652b5 100644
--- a/build_tools/docker/cmake-python-swiftshader/Dockerfile
+++ b/build_tools/docker/cmake-python-swiftshader/Dockerfile
@@ -16,7 +16,7 @@
 # Vulkan implementation.
 
 FROM gcr.io/iree-oss/cmake-python-vulkan AS final
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:883a33e4b9d33c6c7b73fc34319a7510f218c1c1598f5253dcb2f64c5aa263a5 /swiftshader /swiftshader
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:ccae32c83c89a31e8fc5542e480c29f28bbf4a3b3b80198c06b687a92c6813f3 /swiftshader /swiftshader
 
 # Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
 ENV VK_ICD_FILENAMES /swiftshader/vk_swiftshader_icd.json
diff --git a/build_tools/docker/cmake-python/Dockerfile b/build_tools/docker/cmake-python/Dockerfile
index 5daa252..5018d6d 100644
--- a/build_tools/docker/cmake-python/Dockerfile
+++ b/build_tools/docker/cmake-python/Dockerfile
@@ -14,7 +14,7 @@
 
 # An image for building IREE and its Python bindings using CMake.
 
-FROM gcr.io/iree-oss/cmake@sha256:644cc10ea5a33bd97be51a8f6fd6ee7e2ab3904f468873be0f71373b0ec48919 AS final
+FROM gcr.io/iree-oss/cmake@sha256:9d9953acf5ca0cf1ff3e8de32f10f24dfab1c4e8ec5d1fc047f556024ee4bed6 AS final
 # Dependencies for the python bindings tests.
 RUN apt-get update \
   && apt-get install -y \
diff --git a/build_tools/docker/cmake/Dockerfile b/build_tools/docker/cmake/Dockerfile
index a0b4263..6b4c161 100644
--- a/build_tools/docker/cmake/Dockerfile
+++ b/build_tools/docker/cmake/Dockerfile
@@ -33,7 +33,7 @@
 RUN chmod +x "./cmake-${CMAKE_VERSION?}-Linux-x86_64.sh"
 RUN "./cmake-${CMAKE_VERSION?}-Linux-x86_64.sh" --skip-license --prefix=/usr/
 
-FROM gcr.io/iree-oss/base@sha256:1e57b0957f71cd1aa9d6e4838c51f40bdbb52dd1be0b4b6b14b337b36654cc63 AS final
+FROM gcr.io/iree-oss/base@sha256:9b73f4e2b1239f65a19f2022e54f4b15310b805570831fbe2cf8b4dc928f1d10 AS final
 ARG CMAKE_MAJOR_VERSION
 ARG CMAKE_MINOR_VERSION
 
diff --git a/build_tools/docker/prod_digests.txt b/build_tools/docker/prod_digests.txt
index 40cffcf..1945bec 100644
--- a/build_tools/docker/prod_digests.txt
+++ b/build_tools/docker/prod_digests.txt
@@ -1,17 +1,17 @@
-gcr.io/iree-oss/base@sha256:1e57b0957f71cd1aa9d6e4838c51f40bdbb52dd1be0b4b6b14b337b36654cc63
+gcr.io/iree-oss/base@sha256:9b73f4e2b1239f65a19f2022e54f4b15310b805570831fbe2cf8b4dc928f1d10
 gcr.io/iree-oss/util@sha256:40846b4aea5886af3250399d6adfdb3e1195a8b0177706bb0375e812d62dc49c
-gcr.io/iree-oss/cmake@sha256:644cc10ea5a33bd97be51a8f6fd6ee7e2ab3904f468873be0f71373b0ec48919
-gcr.io/iree-oss/swiftshader@sha256:883a33e4b9d33c6c7b73fc34319a7510f218c1c1598f5253dcb2f64c5aa263a5
-gcr.io/iree-oss/cmake-python@sha256:f90e72f8d01c53f462bef56d90a07fed833ff754637d324ad95d81c8699c1309
-gcr.io/iree-oss/cmake-android@sha256:78db00980309a0b52f8c877f8717b3d9ac3c35b619ae704e21f165345409685f
-gcr.io/iree-oss/bazel@sha256:066af7fcb39c13284ed47b2d6afe75f944c1d7415a21beaa5afd6319176654e8
-gcr.io/iree-oss/bazel-python@sha256:b9fc661cedcf3f5f0cce3f207640f79cb92ba72a9f850e1041312ec0ecdefa39
-gcr.io/iree-oss/bazel-tensorflow@sha256:4c2845e20e62f991e34a7cbe973a12ee824e9adc146fb86fdeee1c4e6b35cb12
+gcr.io/iree-oss/cmake@sha256:9d9953acf5ca0cf1ff3e8de32f10f24dfab1c4e8ec5d1fc047f556024ee4bed6
+gcr.io/iree-oss/swiftshader@sha256:ccae32c83c89a31e8fc5542e480c29f28bbf4a3b3b80198c06b687a92c6813f3
+gcr.io/iree-oss/cmake-python@sha256:2777aaf49a41669c6f0567f25dd8e940d4058df64f8a7a78af0fdcb8a80eea4f
+gcr.io/iree-oss/cmake-android@sha256:15d3266ae4865f7642a4ef4d76e5181f0dc3482a7cfba9021b6b55be524208ec
+gcr.io/iree-oss/bazel@sha256:a5c4e189f48e503276c1ba208fee8365b20df503a1b201cde6608dee5eeebadd
+gcr.io/iree-oss/bazel-python@sha256:6a1cee37fa2148a9c6c58273f6e02ca2ac89af0b4908962f1b8fe3ffbb6bd476
+gcr.io/iree-oss/bazel-tensorflow@sha256:d0aa0d31b1c6cc61148e6520077bb725cfee238bfe268c77414c5baabf7608ac
 gcr.io/iree-oss/vulkan@sha256:5812ee64806a7f3df0739ccf0930c27cabce346901488eceb1ee66c9c0a5ae96
 gcr.io/iree-oss/rbe-toolchain@sha256:d69c260b98a97ad430d34c4591fb2399e00888750f5d47ede00c1e6f3e774e5a
-gcr.io/iree-oss/cmake-python-vulkan@sha256:9a764e4944951a8717a4dfbfdcedb0ddd40f63ff681b2e2f24e34fe3e8bb85e7
-gcr.io/iree-oss/cmake-python-swiftshader@sha256:5885e2fb1fd8afdbed1cecc97eeeafeacbfa779b07a6536ecdc85f079dff0af7
-gcr.io/iree-oss/cmake-python-nvidia@sha256:bf6ce5a17c44b041d2fcc74018afd30b6ad35cb769d668f49e615085daddf8a7
-gcr.io/iree-oss/bazel-tensorflow-vulkan@sha256:a33217d03c1a1e96056c7ffa2c0c8857634a9cde23f5d346a58f5e266e3c011a
-gcr.io/iree-oss/bazel-tensorflow-swiftshader@sha256:7f697693448e3d6fe33a4f8f8386b014bb03a7147eef5928a74ef92a8aa0ddc4
-gcr.io/iree-oss/bazel-tensorflow-nvidia@sha256:575ba235ebbbcee5bc26f20c6362664a62113ac869c8868ad415c175fe9c08b0
+gcr.io/iree-oss/cmake-python-vulkan@sha256:f7695315d010a393f3669dace08c05d05735c6d8ce26d5fdda1795f338235f74
+gcr.io/iree-oss/cmake-python-swiftshader@sha256:68a757f54f8a494aee23d43305e3774344fc2607c6aafef33709a571d935bc11
+gcr.io/iree-oss/cmake-python-nvidia@sha256:3e29b42a0eb3bd32f71426b4b41068789a2848e1447467aa409af4109281f4cb
+gcr.io/iree-oss/bazel-tensorflow-vulkan@sha256:caa0c9699f4041406bf978ed4b1ce69b3b40af436d6e999fd3fac037cd4d4749
+gcr.io/iree-oss/bazel-tensorflow-swiftshader@sha256:0f60ce244cc6a2caa89915905baa92d1fae4f806d8c12901ba4b97a56e803a75
+gcr.io/iree-oss/bazel-tensorflow-nvidia@sha256:18f3c97bdac1c6705536efc5fb0f38eb541c723671fd451a257119feff2896b7
diff --git a/build_tools/docker/swiftshader/Dockerfile b/build_tools/docker/swiftshader/Dockerfile
index 65b960b..91d7936 100644
--- a/build_tools/docker/swiftshader/Dockerfile
+++ b/build_tools/docker/swiftshader/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM gcr.io/iree-oss/cmake@sha256:644cc10ea5a33bd97be51a8f6fd6ee7e2ab3904f468873be0f71373b0ec48919 AS install-swiftshader
+FROM gcr.io/iree-oss/cmake@sha256:9d9953acf5ca0cf1ff3e8de32f10f24dfab1c4e8ec5d1fc047f556024ee4bed6 AS install-swiftshader
 WORKDIR /install-swiftshader
 
 RUN apt-get update && apt-get install -y git
diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/bindings/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/bindings/build_kokoro.sh
index 256472e..fb86131 100755
--- a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/bindings/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/bindings/build_kokoro.sh
@@ -32,7 +32,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/bazel-python@sha256:b9fc661cedcf3f5f0cce3f207640f79cb92ba72a9f850e1041312ec0ecdefa39 \
+  gcr.io/iree-oss/bazel-python@sha256:6a1cee37fa2148a9c6c58273f6e02ca2ac89af0b4908962f1b8fe3ffbb6bd476 \
   build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/bindings/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh
index d910e2c..d17ba86 100755
--- a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build_kokoro.sh
@@ -32,7 +32,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/bazel@sha256:066af7fcb39c13284ed47b2d6afe75f944c1d7415a21beaa5afd6319176654e8 \
+  gcr.io/iree-oss/bazel@sha256:a5c4e189f48e503276c1ba208fee8365b20df503a1b201cde6608dee5eeebadd \
   build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/core/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh
index 0f37cda..64435f0 100755
--- a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build_kokoro.sh
@@ -32,7 +32,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/bazel-tensorflow-swiftshader@sha256:7f697693448e3d6fe33a4f8f8386b014bb03a7147eef5928a74ef92a8aa0ddc4 \
+  gcr.io/iree-oss/bazel-tensorflow-swiftshader@sha256:0f60ce244cc6a2caa89915905baa92d1fae4f806d8c12901ba4b97a56e803a75 \
   build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-swiftshader/integrations/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh
index ff47d21..cb660fa 100755
--- a/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build_kokoro.sh
@@ -36,7 +36,7 @@
 # TODO(#3550): Allow this to follow the checked-in Docker hierarchy.
 docker run "${DOCKER_RUN_ARGS[@]?}" \
   --gpus all \
-  gcr.io/iree-oss/bazel-tensorflow-nvidia@sha256:575ba235ebbbcee5bc26f20c6362664a62113ac869c8868ad415c175fe9c08b0 \
+  gcr.io/iree-oss/bazel-tensorflow-nvidia@sha256:18f3c97bdac1c6705536efc5fb0f38eb541c723671fd451a257119feff2896b7 \
   build_tools/kokoro/gcp_ubuntu/bazel/linux/x86-turing/integrations/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
index 68f13a8..0f978ed 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/android/arm64-v8a/build_kokoro.sh
@@ -32,7 +32,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/cmake-android@sha256:78db00980309a0b52f8c877f8717b3d9ac3c35b619ae704e21f165345409685f \
+  gcr.io/iree-oss/cmake-android@sha256:15d3266ae4865f7642a4ef4d76e5181f0dc3482a7cfba9021b6b55be524208ec \
   build_tools/kokoro/gcp_ubuntu/cmake/android/build.sh arm64-v8a
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
index e17205a..fc52b18 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
@@ -32,7 +32,7 @@
 docker_setup
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
-  gcr.io/iree-oss/cmake-python-swiftshader@sha256:5885e2fb1fd8afdbed1cecc97eeeafeacbfa779b07a6536ecdc85f079dff0af7 \
+  gcr.io/iree-oss/cmake-python-swiftshader@sha256:68a757f54f8a494aee23d43305e3774344fc2607c6aafef33709a571d935bc11 \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
index 9452065..a919978 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build_kokoro.sh
@@ -33,7 +33,7 @@
 
 docker run "${DOCKER_RUN_ARGS[@]?}" \
   --gpus all \
-  gcr.io/iree-oss/cmake-python-nvidia@sha256:bf6ce5a17c44b041d2fcc74018afd30b6ad35cb769d668f49e615085daddf8a7 \
+  gcr.io/iree-oss/cmake-python-nvidia@sha256:3e29b42a0eb3bd32f71426b4b41068789a2848e1447467aa409af4109281f4cb \
   build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-turing/build.sh
 
 # Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/third_party/pffft/BUILD.overlay b/build_tools/third_party/pffft/BUILD.overlay
index 96306ba..13eccec 100644
--- a/build_tools/third_party/pffft/BUILD.overlay
+++ b/build_tools/third_party/pffft/BUILD.overlay
@@ -12,6 +12,15 @@
     hdrs = [
         "pffft.h",
     ],
+    deps =  [":pffft_internal"],
+    include_prefix = "third_party/pffft",
+)
+
+cc_library(
+    name = "pffft_internal",
+    hdrs = [
+        "pffft.h",
+    ],
 )
 
 cc_library(
diff --git a/integrations/tensorflow/e2e/BUILD b/integrations/tensorflow/e2e/BUILD
index 031658b..28f69d1 100644
--- a/integrations/tensorflow/e2e/BUILD
+++ b/integrations/tensorflow/e2e/BUILD
@@ -84,7 +84,6 @@
     "einsum_dynamic_test.py",
     "einsum_static_test.py",
     "einsum_vector_test.py",
-    "fft_test.py",  # TODO(natashaknk): Get this working after kernel is in.
     "mandelbrot_test.py",  # TODO(silvasean): Get this working on IREE.
     "ring_buffer_test.py",  # TODO(b/148747011)
     "strings_test.py",
diff --git a/integrations/tensorflow/e2e/fft_test.py b/integrations/tensorflow/e2e/fft_test.py
index 590bff2..59f6144 100644
--- a/integrations/tensorflow/e2e/fft_test.py
+++ b/integrations/tensorflow/e2e/fft_test.py
@@ -21,8 +21,8 @@
 class FftModule(tf.Module):
   # TODO(natashaknk) when multiple outputs are supported, make into one test.
   @tf.function(input_signature=[
-      tf.TensorSpec([4], tf.float32),
-      tf.TensorSpec([4], tf.float32)
+      tf.TensorSpec([16], tf.float32),
+      tf.TensorSpec([16], tf.float32)
   ])
   def fft_real(self, real_array, imag_array):
     complex_in = tf.complex(real_array, imag_array)
@@ -30,8 +30,8 @@
     return tf.math.real(complex_out)
 
   @tf.function(input_signature=[
-      tf.TensorSpec([4], tf.float32),
-      tf.TensorSpec([4], tf.float32)
+      tf.TensorSpec([16], tf.float32),
+      tf.TensorSpec([16], tf.float32)
   ])
   def fft_imag(self, real_array, imag_array):
     complex_in = tf.complex(real_array, imag_array)
@@ -48,18 +48,34 @@
   def test_fft_real(self):
 
     def fft_real(module):
-      real_array = np.array([9., 1., 4.5, -0.3], dtype=np.float32)
-      imag_array = np.array([0., -1., 17.7, 10.], dtype=np.float32)
-      module.fft_real(real_array, imag_array)
+      real_array = np.array([
+          9., 1., 4.5, -0.3, 10., -1., 5.5, 0.3, 299., 3.5, -0.777, 2., 1.7,
+          3.5, -4.5, 0.0
+      ],
+                            dtype=np.float32)
+      imag_array = np.array([
+          0., -1., 17.7, 10., 0., -11., 2763, 0., 0., -1.5, 16.8, 100., 0.,
+          -111., 2.3, 1.
+      ],
+                            dtype=np.float32)
+      module.fft_real(real_array, imag_array, rtol=1e-4)
 
     self.compare_backends(fft_real, self._modules)
 
   def test_fft_imag(self):
 
     def fft_imag(module):
-      real_array = np.array([9., 1., 4.5, -0.3], dtype=np.float32)
-      imag_array = np.array([0., -1., 17.7, 10.], dtype=np.float32)
-      module.fft_imag(real_array, imag_array)
+      real_array = np.array([
+          9., 1., 4.5, -0.3, 10., -1., 5.5, 0.3, 299., 3.5, -0.777, 2, 1.7, 3.5,
+          -4.5, 0.0
+      ],
+                            dtype=np.float32)
+      imag_array = np.array([
+          0., -1., 17.7, 10., 0., -11., 2763, 0., 0., -1.5, 16.8, 100., 0.,
+          -111., 2.3, 1.
+      ],
+                            dtype=np.float32)
+      module.fft_imag(real_array, imag_array, rtol=1e-4)
 
     self.compare_backends(fft_imag, self._modules)
 
diff --git a/iree/compiler/Conversion/LinalgToLLVM/BUILD b/iree/compiler/Conversion/LinalgToLLVM/BUILD
index 0ef31cb..bcdfc24 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/BUILD
+++ b/iree/compiler/Conversion/LinalgToLLVM/BUILD
@@ -25,6 +25,7 @@
         "ConvertToLLVM.cpp",
         "KernelDispatch.cpp",
         "LinalgRewriteDestructiveUpdatesPass.cpp",
+        "LinalgTileAndDistributeOnTensorsPass.cpp",
         "LinalgTileAndDistributePass.cpp",
         "LinalgTileAndVectorizePass.cpp",
         "Passes.cpp",
diff --git a/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt b/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt
index 46d3787..199d55e 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt
+++ b/iree/compiler/Conversion/LinalgToLLVM/CMakeLists.txt
@@ -25,6 +25,7 @@
     "ConvertToLLVM.cpp"
     "KernelDispatch.cpp"
     "LinalgRewriteDestructiveUpdatesPass.cpp"
+    "LinalgTileAndDistributeOnTensorsPass.cpp"
     "LinalgTileAndDistributePass.cpp"
     "LinalgTileAndVectorizePass.cpp"
     "Passes.cpp"
diff --git a/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributeOnTensorsPass.cpp b/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributeOnTensorsPass.cpp
new file mode 100644
index 0000000..33b05e1
--- /dev/null
+++ b/iree/compiler/Conversion/LinalgToLLVM/LinalgTileAndDistributeOnTensorsPass.cpp
@@ -0,0 +1,138 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
+#include "iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h"
+#include "iree/compiler/Dialect/IREE/IR/IREEDialect.h"
+#include "iree/compiler/Dialect/IREE/IR/IREEOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-linalg-tile-and-distribute-on-tensors"
+
+namespace mlir {
+namespace iree_compiler {
+
+struct LinalgTileAndDistributeOnTensorsPass
+    : public PassWrapper<LinalgTileAndDistributeOnTensorsPass,
+                         OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, IREEDialect, AffineDialect,
+                    scf::SCFDialect>();
+  }
+  LinalgTileAndDistributeOnTensorsPass() = default;
+  LinalgTileAndDistributeOnTensorsPass(
+      const LinalgTileAndDistributeOnTensorsPass &pass) {}
+  void runOnOperation() override;
+
+ private:
+  ListOption<int64_t> tileSizes{
+      *this, "tile-sizes", llvm::cl::desc("Set tile sizes to use"),
+      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
+};
+
+static std::pair<Value, Value> buildWorkgroupOpPair(OpBuilder &b,
+                                                    StringRef dim) {
+  Type indexType = b.getIndexType();
+  StringAttr attr = b.getStringAttr(dim);
+  return {b.create<IREE::WorkgroupIdOp>(b.getInsertionPoint()->getLoc(),
+                                        indexType, attr),
+          b.create<IREE::WorkgroupSizeOp>(b.getInsertionPoint()->getLoc(),
+                                          indexType, attr)};
+}
+
+// Rewrite pattern to ensure only ops with tensor semantics are tiled.
+struct TileAndDistributeOnTensorsPattern
+    : public linalg::LinalgBaseTilingPattern {
+  using Base = linalg::LinalgBaseTilingPattern;
+  TileAndDistributeOnTensorsPattern(linalg::LinalgTilingOptions options,
+                                    linalg::LinalgMarker marker,
+                                    PatternBenefit benefit = 1)
+      : Base(options, marker, benefit) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
+    if (!linalgOp || !linalgOp.hasTensorSemantics()) return failure();
+    SmallVector<Value, 4> tensorResults;
+    if (failed(Base::matchAndRewriteBase(op, rewriter, tensorResults)))
+      return failure();
+    // TODO: Wrap in sequentialized SPMD loops.
+    rewriter.replaceOp(op, tensorResults);
+    return success();
+  }
+};
+
+void LinalgTileAndDistributeOnTensorsPass::runOnOperation() {
+  if (tileSizes.empty()) return;
+  ModuleOp module = getOperation();
+  MLIRContext *context = module->getContext();
+
+  // Distribution strategy along at most 3 dimensions with WorkgroupIdOp in
+  // range [0, WorkgroupSizeOp).
+  static linalg::LinalgLoopDistributionOptions workgroupDistributionOptions = {
+      [](OpBuilder &builder, Location loc, ArrayRef<Range> parallelLoopRanges) {
+        // TODO: drop magic names.
+        std::array<StringRef, 3> dimStrs{"x", "y", "z"};
+        auto numParallelDims = parallelLoopRanges.size();
+        SmallVector<linalg::ProcInfo, 2> procInfo(numParallelDims);
+        for (unsigned dim = 0; dim < std::min(numParallelDims, 3ul); ++dim) {
+          auto p = buildWorkgroupOpPair(builder, dimStrs[dim]);
+          procInfo[dim] = {p.first, p.second};
+        }
+        return procInfo;
+      },
+      {linalg::DistributionMethod::Cyclic, linalg::DistributionMethod::Cyclic,
+       linalg::DistributionMethod::Cyclic}};
+
+  for (FuncOp funcOp : module.getOps<FuncOp>()) {
+    // TODO: maybe activate when put in a real pipeline.
+    // if (!isEntryPoint(funcOp)) continue;
+
+    OwningRewritePatternList patterns;
+    auto linalgTilingOptions =
+        linalg::LinalgTilingOptions()
+            .setDistributionOptions(workgroupDistributionOptions)
+            .setLoopType(linalg::LinalgTilingLoopType::Loops)
+            .setTileSizes(ArrayRef<int64_t>(tileSizes));
+    assert(linalgTilingOptions.distribution.hasValue());
+
+    // In the future, derive from LinalgTilingPattern to create sequentialized
+    // SPMD loops.
+    patterns.insert<TileAndDistributeOnTensorsPattern>(
+        linalgTilingOptions,
+        linalg::LinalgMarker(ArrayRef<Identifier>(),
+                             Identifier::get(getWorkgroupMarker(), context)));
+    // Add canonicalization patterns.
+    linalg::populateLinalgTilingCanonicalizationPatterns(patterns, context);
+    patterns.insert<AffineMinCanonicalizationPattern>(context);
+    applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
+  }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createLinalgTileAndDistributeOnTensorsPass() {
+  return std::make_unique<LinalgTileAndDistributeOnTensorsPass>();
+}
+
+static PassRegistration<LinalgTileAndDistributeOnTensorsPass> pass(
+    "iree-codegen-llvm-linalg-tile-and-distribute-on-tensors",
+    "Tile and distribute Linalg operations on tensors",
+    [] { return std::make_unique<LinalgTileAndDistributeOnTensorsPass>(); });
+
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/iree/compiler/Conversion/LinalgToLLVM/Passes.h b/iree/compiler/Conversion/LinalgToLLVM/Passes.h
index 09edd29..42036b6 100644
--- a/iree/compiler/Conversion/LinalgToLLVM/Passes.h
+++ b/iree/compiler/Conversion/LinalgToLLVM/Passes.h
@@ -50,6 +50,11 @@
 std::unique_ptr<OperationPass<FuncOp>>
 createLinalgRewriteDestructiveUpdatesPass();
 
+/// Pass to perform tiling and distribution of Linalg ops with tensor semantics
+/// to sequentialized SPMD loops.
+std::unique_ptr<OperationPass<ModuleOp>>
+createLinalgTileAndDistributeOnTensorsPass();
+
 /// Populates passes needed to lower a XLA HLO op to LLVM dialect via the
 /// structured ops path. The pass manager `pm` in here should operate on the
 /// module within the IREE::HAL::ExecutableOp.
diff --git a/iree/compiler/Conversion/LinalgToLLVM/test/tile_and_distribute_on_tensors.mlir b/iree/compiler/Conversion/LinalgToLLVM/test/tile_and_distribute_on_tensors.mlir
new file mode 100644
index 0000000..b869782
--- /dev/null
+++ b/iree/compiler/Conversion/LinalgToLLVM/test/tile_and_distribute_on_tensors.mlir
@@ -0,0 +1,41 @@
+// RUN: iree-opt -split-input-file -verify-diagnostics -iree-codegen-llvm-linalg-tile-and-distribute-on-tensors=tile-sizes="1,2" %s | IreeFileCheck %s
+
+// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (2, -d0 + 4)>
+
+// CHECK-LABEL: func @tensor
+func @tensor() -> tensor<2x4xf32> {
+  %A = iree.unfoldable_constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>
+  %B = iree.unfoldable_constant dense<[[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [9.0, 10.0, 11.0, 12.0]]> : tensor<3x4xf32>
+  %C = iree.unfoldable_constant dense<1000.0> : tensor<2x4xf32>
+
+  //  CHECK-DAG: %[[C1:.*]] = constant 1 : index
+  //  CHECK-DAG: %[[C2:.*]] = constant 2 : index
+  //  CHECK-DAG: %[[C4:.*]] = constant 4 : index
+  //  CHECK-DAG: %[[bix:.*]] = iree.workgroup_id {dimension = "x"} : index
+  //  CHECK-DAG: %[[bdx:.*]] = iree.workgoup_size {dimension = "x"} : index
+  //  CHECK-DAG: %[[biy:.*]] = iree.workgroup_id {dimension = "y"} : index
+  //  CHECK-DAG: %[[bdy:.*]] = iree.workgoup_size {dimension = "y"} : index
+  //      CHECK: %{{.*}} = scf.for %[[I:.*]] = %[[bix]] to %[[C2]] step %[[bdx]] iter_args(%arg1 = %2) -> (tensor<2x4xf32>) {
+  // CHECK-NEXT:   %[[biy_scaled:.*]] = muli %[[biy]], %[[C2]] : index
+  // CHECK-NEXT:   %[[bdy_scaled:.*]] = muli %[[bdy]], %[[C2]] : index
+  // CHECK-NEXT:   %{{.*}} = scf.for %[[J:.*]] = %[[biy_scaled]] to %[[C4]] step %[[bdy_scaled]] iter_args(%arg3 = %arg1) -> (tensor<2x4xf32>) {
+  // CHECK-NEXT:     subtensor %{{.*}}[%[[I]], 0] [1, 3] [1, 1] : tensor<2x3xf32> to tensor<1x3xf32>
+  //
+  // Canonicalizations not yet powerful enough here.
+  // CHECK-NEXT:     %[[J_slice_1:.*]] = affine.min #[[$MAP]](%[[J]])
+  // CHECK-NEXT:     subtensor %1[0, %[[J]]] [3, %[[J_slice_1]]] [1, 1] : tensor<3x4xf32> to tensor<3x?xf32>
+  //
+  // Canonicalizations not yet powerful enough here.
+  // CHECK-NEXT:     %[[J_slice_2:.*]] = affine.min #[[$MAP]](%[[J]])
+  // CHECK-NEXT:     subtensor %arg3[%[[I]], %[[J]]] [1, %[[J_slice_2]]] [1, 1] : tensor<2x4xf32> to tensor<1x?xf32>
+  // CHECK-NEXT:     linalg.matmul
+  // CHECK-NEXT:     subtensor_insert {{.*}} : tensor<1x?xf32> into tensor<2x4xf32>
+  // CHECK-NEXT:     scf.yield %{{.*}} : tensor<2x4xf32>
+  // CHECK-NEXT:   }
+  // CHECK-NEXT:   scf.yield %{{.*}} : tensor<2x4xf32>
+  %E = linalg.matmul ins(%A, %B: tensor<2x3xf32>, tensor<3x4xf32>)
+                    init(%C: tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %E : tensor<2x4xf32>
+}
diff --git a/iree/compiler/Conversion/init_conversions.h b/iree/compiler/Conversion/init_conversions.h
index 6157e00..e250550 100644
--- a/iree/compiler/Conversion/init_conversions.h
+++ b/iree/compiler/Conversion/init_conversions.h
@@ -66,6 +66,7 @@
     // LinalgToLLVM
     createConvImg2ColMatmulConversionPass();
     createLinalgTileAndDistributePass();
+    createLinalgTileAndDistributeOnTensorsPass();
     createLinalgTileAndVectorizeWorkgroupsPass();
     createLinalgRewriteDestructiveUpdatesPass();
     return true;
diff --git a/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp b/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp
index 12e6382..e838264 100644
--- a/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp
+++ b/iree/compiler/Dialect/Shape/Conversion/ConvertShapeToShapex.cpp
@@ -198,8 +198,11 @@
       ConversionPatternRewriter &rewriter) const override {
     Value lhs = operands[0];
     Value rhs = operands[1];
-    auto lhsType = lhs.getType().cast<RankedShapeType>();
-    auto rhsType = rhs.getType().cast<RankedShapeType>();
+    auto lhsType = lhs.getType().dyn_cast<RankedShapeType>();
+    auto rhsType = rhs.getType().dyn_cast<RankedShapeType>();
+    if (!lhsType || !rhsType) {
+      return failure();
+    }
     // Establish invariant that rank(lhs) <= rank(rhs)
     if (lhsType.getRank() > rhsType.getRank()) {
       std::swap(lhsType, rhsType);
diff --git a/iree/hal/vmla/BUILD b/iree/hal/vmla/BUILD
index cc7204d..54a4d9e 100644
--- a/iree/hal/vmla/BUILD
+++ b/iree/hal/vmla/BUILD
@@ -37,6 +37,7 @@
         # TODO(benvanik): SIMD variants.
         "op_kernels_generic.h",
         "op_kernels_ruy.h",
+        "op_kernels_fft.h",
     ],
     deps = [
         "//iree/base:status",
@@ -49,6 +50,7 @@
         "@com_google_absl//absl/types:span",
         "@com_google_ruy//ruy",
         "@com_google_ruy//ruy:context",
+        "@pffft",
     ],
 )
 
diff --git a/iree/hal/vmla/CMakeLists.txt b/iree/hal/vmla/CMakeLists.txt
index b25a0dc..330da76 100644
--- a/iree/hal/vmla/CMakeLists.txt
+++ b/iree/hal/vmla/CMakeLists.txt
@@ -24,6 +24,7 @@
   HDRS
     "op_kernels.h"
   TEXTUAL_HDRS
+    "op_kernels_fft.h"
     "op_kernels_generic.h"
     "op_kernels_ruy.h"
   DEPS
@@ -35,6 +36,7 @@
     absl::span
     iree::base::status
     iree::base::tracing
+    pffft
     ruy
   PUBLIC
 )
diff --git a/iree/hal/vmla/op_kernels.h b/iree/hal/vmla/op_kernels.h
index a7a5cae..127a89f 100644
--- a/iree/hal/vmla/op_kernels.h
+++ b/iree/hal/vmla/op_kernels.h
@@ -174,15 +174,6 @@
                         absl::Span<int32_t> dst_buffer, ShapeSpan src_shape);
 };
 
-struct Fft {
-  template <typename T>
-  static Status Execute(absl::Span<const T> real_src_buffer,
-                        absl::Span<const T> imag_src_buffer,
-                        absl::Span<T> real_dst_buffer,
-                        absl::Span<T> imag_dst_buffer, ShapeSpan real_src_shape,
-                        ShapeSpan imag_src_shape);
-};
-
 struct Broadcast {
   template <typename T>
   static Status Execute(absl::Span<const T> src_buffer,
@@ -507,6 +498,7 @@
 // clang-format off
 #include "iree/hal/vmla/op_kernels_generic.h"  // IWYU pragma: export
 #include "iree/hal/vmla/op_kernels_ruy.h"  // IWYU pragma: export
+#include "iree/hal/vmla/op_kernels_fft.h"  // IWYU pragma: export
 // clang-format on
 
 #endif  // IREE_HAL_VMLA_OP_KERNELS_H_
diff --git a/iree/hal/vmla/op_kernels_fft.h b/iree/hal/vmla/op_kernels_fft.h
new file mode 100644
index 0000000..ab376d0
--- /dev/null
+++ b/iree/hal/vmla/op_kernels_fft.h
@@ -0,0 +1,89 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Defines kernel functions and provides their implementation via one (or more)
+// included files.
+//
+// Kernels should do the simplest possible operation. Buffer validation is
+// handled by the dispatch logic and need not be checked. Kernels may optionally
+// accept arguments beyond just the buffers, depending on the required state
+// and attributes.
+//
+// Kernels may optionally have runtime state. This is state that is allocated
+// once for the entire Runtime (and stored on RuntimeState) and shared across
+// all fibers. This enables kernels that may require thread pools or device
+// handles to be shared while kernels that require transient storage to be safe
+// to use from multiple fibers concurrently.
+//
+// All kernels are templated to enable specialization of particular types or
+// type combinations. By default the op_kernels_generic.h will provide C++
+// semantics as reference and platform-specific versions can be implemented
+// as needed.
+
+#ifndef IREE_HAL_VMLA_OP_KERNELS_FFT_H_
+#define IREE_HAL_VMLA_OP_KERNELS_FFT_H_
+
+#include "absl/types/span.h"
+#include "iree/base/logging.h"
+#include "iree/base/status.h"
+#include "third_party/pffft/pffft.h"
+
+namespace iree {
+namespace hal {
+namespace vmla {
+namespace kernels {
+
+using ShapeSpan = absl::Span<const int32_t>;
+
+struct Fft {
+  template <typename T>
+  static Status Execute(absl::Span<const T> real_src_buffer,
+                        absl::Span<const T> imag_src_buffer,
+                        absl::Span<T> real_dst_buffer,
+                        absl::Span<T> imag_dst_buffer, ShapeSpan real_src_shape,
+                        ShapeSpan imag_src_shape) {
+    PFFFT_Setup* fft_state =
+        pffft_new_setup(real_src_shape.back(), PFFFT_COMPLEX);
+    int element_count = real_src_buffer.size();
+    std::vector<T> complex_input;
+    complex_input.reserve(element_count * 2);
+
+    // pffft requires the input to be an array of interleaved complex numbers
+    for (int i = 0; i < element_count; i++) {
+      complex_input[i * 2] = real_src_buffer[i];
+      complex_input[i * 2 + 1] = imag_src_buffer[i];
+    }
+
+    std::vector<T> complex_output;
+    complex_output.reserve(element_count * 2);
+
+    pffft_transform_ordered(fft_state, &complex_input[0], &complex_output[0],
+                            NULL, PFFFT_FORWARD);
+
+    // Split the interleaved array back into a real and imag vectors.
+    for (int i = 0; i < element_count; i++) {
+      real_dst_buffer[i] = complex_output[i * 2];
+      imag_dst_buffer[i] = complex_output[i * 2 + 1];
+    }
+    pffft_destroy_setup(fft_state);
+    return OkStatus();
+  }
+};
+
+}  // namespace kernels
+}  // namespace vmla
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VMLA_OP_KERNELS_FFT_H_
diff --git a/iree/hal/vmla/op_kernels_generic.h b/iree/hal/vmla/op_kernels_generic.h
index f3ec435..c033090 100644
--- a/iree/hal/vmla/op_kernels_generic.h
+++ b/iree/hal/vmla/op_kernels_generic.h
@@ -546,18 +546,6 @@
 }
 
 template <typename T>
-Status Fft::Execute(absl::Span<const T> real_src_buffer,
-                    absl::Span<const T> imag_src_buffer,
-                    absl::Span<T> real_dst_buffer,
-                    absl::Span<T> imag_dst_buffer, ShapeSpan real_src_shape,
-                    ShapeSpan imag_src_shape) {
-  // TODO (natashaknk): implement
-  std::fill(real_dst_buffer.begin(), real_dst_buffer.end(), 1);
-  std::fill(imag_dst_buffer.begin(), imag_dst_buffer.end(), 2);
-  return OkStatus();
-}
-
-template <typename T>
 Status Broadcast::Execute(absl::Span<const T> src_buffer,
                           absl::Span<T> dst_buffer) {
   for (size_t i = 0; i < dst_buffer.size(); ++i) {