Script to update to llvm syncpoint (#3155)

Rather than updating TF to HEAD and updating the LLVM commit to match,
this script uses the LLVM commit as a synchronization point, which is
more in keeping with our current setup (we've had the LLVM update
feature turned off for a while now anyway).

This will later expand to other repositories that synchronize with LLVM
(mlir-hlo, llvm-bazel, etc.). I'm holding off on generalizing the script
much until we have those use cases, so the TF stuff is pretty hardcoded.

Here's an example TF PR from the script:
https://github.com/google/iree/pull/3154
diff --git a/.github/workflows/update_tf.yml b/.github/workflows/update_tf.yml
index b116f99..75f6bbd 100644
--- a/.github/workflows/update_tf.yml
+++ b/.github/workflows/update_tf.yml
@@ -35,9 +35,11 @@
       - name: Initializing submodules
         run: ./scripts/git/submodule_versions.py init
       - name: Updating submodules
-        run: ./scripts/git/update_tf_llvm_submodules.py --llvm_commit=KEEP --update_build_files=true
-      - name: Calculating TF SHA
-        run: echo "::set-env name=TF_SHA::$(git submodule status third_party/tensorflow | awk '{print $1}' | cut -c -12)"
+        run: ./scripts/git/update_to_llvm_syncpoint.py
+      - name: Calculating SHAs
+        run: |
+          echo "::set-env name=TF_SHA::$(git submodule status third_party/tensorflow | awk '{print $1}' | cut -c -12)"
+          echo "::set-env name=LLVM_SHA::$(git submodule status third_party/llvm-project | awk '{print $1}' | cut -c -12)"
       - name: Creating Pull Request
         uses: peter-evans/create-pull-request@v2
         with:
@@ -48,9 +50,11 @@
           body: |
             Updates TF to
             [${{ env.TF_SHA }}](https://github.com/tensorflow/tensorflow/commit/${{ env.TF_SHA }})
+            matching
+            [${{ env.LLVM_SHA }}](https://github.com/llvm/llvm-project/commit/${{ env.LLVM_SHA }})
             and copies over the LLVM BUILD files.
 
-            `./scripts/git/update_tf_llvm_submodules.py --llvm_commit=KEEP --update_build_files=true`
+            `./scripts/git/update_to_llvm_syncpoint.py`
 
             Automated submodule bump from .github/workflows/update_tf.yml
           committer: "Submodule Update Action <iree-github-actions-bot@google.com>"
diff --git a/scripts/git/update_tf_llvm_submodules.py b/scripts/git/update_tf_llvm_submodules.py
deleted file mode 100755
index 9ca4e84..0000000
--- a/scripts/git/update_tf_llvm_submodules.py
+++ /dev/null
@@ -1,204 +0,0 @@
-#!/usr/bin/env python3
-# Lint as: python3
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint: disable=missing-docstring
-"""update_tf_llvm_submodules.
-
-Updates the third_party/tensorflow and third_party/llvm-project submodules
-to new commits. We have special conditions around these submodules since
-upstream will only accept an llvm-project version that is sync'd with the
-corresponding version that tensorflow depends on. In addition, some BUILD
-files must be sync'd for the new version.
-
-Typical usage:
-  Syntax: ./scripts/git/update_tf_llvm_submodules.py
-
-  By default, this will update the tensorflow submodule to remote HEAD and
-  update the llvm-project submodule to the corresponding version. It will
-  also sync BUILD file changes as needed and export the version metadata.
-"""
-
-import argparse
-import re
-import os
-import sys
-
-import submodule_versions
-import utils
-
-
-def parse_arguments():
-  parser = argparse.ArgumentParser()
-  parser.add_argument("--repo", help="Repository root directory")
-  parser.add_argument(
-      "--tensorflow",
-      help="Path to the tensorflow sources "
-      "(default to third_party/tensorflow)",
-      default=None)
-  parser.add_argument(
-      "--llvm",
-      help="Path to the LLVM sources "
-      "(defaults to third_party/llvm-project)",
-      default=None)
-  parser.add_argument(
-      "--tensorflow_commit",
-      help="Update TensorFlow to this commit (or 'KEEP', 'REMOTE')",
-      default="REMOTE")
-  parser.add_argument(
-      "--llvm_commit",
-      help="Update LLVM to this commit (or 'KEEP', 'REMOTE', 'TENSORFLOW')",
-      default="TENSORFLOW")
-  parser.add_argument(
-      "--update_build_files",
-      help=("Updates the IREE LLVM build files from TensorFlow."
-            "Defaults to True iff llvm_commit==TENSORFLOW"),
-      type=utils.str2bool,
-      nargs="?",
-      default=None)
-  args = parser.parse_args()
-
-  # Default repo path.
-  if args.repo is None:
-    args.repo = utils.find_git_toplevel()
-
-  # Set some defaults.
-  if not args.tensorflow:
-    args.tensorflow = os.path.join(args.repo, "third_party", "tensorflow")
-  if not args.llvm:
-    args.llvm = os.path.join(args.repo, "third_party", "llvm-project")
-  return args
-
-
-def main(args):
-  print("IREE handy-dandy-LLVM-submodule-updater at your service...")
-  print("  IREE Path :", args.repo)
-  print("  LLVM Path :", args.llvm)
-  print("  TensorFlow Path :", args.tensorflow)
-  print("  Update Build files:", args.update_build_files)
-  current_llvm_commit = get_commit(args.llvm)
-  current_tensorflow_commit = get_commit(args.tensorflow)
-
-  print("Current Commits: llvm =", current_llvm_commit, "tensorflow =",
-        current_tensorflow_commit)
-
-  # Update TensorFlow
-  if args.tensorflow_commit == "KEEP":
-    print("Not updating TensorFlow (--tensorflow_commit == 'KEEP')")
-  else:
-    print("\n*** Updating TensorFlow to", args.tensorflow_commit, "***")
-    update_submodule(args.tensorflow, args.tensorflow_commit)
-    stage_path(args.repo, "third_party/tensorflow")
-
-  # Update LLVM.
-  if args.llvm_commit == "TENSORFLOW":
-    args.llvm_commit = find_tensorflow_llvm_commit(args.tensorflow)
-    print("Found TensorFlow's LLVM commit:", args.llvm_commit)
-    if args.update_build_files is None:
-      print("Will update build files from TensorFlow",
-            "because --update_build_files not specified")
-      args.update_build_files = True
-  if args.llvm_commit == "KEEP":
-    print("Not updating LLVM (--llvm_commit == 'KEEP')")
-  else:
-    print("\n*** Updating LLVM to", args.llvm_commit, "***")
-    update_submodule(args.llvm, args.llvm_commit)
-    stage_path(args.repo, "third_party/llvm-project")
-
-  # Update build files.
-  if not args.update_build_files:
-    print("Not updating build files (--update_build_files not specified)")
-  else:
-    print("\n*** Updating BUILD.bazel files ***")
-    update_build_files_from_tensorflow(args.repo, args.tensorflow)
-
-  # Export SUBMODULE_VERSIONS.
-  print()  # Add line break.
-  submodule_versions.export_versions(args.repo)
-
-
-def get_commit(path, rev="HEAD"):
-  return utils.execute(["git", "rev-parse", rev],
-                       cwd=path,
-                       silent=True,
-                       capture_output=True).decode("ISO-8859-1").strip()
-
-
-def update_submodule(path, commit, tracking="origin/master"):
-  # Fetch.
-  utils.execute(["git", "fetch"], cwd=path)
-  # Determine commit.
-  if commit == "REMOTE":
-    commit = get_commit(path, rev=tracking)
-    print("Resolved remote commit:", commit)
-
-  # Rebase to commit (will fail if not fast-forward).
-  utils.execute(["git", "checkout", commit], cwd=path)
-
-
-def find_tensorflow_llvm_commit(tensorflow_path):
-  # TensorFlow keeps its commit in workspace.bzl on a line like:
-  # LLVM_COMMIT = "..."
-  # Yeah. This is how we do it.
-  workspace_path = os.path.join(tensorflow_path, "tensorflow", "workspace.bzl")
-  pattern_text = r"""\s*LLVM_COMMIT\s*=\s*"(.+)"\s*"""
-  pattern = re.compile(pattern_text, flags=re.MULTILINE)
-  for line in open(workspace_path, "r", encoding="UTF-8"):
-    m = re.match(pattern, line)
-    if m:
-      return m.group(1)
-
-  print("ERROR: Could not find LLVM commit in %s." % workspace_path)
-  print("Request an explicit commit via --llvm_commit (and file a bug)")
-  print("Expected pattern match for:", pattern_text)
-  sys.exit(1)
-
-
-def update_build_files_from_tensorflow(repo_path, tensorflow_path):
-  src_llvm_build = os.path.join(tensorflow_path, "third_party", "llvm",
-                                "llvm.autogenerated.BUILD")
-  # NOTE(laurenzo): These will probably move upstream.
-  src_mlir_build = os.path.join(tensorflow_path, "third_party", "mlir", "BUILD")
-  src_mlir_test_build = os.path.join(tensorflow_path, "third_party", "mlir",
-                                     "test.BUILD")
-  overlay_path = os.path.join(repo_path, "build_tools", "bazel",
-                              "third_party_import", "llvm-project", "overlay")
-  copy_text_file(repo_path, src_llvm_build,
-                 os.path.join(overlay_path, "llvm", "BUILD.bazel"))
-  copy_text_file(repo_path, src_mlir_build,
-                 os.path.join(overlay_path, "mlir", "BUILD.bazel"))
-  copy_text_file(repo_path, src_mlir_test_build,
-                 os.path.join(overlay_path, "mlir", "test", "BUILD.bazel"))
-
-
-def copy_text_file(repo_path, src_file, dst_file):
-  print("+ cp %s %s" % (src_file, dst_file))
-  with open(src_file, "r", encoding="UTF-8") as f:
-    src_contents = f.read()
-
-  if not os.path.exists(dst_file):
-    print("WARNING: Destination file does not exist:", dst_file)
-  with open(dst_file, "w", encoding="UTF-8") as f:
-    f.write(src_contents)
-  stage_path(repo_path, dst_file)
-
-
-def stage_path(repo_path, to_stage):
-  # TODO(laurenzo): Move to utils.py.
-  utils.execute(["git", "add", to_stage], cwd=repo_path)
-
-
-if __name__ == "__main__":
-  main(parse_arguments())
diff --git a/scripts/git/update_tf_submodule.sh b/scripts/git/update_tf_submodule.sh
index 0f61a9c..0031cfa 100755
--- a/scripts/git/update_tf_submodule.sh
+++ b/scripts/git/update_tf_submodule.sh
@@ -33,30 +33,28 @@
 set -o pipefail
 
 export UPSTREAM_REMOTE="${UPSTREAM_REMOTE:-upstream}"
-TENSORFLOW_COMMIT="${1:-REMOTE}"
+TENSORFLOW_COMMIT="${1:-LATEST_MATCH}"
 PR_BRANCH="tf-submodule-update"
-BASE_BRANCH="${1:-google}"
+BASE_BRANCH="${BASE_BRANCH:-google}"
 FORK_REMOTE="${FORK_REMOTE:-origin}"
-TF_COMMIT_NICKNAME=""
 
 ./scripts/git/git_update.sh "${BASE_BRANCH?}"
 git checkout -B "${PR_BRANCH?}"
 
-CMD="./scripts/git/update_tf_llvm_submodules.py --llvm_commit=KEEP --update_build_files=true --tensorflow_commit=${TENSORFLOW_COMMIT?}"
+CMD="./scripts/git/update_to_llvm_syncpoint.py --tensorflow_commit=${TENSORFLOW_COMMIT?}"
 
 bash -c "${CMD?}"
 
 TF_SHA="$(git submodule status third_party/tensorflow | awk '{print $1}' | cut -c -12)"
 
-if [[ -z "${TF_COMMIT_NICKNAME?}" && "${TENSORFLOW_COMMIT?}" == "REMOTE" ]]; then
-  TF_COMMIT_NICKNAME="current HEAD"
-fi
-TF_COMMIT_NICKNAME="${TF_COMMIT_NICKNAME:-${TF_SHA?}}"
+LLVM_SHA="$(git submodule status third_party/llvm-project | awk '{print $1}' | cut -c -12)"
 
 TITLE="Integrate TF at tensorflow/tensorflow@${TF_SHA?}"
 BODY="$(cat <<-EOF
 Updates TF to
-[${TF_COMMIT_NICKNAME?}](https://github.com/tensorflow/tensorflow/commit/${TF_SHA?})
+[${TF_SHA?}](https://github.com/tensorflow/tensorflow/commit/${TF_SHA?})
+matching
+[${LLVM_SHA?}](https://github.com/llvm/llvm-project/commit/${LLVM_SHA?})
 and copies over the LLVM BUILD files.
 
 \`${CMD?}\`
diff --git a/scripts/git/update_to_llvm_syncpoint.py b/scripts/git/update_to_llvm_syncpoint.py
new file mode 100755
index 0000000..661a937
--- /dev/null
+++ b/scripts/git/update_to_llvm_syncpoint.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=missing-docstring
+"""Updates LLVM-dependent submodules based on the current LLVM commit.
+
+Updates the third_party/tensorflow submodule to a new commit based on the commit
+in the third_party/llvm-project submodule. We have special conditions around
+these submodules since they are synced as part of the integration of LLVM into
+Google's source repository. See
+https://google.github.io/iree/developing-iree/repository-management#the-special-relationship-with-llvm-and-tensorflow.
+
+In addition we currently copy LLVM Bazel BUILD files from TensorFlow.
+
+Typical usage:
+  Syntax: ./scripts/git/update_to_llvm_syncpoint.py
+
+  By default, this will update the TensorFlow submodule to the most recent
+  commit with an LLVM version that matches IREE's and copy over the LLVM
+  BUILD file changes as needed.
+"""
+
+import argparse
+import re
+import os
+import sys
+
+import submodule_versions
+import utils
+
+REMOTE_HEAD_COMMIT = "REMOTE"
+KEEP_COMMIT = "KEEP"
+INTEGRATE_COMMIT = "INTEGRATE"
+LATEST_MATCHING_COMMIT = "LATEST_MATCH"
+
+COMMIT_OPTIONS = {
+    REMOTE_HEAD_COMMIT:
+        "Update to the HEAD commit on the remote repository default branch",
+    KEEP_COMMIT:
+        "Do not modify the current commit",
+    INTEGRATE_COMMIT:
+        "Update to the commit where the current version of LLVM was first integrated",
+    LATEST_MATCHING_COMMIT:
+        "Update to the most recent commit with a matching version of LLVM",
+}
+
+
+def parse_arguments():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--repo", help="Repository root directory")
+  parser.add_argument("--tensorflow",
+                      help="Path to the tensorflow sources "
+                      "(default to third_party/tensorflow)",
+                      default=None)
+  parser.add_argument("--llvm",
+                      help="Path to the LLVM sources "
+                      "(defaults to third_party/llvm-project)",
+                      default=None)
+  parser.add_argument(
+      "--tensorflow_commit",
+      "--tf_commit",
+      help=
+      f"Update TensorFlow to this commit, or a named option: {COMMIT_OPTIONS}",
+      default=LATEST_MATCHING_COMMIT)
+  parser.add_argument(
+      "--validate",
+      help="Validate that the selected commits all match the LLVM commit",
+      type=utils.str2bool,
+      nargs="?",
+      default=True,
+  )
+
+  parser.add_argument("--update_build_files",
+                      help="Updates the IREE LLVM build files from TensorFlow.",
+                      type=utils.str2bool,
+                      nargs="?",
+                      default=True)
+  args = parser.parse_args()
+
+  # Default repo path.
+  if args.repo is None:
+    args.repo = utils.find_git_toplevel()
+
+  # Set some defaults.
+  if not args.tensorflow:
+    args.tensorflow = os.path.join(args.repo, "third_party", "tensorflow")
+  if not args.llvm:
+    args.llvm = os.path.join(args.repo, "third_party", "llvm-project")
+
+  return args
+
+
+def main(args):
+  print("IREE handy-dandy-LLVM-submodule-updater at your service...")
+  print("  IREE Path :", args.repo)
+  print("  LLVM Path :", args.llvm)
+  print("  TensorFlow Path :", args.tensorflow)
+  print("  Update Build files:", args.update_build_files)
+  current_llvm_commit = get_commit(args.llvm)
+  current_tensorflow_commit = get_commit(args.tensorflow)
+
+  print("Current Commits: llvm =", current_llvm_commit, "tensorflow =",
+        current_tensorflow_commit)
+
+  # Update TensorFlow
+  new_tf_commit = find_new_tf_commit(args.tensorflow, current_llvm_commit,
+                                     args.tensorflow_commit)
+  print("\n*** Updating TensorFlow to", new_tf_commit, "***")
+  utils.execute(["git", "checkout", new_tf_commit], cwd=args.tensorflow)
+  stage_path(args.repo, args.tensorflow)
+
+  validate_tf_commit(current_llvm_commit,
+                     args.tensorflow,
+                     exit_on_failure=args.validate)
+
+  if args.update_build_files:
+    print("\n*** Updating BUILD.bazel files ***")
+    update_build_files_from_tensorflow(args.repo, args.tensorflow)
+
+  # Export SUBMODULE_VERSIONS.
+  print()  # Add line break.
+  submodule_versions.export_versions(args.repo)
+
+
+def get_commit(path, rev="HEAD"):
+  return utils.execute(["git", "rev-parse", rev],
+                       cwd=path,
+                       silent=True,
+                       capture_output=True,
+                       universal_newlines=True).strip()
+
+
+def find_new_tf_commit(tensorflow_path, llvm_commit, tf_commit):
+  utils.execute(["git", "fetch"], cwd=tensorflow_path)
+
+  if tf_commit not in COMMIT_OPTIONS:
+    return get_commit(tensorflow_path, rev=tf_commit)
+
+  if tf_commit == KEEP_COMMIT:
+    return get_commit(tensorflow_path)
+
+  if tf_commit == REMOTE_HEAD_COMMIT:
+    return get_commit(tensorflow_path, "origin/master")
+
+  # Find commits where the number of occurences of the given LLVM commit hash
+  # changes. In normal cases, there should be at most two commits that match
+  # this:
+  # 1. The commit that first introduced the new hash in the TF workspace file.
+  # 2. The commit that changed it to a new hash afterwards.
+  tf_integrate_commits = utils.execute(
+      [
+          "git",
+          "log",
+          # Only follow the first parent of a merge commit. We don't want to go off
+          # to some random PR.
+          "--first-parent",
+          # Just print the commit hash
+          "--format=%H",
+          # Look for commits where the number of occurences of llvm_commit changed.
+          # https://git-scm.com/docs/git-log#Documentation/git-log.txt--Sltstringgt
+          "-S",
+          llvm_commit,
+          # Search along the master branch
+          "origin/master",
+          # Only look in the TF workspace file where the llvm_commit is recorded
+          "--",
+          "tensorflow/workspace.bzl"
+      ],
+      capture_output=True,
+      universal_newlines=True,
+      cwd=tensorflow_path).split()
+  if len(tf_integrate_commits) > 2:
+    raise RuntimeError(
+        f"Expected one or two TF commits to involve LLVM commit {llvm_commit},"
+        f" but got {len(tf_integrate_commits)}")
+
+  if not tf_integrate_commits:
+    raise RuntimeError(
+        f"TF does not have any references to LLVM commit {llvm_commit}."
+        " Maybe TF export is behind?")
+
+  if tf_commit == INTEGRATE_COMMIT:
+    return tf_integrate_commits[-1]
+
+  assert tf_commit == LATEST_MATCHING_COMMIT
+  if len(tf_integrate_commits) == 1:
+    # There hasn't been a subsequent integrate, use remote head.
+    return get_commit(tensorflow_path, "origin/master")
+
+  # Use the commit one before the one that changed away from this LLVM version.
+  return get_commit(tensorflow_path, rev=f"{tf_integrate_commits[0]}^")
+
+
+def validate_tf_commit(llvm_commit, tensorflow_path, exit_on_failure=True):
+  tf_llvm_commit = find_tensorflow_llvm_commit(tensorflow_path)
+
+  matches = tf_llvm_commit == llvm_commit
+  if not matches:
+    print("WARNING: LLVM commit in TF  does not match that in IREE"
+          f" ({tf_llvm_commit} vs {llvm_commit})")
+    if exit_on_failure:
+      sys.exit(1)
+
+
+def find_tensorflow_llvm_commit(tensorflow_path):
+  # TensorFlow keeps its commit in workspace.bzl on a line like:
+  # LLVM_COMMIT = "..."
+  # Yeah. This is how we do it.
+  workspace_path = os.path.join(tensorflow_path, "tensorflow", "workspace.bzl")
+  pattern_text = r"""\s*LLVM_COMMIT\s*=\s*"(.+)"\s*"""
+  pattern = re.compile(pattern_text, flags=re.MULTILINE)
+  for line in open(workspace_path, "r", encoding="UTF-8"):
+    m = re.match(pattern, line)
+    if m:
+      return m.group(1)
+
+  print(f"ERROR: Could not find LLVM commit in {workspace_path}.")
+  print("Please file a bug)")
+  print("Expected pattern match for:", pattern_text)
+  sys.exit(1)
+
+
+def update_build_files_from_tensorflow(repo_path, tensorflow_path):
+  src_llvm_build = os.path.join(tensorflow_path, "third_party", "llvm",
+                                "llvm.autogenerated.BUILD")
+  # NOTE(laurenzo): These will probably move upstream.
+  src_mlir_build = os.path.join(tensorflow_path, "third_party", "mlir", "BUILD")
+  src_mlir_test_build = os.path.join(tensorflow_path, "third_party", "mlir",
+                                     "test.BUILD")
+  overlay_path = os.path.join(repo_path, "build_tools", "bazel",
+                              "third_party_import", "llvm-project", "overlay")
+  copy_text_file(repo_path, src_llvm_build,
+                 os.path.join(overlay_path, "llvm", "BUILD.bazel"))
+  copy_text_file(repo_path, src_mlir_build,
+                 os.path.join(overlay_path, "mlir", "BUILD.bazel"))
+  copy_text_file(repo_path, src_mlir_test_build,
+                 os.path.join(overlay_path, "mlir", "test", "BUILD.bazel"))
+
+
+def copy_text_file(repo_path, src_file, dst_file):
+  print(f"+ cp {src_file} {dst_file}")
+  with open(src_file, "r", encoding="UTF-8") as f:
+    src_contents = f.read()
+
+  if not os.path.exists(dst_file):
+    print("WARNING: Destination file does not exist:", dst_file)
+  with open(dst_file, "w", encoding="UTF-8") as f:
+    f.write(src_contents)
+  stage_path(repo_path, dst_file)
+
+
+def stage_path(repo_path, to_stage):
+  # TODO(laurenzo): Move to utils.py.
+  utils.execute(["git", "add", to_stage], cwd=repo_path)
+
+
+if __name__ == "__main__":
+  main(parse_arguments())