Add a script to generate benchmarking artifacts. (#3288)

Additional changes:
- Makes our integrations tests use Bazel's `TEST_UNDECLARED_OUTPUTS_DIR` to safely write to disk.
- Adds back the `saved_model` directory now that we can save it without artificially duplicating it.
- Adds `scripts/utils.py` to remove code duplication between `update_op_coverage.py`, `update_e2e_coveage.py` and `get_e2e_artifacts.py`.
diff --git a/build_tools/docker/manage_images.py b/build_tools/docker/manage_images.py
index 468313d..7b8cb7e 100755
--- a/build_tools/docker/manage_images.py
+++ b/build_tools/docker/manage_images.py
@@ -19,12 +19,12 @@
 
 Example usage:
 
-Rebuild the cmake image and all images that transitiviely on depend on it,
+Rebuild the cmake image and all images that transitively on depend on it,
 tagging them with `latest`:
   python3 build_tools/docker/manage_images.py --build --image cmake
 
 Print out output for rebuilding the cmake image and all images that
-transitiviely on depend on it, but don't take side-effecting actions:
+transitively on depend on it, but don't take side-effecting actions:
   python3 build_tools/docker/manage_images.py --build --image cmake --dry-run
 
 Push all `prod` images to GCR:
diff --git a/docs/developing_iree/e2e_benchmarking.md b/docs/developing_iree/e2e_benchmarking.md
index 3bfb460..15e3f25 100644
--- a/docs/developing_iree/e2e_benchmarking.md
+++ b/docs/developing_iree/e2e_benchmarking.md
@@ -12,14 +12,17 @@
 
 ## 1. Run IREE's E2E TensorFlow tests to generate the benchmarking artifacts
 
-This command will compile and test all of our passing, non-manual targets.
+The `get_e2e_artifacts.py` script compiles and tests all of our integrated
+TensorFlow models, and gathers their compilation and benchmarking artifacts in
+`/tmp/iree/modules/`.
 
 ```shell
-bazel test //integrations/tensorflow/e2e/...
+# By default `get_e2e_artifacts.py` will run all of our test suites, including
+# those that take a long time to complete, so we specify
+# `--test_suites=e2e_tests` to only run the smaller tests.
+python3 ./scripts/get_e2e_artifacts.py --test_suites=e2e_tests
 ```
 
-Running the above command populates a directory `/tmp/iree/modules/` with the
-compilation artifacts needed to benchmark each TensorFlow model in our tests.
 Each test/module has a folder with the following artifacts (filtered to only
 include those relevant for benchmarking):
 
@@ -95,7 +98,7 @@
 wildcard expansion. They can be run by invoking the following test suite:
 
 ```shell
-bazel test //integrations/tensorflow/e2e/keras:vision_external_tests
+python3 ./scripts/get_e2e_artifacts.py --test_suites=vision_external_tests
 ```
 
 The previous command compiles `MobileNet`, `MobileNetV2` and `ResNet50` to run
@@ -104,6 +107,17 @@
 organized by `/tmp/iree/modules/ModelName/Dataset/backends` instead of just by
 `/tmp/iree/modules/ModelName/backends`.
 
+### Optional: Manually get the benchmarking artifacts for a specific test
+
+You can manually get the benchmarking artifacts for a specific test by using
+`bazel run` on the `_manual` binary target we create for each test. This will
+automatically store the benchmarking artifacts in `/tmp/iree/modules/`.
+
+```shell
+bazel run //integrations/tensorflow/e2e:matrix_ops_static_test_manual -- \
+  --target_backends=iree_vmla,tflite
+```
+
 ## 2. Benchmarking IREE on desktop
 
 ### 2.1 Optional: Build the `iree-benchmark-module`
diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
index 4d432d8..bcc629e 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
@@ -51,14 +51,27 @@
     "Summarize the inputs and outputs of each module trace logged to disk.")
 flags.DEFINE_bool("log_all_traces", False,
                   "Log all traces to logging.info, even if comparison passes.")
+flags.DEFINE_bool(
+    "get_saved_model", False,
+    "Creates and stores a SavedModel for the tf.Module class to be tested.")
 FLAGS = flags.FLAGS
 NUMPY_LINEWIDTH = 120
 
 
+def _get_from_environment_if_set(variable_name: str) -> Union[str, None]:
+  return os.environ[variable_name] if variable_name in os.environ else None
+
+
 def _setup_artifacts_dir(module_name: str) -> str:
-  parent_dir = FLAGS.artifacts_dir
-  if parent_dir is None:
-    parent_dir = os.path.join(tempfile.gettempdir(), "iree", "modules")
+  parent_dirs = [
+      FLAGS.artifacts_dir,
+      _get_from_environment_if_set('TEST_UNDECLARED_OUTPUTS_DIR'),
+      _get_from_environment_if_set('TEST_TMPDIR'),
+      os.path.join(tempfile.gettempdir(), "iree", "modules"),
+  ]
+  # Use the most preferred path in parent_dirs that isn't None.
+  parent_dir = next(parent for parent in parent_dirs if parent is not None)
+
   artifacts_dir = os.path.join(parent_dir, module_name)
   logging.info("Saving compilation artifacts and traces to '%s'", artifacts_dir)
   os.makedirs(artifacts_dir, exist_ok=True)
diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py
index a4a731a..99573c5 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_utils.py
@@ -23,15 +23,12 @@
 import tempfile
 from typing import Any, Callable, Dict, Sequence, Set, Tuple, Type, Union
 
-from absl import flags
 from absl import logging
 import numpy as np
 from pyiree import rt
 from pyiree.tf import compiler
 import tensorflow.compat.v2 as tf
 
-FLAGS = flags.FLAGS
-
 
 def set_random_seed(seed: int = 0) -> None:
   """Set random seed for tf, np and random."""
diff --git a/integrations/tensorflow/e2e/bool_test.py b/integrations/tensorflow/e2e/bool_test.py
index e086cff..dbc792a 100644
--- a/integrations/tensorflow/e2e/bool_test.py
+++ b/integrations/tensorflow/e2e/bool_test.py
@@ -20,7 +20,7 @@
 import tensorflow.compat.v2 as tf
 
 
-class MathModule(tf.Module):
+class BooleanModule(tf.Module):
 
   @tf.function(input_signature=[])
   def constant(self):
@@ -42,7 +42,7 @@
 
   def __init__(self, *args, **kwargs):
     super(BooleanTest, self).__init__(*args, **kwargs)
-    self._modules = tf_test_utils.compile_tf_module(MathModule)
+    self._modules = tf_test_utils.compile_tf_module(BooleanModule)
 
   def test_constant(self):
 
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scripts/__init__.py
diff --git a/scripts/get_e2e_artifacts.py b/scripts/get_e2e_artifacts.py
new file mode 100755
index 0000000..5c6f6f6
--- /dev/null
+++ b/scripts/get_e2e_artifacts.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs all E2E TensorFlow tests and extracts their benchmarking artifacts.
+
+Example usage:
+  python3 get_e2e_artifacts.py
+"""
+
+import fileinput
+import os
+import re
+import subprocess
+import tempfile
+from typing import Dict, Set
+import zipfile
+
+import utils
+
+from absl import app
+from absl import flags
+
+SUITE_NAME_TO_TARGET = {
+    'e2e_tests':
+        '//integrations/tensorflow/e2e:e2e_tests',
+    'mobile_bert_squad_tests':
+        '//integrations/tensorflow/e2e:mobile_bert_squad_tests',
+    'keras_tests':
+        '//integrations/tensorflow/e2e/keras:keras_tests',
+    'vision_external_tests':
+        '//integrations/tensorflow/e2e/keras:vision_external_tests',
+}
+SUITES_HELP = [f'`{name}`' for name in SUITE_NAME_TO_TARGET]
+SUITES_HELP = f'{", ".join(SUITES_HELP[:-1])} and {SUITES_HELP[-1]}'
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_bool(
+    'dry_run', False,
+    'Run without extracting files. Useful for quickly checking for artifact '
+    'collisions.')
+flags.DEFINE_string(
+    'artifacts_dir', os.path.join(tempfile.gettempdir(), 'iree', 'modules'),
+    'Directory to transfer the benchmarking artifacts to. Defaults to '
+    '/tmp/iree/modules/')
+flags.DEFINE_bool('run_test_suites', True, 'Run any specified test suites.')
+flags.DEFINE_list('test_suites', list(SUITE_NAME_TO_TARGET.keys()),
+                  f'Any combination of {SUITES_HELP}.')
+
+EXPECTED_COLLISIONS = [
+    '/tf_ref/', 'tf_input.mlir', 'iree_input.mlir', '/saved_model/'
+]
+
+
+def _target_to_testlogs_path(target: str) -> str:
+  """Convert target into the path where Bazel stores the artifacts we want."""
+  return os.path.join('bazel-testlogs',
+                      target.replace('//', '').replace(':', os.sep))
+
+
+def _target_to_test_name(target: str, test_suite_path: str) -> str:
+  """Get test_name from `suite_name_test_name__tf__backend_name`."""
+  return target.split('__')[0].replace(f'{test_suite_path}_', '')
+
+
+def get_test_paths_and_names(test_suite_path: str):
+  """Get the paths Bazel stores test outputs in and the matching test names."""
+  targets = utils.get_test_targets(test_suite_path)
+  test_paths = [_target_to_testlogs_path(target) for target in targets]
+  test_names = [
+      _target_to_test_name(target, test_suite_path) for target in targets
+  ]
+  return test_paths, test_names
+
+
+def check_collision(filename: str, test_name: str, written_paths: Set[str],
+                    paths_to_tests: Dict[str, str]):
+  """Check that we aren't overwriting files unless we expect to."""
+  # Note: We can't use a check that the files have identical contents because
+  # tf_input.mlir can have random numbers appended to its function names.
+  # See https://github.com/google/iree/issues/3375
+
+  expected_collision = any([name in filename for name in EXPECTED_COLLISIONS])
+  if filename in written_paths and not expected_collision:
+    raise ValueError(f'Collision found on {filename} between {test_name}.py '
+                     f'and {paths_to_tests[filename]}.py')
+  else:
+    written_paths.add(filename)
+    paths_to_tests[filename] = test_name
+
+
+def update_path(archive_path: str):
+  """Update the --input_file flag with the new location of the compiled.vmfb"""
+  backend_path = archive_path.split('traces')[0]  # 'ModuleName/backend_name'.
+  compiled_path = os.path.join(FLAGS.artifacts_dir, backend_path,
+                               'compiled.vmfb')
+  flagfile_path = os.path.join(FLAGS.artifacts_dir, archive_path)
+  for line in fileinput.input(files=[flagfile_path], inplace=True):
+    if line.strip().startswith('--input_file'):
+      print(f'--input_file={compiled_path}\n', end='')
+    else:
+      print(line, end='')
+
+
+def extract_artifacts(test_path: str, test_name: str, written_paths: Set[str],
+                      paths_to_tests: Dict[str, str]):
+  """Unzips all of the benchmarking artifacts for a given test and backend."""
+  outputs = os.path.join(test_path, 'test.outputs', 'outputs.zip')
+  archive = zipfile.ZipFile(outputs)
+  # Filter out directory names.
+  filenames = [name for name in archive.namelist() if name[-1] != os.sep]
+
+  for filename in filenames:
+    # Check for collisions.
+    check_collision(filename, test_name, written_paths, paths_to_tests)
+
+    # Extract and update flagfile path.
+    if not FLAGS.dry_run:
+      archive.extract(filename, FLAGS.artifacts_dir)
+      if filename.endswith('flagfile'):
+        update_path(filename)
+
+
+def main(argv):
+  del argv  # Unused.
+
+  # Convert test suite shorthands to full test suite targets.
+  test_suites = [SUITE_NAME_TO_TARGET[suite] for suite in FLAGS.test_suites]
+
+  written_paths = set()
+  paths_to_tests = dict()
+
+  for test_suite in test_suites:
+    if FLAGS.run_test_suites and not FLAGS.dry_run:
+      subprocess.check_call([
+          'bazel', 'test', test_suite, '--color=yes',
+          '--test_arg=--get_saved_model'
+      ])
+      print()
+
+    # Extract all of the artifacts for this test suite.
+    test_paths, test_names = get_test_paths_and_names(test_suite)
+    for i, (test_path, test_name) in enumerate(zip(test_paths, test_names)):
+      print(f'\rTransfering {test_suite} {i + 1}/{len(test_paths)}', end='')
+      extract_artifacts(test_path, test_name, written_paths, paths_to_tests)
+    print('\n')
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/scripts/update_e2e_coverage.py b/scripts/update_e2e_coverage.py
index b048baa..baa8d41 100755
--- a/scripts/update_e2e_coverage.py
+++ b/scripts/update_e2e_coverage.py
@@ -24,6 +24,8 @@
 import re
 import subprocess
 
+import utils
+
 REFERENCE_BACKEND = 'tf'
 # Assumes that tests are expanded for the tf, iree_vmla, iree_llvmjit and
 # iree_vulkan backends.
@@ -87,8 +89,10 @@
   """Parses command-line options."""
   parser = argparse.ArgumentParser(
       description='Generates Markdown files for op coverage table')
-  parser.add_argument(
-      'build_dir', metavar='BUILD_PATH', type=str, help='Base build directory.')
+  parser.add_argument('build_dir',
+                      metavar='BUILD_PATH',
+                      type=str,
+                      help='Base build directory.')
 
   parsed_args = parser.parse_args()
   if not os.path.isdir(parsed_args.build_dir):
@@ -97,38 +101,22 @@
   return parsed_args
 
 
-def create_markdown_table(rows):
-  """Converts a 2D array to a Markdown table."""
-  return '\n'.join([' | '.join(row) for row in rows])
-
-
 def get_name_and_backend(test_string):
   """Splits a pathless test target into its name and comparison backend."""
   name, backend = test_string.split(f'__{REFERENCE_BACKEND}__')
   return name, backend
 
 
-def get_test_targets(test_suite_path):
-  """Returns a list of test targets stripped of paths and suite names."""
-  # Check if the suite exists (which may not be true for failing suites)
-  target_dir = test_suite.split(':')[0]
-  query = ['bazel', 'query', f'{target_dir}/...']
-  targets = subprocess.check_output(query)
-  if test_suite_path not in targets.decode('ascii'):
-    return []
-
-  query = ['bazel', 'query', f'tests({test_suite_path})']
-  tests = subprocess.check_output(query)
-  tests = tests.decode('ascii').split(os.linesep)
-  tests = list(filter(lambda s: s.startswith(f'{test_suite_path}_'), tests))
-  tests = [test.replace(f'{test_suite_path}_', '') for test in tests]
-  return tests
-
-
 def get_suite_metadata(test_suite):
   """Gets all test names, and passing and failing test-backend pairs."""
-  passing = get_test_targets(test_suite)
-  failing = get_test_targets(f'{test_suite}_failing')
+  passing = utils.get_test_targets(test_suite)
+  failing = utils.get_test_targets(f'{test_suite}_failing')
+
+  # Remove bazel path.
+  passing = [test.replace(f'{test_suite}_', '') for test in passing]
+  failing = [test.replace(f'{test_suite}_failing_', '') for test in failing]
+
+  # Split into (test_name, target_backend).
   passing = [get_name_and_backend(test) for test in passing]
   failing = [get_name_and_backend(test) for test in failing]
   passing_names = [test[0] for test in passing]
@@ -178,7 +166,7 @@
         SUCCESS_ELEMENT if backend else FAILURE_ELEMENT for backend in backends
     ])
     rows.append(row)
-  return create_markdown_table(rows)
+  return utils.create_markdown_table(rows)
 
 
 if __name__ == '__main__':
diff --git a/scripts/update_op_coverage.py b/scripts/update_op_coverage.py
index c2c6dd9..3a30a24 100755
--- a/scripts/update_op_coverage.py
+++ b/scripts/update_op_coverage.py
@@ -47,8 +47,10 @@
   """Parses command-line options."""
   parser = argparse.ArgumentParser(
       description='Generates Markdown files for op coverage table')
-  parser.add_argument(
-      'build_dir', metavar='BUILD_PATH', type=str, help='Base build directory.')
+  parser.add_argument('build_dir',
+                      metavar='BUILD_PATH',
+                      type=str,
+                      help='Base build directory.')
 
   parsed_args = parser.parse_args()
   if not os.path.isdir(parsed_args.build_dir):
@@ -87,11 +89,6 @@
   return res
 
 
-def create_markdown_table(rows):
-  """Converts a 2D array to a Markdown table."""
-  return '\n'.join([' | '.join(row) for row in rows])
-
-
 def generate_table(build_dir):
   """Generates an op coverage Markdown table for each backend."""
   backend_ops = get_tested_ops_for_backends(build_dir)
@@ -109,10 +106,10 @@
   for op in all_ops:
     row = [op]
     for backend in backends:
-      row.append(
-          SUCCESS_ELEMENT if (op in backend_ops[backend]) else FAILURE_ELEMENT)
+      row.append(SUCCESS_ELEMENT if (
+          op in backend_ops[backend]) else FAILURE_ELEMENT)
     rows.append(row)
-  return create_markdown_table(rows)
+  return utils.create_markdown_table(rows)
 
 
 if __name__ == '__main__':
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..3b6f547
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,75 @@
+# Lint as: python3
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=missing-docstring
+
+import argparse
+import os
+import re
+import subprocess
+from typing import Sequence
+
+BAZEL_FILTERS = [
+    r'Loading: [0-9]+ packages loaded',
+    r'.*Using python binary from PYTHON_BIN = .*'
+]
+
+
+def create_markdown_table(rows: Sequence[Sequence[str]]):
+  """Converts a 2D array to a Markdown table."""
+  return '\n'.join([' | '.join(row) for row in rows])
+
+
+def check_and_get_output(command: Sequence[str],
+                         dry_run: bool = False,
+                         log_stderr: bool = True,
+                         stderr_filters: Sequence[str] = ()):
+  print(f'Running: `{" ".join(command)}`')
+  if dry_run:
+    return None, None
+  process = subprocess.Popen(command,
+                             bufsize=1,
+                             stderr=subprocess.PIPE,
+                             stdout=subprocess.PIPE,
+                             universal_newlines=True)
+  process.wait()
+  stdout = [line.strip(os.linesep) for line in process.stdout]
+  stderr = [line.strip(os.linesep) for line in process.stderr]
+
+  if log_stderr:
+    for line in stderr:
+      if not any(re.match(pattern, line) for pattern in stderr_filters):
+        print(line)
+
+  if process.returncode != 0:
+    raise subprocess.CalledProcessError(process.returncode, ' '.join(command))
+
+  return stdout, stderr
+
+
+def get_test_targets(test_suite_path: str):
+  """Returns a list of test targets for the given test suite."""
+  # Check if the suite exists (which may not be true for failing suites).
+  # We use two queries here because the return code for a failed query is
+  # unfortunately the same as the return code for a bazel configuration error.
+  target_dir = test_suite_path.split(':')[0]
+  query = ['bazel', 'query', f'{target_dir}/...']
+  targets, _ = check_and_get_output(query, stderr_filters=BAZEL_FILTERS)
+  if test_suite_path not in targets:
+    return []
+
+  query = ['bazel', 'query', f'tests({test_suite_path})']
+  tests, _ = check_and_get_output(query, stderr_filters=BAZEL_FILTERS)
+  return tests