util/dvsim/Launcher.py - 3p/lowrisc/opentitan - Git at Google

 # Copyright lowRISC contributors.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0

 import collections
 import datetime
 import logging as log
 import os
 import re
 import sys
 from pathlib import Path

 from utils import VERBOSE, clean_odirs, mk_symlink, rm_path


 class LauncherError(Exception):

     def __init__(self, msg):
         self.msg = msg


 class ErrorMessage(
         collections.namedtuple(
             'ErrorMessage',
             ['line_number', 'message', 'context'],
         )):
     """Contains error-related information.

     This support classification of failures into buckets. The message field
     is used to generate the bucket, and context contains a list of lines in
     the failing log that can be useful for quick diagnostics.
     """
     pass


 class Launcher:
     """
     Abstraction for launching and maintaining a job.

     An abstract class that provides methods to prepare a job's environment,
     launch the job, poll for its completion and finally do some cleanup
     activities. This class is not meant to be instantiated directly. Each
     launcher object holds an instance of the deploy object.
     """

     # Type of launcher used as string.
     variant = None

     # Max jobs running at one time
     max_parallel = sys.maxsize

     # Max jobs polled at one time
     max_poll = 10000

     # Poll job's completion status every this many seconds
     poll_freq = 1

     # Points to the python virtual env area.
     pyvenv = None

     # If a history of previous invocations is to be maintained, then keep no
     # more than this many directories.
     max_odirs = 5

     # Flag indicating the workspace preparation steps are complete.
     workspace_prepared = False
     workspace_prepared_for_cfg = set()

     # Jobs that are not run when one of their dependent jobs fail are
     # considered killed. All non-passing jobs are required to have an
     # an associated fail_msg attribute (an object of class ErrorMessage)
     # reflecting the appropriate message. This class attribute thus serves
     # as a catch-all for all those jobs that are not even run. If a job
     # instance runs and fails, the fail_msg attribute is overridden by
     # the instance with the correct message in _post_finish().
     fail_msg = ErrorMessage(
         line_number=None,
         message="Job killed most likely because its dependent job failed.",
         context=[])

     @staticmethod
     def set_pyvenv(project):
         '''Activate a python virtualenv if available.

         The env variable <PROJECT>_PYTHON_VENV if set, points to the path
         containing the python virtualenv created specifically for this
         project. We can activate it if needed, before launching jobs using
         external compute machines.

         This is not applicable when running jobs locally on the user's machine.
         '''

         if Launcher.pyvenv is not None:
             return

         # If project-specific python virtualenv path is set, then activate it
         # before running downstream tools. This is more relevant when not
         # launching locally, but on external machines in a compute farm, which
         # may not have access to the default python installation area on the
         # host machine.
         #
         # The code below allows each launcher variant to set its own virtualenv
         # because the loading / activating mechanism could be different between
         # them.
         Launcher.pyvenv = os.environ.get("{}_PYVENV_{}".format(
             project.upper(), Launcher.variant.upper()))

         if not Launcher.pyvenv:
             Launcher.pyvenv = os.environ.get("{}_PYVENV".format(
                 project.upper()))

     @staticmethod
     def prepare_workspace(project, repo_top, args):
         '''Prepare the workspace based on the chosen launcher's needs.

         This is done once for the entire duration for the flow run.
         'project' is the name of the project.
         'repo_top' is the path to the repository.
         'args' are the command line args passed to dvsim.
         '''
         pass

     @staticmethod
     def prepare_workspace_for_cfg(cfg):
         '''Prepare the workspace for a cfg.

         This is invoked once for each cfg.
         'cfg' is the flow configuration object.
         '''
         pass

     def __str__(self):
         return self.deploy.full_name + ":launcher"

     def __init__(self, deploy):
         cfg = deploy.sim_cfg

         # One-time preparation of the workspace.
         if not Launcher.workspace_prepared:
             self.prepare_workspace(cfg.project, cfg.proj_root, cfg.args)
             Launcher.workspace_prepared = True

         # One-time preparation of the workspace, specific to the cfg.
         if cfg not in Launcher.workspace_prepared_for_cfg:
             self.prepare_workspace_for_cfg(cfg)
             Launcher.workspace_prepared_for_cfg.add(cfg)

         # Store the deploy object handle.
         self.deploy = deploy

         # Status of the job. This is primarily determined by the
         # _check_status() method, but eventually updated by the _post_finish()
         # method, in case any of the cleanup tasks fails. This value is finally
         # returned to the Scheduler by the poll() method.
         self.status = None

         # Return status of the process running the job.
         self.exit_code = None

         # Flag to indicate whether to 'overwrite' if odir already exists,
         # or to backup the existing one and create a new one.
         # For builds, we want to overwrite existing to leverage the tools'
         # incremental / partition compile features. For runs, we may want to
         # create a new one.
         self.renew_odir = False

         # The actual job runtime computed by dvsim, in seconds.
         self.job_runtime_secs = 0

     def _make_odir(self):
         """Create the output directory."""

         # If renew_odir flag is True - then move it.
         if self.renew_odir:
             clean_odirs(odir=self.deploy.odir, max_odirs=self.max_odirs)
         os.makedirs(self.deploy.odir, exist_ok=True)

     def _link_odir(self, status):
         """Soft-links the job's directory based on job's status.

         The dispatched, passed and failed directories in the scratch area
         provide a quick way to get to the job that was executed.
         """

         dest = Path(self.deploy.sim_cfg.links[status], self.deploy.qual_name)
         mk_symlink(self.deploy.odir, dest)

         # Delete the symlink from dispatched directory if it exists.
         if status != "D":
             old = Path(self.deploy.sim_cfg.links['D'], self.deploy.qual_name)
             rm_path(old)

     def _dump_env_vars(self, exports):
         """Write env vars to a file for ease of debug.

         Each extended class computes the list of exports and invokes this
         method right before launching the job.
         """

         with open(self.deploy.odir + "/env_vars",
                   "w",
                   encoding="UTF-8",
                   errors="surrogateescape") as f:
             for var in sorted(exports.keys()):
                 f.write("{}={}\n".format(var, exports[var]))

     def _pre_launch(self):
         """Do pre-launch activities.

         Examples include such as preparing the job's environment, clearing
         old runs, creating the output directory, dumping all env variables
         etc. This method is already invoked by launch() as the first step.
         """

         self.deploy.pre_launch()
         self._make_odir()
         self.start_time = datetime.datetime.now()

     def _do_launch(self):
         """Launch the job."""

         raise NotImplementedError()

     def launch(self):
         """Launch the job."""

         self._pre_launch()
         self._do_launch()

     def poll(self):
         """Poll the launched job for completion.

         Invokes _check_status() and _post_finish() when the job completes.
         """

         raise NotImplementedError()

     def kill(self):
         """Terminate the job."""

         raise NotImplementedError()

     def _check_status(self):
         """Determine the outcome of the job (P/F if it ran to completion).

         Returns (status, err_msg) extracted from the log, where the status is
         "P" if the it passed, "F" otherwise. This is invoked by poll() just
         after the job finishes. err_msg is an instance of the named tuple
         ErrorMessage.
         """

         def _find_patterns(patterns, line):
             """Helper function that returns the pattern if any of the given
             patterns is found, else None."""

             assert patterns
             for pattern in patterns:
                 match = re.search(r"{}".format(pattern), line)
                 if match:
                     return pattern
             return None

         if self.deploy.dry_run:
             return "P", None

         # Only one fail pattern needs to be seen.
         chk_failed = bool(self.deploy.fail_patterns)

         # All pass patterns need to be seen, so we replicate the list and remove
         # patterns as we encounter them.
         pass_patterns = self.deploy.pass_patterns.copy()
         chk_passed = bool(pass_patterns) and (self.exit_code == 0)

         try:
             with open(self.deploy.get_log_path(),
                       "r",
                       encoding="UTF-8",
                       errors="surrogateescape") as f:
                 lines = f.readlines()
         except OSError as e:
             return "F", ErrorMessage(
                 line_number=None,
                 message="Error opening file {}:\n{}".format(
                     self.deploy.get_log_path(), e),
                 context=[],
             )

         # Since the log file is already opened and read to assess the job's
         # status, use this opportunity to also extract other pieces of
         # information.
         self.deploy.extract_info_from_log(lines)

         if chk_failed or chk_passed:
             for cnt, line in enumerate(lines):
                 if chk_failed:
                     if _find_patterns(self.deploy.fail_patterns, line):
                         # If failed, then nothing else to do. Just return.
                         # Provide some extra lines for context.
                         return "F", ErrorMessage(line_number=cnt + 1,
                                                  message=line.strip(),
                                                  context=lines[cnt:cnt + 5])

                 if chk_passed:
                     pattern = _find_patterns(pass_patterns, line)
                     if pattern:
                         pass_patterns.remove(pattern)
                         chk_passed = bool(pass_patterns)

         # If no fail patterns were seen, but the job returned with non-zero
         # exit code for whatever reason, then show the last 10 lines of the log
         # as the failure message, which might help with the debug.
         if self.exit_code != 0:
             return "F", ErrorMessage(line_number=None,
                                      message="Job returned non-zero exit code",
                                      context=lines[-10:])
         if chk_passed:
             return "F", ErrorMessage(
                 line_number=None,
                 message=f"Some pass patterns missing: {pass_patterns}",
                 context=lines[-10:],
             )
         return "P", None

     def _post_finish(self, status, err_msg):
         """Do post-completion activities, such as preparing the results.

         Must be invoked by poll(), after the job outcome is determined.

         status is the status of the job, either 'P', 'F' or 'K'.
         err_msg is an instance of the named tuple ErrorMessage.
         """

         assert status in ['P', 'F', 'K']
         self._link_odir(status)
         log.debug("Item %s has completed execution: %s", self, status)

         try:
             # Run the target-specific cleanup tasks regardless of the job's
             # outcome.
             self.deploy.post_finish(status)
         except Exception as e:
             # If the job had already failed, then don't do anything. If it's
             # cleanup task failed, then mark the job as failed.
             if status == "P":
                 status = "F"
                 err_msg = ErrorMessage(line_number=None,
                                        message=f"{e}",
                                        context=[f"{e}"])

         self.status = status
         if self.status != "P":
             assert err_msg and isinstance(err_msg, ErrorMessage)
             self.fail_msg = err_msg
             log.log(VERBOSE, err_msg.message)
	# Copyright lowRISC contributors.
	# Licensed under the Apache License, Version 2.0, see LICENSE for details.
	# SPDX-License-Identifier: Apache-2.0

	import collections
	import datetime
	import logging as log
	import os
	import re
	import sys
	from pathlib import Path

	from utils import VERBOSE, clean_odirs, mk_symlink, rm_path


	class LauncherError(Exception):

	def __init__(self, msg):
	self.msg = msg


	class ErrorMessage(
	collections.namedtuple(
	'ErrorMessage',
	['line_number', 'message', 'context'],
	)):
	"""Contains error-related information.

	This support classification of failures into buckets. The message field
	is used to generate the bucket, and context contains a list of lines in
	the failing log that can be useful for quick diagnostics.
	"""
	pass


	class Launcher:
	"""
	Abstraction for launching and maintaining a job.

	An abstract class that provides methods to prepare a job's environment,
	launch the job, poll for its completion and finally do some cleanup
	activities. This class is not meant to be instantiated directly. Each
	launcher object holds an instance of the deploy object.
	"""

	# Type of launcher used as string.
	variant = None

	# Max jobs running at one time
	max_parallel = sys.maxsize

	# Max jobs polled at one time
	max_poll = 10000

	# Poll job's completion status every this many seconds
	poll_freq = 1

	# Points to the python virtual env area.
	pyvenv = None

	# If a history of previous invocations is to be maintained, then keep no
	# more than this many directories.
	max_odirs = 5

	# Flag indicating the workspace preparation steps are complete.
	workspace_prepared = False
	workspace_prepared_for_cfg = set()

	# Jobs that are not run when one of their dependent jobs fail are
	# considered killed. All non-passing jobs are required to have an
	# an associated fail_msg attribute (an object of class ErrorMessage)
	# reflecting the appropriate message. This class attribute thus serves
	# as a catch-all for all those jobs that are not even run. If a job
	# instance runs and fails, the fail_msg attribute is overridden by
	# the instance with the correct message in _post_finish().
	fail_msg = ErrorMessage(
	line_number=None,
	message="Job killed most likely because its dependent job failed.",
	context=[])

	@staticmethod
	def set_pyvenv(project):
	'''Activate a python virtualenv if available.

	The env variable <PROJECT>_PYTHON_VENV if set, points to the path
	containing the python virtualenv created specifically for this
	project. We can activate it if needed, before launching jobs using
	external compute machines.

	This is not applicable when running jobs locally on the user's machine.
	'''

	if Launcher.pyvenv is not None:
	return

	# If project-specific python virtualenv path is set, then activate it
	# before running downstream tools. This is more relevant when not
	# launching locally, but on external machines in a compute farm, which
	# may not have access to the default python installation area on the
	# host machine.
	#
	# The code below allows each launcher variant to set its own virtualenv
	# because the loading / activating mechanism could be different between
	# them.
	Launcher.pyvenv = os.environ.get("{}_PYVENV_{}".format(
	project.upper(), Launcher.variant.upper()))

	if not Launcher.pyvenv:
	Launcher.pyvenv = os.environ.get("{}_PYVENV".format(
	project.upper()))

	@staticmethod
	def prepare_workspace(project, repo_top, args):
	'''Prepare the workspace based on the chosen launcher's needs.

	This is done once for the entire duration for the flow run.
	'project' is the name of the project.
	'repo_top' is the path to the repository.
	'args' are the command line args passed to dvsim.
	'''
	pass

	@staticmethod
	def prepare_workspace_for_cfg(cfg):
	'''Prepare the workspace for a cfg.

	This is invoked once for each cfg.
	'cfg' is the flow configuration object.
	'''
	pass

	def __str__(self):
	return self.deploy.full_name + ":launcher"

	def __init__(self, deploy):
	cfg = deploy.sim_cfg

	# One-time preparation of the workspace.
	if not Launcher.workspace_prepared:
	self.prepare_workspace(cfg.project, cfg.proj_root, cfg.args)
	Launcher.workspace_prepared = True

	# One-time preparation of the workspace, specific to the cfg.
	if cfg not in Launcher.workspace_prepared_for_cfg:
	self.prepare_workspace_for_cfg(cfg)
	Launcher.workspace_prepared_for_cfg.add(cfg)

	# Store the deploy object handle.
	self.deploy = deploy

	# Status of the job. This is primarily determined by the
	# _check_status() method, but eventually updated by the _post_finish()
	# method, in case any of the cleanup tasks fails. This value is finally
	# returned to the Scheduler by the poll() method.
	self.status = None

	# Return status of the process running the job.
	self.exit_code = None

	# Flag to indicate whether to 'overwrite' if odir already exists,
	# or to backup the existing one and create a new one.
	# For builds, we want to overwrite existing to leverage the tools'
	# incremental / partition compile features. For runs, we may want to
	# create a new one.
	self.renew_odir = False

	# The actual job runtime computed by dvsim, in seconds.
	self.job_runtime_secs = 0

	def _make_odir(self):
	"""Create the output directory."""

	# If renew_odir flag is True - then move it.
	if self.renew_odir:
	clean_odirs(odir=self.deploy.odir, max_odirs=self.max_odirs)
	os.makedirs(self.deploy.odir, exist_ok=True)

	def _link_odir(self, status):
	"""Soft-links the job's directory based on job's status.

	The dispatched, passed and failed directories in the scratch area
	provide a quick way to get to the job that was executed.
	"""

	dest = Path(self.deploy.sim_cfg.links[status], self.deploy.qual_name)
	mk_symlink(self.deploy.odir, dest)

	# Delete the symlink from dispatched directory if it exists.
	if status != "D":
	old = Path(self.deploy.sim_cfg.links['D'], self.deploy.qual_name)
	rm_path(old)

	def _dump_env_vars(self, exports):
	"""Write env vars to a file for ease of debug.

	Each extended class computes the list of exports and invokes this
	method right before launching the job.
	"""

	with open(self.deploy.odir + "/env_vars",
	"w",
	encoding="UTF-8",
	errors="surrogateescape") as f:
	for var in sorted(exports.keys()):
	f.write("{}={}\n".format(var, exports[var]))

	def _pre_launch(self):
	"""Do pre-launch activities.

	Examples include such as preparing the job's environment, clearing
	old runs, creating the output directory, dumping all env variables
	etc. This method is already invoked by launch() as the first step.
	"""

	self.deploy.pre_launch()
	self._make_odir()
	self.start_time = datetime.datetime.now()

	def _do_launch(self):
	"""Launch the job."""

	raise NotImplementedError()

	def launch(self):
	"""Launch the job."""

	self._pre_launch()
	self._do_launch()

	def poll(self):
	"""Poll the launched job for completion.

	Invokes _check_status() and _post_finish() when the job completes.
	"""

	raise NotImplementedError()

	def kill(self):
	"""Terminate the job."""

	raise NotImplementedError()

	def _check_status(self):
	"""Determine the outcome of the job (P/F if it ran to completion).

	Returns (status, err_msg) extracted from the log, where the status is
	"P" if the it passed, "F" otherwise. This is invoked by poll() just
	after the job finishes. err_msg is an instance of the named tuple
	ErrorMessage.
	"""

	def _find_patterns(patterns, line):
	"""Helper function that returns the pattern if any of the given
	patterns is found, else None."""

	assert patterns
	for pattern in patterns:
	match = re.search(r"{}".format(pattern), line)
	if match:
	return pattern
	return None

	if self.deploy.dry_run:
	return "P", None

	# Only one fail pattern needs to be seen.
	chk_failed = bool(self.deploy.fail_patterns)

	# All pass patterns need to be seen, so we replicate the list and remove
	# patterns as we encounter them.
	pass_patterns = self.deploy.pass_patterns.copy()
	chk_passed = bool(pass_patterns) and (self.exit_code == 0)

	try:
	with open(self.deploy.get_log_path(),
	"r",
	encoding="UTF-8",
	errors="surrogateescape") as f:
	lines = f.readlines()
	except OSError as e:
	return "F", ErrorMessage(
	line_number=None,
	message="Error opening file {}:\n{}".format(
	self.deploy.get_log_path(), e),
	context=[],
	)

	# Since the log file is already opened and read to assess the job's
	# status, use this opportunity to also extract other pieces of
	# information.
	self.deploy.extract_info_from_log(lines)

	if chk_failed or chk_passed:
	for cnt, line in enumerate(lines):
	if chk_failed:
	if _find_patterns(self.deploy.fail_patterns, line):
	# If failed, then nothing else to do. Just return.
	# Provide some extra lines for context.
	return "F", ErrorMessage(line_number=cnt + 1,
	message=line.strip(),
	context=lines[cnt:cnt + 5])

	if chk_passed:
	pattern = _find_patterns(pass_patterns, line)
	if pattern:
	pass_patterns.remove(pattern)
	chk_passed = bool(pass_patterns)

	# If no fail patterns were seen, but the job returned with non-zero
	# exit code for whatever reason, then show the last 10 lines of the log
	# as the failure message, which might help with the debug.
	if self.exit_code != 0:
	return "F", ErrorMessage(line_number=None,
	message="Job returned non-zero exit code",
	context=lines[-10:])
	if chk_passed:
	return "F", ErrorMessage(
	line_number=None,
	message=f"Some pass patterns missing: {pass_patterns}",
	context=lines[-10:],
	)
	return "P", None

	def _post_finish(self, status, err_msg):
	"""Do post-completion activities, such as preparing the results.

	Must be invoked by poll(), after the job outcome is determined.

	status is the status of the job, either 'P', 'F' or 'K'.
	err_msg is an instance of the named tuple ErrorMessage.
	"""

	assert status in ['P', 'F', 'K']
	self._link_odir(status)
	log.debug("Item %s has completed execution: %s", self, status)

	try:
	# Run the target-specific cleanup tasks regardless of the job's
	# outcome.
	self.deploy.post_finish(status)
	except Exception as e:
	# If the job had already failed, then don't do anything. If it's
	# cleanup task failed, then mark the job as failed.
	if status == "P":
	status = "F"
	err_msg = ErrorMessage(line_number=None,
	message=f"{e}",
	context=[f"{e}"])

	self.status = status
	if self.status != "P":
	assert err_msg and isinstance(err_msg, ErrorMessage)
	self.fail_msg = err_msg
	log.log(VERBOSE, err_msg.message)