[dvsim] Remove "status" from Deploy items Before this patch, the scheduler code in dvsim had two notions of the status of a job: - Deploy.status (stored in each job) - TargetStatus.counters (a count of passes/fails) This patch removes the first of the two, to more cleanly split "run some stuff and collect the results" (done by the scheduler) from "how do I run this job?" (done by the Deploy object). There are some extra changes we have to make to get this to work. Firstly, we move the SIGINT handling code into Runner.py: this belongs around the code that actually runs something, not at the top of dvsim.py. That means that the "kill some stuff" logic can now be triggered in the main thread via a threading.Event, which avoids lots of code running in the signal handler and is probably a bit safer. Secondly, we have to explicitly return the results of running jobs from Scheduler.run() and thread them through FlowCfg.gen_results(). As it turns out, most of the subclasses of FlowCfg figure out success/failure by loading generated hjson files, so this actually only needs updating in SimCfg.py. Signed-off-by: Rupert Swarbrick <rswarbrick@lowrisc.org>

commit: a2892a50ed57bee2ff96c6aa5cbaec2d11bacfae [log] [tgz]
author: Rupert Swarbrick <rswarbrick@lowrisc.org> Mon Oct 12 08:50:08 2020 +0100
committer: Srikrishna Iyer <46467186+sriyerg@users.noreply.github.com> Mon Jan 25 16:42:49 2021 -0800
tree: c8b635c6f545632ff302441876234487eb39e3ac
parent: 381770d22de727628629322a66d83150c7f762c2 [diff] [blame]
diff --git a/util/dvsim/Deploy.py b/util/dvsim/Deploy.py
index a11e1a2..a513786 100644
--- a/util/dvsim/Deploy.py
+++ b/util/dvsim/Deploy.py

@@ -16,6 +16,11 @@
 from utils import VERBOSE, find_and_substitute_wildcards, run_cmd
 
 
+class DeployError(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+
+
 class Deploy():
     """
     Abstraction for deploying builds and runs.
@@ -66,7 +71,6 @@
         # Process
         self.process = None
         self.log_fd = None
-        self.status = None
 
         # These are mandatory class attributes that need to be extracted and
         # set from the sim_cfg object. These are explicitly used to construct
@@ -268,13 +272,11 @@
                                             stderr=f,
                                             env=exports)
             self.log_fd = f
-            self.status = "D"
             Deploy.dispatch_counter += 1
         except IOError:
-            log.error('IO Error: See %s', self.log)
             if self.log_fd:
                 self.log_fd.close()
-            self.status = "K"
+            raise DeployError('IO Error: See {}'.format(self.log))
 
     def odir_limiter(self, odir, max_odirs=-1):
         '''Function to backup previously run output directory to maintain a
@@ -329,93 +331,120 @@
             log.error("Failed to delete old run directories!")
         return dirs
 
-    def set_status(self):
-        self.status = 'P'
-        if self.dry_run is False:
-            seen_fail_pattern = False
-            for fail_pattern in self.fail_patterns:
-                # Return error message with the following 4 lines.
-                grep_cmd = "grep -m 1 -A 4 -E \'" + fail_pattern + "\' " + self.log
-                (status, rslt) = subprocess.getstatusoutput(grep_cmd)
-                if rslt:
-                    msg = "```\n{}\n```\n".format(rslt)
-                    self.fail_msg += msg
-                    log.log(VERBOSE, msg)
-                    self.status = 'F'
-                    seen_fail_pattern = True
-                    break
+    def _test_passed(self):
+        '''Return True if the job passed, False otherwise
 
-            # If fail patterns were not encountered, but the job returned with non-zero exit code
-            # for whatever reason, then show the last 10 lines of the log as the failure message,
-            # which might help with the debug.
-            if self.process.returncode != 0 and not seen_fail_pattern:
-                msg = "Last 10 lines of the log:<br>\n"
+        This is called by poll() just after the job finishes.
+
+        '''
+        if self.dry_run:
+            return True
+
+        seen_fail_pattern = False
+        for fail_pattern in self.fail_patterns:
+            # Return error message with the following 4 lines.
+            grep_cmd = "grep -m 1 -A 4 -E \'" + fail_pattern + "\' " + self.log
+            (status, rslt) = subprocess.getstatusoutput(grep_cmd)
+            if rslt:
+                msg = "```\n{}\n```\n".format(rslt)
                 self.fail_msg += msg
                 log.log(VERBOSE, msg)
-                get_fail_msg_cmd = "tail -n 10 " + self.log
-                msg = run_cmd(get_fail_msg_cmd)
-                msg = "```\n{}\n```\n".format(msg)
+                seen_fail_pattern = True
+                break
+
+        if seen_fail_pattern:
+            return False
+
+        # If no fail patterns were seen, but the job returned with non-zero
+        # exit code for whatever reason, then show the last 10 lines of the log
+        # as the failure message, which might help with the debug.
+        if self.process.returncode != 0:
+            msg = "Last 10 lines of the log:<br>\n"
+            self.fail_msg += msg
+            log.log(VERBOSE, msg)
+            get_fail_msg_cmd = "tail -n 10 " + self.log
+            msg = run_cmd(get_fail_msg_cmd)
+            msg = "```\n{}\n```\n".format(msg)
+            self.fail_msg += msg
+            log.log(VERBOSE, msg)
+            return False
+
+        # If we get here, we've not seen anything explicitly wrong, but we
+        # might have "pass patterns": patterns that must occur in the log for
+        # the run to be considered successful.
+        for pass_pattern in self.pass_patterns:
+            grep_cmd = "grep -c -m 1 -E \'" + pass_pattern + "\' " + self.log
+            (status, rslt) = subprocess.getstatusoutput(grep_cmd)
+            if rslt == "0":
+                msg = "Pass pattern {!r} not found.<br>\n".format(pass_pattern)
                 self.fail_msg += msg
                 log.log(VERBOSE, msg)
-                self.status = "F"
+                return False
 
-            # Return if status is fail - no need to look for pass patterns.
-            if self.status == 'F':
-                return
+        return True
 
-            # If fail patterns were not found, ensure pass patterns indeed were.
-            for pass_pattern in self.pass_patterns:
-                grep_cmd = "grep -c -m 1 -E \'" + pass_pattern + "\' " + self.log
-                (status, rslt) = subprocess.getstatusoutput(grep_cmd)
-                if rslt == "0":
-                    msg = "Pass pattern \"{}\" not found.<br>\n".format(
-                        pass_pattern)
-                    self.fail_msg += msg
-                    log.log(VERBOSE, msg)
-                    self.status = 'F'
-                    break
+    def _link_odir(self, status):
+        old_link = self.sim_cfg.links['D'] + "/" + self.odir_ln
+        new_link = self.sim_cfg.links[status] + "/" + self.odir_ln
+        cmd = "ln -s " + self.odir + " " + new_link + "; "
+        cmd += "rm " + old_link
+        if os.system(cmd):
+            log.error("Cmd \"%s\" could not be run", cmd)
 
-    def link_odir(self):
-        if self.status == '.':
-            log.error("Method unexpectedly called!")
-        else:
-            old_link = self.sim_cfg.links['D'] + "/" + self.odir_ln
-            new_link = self.sim_cfg.links[self.status] + "/" + self.odir_ln
-            cmd = "ln -s " + self.odir + " " + new_link + "; "
-            cmd += "rm " + old_link
-            if os.system(cmd):
-                log.error("Cmd \"%s\" could not be run", cmd)
+    def _on_finish(self, status):
+        '''Called when the process finishes or is killed'''
+        assert status in ['P', 'F', 'K']
+        if status in ['P', 'F']:
+            self._link_odir(status)
 
-    def get_status(self):
-        if self.status != "D":
-            return
-        if self.process.poll() is not None:
-            self.log_fd.close()
-            self.set_status()
+    def poll(self):
+        '''Check status of the running process
 
-            log.debug("Item %s has completed execution: %s", self.name,
-                      self.status)
-            Deploy.dispatch_counter -= 1
-            self.link_odir()
-            del self.process
+        This returns 'D', 'P' or 'F'. If 'D', the job is still running. If 'P',
+        the job finished successfully. If 'F', the job finished with an error.
+
+        This function must only be called after running self.dispatch_cmd() and
+        must not be called again once it has returned 'P' or 'F'.
+
+        '''
+        assert self.process is not None
+        if self.process.poll() is None:
+            return 'D'
+        self.log_fd.close()
+
+        status = 'P' if self._test_passed() else 'F'
+
+        log.debug("Item %s has completed execution: %s", self.name, status)
+        Deploy.dispatch_counter -= 1
+        self._on_finish(status)
+
+        del self.process
+        self.process = None
+
+        return status
 
     def kill(self):
-        '''Kill running processes.
+        '''Kill the running process.
+
+        This must be called between dispatching and reaping the process (the
+        same window as poll()).
+
         '''
-        if self.status == "D" and self.process.poll() is None:
-            self.kill_remote_job()
+        assert self.process is not None
+        self.kill_remote_job()
 
-            # Try to kill the running process. Send SIGTERM first, wait a bit,
-            # and then send SIGKILL if it didn't work.
-            self.process.terminate()
-            try:
-                self.process.wait(timeout=2)
-            except subprocess.TimeoutExpired:
-                self.process.kill()
+        # Try to kill the running process. Send SIGTERM first, wait a bit,
+        # and then send SIGKILL if it didn't work.
+        self.process.terminate()
+        try:
+            self.process.wait(timeout=2)
+        except subprocess.TimeoutExpired:
+            self.process.kill()
 
-            if self.log_fd:
-                self.log_fd.close()
-            self.status = "K"
+        if self.log_fd:
+            self.log_fd.close()
+        self.process = None
+        self._on_finish('K')
 
     def kill_remote_job(self):
         '''
@@ -630,11 +659,9 @@
         # Set identifier.
         self.identifier = self.sim_cfg.name + ":" + self.run_dir_name
 
-    def get_status(self):
-        '''Override base class get_status implementation for additional post-status
-        actions.'''
-        super().get_status()
-        if self.status not in ["D", "P"]:
+    def _on_finish(self, status):
+        super()._on_finish(status)
+        if status != 'P':
             # Delete the coverage data if available.
             if os.path.exists(self.cov_db_test_dir):
                 log.log(VERBOSE, "Deleting coverage data of failing test:\n%s",
@@ -829,28 +856,31 @@
 
         CovReport.items.append(self)
 
-    def get_status(self):
-        super().get_status()
-        # Once passed, extract the cov results summary from the dashboard.
-        if self.status == "P":
-            results, self.cov_total, ex_msg = get_cov_summary_table(
-                self.cov_report_txt, self.sim_cfg.tool)
+    def _test_passed(self):
+        # Add an extra check to Deploy._test_passed where we extract the
+        # coverage results summary for the dashboard (and fail the job if
+        # something goes wrong).
+        if not super()._test_passed():
+            return False
 
-            if not ex_msg:
-                # Succeeded in obtaining the coverage data.
-                colalign = (("center", ) * len(results[0]))
-                self.cov_results = tabulate(results,
-                                            headers="firstrow",
-                                            tablefmt="pipe",
-                                            colalign=colalign)
-            else:
-                self.fail_msg += ex_msg
-                log.error(ex_msg)
-                self.status = "F"
+        results, self.cov_total, ex_msg = get_cov_summary_table(
+            self.cov_report_txt, self.sim_cfg.tool)
 
-        if self.status == "P":
-            # Delete the cov report - not needed.
-            os.system("rm -rf " + self.log)
+        if ex_msg:
+            self.fail_msg += ex_msg
+            log.error(ex_msg)
+            return False
+
+        # Succeeded in obtaining the coverage data.
+        colalign = (("center", ) * len(results[0]))
+        self.cov_results = tabulate(results,
+                                    headers="firstrow",
+                                    tablefmt="pipe",
+                                    colalign=colalign)
+
+        # Delete the cov report - not needed.
+        os.system("rm -rf " + self.log)
+        return True
 
 
 class CovAnalyze(Deploy):
commit	a2892a50ed57bee2ff96c6aa5cbaec2d11bacfae	[log] [tgz]
author	Rupert Swarbrick <rswarbrick@lowrisc.org>	Mon Oct 12 08:50:08 2020 +0100
committer	Srikrishna Iyer <46467186+sriyerg@users.noreply.github.com>	Mon Jan 25 16:42:49 2021 -0800
tree	c8b635c6f545632ff302441876234487eb39e3ac
parent	381770d22de727628629322a66d83150c7f762c2 [diff] [blame]