Blame - util/dvsim/LsfLauncher.py - 3p/lowrisc/opentitan

2021-03-02 00:15:51 -0800

[diff] [blame]

1

# Copyright lowRISC contributors.

2

# Licensed under the Apache License, Version 2.0, see LICENSE for details.

3

# SPDX-License-Identifier: Apache-2.0

4

5

import logging as log

6

import os

7

import re

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

8

import subprocess

9

import tarfile

10

from pathlib import Path

11

Guillermo Maturana

2021-04-08 14:04:15 -0700

[diff] [blame]

12

from Launcher import ErrorMessage, Launcher, LauncherError

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

13

from utils import VERBOSE, clean_odirs

14

15

16

class LsfLauncher(Launcher):

17

18

# A hidden directory specific to a cfg, where we put individual 'job'

# scripts.

jobs_dir = {}

# All launcher instances available for lookup.

23

jobs = {}

24

25

# When the job completes, we try to read the job script output to determine

26

# the outcome. It may not have been completely written the first time we

27

# read it so we retry on the next poll, no more than 10 times.

28

max_poll_retries = 10

29

30

# TODO: Add support for build/run/cov job specific resource requirements:

31

# cpu, mem, disk, stack.

32

# TODO: Allow site-specific job resource usage setting using

33

# `DVSIM_LSF_CFG` environment variable.

34

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

35

@staticmethod

36

def prepare_workspace(project, repo_top, args):

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

37

# Since we dispatch to remote machines, a project specific python

38

# virtualenv is exists, needs to be activated when launching the job.

Srikrishna Iyer

2021-03-23 16:27:26 -0700

[diff] [blame]

39

Launcher.set_pyvenv(project)

40

if Launcher.pyvenv is None:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

41

return

42

Srikrishna Iyer

2021-03-23 16:27:26 -0700

[diff] [blame]

43

# If it is already a dir, then nothing to be done.

44

if os.path.isdir(Launcher.pyvenv):

45

return

46

47

# If not, then it needs to be a valid tarball. Extract it in the

48

# scratch area if it does not exist.

49

stem = Path(Launcher.pyvenv).stem

50

if stem.endswith("tar"):

51

stem = stem[:-4]

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

52

path = Path(args.scratch_root, stem)

53

if not path.is_dir():

Srikrishna Iyer

2021-03-23 16:27:26 -0700

[diff] [blame]

54

log.info("[prepare_workspace]: [pyvenv]: Extracting %s",

55

Launcher.pyvenv)

56

with tarfile.open(Launcher.pyvenv, mode='r') as tar:

57

tar.extractall(args.scratch_root)

58

log.info("[prepare_workspace]: [pyvenv]: Done: %s", path)

59

Launcher.pyvenv = path

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

60

61

@staticmethod

62

def prepare_workspace_for_cfg(cfg):

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

63

# Create the job dir.

64

LsfLauncher.jobs_dir[cfg] = Path(cfg.scratch_path, "lsf",

65

cfg.timestamp)

66

clean_odirs(odir=LsfLauncher.jobs_dir[cfg], max_odirs=2)

67

os.makedirs(Path(LsfLauncher.jobs_dir[cfg]), exist_ok=True)

68

69

@staticmethod

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

70

def make_job_script(cfg, job_name):

71

"""Creates the job script.

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

72

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

73

Once all jobs in the array are launched, the job script can be created.

74

It is a bash script that takes the job index as a single argument.

75

This index is set in the bsub command as '$LSB_JOBINDEX', which bsub

76

sets as the actual index when launching that job in the array. This

77

script is super simple - it is just a giant case statement that

78

switches on the job index to run that specific job. This preferred over

79

creating individual scripts for each job which incurs additional file

80

I/O overhead when the scratch area is on NFS, causing a slowdown.

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

81

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

82

Returns the path to the job script.

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

83

"""

84

85

lines = ["#!/usr/bin/env bash\nset -e\n"]

86

87

# Activate the python virtualenv if it exists.

Srikrishna Iyer

2021-03-23 16:27:26 -0700

[diff] [blame]

88

if Launcher.pyvenv:

89

lines += ["source {}/bin/activate\n".format(Launcher.pyvenv)]

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

90

91

lines += ["case $1 in\n"]

92

for job in LsfLauncher.jobs[cfg][job_name]:

93

# Redirect the job's stdout and stderr to its log file.

94

cmd = "{} > {} 2>&1".format(job.deploy.cmd,

95

job.deploy.get_log_path())

96

lines += [" {})\n".format(job.index), " {};;\n".format(cmd)]

97

98

# Throw error as a sanity check if the job index is invalid.

99

lines += [

100

" *)\n",

101

" echo \"ERROR: Illegal job index: $1\" 1>&2; exit 1;;\n",

102

"esac\n"

103

]

Srikrishna Iyer

2021-03-23 16:27:26 -0700

[diff] [blame]

104

if Launcher.pyvenv:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

105

lines += ["deactivate\n"]

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

106

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

107

job_script = Path(LsfLauncher.jobs_dir[cfg], job_name)

108

try:

109

with open(job_script, "w", encoding='utf-8') as f:

110

f.writelines(lines)

111

except IOError as e:

112

err_msg = "ERROR: Failed to write {}:\n{}".format(job_script, e)

113

LsfLauncher._post_finish_job_array(cfg, job_name, err_msg)

114

raise LauncherError(err_msg)

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

115

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

116

log.log(VERBOSE, "[job_script]: %s", job_script)

117

return job_script

118

119

def __init__(self, deploy):

120

super().__init__(deploy)

121

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

122

# Maintain the job script output as an instance variable for polling

# and cleanup.

self.bsub_out = None

# If we already opened the job script output file (but have not

127

# determined the outcome), then we maintain the file descriptor rather

128

# then reopening it and starting all over again on the next poll.

129

self.bsub_out_fd = None

130

self.bsub_out_err_msg = []

131

self.bsub_out_err_msg_found = False

# Set the job id.

self.job_id = None

# Polling retry counter..

137

self.num_poll_retries = 0

138

139

# Add self to the list of jobs.

140

cfg_dict = LsfLauncher.jobs.setdefault(deploy.sim_cfg, {})

141

job_name_list = cfg_dict.setdefault(deploy.job_name, [])

142

job_name_list.append(self)

143

144

# Job's index in the array.

145

self.index = len(job_name_list)

146

147

def _do_launch(self):

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

148

# Add self to the list of jobs.

149

job_name = self.deploy.job_name

150

cfg = self.deploy.sim_cfg

151

job_total = len(LsfLauncher.jobs[cfg][job_name])

152

153

# The actual launching of the bsub command cannot happen until the

154

# Scheduler has dispatched ALL jobs in the array.

155

if self.index < job_total:

156

return

157

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

158

job_script = self.make_job_script(cfg, job_name)

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

159

160

# Update the shell's env vars with self.exports. Values in exports must

161

# replace the values in the shell's env vars if the keys match.

162

exports = os.environ.copy()

163

if self.deploy.exports:

164

exports.update(self.deploy.exports)

165

166

# Clear the magic MAKEFLAGS variable from exports if necessary. This

167

# variable is used by recursive Make calls to pass variables from one

168

# level to the next. Here, self.cmd is a call to Make but it's

169

# logically a top-level invocation: we don't want to pollute the flow's

170

# Makefile with Make variables from any wrapper that called dvsim.

171

if 'MAKEFLAGS' in exports:

172

del exports['MAKEFLAGS']

173

174

self._dump_env_vars(exports)

175

176

# TODO: Arbitrarily set the max slot-limit to 100.

177

job_array = "{}[1-{}]".format(job_name, job_total)

if job_total > 100:

job_array += "%100"

# TODO: This needs to be moved to a HJson.

182

if self.deploy.sim_cfg.tool == "vcs":

183

job_rusage = "\'rusage[vcssim=1,vcssim_dynamic=1:duration=1]\'"

184

185

elif self.deploy.sim_cfg.tool == "xcelium":

186

job_rusage = "\'rusage[xcelium=1,xcelium_dynamic=1:duration=1]\'"

else:

job_rusage = None

# Launch the job array.

192

cmd = [

193

"bsub",

194

# TODO: LSF project name could be site specific!

"-P",

cfg.project,

"-J",

job_array,

"-oo",

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

200

"{}.%I.out".format(job_script),

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

201

"-eo",

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

202

"{}.%I.out".format(job_script)

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

]

if job_rusage:

cmd += ["-R", job_rusage]

207

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

208

cmd += ["/usr/bin/bash {} $LSB_JOBINDEX".format(job_script)]

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

209

210

try:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

211

p = subprocess.run(cmd,

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

212

check=True,

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

213

timeout=60,

214

stdout=subprocess.PIPE,

215

stderr=subprocess.PIPE,

216

env=exports)

217

except subprocess.CalledProcessError as e:

218

# Need to mark all jobs in this range with this fail pattern.

219

err_msg = e.stderr.decode("utf-8").strip()

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

220

self._post_finish_job_array(cfg, job_name, err_msg)

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

221

raise LauncherError(err_msg)

222

223

# Extract the job ID.

224

result = p.stdout.decode("utf-8").strip()

225

job_id = result.split('Job <')[1].split('>')[0]

226

if not job_id:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

227

self._post_finish_job_array(cfg, job_name, "Job ID not found!")

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

228

raise LauncherError(err_msg)

229

230

for job in LsfLauncher.jobs[cfg][job_name]:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

231

job.bsub_out = Path("{}.{}.out".format(job_script, job.index))

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

232

job.job_id = "{}[{}]".format(job_id, job.index)

233

job._link_odir("D")

234

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

235

def poll(self):

236

# It is possible we may have determined the status already.

if self.status:

return self.status

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

240

if not self.bsub_out_fd:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

241

# If job id is not set, the bsub command has not been sent yet.

if not self.job_id:

return 'D'

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

245

# We redirect the job's output to the log file, so the job script

246

# output remains empty until the point it finishes. This is a very

247

# quick way to check if the job has completed. If nothing has been

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

248

# written to the job script output yet (or if it is not yet

249

# created), then the job is still running.

250

try:

251

if not self.bsub_out.stat().st_size:

252

return "D"

253

except FileNotFoundError:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

254

return "D"

255

256

# If we got to this point, we can now open the job script output

257

# file for reading.

258

try:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

259

self.bsub_out_fd = open(self.bsub_out, "r")

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

260

except IOError as e:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

261

self._post_finish(

262

"F",

Guillermo Maturana

2021-04-08 14:04:15 -0700

[diff] [blame]

263

ErrorMessage(

264

line_number=None,

265

message="ERROR: Failed to open {}\n{}.".format(

266

self.bsub_out, e),

267

context=[]))

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

268

return "F"

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

269

270

# Now that the job has completed, we need to determine its status.

271

#

272

# If the job successfully launched and it failed, the failure message

273

# will appear in its log file (because of the stderr redirection).

274

# But, in some cases, if there is something wrong with the command

275

# itself, bsub might return immediately with an error message, which

276

# will appear in the job script output file. We want to retrieve that

277

# so that we can report the status accurately.

278

#

279

# At this point, we could run bjobs or bhist to determine the status,

280

# but it has been found to be too slow, expecially when running 1000s

281

# of jobs. Plus, we have to read the job script output anyway to look

282

# for those error messages.

283

#

284

# So we just read this file to determine both, the status and extract

285

# the error message, rather than running bjobs or bhist. But there is

286

# one more complication to deal with - if we read the file now, it is

287

# possible that it may not have been fully updated. We will try reading

288

# it anyway. If we are unable to find what we are looking for, then we

289

# will resume reading it again at the next poll. We will do this upto

290

# max_poll_retries times before giving up and flagging an error.

291

#

292

# TODO: Consider using the IBM Plarform LSF Python APIs instead.

293

# (deferred due to shortage of time / resources).

294

# TODO: Parse job telemetry data for performance insights.

295

296

exit_code = self._get_job_exit_code()

297

if exit_code is not None:

298

self.exit_code = exit_code

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

299

status, err_msg = self._check_status()

300

# Prioritize error messages from bsub over the job's log file.

301

if self.bsub_out_err_msg:

Guillermo Maturana

2021-04-08 14:04:15 -0700

[diff] [blame]

302

err_msg = ErrorMessage(line_number=None,

303

message=self.bsub_out_err_msg,

304

context=[])

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

305

self._post_finish(status, err_msg)

306

return status

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

307

308

else:

309

self.num_poll_retries += 1

310

# Fail the test if we have reached the max polling retries.

311

if self.num_poll_retries == LsfLauncher.max_poll_retries:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

312

self._post_finish(

313

"F", "ERROR: Reached max retries while "

314

"reading job script output {} to determine"

315

" the outcome.".format(self.bsub_out))

316

return "F"

317

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

318

return "D"

319

320

def _get_job_exit_code(self):

321

'''Read the job script output to retrieve the exit code.

322

323

Also read the error message if any, which will appear at the beginning

324

of the log file followed by bsub's standard 'email' format output. It

325

looks something like this:

326

327

328

------------------------------------------------------------

329

Sender: LSF System <...>

Subject: ...

...

Successfully completed.

334

<OR>

335

Exited with exit code 1.

...

The extracted stderr messages are logged to self.fail_msg. The line

340

indicating whether it was successful or it failed with an exit code

341

is used to return the exit code.

342

343

Returns the exit code if found, else None.

344

'''

345

346

# Job script output must have been opened already.

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

347

assert self.bsub_out_fd

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

348

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

349

for line in self.bsub_out_fd:

350

if not self.bsub_out_err_msg_found:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

351

m = re.match("^Sender", line)

352

if m:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

353

# Pop the line before the sender line.

354

self.bsub_out_err_msg = "\n".join(

355

self.bsub_out_err_msg[:-1])

356

self.bsub_out_err_msg_found = True

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

357

else:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

358

self.bsub_out_err_msg.append(line.strip())

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

359

360

else:

361

m = re.match(r"^Exited with exit code (\d+).\n$", line)

362

if m:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

363

return m.group(1)

364

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

365

if not self.bsub_out_err_msg:

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

366

m = re.match(r"^Successfully completed.\n$", line)

if m:

return 0

return None

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

def kill(self):

if self.job_id:

try:

subprocess.run(["bkill", "-s", "SIGTERM", self.job_id],

375

check=True,

376

stdout=subprocess.PIPE,

377

stderr=subprocess.PIPE)

378

except subprocess.CalledProcessError as e:

379

log.error("Failed to kill job: {}".format(

380

e.stderr.decode("utf-8").strip()))

381

else:

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

382

log.error("Job ID for %s not found", self.deploy.full_name)

Srikrishna Iyer

2021-03-02 00:15:51 -0800

[diff] [blame]

383

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

384

self._post_finish('K', "Job killed!")

385

386

def _post_finish(self, status, err_msg):

387

if self.bsub_out_fd:

388

self.bsub_out_fd.close()

Srikrishna Iyer

2021-03-17 00:18:26 -0700

[diff] [blame]

389

if self.exit_code is None:

390

self.exit_code = 0 if status == 'P' else 1

391

super()._post_finish(status, err_msg)

392

393

@staticmethod

394

def _post_finish_job_array(cfg, job_name, err_msg):

395

'''On LSF error, mark all jobs in this array as killed.

396

397

err_msg is the error message indicating the cause of failure.'''

398

399

for job in LsfLauncher.jobs[cfg][job_name]:

Guillermo Maturana