slurm_batch.py @ 35d2300

Last change on this file since 35d2300 was 1d9998c, checked in by Paul Kienzle <pkienzle@…>, 6 years ago
update docs for the slurm batch program
Property mode set to `100755`
File size: 8.0 KB

Rev	Line
[58a34f1]	1	#!/usr/bin/env python
	2	"""
	3	Submit a batch fit job to the slurm cluster.
	4
	5	Given a model.py file defining a Bumps problem defined on a single data
	6	file, with the data file specified as a command line argument, run the
	7	bumps fit as a batch over a set of different datafiles independently.
[1d9998c]	8	An example model is given in model_ellipsoid_hayter_msa.py, which fits
	9	the data in 09319*.dat.
[58a34f1]	10
[1d9998c]	11	To run the fit, use::
[58a34f1]	12
	13	slurm_batch.py [--slurm_opts] model.py *.dat --store=T1 [--bumps_opt ...]
	14
[1d9998c]	15	For example::
	16
	17	slurm_batch.py model_ellipsoid_hayter_msa.py 09319*.dat --store=T1
	18
[58a34f1]	19	This creates the T1 subdirectory to hold the fit results and
	20	prints the real command that is submitted, as well as the job id.
	21
	22	The store directory T1 contains a copy of the model file and
	23	all the data files. The fit results for each file will be
	24	in T1/##/*. The file T1/files.dat contains the list
	25	of "subdirectory filename" pairs indicating which ## directory
[1d9998c]	26	contains the resuls for which file. Check for errors using::
[58a34f1]	27
	28	cat T1/slurm*_1.out
	29
[1d9998c]	30	The following slurm options are used::
[58a34f1]	31
	32	--array=1-#files batch size comes from the file list
	33	--gres=gpu:1 request a gpu for each fit
	34	--job-name=model.py use model file name for job name
	35	--output=... log into T1/slurm-job_##.out
	36	--chdir=... run fit from store directory
[1d9998c]	37	--time=2 time as number of hours (can override)
[58a34f1]	38
	39	To receive an email on job completion or failure, add the following
	40	slurm options before the model file::
	41
	42	--mail-type=END,FAIL --mail-user=user@mail.domain
	43
[1d9998c]	44	Bumps options are described at bumps.readthedocs.org, with the
	45	following set automatically::
	46
	47	--batch run in batch mode, without output to .mon
	48	--view=log SAS fits want log plots
	49	--time=2-0.1 slurm time minus 6 minutes for cleanup
	50
	51	The --store and --resume options indicate the parent directory for
	52	the output. These are modified to store the results in a separate
	53	subdirectory for each file. Keep in mind that the fit is run from
	54	the store directory, so any files or modules referenced from the
	55	model file will need to use a full path to the original location.
	56
[58a34f1]	57	After submitting the job a job id will be printed to the console.
	58	You can check the status of the job using the usual slurm commands
	59	such as::
	60
	61	squeue
	62
	63	or cancel the job using::
	64
	65	scancel jobid
	66
	67	The slurm_batch program runs directly from the source tree for sasmodels,
	68	and requires sasview, bumps and periodictable as sister directories
	69	accessible on the worker nodes. You can link it into your bin directory
	70	using::
	71
	72	mkdir ~/bin
	73	ln -s path/to/slurm_batch.py ~/bin
	74
	75	or if you are a cluster administrator, into /usr/local/bin.
	76	"""
	77
	78	# If called from command line, this submits a job to the slurm queue, with _this_ file
	79	# as the batch script. Before calling it on the worker node, slurm sets the
	80	# SLURM_ARRAY_TASK_ID to the current task so we can tell that we are running
	81	# as a worker and which file we should be working on.
	82
	83	## SBATCH options as comments do not seem to work. Maybe they neeed to be before
	84	## the doc string? For now they are hardcoded in the sbatch call in submit_job.
	85
	86	import sys
	87	import os
	88	import tempfile
	89	import shutil
	90
	91	DEFAULT_TIME_LIMIT = 2
	92
	93	def split_args():
	94	slurm_opts = []
	95	bumps_opts = []
	96	model_file = None
	97	store = None
	98	resume = None
	99	data_files = []
	100	time_limit = DEFAULT_TIME_LIMIT
	101
	102	# start with '-' arguments as slurm opts, then after
	103	# the model file any '-' arguments are bumps opts.
	104	opts = slurm_opts
	105	for v in sys.argv[1:]:
	106	if v.startswith('--store='):
	107	store = os.path.realpath(os.path.abspath(v[8:]))
	108	elif v.startswith('--resume='):
	109	resume = os.path.realpath(os.path.abspath(v[9:]))
	110	elif v.startswith('--time='):
	111	time_limit = float(v[7:])
	112	elif v[0] == '-':
	113	opts.append(v)
	114	elif model_file is None:
	115	model_file = v
	116	opts = bumps_opts
	117	else:
	118	data_files.append(v)
	119
	120
	121	s = time_limit*3600
	122	slurm_opts.append("--time=%d:%02d:%02d"%(s//3600, (s%3600)//60, s%60))
	123	bumps_opts.append('--time=%f'%(time_limit - 0.1)) # 6 min to stop cleanly
	124
	125	return {
	126	'slurm': slurm_opts,
	127	'model_file': model_file,
	128	'data_files': data_files,
	129	'store': store,
	130	'resume': resume,
	131	'bumps': bumps_opts,
	132	}
	133
	134	def dirn(path, n):
	135	path = os.path.realpath(os.path.abspath(path))
	136	for _ in range(n):
	137	path = os.path.dirname(path)
	138	return path
	139
	140	def submit_job():
	141	# sbatch --array=1-5 ./slurm_batch.py model_ellipsoid_hayter_msa.py 09*.dat --store=T1 --fit=dream
	142	opts = split_args()
	143	store = opts['store']
	144	model_file = opts['model_file']
	145	data_files = opts['data_files']
	146	bumps_opts = opts['bumps']
	147	slurm_opts = opts['slurm']
	148
	149	# make sure the store directory exists and save the order of the files, as well
	150	# as the model and the data files
	151	if store is not None:
	152	if not os.path.exists(store):
	153	os.makedirs(store)
	154
	155	# save file order
	156	with open(os.path.join(store, 'files.dat'), 'w') as fid:
	157	for k, f in enumerate(data_files):
	158	fid.write("%02d %s\n"%(k+1, f))
	159
	160	# Copy the model and data files to the root store directory
	161	# Since bumps changes into the model directory prior to loading
	162	# the datafiles, strip all leading paths from data and model and
	163	# set the working directory for the job to the store directory.
	164	model_copy = os.path.basename(model_file)
	165	shutil.copy(model_file, os.path.join(store, model_copy))
	166	data_copy = []
	167	for f in data_files:
	168	f_copy = os.path.basename(f)
	169	shutil.copy(f, os.path.join(store, f_copy))
	170	data_copy.append(f_copy)
	171
	172	model_file = model_copy
	173	data_files = data_copy
	174
	175
	176	# build and run the command
	177	SRC = dirn(__file__, 3) # __file__ is $SRC/sasmodels/example/slurm_batch.py
	178	parts = [
	179	"sbatch",
	180	"--array=1-%d"%len(data_files),
	181	"--gres=gpu:1",
	182	"--job-name="+model_file,
	183	## since we are setting the current working directory, we don't need
	184	## to fiddle the slurm output files
	185	"--output=%s/slurm-%%A_%%a.out"%store,
	186	"--chdir=%s"%store,
	187	]
	188	parts.extend(slurm_opts)
	189	parts.append(__file__)
	190	# Remember the source root so we can reconstruct the correct python path
	191	# This is done after the model file so that it doesn't get interpreted
	192	# as a slurm option.
	193	parts.append("--source_root=%s"%SRC)
	194	parts.append(model_file)
	195	parts.extend(data_files)
	196	parts.extend(bumps_opts)
	197	#if store is not None:
	198	# parts.append("--store=" + store)
	199	command = " ".join(parts)
	200
	201	print(command)
	202	os.system(command)
	203
	204	def run_task(task_id):
	205	opts = split_args()
	206
	207	# Set environment put compiled sasmodels in user-specific temporary cache
	208	# We need this because users don't have a home directory on the individual
	209	# cluster nodes.
	210	assert opts['slurm'][0].startswith('--source_root=')
	211	SRC = opts['slurm'][0][14:]
	212	PACKAGES = ("periodictable", "sasview/src", "bumps", "sasmodels")
	213	os.environ['PYTHONPATH'] = ":".join(SRC+"/"+v for v in PACKAGES)
	214	TMP = tempfile.gettempdir()
	215	cache_path = os.path.join(TMP, os.environ['USER'], '.cache')
	216	os.environ['SAS_DLL_PATH'] = cache_path
	217	os.environ['XDG_CACHE_HOME'] = cache_path
	218
	219	#task_store = "%s/%02d"%(opts['store'], task_id)
	220	task_store = "%02d"%task_id
	221	parts = [
	222	"python", os.path.join(SRC, "bumps", "run.py"), "--batch",
	223	"--view=log",
	224	opts['model_file'],
	225	opts['data_files'][task_id-1],
	226	]
	227	parts.extend(opts['bumps'])
	228	parts.append('--store='+task_store)
	229	if opts['resume'] is not None:
	230	parts.append('--resume='+os.path.join(opts['resume'], task_store))
	231	command = " ".join(parts)
	232	print(os.getcwd() + "$ " + command)
	233	os.system(command)
	234
	235
	236	task_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', -1))
	237	if task_id == -1:
	238	submit_job()
	239	else:
	240	run_task(task_id)
	241

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/example/slurm_batch.py @ 35d2300

Download in other formats: