slurm_batch.py @ 7609046

Last change on this file since 7609046 was 1d9998c, checked in by Paul Kienzle <pkienzle@…>, 6 years ago
update docs for the slurm batch program
Property mode set to `100755`
File size: 8.0 KB

Line
1	#!/usr/bin/env python
2	"""
3	Submit a batch fit job to the slurm cluster.
4
5	Given a model.py file defining a Bumps problem defined on a single data
6	file, with the data file specified as a command line argument, run the
7	bumps fit as a batch over a set of different datafiles independently.
8	An example model is given in model_ellipsoid_hayter_msa.py, which fits
9	the data in 09319*.dat.
10
11	To run the fit, use::
12
13	slurm_batch.py [--slurm_opts] model.py *.dat --store=T1 [--bumps_opt ...]
14
15	For example::
16
17	slurm_batch.py model_ellipsoid_hayter_msa.py 09319*.dat --store=T1
18
19	This creates the T1 subdirectory to hold the fit results and
20	prints the real command that is submitted, as well as the job id.
21
22	The store directory T1 contains a copy of the model file and
23	all the data files. The fit results for each file will be
24	in T1/##/*. The file T1/files.dat contains the list
25	of "subdirectory filename" pairs indicating which ## directory
26	contains the resuls for which file. Check for errors using::
27
28	cat T1/slurm*_1.out
29
30	The following slurm options are used::
31
32	--array=1-#files batch size comes from the file list
33	--gres=gpu:1 request a gpu for each fit
34	--job-name=model.py use model file name for job name
35	--output=... log into T1/slurm-job_##.out
36	--chdir=... run fit from store directory
37	--time=2 time as number of hours (can override)
38
39	To receive an email on job completion or failure, add the following
40	slurm options before the model file::
41
42	--mail-type=END,FAIL --mail-user=user@mail.domain
43
44	Bumps options are described at bumps.readthedocs.org, with the
45	following set automatically::
46
47	--batch run in batch mode, without output to .mon
48	--view=log SAS fits want log plots
49	--time=2-0.1 slurm time minus 6 minutes for cleanup
50
51	The --store and --resume options indicate the parent directory for
52	the output. These are modified to store the results in a separate
53	subdirectory for each file. Keep in mind that the fit is run from
54	the store directory, so any files or modules referenced from the
55	model file will need to use a full path to the original location.
56
57	After submitting the job a job id will be printed to the console.
58	You can check the status of the job using the usual slurm commands
59	such as::
60
61	squeue
62
63	or cancel the job using::
64
65	scancel jobid
66
67	The slurm_batch program runs directly from the source tree for sasmodels,
68	and requires sasview, bumps and periodictable as sister directories
69	accessible on the worker nodes. You can link it into your bin directory
70	using::
71
72	mkdir ~/bin
73	ln -s path/to/slurm_batch.py ~/bin
74
75	or if you are a cluster administrator, into /usr/local/bin.
76	"""
77
78	# If called from command line, this submits a job to the slurm queue, with _this_ file
79	# as the batch script. Before calling it on the worker node, slurm sets the
80	# SLURM_ARRAY_TASK_ID to the current task so we can tell that we are running
81	# as a worker and which file we should be working on.
82
83	## SBATCH options as comments do not seem to work. Maybe they neeed to be before
84	## the doc string? For now they are hardcoded in the sbatch call in submit_job.
85
86	import sys
87	import os
88	import tempfile
89	import shutil
90
91	DEFAULT_TIME_LIMIT = 2
92
93	def split_args():
94	slurm_opts = []
95	bumps_opts = []
96	model_file = None
97	store = None
98	resume = None
99	data_files = []
100	time_limit = DEFAULT_TIME_LIMIT
101
102	# start with '-' arguments as slurm opts, then after
103	# the model file any '-' arguments are bumps opts.
104	opts = slurm_opts
105	for v in sys.argv[1:]:
106	if v.startswith('--store='):
107	store = os.path.realpath(os.path.abspath(v[8:]))
108	elif v.startswith('--resume='):
109	resume = os.path.realpath(os.path.abspath(v[9:]))
110	elif v.startswith('--time='):
111	time_limit = float(v[7:])
112	elif v[0] == '-':
113	opts.append(v)
114	elif model_file is None:
115	model_file = v
116	opts = bumps_opts
117	else:
118	data_files.append(v)
119
120
121	s = time_limit*3600
122	slurm_opts.append("--time=%d:%02d:%02d"%(s//3600, (s%3600)//60, s%60))
123	bumps_opts.append('--time=%f'%(time_limit - 0.1)) # 6 min to stop cleanly
124
125	return {
126	'slurm': slurm_opts,
127	'model_file': model_file,
128	'data_files': data_files,
129	'store': store,
130	'resume': resume,
131	'bumps': bumps_opts,
132	}
133
134	def dirn(path, n):
135	path = os.path.realpath(os.path.abspath(path))
136	for _ in range(n):
137	path = os.path.dirname(path)
138	return path
139
140	def submit_job():
141	# sbatch --array=1-5 ./slurm_batch.py model_ellipsoid_hayter_msa.py 09*.dat --store=T1 --fit=dream
142	opts = split_args()
143	store = opts['store']
144	model_file = opts['model_file']
145	data_files = opts['data_files']
146	bumps_opts = opts['bumps']
147	slurm_opts = opts['slurm']
148
149	# make sure the store directory exists and save the order of the files, as well
150	# as the model and the data files
151	if store is not None:
152	if not os.path.exists(store):
153	os.makedirs(store)
154
155	# save file order
156	with open(os.path.join(store, 'files.dat'), 'w') as fid:
157	for k, f in enumerate(data_files):
158	fid.write("%02d %s\n"%(k+1, f))
159
160	# Copy the model and data files to the root store directory
161	# Since bumps changes into the model directory prior to loading
162	# the datafiles, strip all leading paths from data and model and
163	# set the working directory for the job to the store directory.
164	model_copy = os.path.basename(model_file)
165	shutil.copy(model_file, os.path.join(store, model_copy))
166	data_copy = []
167	for f in data_files:
168	f_copy = os.path.basename(f)
169	shutil.copy(f, os.path.join(store, f_copy))
170	data_copy.append(f_copy)
171
172	model_file = model_copy
173	data_files = data_copy
174
175
176	# build and run the command
177	SRC = dirn(__file__, 3) # __file__ is $SRC/sasmodels/example/slurm_batch.py
178	parts = [
179	"sbatch",
180	"--array=1-%d"%len(data_files),
181	"--gres=gpu:1",
182	"--job-name="+model_file,
183	## since we are setting the current working directory, we don't need
184	## to fiddle the slurm output files
185	"--output=%s/slurm-%%A_%%a.out"%store,
186	"--chdir=%s"%store,
187	]
188	parts.extend(slurm_opts)
189	parts.append(__file__)
190	# Remember the source root so we can reconstruct the correct python path
191	# This is done after the model file so that it doesn't get interpreted
192	# as a slurm option.
193	parts.append("--source_root=%s"%SRC)
194	parts.append(model_file)
195	parts.extend(data_files)
196	parts.extend(bumps_opts)
197	#if store is not None:
198	# parts.append("--store=" + store)
199	command = " ".join(parts)
200
201	print(command)
202	os.system(command)
203
204	def run_task(task_id):
205	opts = split_args()
206
207	# Set environment put compiled sasmodels in user-specific temporary cache
208	# We need this because users don't have a home directory on the individual
209	# cluster nodes.
210	assert opts['slurm'][0].startswith('--source_root=')
211	SRC = opts['slurm'][0][14:]
212	PACKAGES = ("periodictable", "sasview/src", "bumps", "sasmodels")
213	os.environ['PYTHONPATH'] = ":".join(SRC+"/"+v for v in PACKAGES)
214	TMP = tempfile.gettempdir()
215	cache_path = os.path.join(TMP, os.environ['USER'], '.cache')
216	os.environ['SAS_DLL_PATH'] = cache_path
217	os.environ['XDG_CACHE_HOME'] = cache_path
218
219	#task_store = "%s/%02d"%(opts['store'], task_id)
220	task_store = "%02d"%task_id
221	parts = [
222	"python", os.path.join(SRC, "bumps", "run.py"), "--batch",
223	"--view=log",
224	opts['model_file'],
225	opts['data_files'][task_id-1],
226	]
227	parts.extend(opts['bumps'])
228	parts.append('--store='+task_store)
229	if opts['resume'] is not None:
230	parts.append('--resume='+os.path.join(opts['resume'], task_store))
231	command = " ".join(parts)
232	print(os.getcwd() + "$ " + command)
233	os.system(command)
234
235
236	task_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', -1))
237	if task_id == -1:
238	submit_job()
239	else:
240	run_task(task_id)
241

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/example/slurm_batch.py @ 7609046

Download in other formats: