source: sasmodels/example/slurm_batch.py @ 1a8b91c

Last change on this file since 1a8b91c was 1d9998c, checked in by Paul Kienzle <pkienzle@…>, 6 years ago

update docs for the slurm batch program

  • Property mode set to 100755
File size: 8.0 KB
Line 
1#!/usr/bin/env python
2"""
3Submit a batch fit job to the slurm cluster.
4
5Given a model.py file defining a Bumps problem defined on a single data
6file, with the data file specified as a command line argument, run the
7bumps fit as a batch over a set of different datafiles independently.
8An example model is given in model_ellipsoid_hayter_msa.py, which fits
9the data in 09319*.dat.
10
11To run the fit, use::
12
13    slurm_batch.py [--slurm_opts] model.py *.dat --store=T1 [--bumps_opt ...]
14
15For example::
16
17    slurm_batch.py model_ellipsoid_hayter_msa.py 09319*.dat --store=T1
18
19This creates the T1 subdirectory to hold the fit results and
20prints the real command that is submitted, as well as the job id.
21
22The store directory T1 contains a copy of the model file and
23all the data files.  The fit results for each file will be
24in T1/##/*.  The file T1/files.dat contains the list
25of "subdirectory filename" pairs indicating which ## directory
26contains the resuls for which file.  Check for errors using::
27
28    cat T1/slurm*_1.out
29
30The following slurm options are used::
31
32    --array=1-#files     batch size comes from the file list
33    --gres=gpu:1         request a gpu for each fit
34    --job-name=model.py  use model file name for job name
35    --output=...         log into T1/slurm-job_##.out
36    --chdir=...          run fit from store directory
37    --time=2             time as number of hours (can override)
38
39To receive an email on job completion or failure, add the following
40slurm options before the model file::
41
42    --mail-type=END,FAIL --mail-user=user@mail.domain
43
44Bumps options are described at bumps.readthedocs.org, with the
45following set automatically::
46
47    --batch              run in batch mode, without output to .mon
48    --view=log           SAS fits want log plots
49    --time=2-0.1         slurm time minus 6 minutes for cleanup
50
51The --store and --resume options indicate the parent directory for
52the output.  These are modified to store the results in a separate
53subdirectory for each file.  Keep in mind that the fit is run from
54the store directory, so any files or modules referenced from the
55model file will need to use a full path to the original location.
56
57After submitting the job a job id will be printed to the console.
58You can check the status of the job using the usual slurm commands
59such as::
60
61    squeue
62
63or cancel the job using::
64
65    scancel jobid
66
67The slurm_batch program runs directly from the source tree for sasmodels,
68and requires sasview, bumps and periodictable as sister directories
69accessible on the worker nodes.  You can link it into your bin directory
70using::
71
72     mkdir ~/bin
73     ln -s path/to/slurm_batch.py ~/bin
74
75or if you are a cluster administrator, into /usr/local/bin.
76"""
77
78# If called from command line, this submits a job to the slurm queue, with _this_ file
79# as the batch script. Before calling it on the worker node, slurm sets the
80# SLURM_ARRAY_TASK_ID to the current task so we can tell that we are running
81# as a worker and which file we should be working on.
82
83## SBATCH options as comments do not seem to work.  Maybe they neeed to be before
84## the doc string?  For now they are hardcoded in the sbatch call in submit_job.
85
86import sys
87import os
88import tempfile
89import shutil
90
91DEFAULT_TIME_LIMIT = 2
92
93def split_args():
94    slurm_opts = []
95    bumps_opts = []
96    model_file = None
97    store = None
98    resume = None
99    data_files = []
100    time_limit = DEFAULT_TIME_LIMIT
101
102    # start with '-' arguments as slurm opts, then after
103    # the model file any '-' arguments are bumps opts.
104    opts = slurm_opts
105    for v in sys.argv[1:]:
106        if v.startswith('--store='):
107            store = os.path.realpath(os.path.abspath(v[8:]))
108        elif v.startswith('--resume='):
109            resume = os.path.realpath(os.path.abspath(v[9:]))
110        elif v.startswith('--time='):
111            time_limit = float(v[7:])
112        elif v[0] == '-':
113            opts.append(v)
114        elif model_file is None:
115            model_file = v
116            opts = bumps_opts
117        else:
118            data_files.append(v)
119
120
121    s = time_limit*3600
122    slurm_opts.append("--time=%d:%02d:%02d"%(s//3600, (s%3600)//60, s%60))
123    bumps_opts.append('--time=%f'%(time_limit - 0.1))  # 6 min to stop cleanly
124
125    return {
126        'slurm': slurm_opts, 
127        'model_file': model_file, 
128        'data_files': data_files, 
129        'store': store, 
130        'resume': resume,
131        'bumps': bumps_opts,
132    }
133
134def dirn(path, n):
135    path = os.path.realpath(os.path.abspath(path))
136    for _ in range(n):
137        path = os.path.dirname(path)
138    return path
139
140def submit_job():
141    # sbatch --array=1-5 ./slurm_batch.py model_ellipsoid_hayter_msa.py 09*.dat --store=T1 --fit=dream
142    opts = split_args()
143    store = opts['store']
144    model_file = opts['model_file']
145    data_files = opts['data_files']
146    bumps_opts = opts['bumps']
147    slurm_opts = opts['slurm']
148
149    # make sure the store directory exists and save the order of the files, as well
150    # as the model and the data files
151    if store is not None:
152        if not os.path.exists(store):
153            os.makedirs(store)
154
155        # save file order
156        with open(os.path.join(store, 'files.dat'), 'w') as fid:
157            for k, f in enumerate(data_files):
158                fid.write("%02d %s\n"%(k+1, f))
159
160        # Copy the model and data files to the root store directory
161        # Since bumps changes into the model directory prior to loading
162        # the datafiles, strip all leading paths from data and model and
163        # set the working directory for the job to the store directory.
164        model_copy = os.path.basename(model_file)
165        shutil.copy(model_file, os.path.join(store, model_copy))
166        data_copy = []
167        for f in data_files:
168            f_copy = os.path.basename(f)
169            shutil.copy(f, os.path.join(store, f_copy))
170            data_copy.append(f_copy) 
171
172        model_file = model_copy
173        data_files = data_copy
174
175
176    # build and run the command
177    SRC = dirn(__file__, 3) # __file__ is $SRC/sasmodels/example/slurm_batch.py
178    parts = [
179        "sbatch",
180        "--array=1-%d"%len(data_files),
181        "--gres=gpu:1",
182        "--job-name="+model_file,
183        ## since we are setting the current working directory, we don't need
184        ## to fiddle the slurm output files
185        "--output=%s/slurm-%%A_%%a.out"%store,
186        "--chdir=%s"%store,
187        ]
188    parts.extend(slurm_opts)
189    parts.append(__file__)
190    # Remember the source root so we can reconstruct the correct python path
191    # This is done after the model file so that it doesn't get interpreted
192    # as a slurm option.
193    parts.append("--source_root=%s"%SRC)
194    parts.append(model_file)
195    parts.extend(data_files)
196    parts.extend(bumps_opts)
197    #if store is not None:
198    #    parts.append("--store=" + store)
199    command = " ".join(parts)
200
201    print(command)
202    os.system(command)
203
204def run_task(task_id):
205    opts = split_args()
206
207    # Set environment put compiled sasmodels in user-specific temporary cache
208    # We need this because users don't have a home directory on the individual
209    # cluster nodes.
210    assert opts['slurm'][0].startswith('--source_root=')
211    SRC = opts['slurm'][0][14:]
212    PACKAGES = ("periodictable", "sasview/src", "bumps", "sasmodels")
213    os.environ['PYTHONPATH'] = ":".join(SRC+"/"+v for v in PACKAGES)
214    TMP = tempfile.gettempdir()
215    cache_path = os.path.join(TMP, os.environ['USER'], '.cache')
216    os.environ['SAS_DLL_PATH'] = cache_path
217    os.environ['XDG_CACHE_HOME'] = cache_path
218
219    #task_store = "%s/%02d"%(opts['store'], task_id)
220    task_store = "%02d"%task_id
221    parts = [
222       "python", os.path.join(SRC, "bumps", "run.py"), "--batch",
223       "--view=log",
224       opts['model_file'],
225       opts['data_files'][task_id-1],
226       ]
227    parts.extend(opts['bumps'])
228    parts.append('--store='+task_store)
229    if opts['resume'] is not None:
230        parts.append('--resume='+os.path.join(opts['resume'], task_store))
231    command = " ".join(parts)
232    print(os.getcwd() + "$ " + command)
233    os.system(command)
234
235
236task_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', -1))
237if task_id == -1:
238    submit_job()
239else:
240    run_task(task_id)
241
Note: See TracBrowser for help on using the repository browser.