source: sasmodels/example/slurm_batch.py @ 58a34f1

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 58a34f1 was 58a34f1, checked in by Paul Kienzle <pkienzle@…>, 6 years ago

add batch fitting program which uses a slurm queue

  • Property mode set to 100755
File size: 7.4 KB
Line 
1#!/usr/bin/env python
2"""
3Submit a batch fit job to the slurm cluster.
4
5Given a model.py file defining a Bumps problem defined on a single data
6file, with the data file specified as a command line argument, run the
7bumps fit as a batch over a set of different datafiles independently.
8An example model is given in example/model_ellipsoid_hayter_msa.py,
9which fits the data in example/09319*.dat.
10
11To run the fit, use:
12
13    slurm_batch.py [--slurm_opts] model.py *.dat --store=T1 [--bumps_opt ...]
14
15This creates the T1 subdirectory to hold the fit results and
16prints the real command that is submitted, as well as the job id.
17
18The store directory T1 contains a copy of the model file and
19all the data files.  The fit results for each file will be
20in T1/##/*.  The file T1/files.dat contains the list
21of "subdirectory filename" pairs indicating which ## directory
22contains the resuls for which file.  Check for errors using
23
24    cat T1/slurm*_1.out
25
26Bumps options are described at bumps.readthedocs.org, with
27running time (--time=T) defaulting to 2 hours maximum.  The
28following slurm options are used::
29
30    --array=1-#files     batch size comes from the file list
31    --gres=gpu:1         request a gpu for each fit
32    --job-name=model.py  use model file name for job name
33    --output=...         log into T1/slurm-job_##.out
34    --chdir=...          run fit from store directory
35
36To receive an email on job completion or failure, add the following
37slurm options before the model file::
38
39    --mail-type=END,FAIL --mail-user=user@mail.domain
40
41After submitting the job a job id will be printed to the console.
42You can check the status of the job using the usual slurm commands
43such as::
44
45    squeue
46
47or cancel the job using::
48
49    scancel jobid
50
51The slurm_batch program runs directly from the source tree for sasmodels,
52and requires sasview, bumps and periodictable as sister directories
53accessible on the worker nodes.  You can link it into your bin directory
54using::
55
56     mkdir ~/bin
57     ln -s path/to/slurm_batch.py ~/bin
58
59or if you are a cluster administrator, into /usr/local/bin.
60"""
61
62# If called from command line, this submits a job to the slurm queue, with _this_ file
63# as the batch script. Before calling it on the worker node, slurm sets the
64# SLURM_ARRAY_TASK_ID to the current task so we can tell that we are running
65# as a worker and which file we should be working on.
66
67## SBATCH options as comments do not seem to work.  Maybe they neeed to be before
68## the doc string?  For now they are hardcoded in the sbatch call in submit_job.
69
70import sys
71import os
72import tempfile
73import shutil
74
75DEFAULT_TIME_LIMIT = 2
76
77def split_args():
78    slurm_opts = []
79    bumps_opts = []
80    model_file = None
81    store = None
82    resume = None
83    data_files = []
84    time_limit = DEFAULT_TIME_LIMIT
85
86    # start with '-' arguments as slurm opts, then after
87    # the model file any '-' arguments are bumps opts.
88    opts = slurm_opts
89    for v in sys.argv[1:]:
90        if v.startswith('--store='):
91            store = os.path.realpath(os.path.abspath(v[8:]))
92        elif v.startswith('--resume='):
93            resume = os.path.realpath(os.path.abspath(v[9:]))
94        elif v.startswith('--time='):
95            time_limit = float(v[7:])
96        elif v[0] == '-':
97            opts.append(v)
98        elif model_file is None:
99            model_file = v
100            opts = bumps_opts
101        else:
102            data_files.append(v)
103
104
105    s = time_limit*3600
106    slurm_opts.append("--time=%d:%02d:%02d"%(s//3600, (s%3600)//60, s%60))
107    bumps_opts.append('--time=%f'%(time_limit - 0.1))  # 6 min to stop cleanly
108
109    return {
110        'slurm': slurm_opts, 
111        'model_file': model_file, 
112        'data_files': data_files, 
113        'store': store, 
114        'resume': resume,
115        'bumps': bumps_opts,
116    }
117
118def dirn(path, n):
119    path = os.path.realpath(os.path.abspath(path))
120    for _ in range(n):
121        path = os.path.dirname(path)
122    return path
123
124def submit_job():
125    # sbatch --array=1-5 ./slurm_batch.py model_ellipsoid_hayter_msa.py 09*.dat --store=T1 --fit=dream
126    opts = split_args()
127    store = opts['store']
128    model_file = opts['model_file']
129    data_files = opts['data_files']
130    bumps_opts = opts['bumps']
131    slurm_opts = opts['slurm']
132
133    # make sure the store directory exists and save the order of the files, as well
134    # as the model and the data files
135    if store is not None:
136        if not os.path.exists(store):
137            os.makedirs(store)
138
139        # save file order
140        with open(os.path.join(store, 'files.dat'), 'w') as fid:
141            for k, f in enumerate(data_files):
142                fid.write("%02d %s\n"%(k+1, f))
143
144        # Copy the model and data files to the root store directory
145        # Since bumps changes into the model directory prior to loading
146        # the datafiles, strip all leading paths from data and model and
147        # set the working directory for the job to the store directory.
148        model_copy = os.path.basename(model_file)
149        shutil.copy(model_file, os.path.join(store, model_copy))
150        data_copy = []
151        for f in data_files:
152            f_copy = os.path.basename(f)
153            shutil.copy(f, os.path.join(store, f_copy))
154            data_copy.append(f_copy) 
155
156        model_file = model_copy
157        data_files = data_copy
158
159
160    # build and run the command
161    SRC = dirn(__file__, 3) # __file__ is $SRC/sasmodels/example/slurm_batch.py
162    parts = [
163        "sbatch",
164        "--array=1-%d"%len(data_files),
165        "--gres=gpu:1",
166        "--job-name="+model_file,
167        ## since we are setting the current working directory, we don't need
168        ## to fiddle the slurm output files
169        "--output=%s/slurm-%%A_%%a.out"%store,
170        "--chdir=%s"%store,
171        ]
172    parts.extend(slurm_opts)
173    parts.append(__file__)
174    # Remember the source root so we can reconstruct the correct python path
175    # This is done after the model file so that it doesn't get interpreted
176    # as a slurm option.
177    parts.append("--source_root=%s"%SRC)
178    parts.append(model_file)
179    parts.extend(data_files)
180    parts.extend(bumps_opts)
181    #if store is not None:
182    #    parts.append("--store=" + store)
183    command = " ".join(parts)
184
185    print(command)
186    os.system(command)
187
188def run_task(task_id):
189    opts = split_args()
190
191    # Set environment put compiled sasmodels in user-specific temporary cache
192    # We need this because users don't have a home directory on the individual
193    # cluster nodes.
194    assert opts['slurm'][0].startswith('--source_root=')
195    SRC = opts['slurm'][0][14:]
196    PACKAGES = ("periodictable", "sasview/src", "bumps", "sasmodels")
197    os.environ['PYTHONPATH'] = ":".join(SRC+"/"+v for v in PACKAGES)
198    TMP = tempfile.gettempdir()
199    cache_path = os.path.join(TMP, os.environ['USER'], '.cache')
200    os.environ['SAS_DLL_PATH'] = cache_path
201    os.environ['XDG_CACHE_HOME'] = cache_path
202
203    #task_store = "%s/%02d"%(opts['store'], task_id)
204    task_store = "%02d"%task_id
205    parts = [
206       "python", os.path.join(SRC, "bumps", "run.py"), "--batch",
207       "--view=log",
208       opts['model_file'],
209       opts['data_files'][task_id-1],
210       ]
211    parts.extend(opts['bumps'])
212    parts.append('--store='+task_store)
213    if opts['resume'] is not None:
214        parts.append('--resume='+os.path.join(opts['resume'], task_store))
215    command = " ".join(parts)
216    print(os.getcwd() + "$ " + command)
217    os.system(command)
218
219
220task_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', -1))
221if task_id == -1:
222    submit_job()
223else:
224    run_task(task_id)
225
Note: See TracBrowser for help on using the repository browser.