#!/usr/bin/env python """ Submit a batch fit job to the slurm cluster. Given a model.py file defining a Bumps problem defined on a single data file, with the data file specified as a command line argument, run the bumps fit as a batch over a set of different datafiles independently. An example model is given in example/model_ellipsoid_hayter_msa.py, which fits the data in example/09319*.dat. To run the fit, use: slurm_batch.py [--slurm_opts] model.py *.dat --store=T1 [--bumps_opt ...] This creates the T1 subdirectory to hold the fit results and prints the real command that is submitted, as well as the job id. The store directory T1 contains a copy of the model file and all the data files. The fit results for each file will be in T1/##/*. The file T1/files.dat contains the list of "subdirectory filename" pairs indicating which ## directory contains the resuls for which file. Check for errors using cat T1/slurm*_1.out Bumps options are described at bumps.readthedocs.org, with running time (--time=T) defaulting to 2 hours maximum. The following slurm options are used:: --array=1-#files batch size comes from the file list --gres=gpu:1 request a gpu for each fit --job-name=model.py use model file name for job name --output=... log into T1/slurm-job_##.out --chdir=... run fit from store directory To receive an email on job completion or failure, add the following slurm options before the model file:: --mail-type=END,FAIL --mail-user=user@mail.domain After submitting the job a job id will be printed to the console. You can check the status of the job using the usual slurm commands such as:: squeue or cancel the job using:: scancel jobid The slurm_batch program runs directly from the source tree for sasmodels, and requires sasview, bumps and periodictable as sister directories accessible on the worker nodes. You can link it into your bin directory using:: mkdir ~/bin ln -s path/to/slurm_batch.py ~/bin or if you are a cluster administrator, into /usr/local/bin. """ # If called from command line, this submits a job to the slurm queue, with _this_ file # as the batch script. Before calling it on the worker node, slurm sets the # SLURM_ARRAY_TASK_ID to the current task so we can tell that we are running # as a worker and which file we should be working on. ## SBATCH options as comments do not seem to work. Maybe they neeed to be before ## the doc string? For now they are hardcoded in the sbatch call in submit_job. import sys import os import tempfile import shutil DEFAULT_TIME_LIMIT = 2 def split_args(): slurm_opts = [] bumps_opts = [] model_file = None store = None resume = None data_files = [] time_limit = DEFAULT_TIME_LIMIT # start with '-' arguments as slurm opts, then after # the model file any '-' arguments are bumps opts. opts = slurm_opts for v in sys.argv[1:]: if v.startswith('--store='): store = os.path.realpath(os.path.abspath(v[8:])) elif v.startswith('--resume='): resume = os.path.realpath(os.path.abspath(v[9:])) elif v.startswith('--time='): time_limit = float(v[7:]) elif v[0] == '-': opts.append(v) elif model_file is None: model_file = v opts = bumps_opts else: data_files.append(v) s = time_limit*3600 slurm_opts.append("--time=%d:%02d:%02d"%(s//3600, (s%3600)//60, s%60)) bumps_opts.append('--time=%f'%(time_limit - 0.1)) # 6 min to stop cleanly return { 'slurm': slurm_opts, 'model_file': model_file, 'data_files': data_files, 'store': store, 'resume': resume, 'bumps': bumps_opts, } def dirn(path, n): path = os.path.realpath(os.path.abspath(path)) for _ in range(n): path = os.path.dirname(path) return path def submit_job(): # sbatch --array=1-5 ./slurm_batch.py model_ellipsoid_hayter_msa.py 09*.dat --store=T1 --fit=dream opts = split_args() store = opts['store'] model_file = opts['model_file'] data_files = opts['data_files'] bumps_opts = opts['bumps'] slurm_opts = opts['slurm'] # make sure the store directory exists and save the order of the files, as well # as the model and the data files if store is not None: if not os.path.exists(store): os.makedirs(store) # save file order with open(os.path.join(store, 'files.dat'), 'w') as fid: for k, f in enumerate(data_files): fid.write("%02d %s\n"%(k+1, f)) # Copy the model and data files to the root store directory # Since bumps changes into the model directory prior to loading # the datafiles, strip all leading paths from data and model and # set the working directory for the job to the store directory. model_copy = os.path.basename(model_file) shutil.copy(model_file, os.path.join(store, model_copy)) data_copy = [] for f in data_files: f_copy = os.path.basename(f) shutil.copy(f, os.path.join(store, f_copy)) data_copy.append(f_copy) model_file = model_copy data_files = data_copy # build and run the command SRC = dirn(__file__, 3) # __file__ is $SRC/sasmodels/example/slurm_batch.py parts = [ "sbatch", "--array=1-%d"%len(data_files), "--gres=gpu:1", "--job-name="+model_file, ## since we are setting the current working directory, we don't need ## to fiddle the slurm output files "--output=%s/slurm-%%A_%%a.out"%store, "--chdir=%s"%store, ] parts.extend(slurm_opts) parts.append(__file__) # Remember the source root so we can reconstruct the correct python path # This is done after the model file so that it doesn't get interpreted # as a slurm option. parts.append("--source_root=%s"%SRC) parts.append(model_file) parts.extend(data_files) parts.extend(bumps_opts) #if store is not None: # parts.append("--store=" + store) command = " ".join(parts) print(command) os.system(command) def run_task(task_id): opts = split_args() # Set environment put compiled sasmodels in user-specific temporary cache # We need this because users don't have a home directory on the individual # cluster nodes. assert opts['slurm'][0].startswith('--source_root=') SRC = opts['slurm'][0][14:] PACKAGES = ("periodictable", "sasview/src", "bumps", "sasmodels") os.environ['PYTHONPATH'] = ":".join(SRC+"/"+v for v in PACKAGES) TMP = tempfile.gettempdir() cache_path = os.path.join(TMP, os.environ['USER'], '.cache') os.environ['SAS_DLL_PATH'] = cache_path os.environ['XDG_CACHE_HOME'] = cache_path #task_store = "%s/%02d"%(opts['store'], task_id) task_store = "%02d"%task_id parts = [ "python", os.path.join(SRC, "bumps", "run.py"), "--batch", "--view=log", opts['model_file'], opts['data_files'][task_id-1], ] parts.extend(opts['bumps']) parts.append('--store='+task_store) if opts['resume'] is not None: parts.append('--resume='+os.path.join(opts['resume'], task_store)) command = " ".join(parts) print(os.getcwd() + "$ " + command) os.system(command) task_id = int(os.environ.get('SLURM_ARRAY_TASK_ID', -1)) if task_id == -1: submit_job() else: run_task(task_id)