source: sasmodels/sasmodels/kernelcl.py @ 0d26e91

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 0d26e91 was 0d26e91, checked in by GitHub <noreply@…>, 5 years ago

Merge branch 'beta_approx' into ticket-1015-gpu-mem-error

  • Property mode set to 100644
File size: 23.6 KB
Line 
1"""
2GPU driver for C kernels
3
4TODO: docs are out of date
5
6There should be a single GPU environment running on the system.  This
7environment is constructed on the first call to :func:`env`, and the
8same environment is returned on each call.
9
10After retrieving the environment, the next step is to create the kernel.
11This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
12returns the type of data used by the kernel.
13
14Next a :class:`GpuData` object should be created with the correct kind
15of data.  This data object can be used by multiple kernels, for example,
16if the target model is a weighted sum of multiple kernels.  The data
17should include any extra evaluation points required to compute the proper
18data smearing.  This need not match the square grid for 2D data if there
19is an index saying which q points are active.
20
21Together the GpuData, the program, and a device form a :class:`GpuKernel`.
22This kernel is used during fitting, receiving new sets of parameters and
23evaluating them.  The output value is stored in an output buffer on the
24devices, where it can be combined with other structure factors and form
25factors and have instrumental resolution effects applied.
26
27In order to use OpenCL for your models, you will need OpenCL drivers for
28your machine.  These should be available from your graphics card vendor.
29Intel provides OpenCL drivers for CPUs as well as their integrated HD
30graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
31this writing the performance is lacking compared to the Intel drivers.
32NVidia combines drivers for CUDA and OpenCL in one package.  The result
33is a bit messy if you have multiple drivers installed.  You can see which
34drivers are available by starting python and running:
35
36    import pyopencl as cl
37    cl.create_some_context(interactive=True)
38
39Once you have done that, it will show the available drivers which you
40can select.  It will then tell you that you can use these drivers
41automatically by setting the SAS_OPENCL environment variable, which is
42PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
43
44Some graphics cards have multiple devices on the same card.  You cannot
45yet use both of them concurrently to evaluate models, but you can run
46the program twice using a different device for each session.
47
48OpenCL kernels are compiled when needed by the device driver.  Some
49drivers produce compiler output even when there is no error.  You
50can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
51harmless, albeit annoying.
52"""
53from __future__ import print_function
54
55import os
56import warnings
57import logging
58import time
59
60try:
61    from time import perf_counter as clock
62except ImportError: # CRUFT: python < 3.3
63    import sys
64    if sys.platform.count("darwin") > 0:
65        from time import time as clock
66    else:
67        from time import clock
68
69import numpy as np  # type: ignore
70
71# Attempt to setup OpenCL. This may fail if the pyopencl package is not
72# installed or if it is installed but there are no devices available.
73try:
74    import pyopencl as cl  # type: ignore
75    from pyopencl import mem_flags as mf
76    from pyopencl.characterize import get_fast_inaccurate_build_options
77    # Ask OpenCL for the default context so that we know that one exists.
78    cl.create_some_context(interactive=False)
79    HAVE_OPENCL = True
80    OPENCL_ERROR = ""
81except Exception as exc:
82    HAVE_OPENCL = False
83    OPENCL_ERROR = str(exc)
84
85from . import generate
86from .generate import F32, F64
87from .kernel import KernelModel, Kernel
88
89# pylint: disable=unused-import
90try:
91    from typing import Tuple, Callable, Any
92    from .modelinfo import ModelInfo
93    from .details import CallDetails
94except ImportError:
95    pass
96# pylint: enable=unused-import
97
98
99# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path).
100def quote_path(v):
101    """
102    Quote the path if it is not already quoted.
103
104    If v starts with '-', then assume that it is a -I option or similar
105    and do not quote it.  This is fragile:  -Ipath with space needs to
106    be quoted.
107    """
108    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
109
110
111def fix_pyopencl_include():
112    """
113    Monkey patch pyopencl to allow spaces in include file path.
114    """
115    import pyopencl as cl
116    if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'):
117        cl._DEFAULT_INCLUDE_OPTIONS = [
118            quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS
119            ]
120
121
122if HAVE_OPENCL:
123    fix_pyopencl_include()
124
125# The max loops number is limited by the amount of local memory available
126# on the device.  You don't want to make this value too big because it will
127# waste resources, nor too small because it may interfere with users trying
128# to do their polydispersity calculations.  A value of 1024 should be much
129# larger than necessary given that cost grows as npts^k where k is the number
130# of polydisperse parameters.
131MAX_LOOPS = 2048
132
133# Pragmas for enable OpenCL features.  Be sure to protect them so that they
134# still compile even if OpenCL is not present.
135_F16_PRAGMA = """\
136#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
137#  pragma OPENCL EXTENSION cl_khr_fp16: enable
138#endif
139"""
140
141_F64_PRAGMA = """\
142#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
143#  pragma OPENCL EXTENSION cl_khr_fp64: enable
144#endif
145"""
146
147
148def use_opencl():
149    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
150    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
151
152
153ENV = None
154def reset_environment():
155    """
156    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
157    """
158    global ENV
159    ENV = GpuEnvironment() if use_opencl() else None
160
161
162def environment():
163    # type: () -> "GpuEnvironment"
164    """
165    Returns a singleton :class:`GpuEnvironment`.
166
167    This provides an OpenCL context and one queue per device.
168    """
169    if ENV is None:
170        if not HAVE_OPENCL:
171            raise RuntimeError("OpenCL startup failed with ***"
172                               + OPENCL_ERROR + "***; using C compiler instead")
173        reset_environment()
174        if ENV is None:
175            raise RuntimeError("SAS_OPENCL=None in environment")
176    return ENV
177
178
179def has_type(device, dtype):
180    # type: (cl.Device, np.dtype) -> bool
181    """
182    Return true if device supports the requested precision.
183    """
184    if dtype == F32:
185        return True
186    elif dtype == F64:
187        return "cl_khr_fp64" in device.extensions
188    else:
189        # Not supporting F16 type since it isn't accurate enough.
190        return False
191
192
193def get_warp(kernel, queue):
194    # type: (cl.Kernel, cl.CommandQueue) -> int
195    """
196    Return the size of an execution batch for *kernel* running on *queue*.
197    """
198    return kernel.get_work_group_info(
199        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
200        queue.device)
201
202
203def compile_model(context, source, dtype, fast=False):
204    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
205    """
206    Build a model to run on the gpu.
207
208    Returns the compiled program and its type.
209
210    Raises an error if the desired precision is not available.
211    """
212    dtype = np.dtype(dtype)
213    if not all(has_type(d, dtype) for d in context.devices):
214        raise RuntimeError("%s not supported for devices"%dtype)
215
216    source_list = [generate.convert_type(source, dtype)]
217
218    if dtype == generate.F16:
219        source_list.insert(0, _F16_PRAGMA)
220    elif dtype == generate.F64:
221        source_list.insert(0, _F64_PRAGMA)
222
223    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
224    if context.devices[0].type == cl.device_type.GPU:
225        source_list.insert(0, "#define USE_SINCOS\n")
226    options = (get_fast_inaccurate_build_options(context.devices[0])
227               if fast else [])
228    source = "\n".join(source_list)
229    program = cl.Program(context, source).build(options=options)
230
231    #print("done with "+program)
232    return program
233
234
235# For now, this returns one device in the context.
236# TODO: Create a context that contains all devices on all platforms.
237class GpuEnvironment(object):
238    """
239    GPU context for OpenCL, with possibly many devices and one queue per device.
240    """
241    def __init__(self):
242        # type: () -> None
243        # Find gpu context.
244        context_list = _create_some_context()
245
246        # Find a context for F32 and for F64 (maybe the same one).
247        # F16 isn't good enough.
248        self.context = {}
249        for dtype in (F32, F64):
250            for context in context_list:
251                if has_type(context.devices[0], dtype):
252                    self.context[dtype] = context
253                    break
254            else:
255                self.context[dtype] = None
256
257        # Build a queue for each context.
258        self.queue = {}
259        context = self.context[F32]
260        self.queue[F32] = cl.CommandQueue(context, context.devices[0])
261        if self.context[F64] == self.context[F32]:
262            self.queue[F64] = self.queue[F32]
263        else:
264            context = self.context[F64]
265            self.queue[F64] = cl.CommandQueue(context, context.devices[0])
266
267        ## Byte boundary for data alignment.
268        #self.data_boundary = max(context.devices[0].min_data_type_align_size
269        #                         for context in self.context.values())
270
271        # Cache for compiled programs, and for items in context.
272        self.compiled = {}
273
274    def has_type(self, dtype):
275        # type: (np.dtype) -> bool
276        """
277        Return True if all devices support a given type.
278        """
279        return self.context.get(dtype, None) is not None
280
281    def compile_program(self, name, source, dtype, fast, timestamp):
282        # type: (str, str, np.dtype, bool, float) -> cl.Program
283        """
284        Compile the program for the device in the given context.
285        """
286        # Note: PyOpenCL caches based on md5 hash of source, options and device
287        # but I'll do so as well just to save some data munging time.
288        tag = generate.tag_source(source)
289        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
290        # Check timestamp on program.
291        program, program_timestamp = self.compiled.get(key, (None, np.inf))
292        if program_timestamp < timestamp:
293            del self.compiled[key]
294        if key not in self.compiled:
295            context = self.context[dtype]
296            logging.info("building %s for OpenCL %s", key,
297                         context.devices[0].name.strip())
298            program = compile_model(self.context[dtype],
299                                    str(source), dtype, fast)
300            self.compiled[key] = (program, timestamp)
301        return program
302
303
304def _create_some_context():
305    # type: () -> cl.Context
306    """
307    Protected call to cl.create_some_context without interactivity.
308
309    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
310    otherwise scans for the most appropriate device using
311    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
312    indicates that an OpenCL device should be used without specifying
313    which one (and not a CUDA device, or no GPU).
314    """
315    # Assume we do not get here if SAS_OPENCL is None or CUDA.
316    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
317    if sas_opencl.lower() != 'opencl':
318        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context.
319        os.environ["PYOPENCL_CTX"] = sas_opencl
320
321    if 'PYOPENCL_CTX' in os.environ:
322        try:
323            return [cl.create_some_context(interactive=False)]
324        except Exception as exc:
325            warnings.warn(str(exc))
326            warnings.warn("pyopencl.create_some_context() failed.  The "
327                "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might "
328                "not be set correctly")
329
330    return _get_default_context()
331
332
333def _get_default_context():
334    # type: () -> List[cl.Context]
335    """
336    Get an OpenCL context, preferring GPU over CPU, and preferring Intel
337    drivers over AMD drivers.
338    """
339    # Note: on mobile devices there is automatic clock scaling if either the
340    # CPU or the GPU is underutilized; probably doesn't affect us, but we if
341    # it did, it would mean that putting a busy loop on the CPU while the GPU
342    # is running may increase throughput.
343    #
344    # MacBook Pro, base install:
345    #     {'Apple': [Intel CPU, NVIDIA GPU]}
346    # MacBook Pro, base install:
347    #     {'Apple': [Intel CPU, Intel GPU]}
348    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install:
349    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
350    gpu, cpu = None, None
351    for platform in cl.get_platforms():
352        # AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
353        # If someone has bothered to install the AMD/NVIDIA drivers, prefer
354        # them over the integrated graphics driver that may have been supplied
355        # with the CPU chipset.
356        preferred_cpu = (platform.vendor.startswith('Intel')
357                         or platform.vendor.startswith('Apple'))
358        preferred_gpu = (platform.vendor.startswith('Advanced')
359                         or platform.vendor.startswith('NVIDIA'))
360        for device in platform.get_devices():
361            if device.type == cl.device_type.GPU:
362                # If the existing type is not GPU then it will be CUSTOM
363                # or ACCELERATOR so don't override it.
364                if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
365                    gpu = device
366            elif device.type == cl.device_type.CPU:
367                if cpu is None or preferred_cpu:
368                    cpu = device
369            else:
370                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
371                # Intel Phi for example registers as an accelerator.
372                # Since the user installed a custom device on their system
373                # and went through the pain of sorting out OpenCL drivers for
374                # it, lets assume they really do want to use it as their
375                # primary compute device.
376                gpu = device
377
378    # Order the devices by gpu then by cpu; when searching for an available
379    # device by data type they will be checked in this order, which means
380    # that if the gpu supports double then the cpu will never be used (though
381    # we may make it possible to explicitly request the cpu at some point).
382    devices = []
383    if gpu is not None:
384        devices.append(gpu)
385    if cpu is not None:
386        devices.append(cpu)
387    return [cl.Context([d]) for d in devices]
388
389
390class GpuModel(KernelModel):
391    """
392    GPU wrapper for a single model.
393
394    *source* and *model_info* are the model source and interface as returned
395    from :func:`generate.make_source` and :func:`generate.make_model_info`.
396
397    *dtype* is the desired model precision.  Any numpy dtype for single
398    or double precision floats will do, such as 'f', 'float32' or 'single'
399    for single and 'd', 'float64' or 'double' for double.  Double precision
400    is an optional extension which may not be available on all devices.
401    Half precision ('float16','half') may be available on some devices.
402    Fast precision ('fast') is a loose version of single precision, indicating
403    that the compiler is allowed to take shortcuts.
404    """
405    info = None  # type: ModelInfo
406    source = ""  # type: str
407    dtype = None  # type: np.dtype
408    fast = False  # type: bool
409    _program = None  # type: cl.Program
410    _kernels = None  # type: Dict[str, cl.Kernel]
411
412    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
413        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
414        self.info = model_info
415        self.source = source
416        self.dtype = dtype
417        self.fast = fast
418
419    def __getstate__(self):
420        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
421        return self.info, self.source, self.dtype, self.fast
422
423    def __setstate__(self, state):
424        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
425        self.info, self.source, self.dtype, self.fast = state
426        self._program = self._kernels = None
427
428    def make_kernel(self, q_vectors):
429        # type: (List[np.ndarray]) -> "GpuKernel"
430        return GpuKernel(self, q_vectors)
431
432    def get_function(self, name):
433        # type: (str) -> cl.Kernel
434        """
435        Fetch the kernel from the environment by name, compiling it if it
436        does not already exist.
437        """
438        if self._program is None:
439            self._prepare_program()
440        return self._kernels[name]
441
442    def _prepare_program(self):
443        # type: (str) -> None
444        env = environment()
445        timestamp = generate.ocl_timestamp(self.info)
446        program = env.compile_program(
447            self.info.name,
448            self.source['opencl'],
449            self.dtype,
450            self.fast,
451            timestamp)
452        variants = ['Iq', 'Iqxy', 'Imagnetic']
453        names = [generate.kernel_name(self.info, k) for k in variants]
454        functions = [getattr(program, k) for k in names]
455        self._kernels = {k: v for k, v in zip(variants, functions)}
456        # Keep a handle to program so GC doesn't collect.
457        self._program = program
458
459
460# TODO: Check that we don't need a destructor for buffers which go out of scope.
461class GpuInput(object):
462    """
463    Make q data available to the gpu.
464
465    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
466    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
467    to get the best performance on OpenCL, which may involve shifting and
468    stretching the array to better match the memory architecture.  Additional
469    points will be evaluated with *q=1e-3*.
470
471    *dtype* is the data type for the q vectors. The data type should be
472    set to match that of the kernel, which is an attribute of
473    :class:`GpuProgram`.  Note that not all kernels support double
474    precision, so even if the program was created for double precision,
475    the *GpuProgram.dtype* may be single precision.
476
477    Call :meth:`release` when complete.  Even if not called directly, the
478    buffer will be released when the data object is freed.
479    """
480    def __init__(self, q_vectors, dtype=generate.F32):
481        # type: (List[np.ndarray], np.dtype) -> None
482        # TODO: Do we ever need double precision q?
483        self.nq = q_vectors[0].size
484        self.dtype = np.dtype(dtype)
485        self.is_2d = (len(q_vectors) == 2)
486        # TODO: Stretch input based on get_warp().
487        # Not doing it now since warp depends on kernel, which is not known
488        # at this point, so instead using 32, which is good on the set of
489        # architectures tested so far.
490        if self.is_2d:
491            width = ((self.nq+15)//16)*16
492            self.q = np.empty((width, 2), dtype=dtype)
493            self.q[:self.nq, 0] = q_vectors[0]
494            self.q[:self.nq, 1] = q_vectors[1]
495        else:
496            width = ((self.nq+31)//32)*32
497            self.q = np.empty(width, dtype=dtype)
498            self.q[:self.nq] = q_vectors[0]
499        self.global_size = [self.q.shape[0]]
500        #print("creating inputs of size", self.global_size)
501
502        # Transfer input value to GPU.
503        env = environment()
504        context = env.context[self.dtype]
505        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
506                             hostbuf=self.q)
507
508    def release(self):
509        # type: () -> None
510        """
511        Free the buffer associated with the q value.
512        """
513        if self.q_b is not None:
514            self.q_b.release()
515            self.q_b = None
516
517    def __del__(self):
518        # type: () -> None
519        self.release()
520
521
522class GpuKernel(Kernel):
523    """
524    Callable SAS kernel.
525
526    *model* is the GpuModel object to call
527
528    The kernel is derived from :class:`Kernel`, providing the
529    :meth:`call_kernel` method to evaluate the kernel for a given set of
530    parameters.  Because of the need to move the q values to the GPU before
531    evaluation, the kernel is instantiated for a particular set of q vectors,
532    and can be called many times without transfering q each time.
533
534    Call :meth:`release` when done with the kernel instance.
535    """
536    #: SAS model information structure.
537    info = None  # type: ModelInfo
538    #: Kernel precision.
539    dtype = None  # type: np.dtype
540    #: Kernel dimensions (1d or 2d).
541    dim = ""  # type: str
542    #: Calculation results, updated after each call to :meth:`_call_kernel`.
543    result = None  # type: np.ndarray
544
545    def __init__(self, model, q_vectors):
546        # type: (GpuModel, List[np.ndarray]) -> None
547        dtype = model.dtype
548        self.q_input = GpuInput(q_vectors, dtype)
549        self._model = model
550
551        # Attributes accessed from the outside.
552        self.dim = '2d' if self.q_input.is_2d else '1d'
553        self.info = model.info
554        self.dtype = dtype
555
556        # Converter to translate input to target type.
557        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
558
559        # Holding place for the returned value.
560        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
561        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
562        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
563
564        # Allocate result value on GPU.
565        env = environment()
566        context = env.context[self.dtype]
567        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
568        self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
569
570    def _call_kernel(self, call_details, values, cutoff, magnetic,
571                     effective_radius_type):
572        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
573        env = environment()
574        queue = env.queue[self._model.dtype]
575        context = queue.context
576
577        # Arrange data transfer to card.
578        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
579                              hostbuf=call_details.buffer)
580        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
581                             hostbuf=values)
582
583        # Setup kernel function and arguments.
584        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
585        kernel = self._model.get_function(name)
586        kernel_args = [
587            np.uint32(self.q_input.nq),  # Number of inputs.
588            None,  # Placeholder for pd_start.
589            None,  # Placeholder for pd_stop.
590            details_b,  # Problem definition.
591            values_b,  # Parameter values.
592            self.q_input.q_b,  # Q values.
593            self._result_b,   # Result storage.
594            self._as_dtype(cutoff),  # Probability cutoff.
595            np.uint32(effective_radius_type),  # R_eff mode.
596        ]
597
598        # Call kernel and retrieve results.
599        #print("Calling OpenCL")
600        #call_details.show(values)
601        wait_for = None
602        last_nap = clock()
603        step = 1000000//self.q_input.nq + 1
604        for start in range(0, call_details.num_eval, step):
605            stop = min(start + step, call_details.num_eval)
606            #print("queuing",start,stop)
607            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
608            wait_for = [kernel(queue, self.q_input.global_size, None,
609                               *kernel_args, wait_for=wait_for)]
610            if stop < call_details.num_eval:
611                # Allow other processes to run.
612                wait_for[0].wait()
613                current_time = clock()
614                if current_time - last_nap > 0.5:
615                    time.sleep(0.001)
616                    last_nap = current_time
617        cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for)
618        #print("result", self.result)
619
620        # Free buffers.
621        details_b.release()
622        values_b.release()
623
624    def release(self):
625        # type: () -> None
626        """
627        Release resources associated with the kernel.
628        """
629        self.q_input.release()
630        if self._result_b is not None:
631            self._result_b.release()
632            self._result_b = None
633
634    def __del__(self):
635        # type: () -> None
636        self.release()
Note: See TracBrowser for help on using the repository browser.