source: sasmodels/sasmodels/kernelcl.py @ b297ba9

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since b297ba9 was b297ba9, checked in by Paul Kienzle <pkienzle@…>, 5 years ago

lint

  • Property mode set to 100644
File size: 23.9 KB
Line 
1"""
2GPU driver for C kernels
3
4TODO: docs are out of date
5
6There should be a single GPU environment running on the system.  This
7environment is constructed on the first call to :func:`env`, and the
8same environment is returned on each call.
9
10After retrieving the environment, the next step is to create the kernel.
11This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
12returns the type of data used by the kernel.
13
14Next a :class:`GpuData` object should be created with the correct kind
15of data.  This data object can be used by multiple kernels, for example,
16if the target model is a weighted sum of multiple kernels.  The data
17should include any extra evaluation points required to compute the proper
18data smearing.  This need not match the square grid for 2D data if there
19is an index saying which q points are active.
20
21Together the GpuData, the program, and a device form a :class:`GpuKernel`.
22This kernel is used during fitting, receiving new sets of parameters and
23evaluating them.  The output value is stored in an output buffer on the
24devices, where it can be combined with other structure factors and form
25factors and have instrumental resolution effects applied.
26
27In order to use OpenCL for your models, you will need OpenCL drivers for
28your machine.  These should be available from your graphics card vendor.
29Intel provides OpenCL drivers for CPUs as well as their integrated HD
30graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
31this writing the performance is lacking compared to the Intel drivers.
32NVidia combines drivers for CUDA and OpenCL in one package.  The result
33is a bit messy if you have multiple drivers installed.  You can see which
34drivers are available by starting python and running:
35
36    import pyopencl as cl
37    cl.create_some_context(interactive=True)
38
39Once you have done that, it will show the available drivers which you
40can select.  It will then tell you that you can use these drivers
41automatically by setting the SAS_OPENCL environment variable, which is
42PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
43
44Some graphics cards have multiple devices on the same card.  You cannot
45yet use both of them concurrently to evaluate models, but you can run
46the program twice using a different device for each session.
47
48OpenCL kernels are compiled when needed by the device driver.  Some
49drivers produce compiler output even when there is no error.  You
50can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
51harmless, albeit annoying.
52"""
53from __future__ import print_function
54
55import sys
56import os
57import warnings
58import logging
59import time
60
61try:
62    from time import perf_counter as clock
63except ImportError: # CRUFT: python < 3.3
64    if sys.platform.count("darwin") > 0:
65        from time import time as clock
66    else:
67        from time import clock
68
69import numpy as np  # type: ignore
70
71# Attempt to setup OpenCL. This may fail if the pyopencl package is not
72# installed or if it is installed but there are no devices available.
73try:
74    import pyopencl as cl  # type: ignore
75    from pyopencl import mem_flags as mf
76    from pyopencl.characterize import get_fast_inaccurate_build_options
77    # Ask OpenCL for the default context so that we know that one exists.
78    cl.create_some_context(interactive=False)
79    HAVE_OPENCL = True
80    OPENCL_ERROR = ""
81except Exception as exc:
82    HAVE_OPENCL = False
83    OPENCL_ERROR = str(exc)
84
85from . import generate
86from .generate import F32, F64
87from .kernel import KernelModel, Kernel
88
89# pylint: disable=unused-import
90try:
91    from typing import Tuple, Callable, Any
92    from .modelinfo import ModelInfo
93    from .details import CallDetails
94except ImportError:
95    pass
96# pylint: enable=unused-import
97
98
99# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path).
100def quote_path(v):
101    # type: (str) -> str
102    """
103    Quote the path if it is not already quoted.
104
105    If v starts with '-', then assume that it is a -I option or similar
106    and do not quote it.  This is fragile:  -Ipath with space needs to
107    be quoted.
108    """
109    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
110
111
112def fix_pyopencl_include():
113    # type: (None) -> None
114    """
115    Monkey patch pyopencl to allow spaces in include file path.
116    """
117    import pyopencl
118    if hasattr(pyopencl, '_DEFAULT_INCLUDE_OPTIONS'):
119        pyopencl._DEFAULT_INCLUDE_OPTIONS = [
120            quote_path(v) for v in pyopencl._DEFAULT_INCLUDE_OPTIONS
121            ]
122
123
124if HAVE_OPENCL:
125    fix_pyopencl_include()
126
127# The max loops number is limited by the amount of local memory available
128# on the device.  You don't want to make this value too big because it will
129# waste resources, nor too small because it may interfere with users trying
130# to do their polydispersity calculations.  A value of 1024 should be much
131# larger than necessary given that cost grows as npts^k where k is the number
132# of polydisperse parameters.
133MAX_LOOPS = 2048
134
135# Pragmas for enable OpenCL features.  Be sure to protect them so that they
136# still compile even if OpenCL is not present.
137_F16_PRAGMA = """\
138#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
139#  pragma OPENCL EXTENSION cl_khr_fp16: enable
140#endif
141"""
142
143_F64_PRAGMA = """\
144#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
145#  pragma OPENCL EXTENSION cl_khr_fp64: enable
146#endif
147"""
148
149
150def use_opencl():
151    # type: () -> bool
152    """Return True if OpenCL is the default computational engine"""
153    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
154    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
155
156
157ENV = None
158def reset_environment():
159    # type: () -> None
160    """
161    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
162    """
163    global ENV
164    ENV = GpuEnvironment() if use_opencl() else None
165
166
167def environment():
168    # type: () -> "GpuEnvironment"
169    """
170    Returns a singleton :class:`GpuEnvironment`.
171
172    This provides an OpenCL context and one queue per device.
173    """
174    if ENV is None:
175        if not HAVE_OPENCL:
176            raise RuntimeError("OpenCL startup failed with ***"
177                               + OPENCL_ERROR + "***; using C compiler instead")
178        reset_environment()
179        if ENV is None:
180            raise RuntimeError("SAS_OPENCL=None in environment")
181    return ENV
182
183
184def has_type(device, dtype):
185    # type: (cl.Device, np.dtype) -> bool
186    """
187    Return true if device supports the requested precision.
188    """
189    if dtype == F32:
190        return True
191    elif dtype == F64:
192        return "cl_khr_fp64" in device.extensions
193    else:
194        # Not supporting F16 type since it isn't accurate enough.
195        return False
196
197
198def get_warp(kernel, queue):
199    # type: (cl.Kernel, cl.CommandQueue) -> int
200    """
201    Return the size of an execution batch for *kernel* running on *queue*.
202    """
203    return kernel.get_work_group_info(
204        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
205        queue.device)
206
207
208def compile_model(context, source, dtype, fast=False):
209    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
210    """
211    Build a model to run on the gpu.
212
213    Returns the compiled program and its type.
214
215    Raises an error if the desired precision is not available.
216    """
217    dtype = np.dtype(dtype)
218    if not all(has_type(d, dtype) for d in context.devices):
219        raise RuntimeError("%s not supported for devices"%dtype)
220
221    source_list = [generate.convert_type(source, dtype)]
222
223    if dtype == generate.F16:
224        source_list.insert(0, _F16_PRAGMA)
225    elif dtype == generate.F64:
226        source_list.insert(0, _F64_PRAGMA)
227
228    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
229    if context.devices[0].type == cl.device_type.GPU:
230        source_list.insert(0, "#define USE_SINCOS\n")
231    options = (get_fast_inaccurate_build_options(context.devices[0])
232               if fast else [])
233    source = "\n".join(source_list)
234    program = cl.Program(context, source).build(options=options)
235
236    #print("done with "+program)
237    return program
238
239
240# For now, this returns one device in the context.
241# TODO: Create a context that contains all devices on all platforms.
242class GpuEnvironment(object):
243    """
244    GPU context for OpenCL, with possibly many devices and one queue per device.
245    """
246    def __init__(self):
247        # type: () -> None
248        # Find gpu context.
249        context_list = _create_some_context()
250
251        # Find a context for F32 and for F64 (maybe the same one).
252        # F16 isn't good enough.
253        self.context = {}
254        for dtype in (F32, F64):
255            for context in context_list:
256                if has_type(context.devices[0], dtype):
257                    self.context[dtype] = context
258                    break
259            else:
260                self.context[dtype] = None
261
262        # Build a queue for each context.
263        self.queue = {}
264        context = self.context[F32]
265        self.queue[F32] = cl.CommandQueue(context, context.devices[0])
266        if self.context[F64] == self.context[F32]:
267            self.queue[F64] = self.queue[F32]
268        else:
269            context = self.context[F64]
270            self.queue[F64] = cl.CommandQueue(context, context.devices[0])
271
272        ## Byte boundary for data alignment.
273        #self.data_boundary = max(context.devices[0].min_data_type_align_size
274        #                         for context in self.context.values())
275
276        # Cache for compiled programs, and for items in context.
277        self.compiled = {}
278
279    def has_type(self, dtype):
280        # type: (np.dtype) -> bool
281        """
282        Return True if all devices support a given type.
283        """
284        return self.context.get(dtype, None) is not None
285
286    def compile_program(self, name, source, dtype, fast, timestamp):
287        # type: (str, str, np.dtype, bool, float) -> cl.Program
288        """
289        Compile the program for the device in the given context.
290        """
291        # Note: PyOpenCL caches based on md5 hash of source, options and device
292        # but I'll do so as well just to save some data munging time.
293        tag = generate.tag_source(source)
294        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
295        # Check timestamp on program.
296        program, program_timestamp = self.compiled.get(key, (None, np.inf))
297        if program_timestamp < timestamp:
298            del self.compiled[key]
299        if key not in self.compiled:
300            context = self.context[dtype]
301            logging.info("building %s for OpenCL %s", key,
302                         context.devices[0].name.strip())
303            program = compile_model(self.context[dtype],
304                                    str(source), dtype, fast)
305            self.compiled[key] = (program, timestamp)
306        return program
307
308
309def _create_some_context():
310    # type: () -> cl.Context
311    """
312    Protected call to cl.create_some_context without interactivity.
313
314    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
315    otherwise scans for the most appropriate device using
316    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
317    indicates that an OpenCL device should be used without specifying
318    which one (and not a CUDA device, or no GPU).
319    """
320    # Assume we do not get here if SAS_OPENCL is None or CUDA.
321    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
322    if sas_opencl.lower() != 'opencl':
323        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context.
324        os.environ["PYOPENCL_CTX"] = sas_opencl
325
326    if 'PYOPENCL_CTX' in os.environ:
327        try:
328            return [cl.create_some_context(interactive=False)]
329        except Exception as exc:
330            # TODO: Should warnings instead by put into logging.warn?
331            warnings.warn(str(exc))
332            warnings.warn(
333                "pyopencl.create_some_context() failed.  The environment "
334                "variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set "
335                "correctly")
336
337    return _get_default_context()
338
339
340def _get_default_context():
341    # type: () -> List[cl.Context]
342    """
343    Get an OpenCL context, preferring GPU over CPU, and preferring Intel
344    drivers over AMD drivers.
345    """
346    # Note: on mobile devices there is automatic clock scaling if either the
347    # CPU or the GPU is underutilized; probably doesn't affect us, but we if
348    # it did, it would mean that putting a busy loop on the CPU while the GPU
349    # is running may increase throughput.
350    #
351    # MacBook Pro, base install:
352    #     {'Apple': [Intel CPU, NVIDIA GPU]}
353    # MacBook Pro, base install:
354    #     {'Apple': [Intel CPU, Intel GPU]}
355    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install:
356    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
357    gpu, cpu = None, None
358    for platform in cl.get_platforms():
359        # AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
360        # If someone has bothered to install the AMD/NVIDIA drivers, prefer
361        # them over the integrated graphics driver that may have been supplied
362        # with the CPU chipset.
363        preferred_cpu = (platform.vendor.startswith('Intel')
364                         or platform.vendor.startswith('Apple'))
365        preferred_gpu = (platform.vendor.startswith('Advanced')
366                         or platform.vendor.startswith('NVIDIA'))
367        for device in platform.get_devices():
368            if device.type == cl.device_type.GPU:
369                # If the existing type is not GPU then it will be CUSTOM
370                # or ACCELERATOR so don't override it.
371                if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
372                    gpu = device
373            elif device.type == cl.device_type.CPU:
374                if cpu is None or preferred_cpu:
375                    cpu = device
376            else:
377                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
378                # Intel Phi for example registers as an accelerator.
379                # Since the user installed a custom device on their system
380                # and went through the pain of sorting out OpenCL drivers for
381                # it, lets assume they really do want to use it as their
382                # primary compute device.
383                gpu = device
384
385    # Order the devices by gpu then by cpu; when searching for an available
386    # device by data type they will be checked in this order, which means
387    # that if the gpu supports double then the cpu will never be used (though
388    # we may make it possible to explicitly request the cpu at some point).
389    devices = []
390    if gpu is not None:
391        devices.append(gpu)
392    if cpu is not None:
393        devices.append(cpu)
394    return [cl.Context([d]) for d in devices]
395
396
397class GpuModel(KernelModel):
398    """
399    GPU wrapper for a single model.
400
401    *source* and *model_info* are the model source and interface as returned
402    from :func:`generate.make_source` and :func:`generate.make_model_info`.
403
404    *dtype* is the desired model precision.  Any numpy dtype for single
405    or double precision floats will do, such as 'f', 'float32' or 'single'
406    for single and 'd', 'float64' or 'double' for double.  Double precision
407    is an optional extension which may not be available on all devices.
408    Half precision ('float16','half') may be available on some devices.
409    Fast precision ('fast') is a loose version of single precision, indicating
410    that the compiler is allowed to take shortcuts.
411    """
412    info = None  # type: ModelInfo
413    source = ""  # type: str
414    dtype = None  # type: np.dtype
415    fast = False  # type: bool
416    _program = None  # type: cl.Program
417    _kernels = None  # type: Dict[str, cl.Kernel]
418
419    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
420        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
421        self.info = model_info
422        self.source = source
423        self.dtype = dtype
424        self.fast = fast
425
426    def __getstate__(self):
427        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
428        return self.info, self.source, self.dtype, self.fast
429
430    def __setstate__(self, state):
431        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
432        self.info, self.source, self.dtype, self.fast = state
433        self._program = self._kernels = None
434
435    def make_kernel(self, q_vectors):
436        # type: (List[np.ndarray]) -> "GpuKernel"
437        return GpuKernel(self, q_vectors)
438
439    def get_function(self, name):
440        # type: (str) -> cl.Kernel
441        """
442        Fetch the kernel from the environment by name, compiling it if it
443        does not already exist.
444        """
445        if self._program is None:
446            self._prepare_program()
447        return self._kernels[name]
448
449    def _prepare_program(self):
450        # type: (str) -> None
451        env = environment()
452        timestamp = generate.ocl_timestamp(self.info)
453        program = env.compile_program(
454            self.info.name,
455            self.source['opencl'],
456            self.dtype,
457            self.fast,
458            timestamp)
459        variants = ['Iq', 'Iqxy', 'Imagnetic']
460        names = [generate.kernel_name(self.info, k) for k in variants]
461        functions = [getattr(program, k) for k in names]
462        self._kernels = {k: v for k, v in zip(variants, functions)}
463        # Keep a handle to program so GC doesn't collect.
464        self._program = program
465
466
467# TODO: Check that we don't need a destructor for buffers which go out of scope.
468class GpuInput(object):
469    """
470    Make q data available to the gpu.
471
472    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
473    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
474    to get the best performance on OpenCL, which may involve shifting and
475    stretching the array to better match the memory architecture.  Additional
476    points will be evaluated with *q=1e-3*.
477
478    *dtype* is the data type for the q vectors. The data type should be
479    set to match that of the kernel, which is an attribute of
480    :class:`GpuProgram`.  Note that not all kernels support double
481    precision, so even if the program was created for double precision,
482    the *GpuProgram.dtype* may be single precision.
483
484    Call :meth:`release` when complete.  Even if not called directly, the
485    buffer will be released when the data object is freed.
486    """
487    def __init__(self, q_vectors, dtype=generate.F32):
488        # type: (List[np.ndarray], np.dtype) -> None
489        # TODO: Do we ever need double precision q?
490        self.nq = q_vectors[0].size
491        self.dtype = np.dtype(dtype)
492        self.is_2d = (len(q_vectors) == 2)
493        # TODO: Stretch input based on get_warp().
494        # Not doing it now since warp depends on kernel, which is not known
495        # at this point, so instead using 32, which is good on the set of
496        # architectures tested so far.
497        if self.is_2d:
498            width = ((self.nq+15)//16)*16
499            self.q = np.empty((width, 2), dtype=dtype)
500            self.q[:self.nq, 0] = q_vectors[0]
501            self.q[:self.nq, 1] = q_vectors[1]
502        else:
503            width = ((self.nq+31)//32)*32
504            self.q = np.empty(width, dtype=dtype)
505            self.q[:self.nq] = q_vectors[0]
506        self.global_size = [self.q.shape[0]]
507        #print("creating inputs of size", self.global_size)
508
509        # Transfer input value to GPU.
510        env = environment()
511        context = env.context[self.dtype]
512        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
513                             hostbuf=self.q)
514
515    def release(self):
516        # type: () -> None
517        """
518        Free the buffer associated with the q value.
519        """
520        if self.q_b is not None:
521            self.q_b.release()
522            self.q_b = None
523
524    def __del__(self):
525        # type: () -> None
526        self.release()
527
528
529class GpuKernel(Kernel):
530    """
531    Callable SAS kernel.
532
533    *model* is the GpuModel object to call
534
535    The kernel is derived from :class:`Kernel`, providing the
536    :meth:`call_kernel` method to evaluate the kernel for a given set of
537    parameters.  Because of the need to move the q values to the GPU before
538    evaluation, the kernel is instantiated for a particular set of q vectors,
539    and can be called many times without transfering q each time.
540
541    Call :meth:`release` when done with the kernel instance.
542    """
543    #: SAS model information structure.
544    info = None  # type: ModelInfo
545    #: Kernel precision.
546    dtype = None  # type: np.dtype
547    #: Kernel dimensions (1d or 2d).
548    dim = ""  # type: str
549    #: Calculation results, updated after each call to :meth:`_call_kernel`.
550    result = None  # type: np.ndarray
551
552    def __init__(self, model, q_vectors):
553        # type: (GpuModel, List[np.ndarray]) -> None
554        dtype = model.dtype
555        self.q_input = GpuInput(q_vectors, dtype)
556        self._model = model
557
558        # Attributes accessed from the outside.
559        self.dim = '2d' if self.q_input.is_2d else '1d'
560        self.info = model.info
561        self.dtype = dtype
562
563        # Converter to translate input to target type.
564        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
565
566        # Holding place for the returned value.
567        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
568        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
569        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
570
571        # Allocate result value on GPU.
572        env = environment()
573        context = env.context[self.dtype]
574        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
575        self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
576
577    def _call_kernel(self, call_details, values, cutoff, magnetic,
578                     effective_radius_type):
579        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
580        env = environment()
581        queue = env.queue[self._model.dtype]
582        context = queue.context
583
584        # Arrange data transfer to card.
585        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
586                              hostbuf=call_details.buffer)
587        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
588                             hostbuf=values)
589
590        # Setup kernel function and arguments.
591        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
592        kernel = self._model.get_function(name)
593        kernel_args = [
594            np.uint32(self.q_input.nq),  # Number of inputs.
595            None,  # Placeholder for pd_start.
596            None,  # Placeholder for pd_stop.
597            details_b,  # Problem definition.
598            values_b,  # Parameter values.
599            self.q_input.q_b,  # Q values.
600            self._result_b,   # Result storage.
601            self._as_dtype(cutoff),  # Probability cutoff.
602            np.uint32(effective_radius_type),  # R_eff mode.
603        ]
604
605        # Call kernel and retrieve results.
606        #print("Calling OpenCL")
607        #call_details.show(values)
608        wait_for = None
609        last_nap = clock()
610        step = 1000000//self.q_input.nq + 1
611        for start in range(0, call_details.num_eval, step):
612            stop = min(start + step, call_details.num_eval)
613            #print("queuing",start,stop)
614            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
615            wait_for = [kernel(queue, self.q_input.global_size, None,
616                               *kernel_args, wait_for=wait_for)]
617            if stop < call_details.num_eval:
618                # Allow other processes to run.
619                wait_for[0].wait()
620                current_time = clock()
621                if current_time - last_nap > 0.5:
622                    time.sleep(0.001)
623                    last_nap = current_time
624        cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for)
625        #print("result", self.result)
626
627        # Free buffers.
628        details_b.release()
629        values_b.release()
630
631    def release(self):
632        # type: () -> None
633        """
634        Release resources associated with the kernel.
635        """
636        self.q_input.release()
637        if self._result_b is not None:
638            self._result_b.release()
639            self._result_b = None
640
641    def __del__(self):
642        # type: () -> None
643        self.release()
Note: See TracBrowser for help on using the repository browser.