source: sasmodels/sasmodels/kernelcl.py @ a4280bd

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since a4280bd was a4280bd, checked in by Paul Kienzle <pkienzle@…>, 8 years ago

restructure magnetic models to use less code

  • Property mode set to 100644
File size: 22.4 KB
Line 
1"""
2GPU driver for C kernels
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25In order to use OpenCL for your models, you will need OpenCL drivers for
26your machine.  These should be available from your graphics card vendor.
27Intel provides OpenCL drivers for CPUs as well as their integrated HD
28graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
29this writing the performance is lacking compared to the Intel drivers.
30NVidia combines drivers for CUDA and OpenCL in one package.  The result
31is a bit messy if you have multiple drivers installed.  You can see which
32drivers are available by starting python and running:
33
34    import pyopencl as cl
35    cl.create_some_context(interactive=True)
36
37Once you have done that, it will show the available drivers which you
38can select.  It will then tell you that you can use these drivers
39automatically by setting the PYOPENCL_CTX environment variable.
40
41Some graphics cards have multiple devices on the same card.  You cannot
42yet use both of them concurrently to evaluate models, but you can run
43the program twice using a different device for each session.
44
45OpenCL kernels are compiled when needed by the device driver.  Some
46drivers produce compiler output even when there is no error.  You
47can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
48harmless, albeit annoying.
49"""
50from __future__ import print_function
51
52import os
53import warnings
54import logging
55
56import numpy as np  # type: ignore
57
58try:
59    #raise NotImplementedError("OpenCL not yet implemented for new kernel template")
60    import pyopencl as cl  # type: ignore
61    # Ask OpenCL for the default context so that we know that one exists
62    cl.create_some_context(interactive=False)
63except Exception as exc:
64    warnings.warn("OpenCL startup failed with ***"+str(exc)+"***; using C compiler instead")
65    raise RuntimeError("OpenCL not available")
66
67from pyopencl import mem_flags as mf  # type: ignore
68from pyopencl.characterize import get_fast_inaccurate_build_options  # type: ignore
69# CRUFT: pyopencl < 2017.1  (as of June 2016 needs quotes around include path)
70def _quote_path(v):
71    """
72    Quote the path if it is not already quoted.
73
74    If v starts with '-', then assume that it is a -I option or similar
75    and do not quote it.  This is fragile:  -Ipath with space needs to
76    be quoted.
77    """
78    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
79
80if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'):
81    cl._DEFAULT_INCLUDE_OPTIONS = [_quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS]
82
83from pyopencl import mem_flags as mf
84from pyopencl.characterize import get_fast_inaccurate_build_options
85
86from . import generate
87from .kernel import KernelModel, Kernel
88
89try:
90    from typing import Tuple, Callable, Any
91    from .modelinfo import ModelInfo
92    from .details import CallDetails
93except ImportError:
94    pass
95
96# The max loops number is limited by the amount of local memory available
97# on the device.  You don't want to make this value too big because it will
98# waste resources, nor too small because it may interfere with users trying
99# to do their polydispersity calculations.  A value of 1024 should be much
100# larger than necessary given that cost grows as npts^k where k is the number
101# of polydisperse parameters.
102MAX_LOOPS = 2048
103
104
105# Pragmas for enable OpenCL features.  Be sure to protect them so that they
106# still compile even if OpenCL is not present.
107_F16_PRAGMA = """\
108#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
109#  pragma OPENCL EXTENSION cl_khr_fp16: enable
110#endif
111"""
112
113_F64_PRAGMA = """\
114#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
115#  pragma OPENCL EXTENSION cl_khr_fp64: enable
116#endif
117"""
118
119
120ENV = None
121def environment():
122    # type: () -> "GpuEnvironment"
123    """
124    Returns a singleton :class:`GpuEnvironment`.
125
126    This provides an OpenCL context and one queue per device.
127    """
128    global ENV
129    if ENV is None:
130        ENV = GpuEnvironment()
131    return ENV
132
133def has_type(device, dtype):
134    # type: (cl.Device, np.dtype) -> bool
135    """
136    Return true if device supports the requested precision.
137    """
138    if dtype == generate.F32:
139        return True
140    elif dtype == generate.F64:
141        return "cl_khr_fp64" in device.extensions
142    elif dtype == generate.F16:
143        return "cl_khr_fp16" in device.extensions
144    else:
145        return False
146
147def get_warp(kernel, queue):
148    # type: (cl.Kernel, cl.CommandQueue) -> int
149    """
150    Return the size of an execution batch for *kernel* running on *queue*.
151    """
152    return kernel.get_work_group_info(
153        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
154        queue.device)
155
156def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
157    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
158    """
159    Stretch an input vector to the correct boundary.
160
161    Performance on the kernels can drop by a factor of two or more if the
162    number of values to compute does not fall on a nice power of two
163    boundary.   The trailing additional vector elements are given a
164    value of *extra*, and so f(*extra*) will be computed for each of
165    them.  The returned array will thus be a subset of the computed array.
166
167    *boundary* should be a power of 2 which is at least 32 for good
168    performance on current platforms (as of Jan 2015).  It should
169    probably be the max of get_warp(kernel,queue) and
170    device.min_data_type_align_size//4.
171    """
172    remainder = vector.size % boundary
173    if remainder != 0:
174        size = vector.size + (boundary - remainder)
175        vector = np.hstack((vector, [extra] * (size - vector.size)))
176    return np.ascontiguousarray(vector, dtype=dtype)
177
178
179def compile_model(context, source, dtype, fast=False):
180    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
181    """
182    Build a model to run on the gpu.
183
184    Returns the compiled program and its type.  The returned type will
185    be float32 even if the desired type is float64 if any of the
186    devices in the context do not support the cl_khr_fp64 extension.
187    """
188    dtype = np.dtype(dtype)
189    if not all(has_type(d, dtype) for d in context.devices):
190        raise RuntimeError("%s not supported for devices"%dtype)
191
192    source_list = [generate.convert_type(source, dtype)]
193
194    if dtype == generate.F16:
195        source_list.insert(0, _F16_PRAGMA)
196    elif dtype == generate.F64:
197        source_list.insert(0, _F64_PRAGMA)
198
199    # Note: USE_SINCOS makes the intel cpu slower under opencl
200    if context.devices[0].type == cl.device_type.GPU:
201        source_list.insert(0, "#define USE_SINCOS\n")
202    options = (get_fast_inaccurate_build_options(context.devices[0])
203               if fast else [])
204    source = "\n".join(source_list)
205    program = cl.Program(context, source).build(options=options)
206    #print("done with "+program)
207    return program
208
209
210# for now, this returns one device in the context
211# TODO: create a context that contains all devices on all platforms
212class GpuEnvironment(object):
213    """
214    GPU context, with possibly many devices, and one queue per device.
215    """
216    def __init__(self):
217        # type: () -> None
218        # find gpu context
219        #self.context = cl.create_some_context()
220
221        self.context = None
222        if 'PYOPENCL_CTX' in os.environ:
223            self._create_some_context()
224
225        if not self.context:
226            self.context = _get_default_context()
227
228        # Byte boundary for data alignment
229        #self.data_boundary = max(d.min_data_type_align_size
230        #                         for d in self.context.devices)
231        self.queues = [cl.CommandQueue(context, context.devices[0])
232                       for context in self.context]
233        self.compiled = {}
234
235    def has_type(self, dtype):
236        # type: (np.dtype) -> bool
237        """
238        Return True if all devices support a given type.
239        """
240        return any(has_type(d, dtype)
241                   for context in self.context
242                   for d in context.devices)
243
244    def get_queue(self, dtype):
245        # type: (np.dtype) -> cl.CommandQueue
246        """
247        Return a command queue for the kernels of type dtype.
248        """
249        for context, queue in zip(self.context, self.queues):
250            if all(has_type(d, dtype) for d in context.devices):
251                return queue
252
253    def get_context(self, dtype):
254        # type: (np.dtype) -> cl.Context
255        """
256        Return a OpenCL context for the kernels of type dtype.
257        """
258        for context, queue in zip(self.context, self.queues):
259            if all(has_type(d, dtype) for d in context.devices):
260                return context
261
262    def _create_some_context(self):
263        # type: () -> cl.Context
264        """
265        Protected call to cl.create_some_context without interactivity.  Use
266        this if PYOPENCL_CTX is set in the environment.  Sets the *context*
267        attribute.
268        """
269        try:
270            self.context = [cl.create_some_context(interactive=False)]
271        except Exception as exc:
272            warnings.warn(str(exc))
273            warnings.warn("pyopencl.create_some_context() failed")
274            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
275
276    def compile_program(self, name, source, dtype, fast=False):
277        # type: (str, str, np.dtype, bool) -> cl.Program
278        """
279        Compile the program for the device in the given context.
280        """
281        key = "%s-%s%s"%(name, dtype, ("-fast" if fast else ""))
282        if key not in self.compiled:
283            context = self.get_context(dtype)
284            logging.info("building %s for OpenCL %s"
285                         % (key, context.devices[0].name.strip()))
286            program = compile_model(self.get_context(dtype),
287                                    str(source), dtype, fast)
288            self.compiled[key] = program
289        return self.compiled[key]
290
291    def release_program(self, name):
292        # type: (str) -> None
293        """
294        Free memory associated with the program on the device.
295        """
296        if name in self.compiled:
297            self.compiled[name].release()
298            del self.compiled[name]
299
300def _get_default_context():
301    # type: () -> cl.Context
302    """
303    Get an OpenCL context, preferring GPU over CPU, and preferring Intel
304    drivers over AMD drivers.
305    """
306    # Note: on mobile devices there is automatic clock scaling if either the
307    # CPU or the GPU is underutilized; probably doesn't affect us, but we if
308    # it did, it would mean that putting a busy loop on the CPU while the GPU
309    # is running may increase throughput.
310    #
311    # Macbook pro, base install:
312    #     {'Apple': [Intel CPU, NVIDIA GPU]}
313    # Macbook pro, base install:
314    #     {'Apple': [Intel CPU, Intel GPU]}
315    # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed
316    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
317    gpu, cpu = None, None
318    for platform in cl.get_platforms():
319        # AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
320        # If someone has bothered to install the AMD/NVIDIA drivers, prefer them over the integrated
321        # graphics driver that may have been supplied with the CPU chipset.
322        preferred_cpu = platform.vendor.startswith('Intel') or platform.vendor.startswith('Apple')
323        preferred_gpu = platform.vendor.startswith('Advanced') or platform.vendor.startswith('NVIDIA')
324        for device in platform.get_devices():
325            if device.type == cl.device_type.GPU:
326                # If the existing type is not GPU then it will be CUSTOM or ACCELERATOR,
327                # so don't override it.
328                if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
329                    gpu = device
330            elif device.type == cl.device_type.CPU:
331                if cpu is None or preferred_cpu:
332                    cpu = device
333            else:
334                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
335                # Intel Phi for example registers as an accelerator
336                # Since the user installed a custom device on their system and went through the
337                # pain of sorting out OpenCL drivers for it, lets assume they really do want to
338                # use it as their primary compute device.
339                gpu = device
340
341    # order the devices by gpu then by cpu; when searching for an available device by data type they
342    # will be checked in this order, which means that if the gpu supports double then the cpu will never
343    # be used (though we may make it possible to explicitly request the cpu at some point).
344    devices = []
345    if gpu is not None:
346        devices.append(gpu)
347    if cpu is not None:
348        devices.append(cpu)
349    return [cl.Context([d]) for d in devices]
350
351
352class GpuModel(KernelModel):
353    """
354    GPU wrapper for a single model.
355
356    *source* and *model_info* are the model source and interface as returned
357    from :func:`generate.make_source` and :func:`generate.make_model_info`.
358
359    *dtype* is the desired model precision.  Any numpy dtype for single
360    or double precision floats will do, such as 'f', 'float32' or 'single'
361    for single and 'd', 'float64' or 'double' for double.  Double precision
362    is an optional extension which may not be available on all devices.
363    Half precision ('float16','half') may be available on some devices.
364    Fast precision ('fast') is a loose version of single precision, indicating
365    that the compiler is allowed to take shortcuts.
366    """
367    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
368        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
369        self.info = model_info
370        self.source = source
371        self.dtype = dtype
372        self.fast = fast
373        self.program = None # delay program creation
374
375    def __getstate__(self):
376        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
377        return self.info, self.source, self.dtype, self.fast
378
379    def __setstate__(self, state):
380        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
381        self.info, self.source, self.dtype, self.fast = state
382        self.program = None
383
384    def make_kernel(self, q_vectors):
385        # type: (List[np.ndarray]) -> "GpuKernel"
386        if self.program is None:
387            compile_program = environment().compile_program
388            self.program = compile_program(
389                self.info.name,
390                self.source['opencl'],
391                self.dtype,
392                self.fast)
393            variants = ['Iq', 'Iqxy', 'Imagnetic']
394            names = [generate.kernel_name(self.info, k) for k in variants]
395            kernels = [getattr(self.program, k) for k in names]
396            self._kernels = dict((k,v) for k,v in zip(variants, kernels))
397        is_2d = len(q_vectors) == 2
398        if is_2d:
399            kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
400        else:
401            kernel = [self._kernels['Iq']]*2
402        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
403
404    def release(self):
405        # type: () -> None
406        """
407        Free the resources associated with the model.
408        """
409        if self.program is not None:
410            environment().release_program(self.info.name)
411            self.program = None
412
413    def __del__(self):
414        # type: () -> None
415        self.release()
416
417# TODO: check that we don't need a destructor for buffers which go out of scope
418class GpuInput(object):
419    """
420    Make q data available to the gpu.
421
422    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
423    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
424    to get the best performance on OpenCL, which may involve shifting and
425    stretching the array to better match the memory architecture.  Additional
426    points will be evaluated with *q=1e-3*.
427
428    *dtype* is the data type for the q vectors. The data type should be
429    set to match that of the kernel, which is an attribute of
430    :class:`GpuProgram`.  Note that not all kernels support double
431    precision, so even if the program was created for double precision,
432    the *GpuProgram.dtype* may be single precision.
433
434    Call :meth:`release` when complete.  Even if not called directly, the
435    buffer will be released when the data object is freed.
436    """
437    def __init__(self, q_vectors, dtype=generate.F32):
438        # type: (List[np.ndarray], np.dtype) -> None
439        # TODO: do we ever need double precision q?
440        env = environment()
441        self.nq = q_vectors[0].size
442        self.dtype = np.dtype(dtype)
443        self.is_2d = (len(q_vectors) == 2)
444        # TODO: stretch input based on get_warp()
445        # not doing it now since warp depends on kernel, which is not known
446        # at this point, so instead using 32, which is good on the set of
447        # architectures tested so far.
448        if self.is_2d:
449            # Note: 17 rather than 15 because results is 2 elements
450            # longer than input.
451            width = ((self.nq+17)//16)*16
452            self.q = np.empty((width, 2), dtype=dtype)
453            self.q[:self.nq, 0] = q_vectors[0]
454            self.q[:self.nq, 1] = q_vectors[1]
455        else:
456            # Note: 33 rather than 31 because results is 2 elements
457            # longer than input.
458            width = ((self.nq+33)//32)*32
459            self.q = np.empty(width, dtype=dtype)
460            self.q[:self.nq] = q_vectors[0]
461        self.global_size = [self.q.shape[0]]
462        context = env.get_context(self.dtype)
463        #print("creating inputs of size", self.global_size)
464        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
465                             hostbuf=self.q)
466
467    def release(self):
468        # type: () -> None
469        """
470        Free the memory.
471        """
472        if self.q_b is not None:
473            self.q_b.release()
474            self.q_b = None
475
476    def __del__(self):
477        # type: () -> None
478        self.release()
479
480class GpuKernel(Kernel):
481    """
482    Callable SAS kernel.
483
484    *kernel* is the GpuKernel object to call
485
486    *model_info* is the module information
487
488    *q_vectors* is the q vectors at which the kernel should be evaluated
489
490    *dtype* is the kernel precision
491
492    The resulting call method takes the *pars*, a list of values for
493    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
494    vectors for the polydisperse parameters.  *cutoff* determines the
495    integration limits: any points with combined weight less than *cutoff*
496    will not be calculated.
497
498    Call :meth:`release` when done with the kernel instance.
499    """
500    def __init__(self, kernel, dtype, model_info, q_vectors):
501        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
502        max_pd = model_info.parameters.max_pd
503        npars = len(model_info.parameters.kernel_parameters)-2
504        q_input = GpuInput(q_vectors, dtype)
505        self.kernel = kernel
506        self.info = model_info
507        self.dtype = dtype
508        self.dim = '2d' if q_input.is_2d else '1d'
509        # plus three for the normalization values
510        self.result = np.empty(q_input.nq+3, dtype)
511
512        # Inputs and outputs for each kernel call
513        # Note: res may be shorter than res_b if global_size != nq
514        env = environment()
515        self.queue = env.get_queue(dtype)
516
517        self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
518                               q_input.global_size[0] * dtype.itemsize)
519        self.q_input = q_input # allocated by GpuInput above
520
521        self._need_release = [ self.result_b, self.q_input ]
522        self.real = (np.float32 if dtype == generate.F32
523                     else np.float64 if dtype == generate.F64
524                     else np.float16 if dtype == generate.F16
525                     else np.float32)  # will never get here, so use np.float32
526
527    def __call__(self, call_details, values, cutoff, magnetic):
528        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
529        context = self.queue.context
530        # Arrange data transfer to card
531        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
532                              hostbuf=call_details.buffer)
533        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
534                             hostbuf=values)
535
536        kernel = self.kernel[1 if magnetic else 0]
537        args = [
538            np.uint32(self.q_input.nq), None, None,
539            details_b, values_b, self.q_input.q_b, self.result_b,
540            self.real(cutoff),
541        ]
542        #print("Calling OpenCL")
543        #print("values",values)
544        # Call kernel and retrieve results
545        last_call = None
546        step = 100
547        for start in range(0, call_details.pd_prod, step):
548            stop = min(start+step, call_details.pd_prod)
549            #print("queuing",start,stop)
550            args[1:3] = [np.int32(start), np.int32(stop)]
551            last_call = [kernel(self.queue, self.q_input.global_size,
552                                None, *args, wait_for=last_call)]
553        cl.enqueue_copy(self.queue, self.result, self.result_b)
554
555        # Free buffers
556        for v in (details_b, values_b):
557            if v is not None: v.release()
558
559        scale = values[0]/self.result[self.q_input.nq]
560        background = values[1]
561        #print("scale",scale,background)
562        return scale*self.result[:self.q_input.nq] + background
563        # return self.result[:self.q_input.nq]
564
565    def release(self):
566        # type: () -> None
567        """
568        Release resources associated with the kernel.
569        """
570        for v in self._need_release:
571            v.release()
572        self._need_release = []
573
574    def __del__(self):
575        # type: () -> None
576        self.release()
Note: See TracBrowser for help on using the repository browser.