source: sasmodels/sasmodels/kernelcl.py @ b297ba9

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since b297ba9 was b297ba9, checked in by Paul Kienzle <pkienzle@…>, 5 years ago

lint

  • Property mode set to 100644
File size: 23.9 KB
RevLine 
[14de349]1"""
[eafc9fa]2GPU driver for C kernels
[14de349]3
[b0de252]4TODO: docs are out of date
5
[14de349]6There should be a single GPU environment running on the system.  This
7environment is constructed on the first call to :func:`env`, and the
8same environment is returned on each call.
9
10After retrieving the environment, the next step is to create the kernel.
11This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
12returns the type of data used by the kernel.
13
14Next a :class:`GpuData` object should be created with the correct kind
15of data.  This data object can be used by multiple kernels, for example,
16if the target model is a weighted sum of multiple kernels.  The data
17should include any extra evaluation points required to compute the proper
18data smearing.  This need not match the square grid for 2D data if there
19is an index saying which q points are active.
20
21Together the GpuData, the program, and a device form a :class:`GpuKernel`.
22This kernel is used during fitting, receiving new sets of parameters and
23evaluating them.  The output value is stored in an output buffer on the
24devices, where it can be combined with other structure factors and form
25factors and have instrumental resolution effects applied.
[92da231]26
27In order to use OpenCL for your models, you will need OpenCL drivers for
28your machine.  These should be available from your graphics card vendor.
29Intel provides OpenCL drivers for CPUs as well as their integrated HD
30graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
31this writing the performance is lacking compared to the Intel drivers.
32NVidia combines drivers for CUDA and OpenCL in one package.  The result
33is a bit messy if you have multiple drivers installed.  You can see which
34drivers are available by starting python and running:
35
36    import pyopencl as cl
37    cl.create_some_context(interactive=True)
38
39Once you have done that, it will show the available drivers which you
40can select.  It will then tell you that you can use these drivers
[880a2ed]41automatically by setting the SAS_OPENCL environment variable, which is
42PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
[92da231]43
44Some graphics cards have multiple devices on the same card.  You cannot
45yet use both of them concurrently to evaluate models, but you can run
46the program twice using a different device for each session.
47
48OpenCL kernels are compiled when needed by the device driver.  Some
49drivers produce compiler output even when there is no error.  You
50can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
51harmless, albeit annoying.
[14de349]52"""
[ba32cdd]53from __future__ import print_function
[a5b8477]54
[b297ba9]55import sys
[250fa25]56import os
57import warnings
[821a9c6]58import logging
[6e5b2a7]59import time
[250fa25]60
[e2592f0]61try:
62    from time import perf_counter as clock
63except ImportError: # CRUFT: python < 3.3
64    if sys.platform.count("darwin") > 0:
65        from time import time as clock
66    else:
67        from time import clock
[b3f6bc3]68
[e2592f0]69import numpy as np  # type: ignore
[3221de0]70
[3199b17]71# Attempt to setup OpenCL. This may fail if the pyopencl package is not
[3221de0]72# installed or if it is installed but there are no devices available.
[250fa25]73try:
[3221de0]74    import pyopencl as cl  # type: ignore
75    from pyopencl import mem_flags as mf
76    from pyopencl.characterize import get_fast_inaccurate_build_options
[3199b17]77    # Ask OpenCL for the default context so that we know that one exists.
[3221de0]78    cl.create_some_context(interactive=False)
79    HAVE_OPENCL = True
80    OPENCL_ERROR = ""
[9404dd3]81except Exception as exc:
[6dba2f0]82    HAVE_OPENCL = False
[3221de0]83    OPENCL_ERROR = str(exc)
[14de349]84
[cb6ecf4]85from . import generate
[95f62aa]86from .generate import F32, F64
[f619de7]87from .kernel import KernelModel, Kernel
[14de349]88
[2d81cfe]89# pylint: disable=unused-import
[a5b8477]90try:
91    from typing import Tuple, Callable, Any
92    from .modelinfo import ModelInfo
93    from .details import CallDetails
94except ImportError:
95    pass
[2d81cfe]96# pylint: enable=unused-import
[a5b8477]97
[3199b17]98
99# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path).
[20317b3]100def quote_path(v):
[b297ba9]101    # type: (str) -> str
[20317b3]102    """
103    Quote the path if it is not already quoted.
104
105    If v starts with '-', then assume that it is a -I option or similar
106    and do not quote it.  This is fragile:  -Ipath with space needs to
107    be quoted.
108    """
109    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
110
[3199b17]111
[20317b3]112def fix_pyopencl_include():
[b297ba9]113    # type: (None) -> None
[40a87fa]114    """
115    Monkey patch pyopencl to allow spaces in include file path.
116    """
[b297ba9]117    import pyopencl
118    if hasattr(pyopencl, '_DEFAULT_INCLUDE_OPTIONS'):
119        pyopencl._DEFAULT_INCLUDE_OPTIONS = [
120            quote_path(v) for v in pyopencl._DEFAULT_INCLUDE_OPTIONS
[3199b17]121            ]
122
[20317b3]123
[6dba2f0]124if HAVE_OPENCL:
125    fix_pyopencl_include()
[20317b3]126
[ce27e21]127# The max loops number is limited by the amount of local memory available
128# on the device.  You don't want to make this value too big because it will
129# waste resources, nor too small because it may interfere with users trying
130# to do their polydispersity calculations.  A value of 1024 should be much
131# larger than necessary given that cost grows as npts^k where k is the number
132# of polydisperse parameters.
[5d4777d]133MAX_LOOPS = 2048
134
[5464d68]135# Pragmas for enable OpenCL features.  Be sure to protect them so that they
136# still compile even if OpenCL is not present.
137_F16_PRAGMA = """\
138#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
139#  pragma OPENCL EXTENSION cl_khr_fp16: enable
140#endif
141"""
142
143_F64_PRAGMA = """\
144#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
145#  pragma OPENCL EXTENSION cl_khr_fp64: enable
146#endif
147"""
148
[3199b17]149
[3221de0]150def use_opencl():
[b297ba9]151    # type: () -> bool
152    """Return True if OpenCL is the default computational engine"""
[07646b6]153    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
154    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
[5464d68]155
[3199b17]156
[14de349]157ENV = None
[3221de0]158def reset_environment():
[b297ba9]159    # type: () -> None
[3221de0]160    """
161    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
162    """
163    global ENV
164    ENV = GpuEnvironment() if use_opencl() else None
165
[3199b17]166
[14de349]167def environment():
[dd7fc12]168    # type: () -> "GpuEnvironment"
[14de349]169    """
170    Returns a singleton :class:`GpuEnvironment`.
171
172    This provides an OpenCL context and one queue per device.
173    """
[b4272a2]174    if ENV is None:
175        if not HAVE_OPENCL:
176            raise RuntimeError("OpenCL startup failed with ***"
[d86f0fc]177                               + OPENCL_ERROR + "***; using C compiler instead")
[3221de0]178        reset_environment()
[b4272a2]179        if ENV is None:
180            raise RuntimeError("SAS_OPENCL=None in environment")
[14de349]181    return ENV
182
[3199b17]183
[5d316e9]184def has_type(device, dtype):
[dd7fc12]185    # type: (cl.Device, np.dtype) -> bool
[14de349]186    """
[5d316e9]187    Return true if device supports the requested precision.
[14de349]188    """
[95f62aa]189    if dtype == F32:
[5d316e9]190        return True
[f872fd1]191    elif dtype == F64:
[5d316e9]192        return "cl_khr_fp64" in device.extensions
193    else:
[3199b17]194        # Not supporting F16 type since it isn't accurate enough.
[5d316e9]195        return False
[14de349]196
[3199b17]197
[f5b9a6b]198def get_warp(kernel, queue):
[dd7fc12]199    # type: (cl.Kernel, cl.CommandQueue) -> int
[f5b9a6b]200    """
201    Return the size of an execution batch for *kernel* running on *queue*.
202    """
[750ffa5]203    return kernel.get_work_group_info(
[63b32bb]204        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
205        queue.device)
[14de349]206
[3199b17]207
[5d316e9]208def compile_model(context, source, dtype, fast=False):
[dd7fc12]209    # type: (cl.Context, str, np.dtype, bool) -> cl.Program
[14de349]210    """
211    Build a model to run on the gpu.
212
[6cbdcd4]213    Returns the compiled program and its type.
214
215    Raises an error if the desired precision is not available.
[14de349]216    """
217    dtype = np.dtype(dtype)
[5d316e9]218    if not all(has_type(d, dtype) for d in context.devices):
219        raise RuntimeError("%s not supported for devices"%dtype)
[14de349]220
[5464d68]221    source_list = [generate.convert_type(source, dtype)]
222
223    if dtype == generate.F16:
224        source_list.insert(0, _F16_PRAGMA)
225    elif dtype == generate.F64:
226        source_list.insert(0, _F64_PRAGMA)
227
[3199b17]228    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
[14de349]229    if context.devices[0].type == cl.device_type.GPU:
[5464d68]230        source_list.insert(0, "#define USE_SINCOS\n")
[5d316e9]231    options = (get_fast_inaccurate_build_options(context.devices[0])
232               if fast else [])
[ba32cdd]233    source = "\n".join(source_list)
[5d316e9]234    program = cl.Program(context, source).build(options=options)
[3199b17]235
[821a9c6]236    #print("done with "+program)
[ce27e21]237    return program
[14de349]238
239
[3199b17]240# For now, this returns one device in the context.
241# TODO: Create a context that contains all devices on all platforms.
[14de349]242class GpuEnvironment(object):
243    """
[3199b17]244    GPU context for OpenCL, with possibly many devices and one queue per device.
[14de349]245    """
246    def __init__(self):
[dd7fc12]247        # type: () -> None
[3199b17]248        # Find gpu context.
[95f62aa]249        context_list = _create_some_context()
250
251        # Find a context for F32 and for F64 (maybe the same one).
252        # F16 isn't good enough.
253        self.context = {}
254        for dtype in (F32, F64):
255            for context in context_list:
256                if has_type(context.devices[0], dtype):
257                    self.context[dtype] = context
258                    break
259            else:
260                self.context[dtype] = None
261
[3199b17]262        # Build a queue for each context.
[95f62aa]263        self.queue = {}
264        context = self.context[F32]
265        self.queue[F32] = cl.CommandQueue(context, context.devices[0])
266        if self.context[F64] == self.context[F32]:
267            self.queue[F64] = self.queue[F32]
268        else:
269            context = self.context[F64]
270            self.queue[F64] = cl.CommandQueue(context, context.devices[0])
[250fa25]271
[3199b17]272        ## Byte boundary for data alignment.
[95f62aa]273        #self.data_boundary = max(context.devices[0].min_data_type_align_size
274        #                         for context in self.context.values())
275
[3199b17]276        # Cache for compiled programs, and for items in context.
[ce27e21]277        self.compiled = {}
278
[5d316e9]279    def has_type(self, dtype):
[dd7fc12]280        # type: (np.dtype) -> bool
[eafc9fa]281        """
282        Return True if all devices support a given type.
283        """
[95f62aa]284        return self.context.get(dtype, None) is not None
[250fa25]285
[300a2f7]286    def compile_program(self, name, source, dtype, fast, timestamp):
287        # type: (str, str, np.dtype, bool, float) -> cl.Program
[eafc9fa]288        """
289        Compile the program for the device in the given context.
290        """
[300a2f7]291        # Note: PyOpenCL caches based on md5 hash of source, options and device
[3199b17]292        # but I'll do so as well just to save some data munging time.
[7fcdc9f]293        tag = generate.tag_source(source)
294        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
[3199b17]295        # Check timestamp on program.
[300a2f7]296        program, program_timestamp = self.compiled.get(key, (None, np.inf))
297        if program_timestamp < timestamp:
298            del self.compiled[key]
[cde11f0f]299        if key not in self.compiled:
[95f62aa]300            context = self.context[dtype]
[20317b3]301            logging.info("building %s for OpenCL %s", key,
302                         context.devices[0].name.strip())
[95f62aa]303            program = compile_model(self.context[dtype],
[fec69dd]304                                    str(source), dtype, fast)
[300a2f7]305            self.compiled[key] = (program, timestamp)
306        return program
[14de349]307
[3199b17]308
[95f62aa]309def _create_some_context():
310    # type: () -> cl.Context
311    """
312    Protected call to cl.create_some_context without interactivity.
313
314    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
315    otherwise scans for the most appropriate device using
[d5ce7fa]316    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
317    indicates that an OpenCL device should be used without specifying
318    which one (and not a CUDA device, or no GPU).
[95f62aa]319    """
[3199b17]320    # Assume we do not get here if SAS_OPENCL is None or CUDA.
[07646b6]321    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
322    if sas_opencl.lower() != 'opencl':
[3199b17]323        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context.
[07646b6]324        os.environ["PYOPENCL_CTX"] = sas_opencl
[95f62aa]325
326    if 'PYOPENCL_CTX' in os.environ:
327        try:
328            return [cl.create_some_context(interactive=False)]
329        except Exception as exc:
[b297ba9]330            # TODO: Should warnings instead by put into logging.warn?
[95f62aa]331            warnings.warn(str(exc))
[b297ba9]332            warnings.warn(
333                "pyopencl.create_some_context() failed.  The environment "
334                "variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set "
335                "correctly")
[95f62aa]336
337    return _get_default_context()
338
[3199b17]339
[3c56da87]340def _get_default_context():
[20317b3]341    # type: () -> List[cl.Context]
[eafc9fa]342    """
[d18582e]343    Get an OpenCL context, preferring GPU over CPU, and preferring Intel
344    drivers over AMD drivers.
[eafc9fa]345    """
[d18582e]346    # Note: on mobile devices there is automatic clock scaling if either the
347    # CPU or the GPU is underutilized; probably doesn't affect us, but we if
348    # it did, it would mean that putting a busy loop on the CPU while the GPU
349    # is running may increase throughput.
350    #
[3199b17]351    # MacBook Pro, base install:
[d18582e]352    #     {'Apple': [Intel CPU, NVIDIA GPU]}
[3199b17]353    # MacBook Pro, base install:
[d18582e]354    #     {'Apple': [Intel CPU, Intel GPU]}
[3199b17]355    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install:
[d18582e]356    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
357    gpu, cpu = None, None
[3c56da87]358    for platform in cl.get_platforms():
[e6a5556]359        # AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
[20317b3]360        # If someone has bothered to install the AMD/NVIDIA drivers, prefer
361        # them over the integrated graphics driver that may have been supplied
362        # with the CPU chipset.
363        preferred_cpu = (platform.vendor.startswith('Intel')
364                         or platform.vendor.startswith('Apple'))
365        preferred_gpu = (platform.vendor.startswith('Advanced')
366                         or platform.vendor.startswith('NVIDIA'))
[3c56da87]367        for device in platform.get_devices():
368            if device.type == cl.device_type.GPU:
[20317b3]369                # If the existing type is not GPU then it will be CUSTOM
370                # or ACCELERATOR so don't override it.
[e6a5556]371                if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
372                    gpu = device
373            elif device.type == cl.device_type.CPU:
374                if cpu is None or preferred_cpu:
375                    cpu = device
[d18582e]376            else:
[e6a5556]377                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
[3199b17]378                # Intel Phi for example registers as an accelerator.
[20317b3]379                # Since the user installed a custom device on their system
380                # and went through the pain of sorting out OpenCL drivers for
381                # it, lets assume they really do want to use it as their
382                # primary compute device.
[e6a5556]383                gpu = device
[199d40d]384
[3199b17]385    # Order the devices by gpu then by cpu; when searching for an available
[20317b3]386    # device by data type they will be checked in this order, which means
387    # that if the gpu supports double then the cpu will never be used (though
388    # we may make it possible to explicitly request the cpu at some point).
[e6a5556]389    devices = []
390    if gpu is not None:
391        devices.append(gpu)
392    if cpu is not None:
393        devices.append(cpu)
394    return [cl.Context([d]) for d in devices]
[3c56da87]395
[250fa25]396
[f619de7]397class GpuModel(KernelModel):
[14de349]398    """
399    GPU wrapper for a single model.
400
[17bbadd]401    *source* and *model_info* are the model source and interface as returned
402    from :func:`generate.make_source` and :func:`generate.make_model_info`.
[14de349]403
404    *dtype* is the desired model precision.  Any numpy dtype for single
405    or double precision floats will do, such as 'f', 'float32' or 'single'
406    for single and 'd', 'float64' or 'double' for double.  Double precision
407    is an optional extension which may not be available on all devices.
[cde11f0f]408    Half precision ('float16','half') may be available on some devices.
409    Fast precision ('fast') is a loose version of single precision, indicating
410    that the compiler is allowed to take shortcuts.
[14de349]411    """
[3199b17]412    info = None  # type: ModelInfo
413    source = ""  # type: str
414    dtype = None  # type: np.dtype
415    fast = False  # type: bool
416    _program = None  # type: cl.Program
417    _kernels = None  # type: Dict[str, cl.Kernel]
[7126c04]418
[dd7fc12]419    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
[a4280bd]420        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
[17bbadd]421        self.info = model_info
[ce27e21]422        self.source = source
[dd7fc12]423        self.dtype = dtype
424        self.fast = fast
[14de349]425
[ce27e21]426    def __getstate__(self):
[dd7fc12]427        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
[eafc9fa]428        return self.info, self.source, self.dtype, self.fast
[14de349]429
[ce27e21]430    def __setstate__(self, state):
[dd7fc12]431        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
[eafc9fa]432        self.info, self.source, self.dtype, self.fast = state
[7126c04]433        self._program = self._kernels = None
[ce27e21]434
[9eb3632]435    def make_kernel(self, q_vectors):
[dd7fc12]436        # type: (List[np.ndarray]) -> "GpuKernel"
[95f62aa]437        return GpuKernel(self, q_vectors)
438
[7126c04]439    def get_function(self, name):
[95f62aa]440        # type: (str) -> cl.Kernel
441        """
442        Fetch the kernel from the environment by name, compiling it if it
443        does not already exist.
444        """
[7126c04]445        if self._program is None:
446            self._prepare_program()
447        return self._kernels[name]
448
449    def _prepare_program(self):
450        # type: (str) -> None
451        env = environment()
452        timestamp = generate.ocl_timestamp(self.info)
453        program = env.compile_program(
454            self.info.name,
455            self.source['opencl'],
456            self.dtype,
457            self.fast,
458            timestamp)
459        variants = ['Iq', 'Iqxy', 'Imagnetic']
460        names = [generate.kernel_name(self.info, k) for k in variants]
[00afc15]461        functions = [getattr(program, k) for k in names]
462        self._kernels = {k: v for k, v in zip(variants, functions)}
[3199b17]463        # Keep a handle to program so GC doesn't collect.
[7126c04]464        self._program = program
[14de349]465
[3199b17]466
467# TODO: Check that we don't need a destructor for buffers which go out of scope.
[14de349]468class GpuInput(object):
469    """
470    Make q data available to the gpu.
471
472    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
473    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
474    to get the best performance on OpenCL, which may involve shifting and
475    stretching the array to better match the memory architecture.  Additional
476    points will be evaluated with *q=1e-3*.
477
478    *dtype* is the data type for the q vectors. The data type should be
479    set to match that of the kernel, which is an attribute of
480    :class:`GpuProgram`.  Note that not all kernels support double
481    precision, so even if the program was created for double precision,
482    the *GpuProgram.dtype* may be single precision.
483
484    Call :meth:`release` when complete.  Even if not called directly, the
485    buffer will be released when the data object is freed.
486    """
[cb6ecf4]487    def __init__(self, q_vectors, dtype=generate.F32):
[dd7fc12]488        # type: (List[np.ndarray], np.dtype) -> None
[3199b17]489        # TODO: Do we ever need double precision q?
[14de349]490        self.nq = q_vectors[0].size
491        self.dtype = np.dtype(dtype)
[eafc9fa]492        self.is_2d = (len(q_vectors) == 2)
[3199b17]493        # TODO: Stretch input based on get_warp().
494        # Not doing it now since warp depends on kernel, which is not known
[f5b9a6b]495        # at this point, so instead using 32, which is good on the set of
496        # architectures tested so far.
[c072f83]497        if self.is_2d:
[07646b6]498            width = ((self.nq+15)//16)*16
[c072f83]499            self.q = np.empty((width, 2), dtype=dtype)
500            self.q[:self.nq, 0] = q_vectors[0]
501            self.q[:self.nq, 1] = q_vectors[1]
502        else:
[07646b6]503            width = ((self.nq+31)//32)*32
[c072f83]504            self.q = np.empty(width, dtype=dtype)
505            self.q[:self.nq] = q_vectors[0]
506        self.global_size = [self.q.shape[0]]
[7126c04]507        #print("creating inputs of size", self.global_size)
[95f62aa]508
[3199b17]509        # Transfer input value to GPU.
[95f62aa]510        env = environment()
[7126c04]511        context = env.context[self.dtype]
512        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
513                             hostbuf=self.q)
[14de349]514
515    def release(self):
[dd7fc12]516        # type: () -> None
[eafc9fa]517        """
[3199b17]518        Free the buffer associated with the q value.
[eafc9fa]519        """
[7126c04]520        if self.q_b is not None:
521            self.q_b.release()
522            self.q_b = None
[14de349]523
[eafc9fa]524    def __del__(self):
[dd7fc12]525        # type: () -> None
[eafc9fa]526        self.release()
527
[3199b17]528
[f619de7]529class GpuKernel(Kernel):
[ff7119b]530    """
531    Callable SAS kernel.
532
[95f62aa]533    *model* is the GpuModel object to call
[ff7119b]534
[7126c04]535    The kernel is derived from :class:`Kernel`, providing the
536    :meth:`call_kernel` method to evaluate the kernel for a given set of
537    parameters.  Because of the need to move the q values to the GPU before
538    evaluation, the kernel is instantiated for a particular set of q vectors,
539    and can be called many times without transfering q each time.
[ff7119b]540
541    Call :meth:`release` when done with the kernel instance.
542    """
[3199b17]543    #: SAS model information structure.
544    info = None  # type: ModelInfo
545    #: Kernel precision.
546    dtype = None  # type: np.dtype
547    #: Kernel dimensions (1d or 2d).
548    dim = ""  # type: str
549    #: Calculation results, updated after each call to :meth:`_call_kernel`.
550    result = None  # type: np.ndarray
[7126c04]551
[95f62aa]552    def __init__(self, model, q_vectors):
[7126c04]553        # type: (GpuModel, List[np.ndarray]) -> None
[95f62aa]554        dtype = model.dtype
555        self.q_input = GpuInput(q_vectors, dtype)
556        self._model = model
557
[3199b17]558        # Attributes accessed from the outside.
[95f62aa]559        self.dim = '2d' if self.q_input.is_2d else '1d'
560        self.info = model.info
[3199b17]561        self.dtype = dtype
[95f62aa]562
[3199b17]563        # Converter to translate input to target type.
564        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
565
566        # Holding place for the returned value.
[5399809]567        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
[3199b17]568        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
569        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
[14de349]570
[3199b17]571        # Allocate result value on GPU.
[ce27e21]572        env = environment()
[7126c04]573        context = env.context[self.dtype]
574        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
575        self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
[d18582e]576
[3199b17]577    def _call_kernel(self, call_details, values, cutoff, magnetic,
578                     effective_radius_type):
579        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
[95f62aa]580        env = environment()
581        queue = env.queue[self._model.dtype]
582        context = queue.context
583
[3199b17]584        # Arrange data transfer to card.
[48fbd50]585        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
[8d62008]586                              hostbuf=call_details.buffer)
[48fbd50]587        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
588                             hostbuf=values)
589
[3199b17]590        # Setup kernel function and arguments.
[95f62aa]591        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
[7126c04]592        kernel = self._model.get_function(name)
[95f62aa]593        kernel_args = [
[3199b17]594            np.uint32(self.q_input.nq),  # Number of inputs.
595            None,  # Placeholder for pd_start.
596            None,  # Placeholder for pd_stop.
597            details_b,  # Problem definition.
598            values_b,  # Parameter values.
599            self.q_input.q_b,  # Q values.
600            self._result_b,   # Result storage.
601            self._as_dtype(cutoff),  # Probability cutoff.
602            np.uint32(effective_radius_type),  # R_eff mode.
[9eb3632]603        ]
[3199b17]604
605        # Call kernel and retrieve results.
[9eb3632]606        #print("Calling OpenCL")
[c036ddb]607        #call_details.show(values)
[6e5b2a7]608        wait_for = None
[e2592f0]609        last_nap = clock()
[6e5b2a7]610        step = 1000000//self.q_input.nq + 1
[bde38b5]611        for start in range(0, call_details.num_eval, step):
612            stop = min(start + step, call_details.num_eval)
[9eb3632]613            #print("queuing",start,stop)
[95f62aa]614            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
615            wait_for = [kernel(queue, self.q_input.global_size, None,
616                               *kernel_args, wait_for=wait_for)]
[6e5b2a7]617            if stop < call_details.num_eval:
[3199b17]618                # Allow other processes to run.
[6e5b2a7]619                wait_for[0].wait()
[e2592f0]620                current_time = clock()
[6e5b2a7]621                if current_time - last_nap > 0.5:
[8b31efa]622                    time.sleep(0.001)
[6e5b2a7]623                    last_nap = current_time
[7126c04]624        cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for)
[bde38b5]625        #print("result", self.result)
[ae2b6b5]626
[3199b17]627        # Free buffers.
[7126c04]628        details_b.release()
629        values_b.release()
[14de349]630
631    def release(self):
[dd7fc12]632        # type: () -> None
[eafc9fa]633        """
634        Release resources associated with the kernel.
635        """
[95f62aa]636        self.q_input.release()
[7126c04]637        if self._result_b is not None:
638            self._result_b.release()
639            self._result_b = None
[14de349]640
641    def __del__(self):
[dd7fc12]642        # type: () -> None
[14de349]643        self.release()
Note: See TracBrowser for help on using the repository browser.