source: sasmodels/sasmodels/kernelcuda.py @ 0db7dbd

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 0db7dbd was 0db7dbd, checked in by pkienzle, 6 years ago

cuda support: allow cylinder model to run under CUDA as well as OpenCL

  • Property mode set to 100644
File size: 18.0 KB
Line 
1"""
2GPU driver for C kernels
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25In order to use OpenCL for your models, you will need OpenCL drivers for
26your machine.  These should be available from your graphics card vendor.
27Intel provides OpenCL drivers for CPUs as well as their integrated HD
28graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
29this writing the performance is lacking compared to the Intel drivers.
30NVidia combines drivers for CUDA and OpenCL in one package.  The result
31is a bit messy if you have multiple drivers installed.  You can see which
32drivers are available by starting python and running:
33
34    import pyopencl as cl
35    cl.create_some_context(interactive=True)
36
37Once you have done that, it will show the available drivers which you
38can select.  It will then tell you that you can use these drivers
39automatically by setting the SAS_OPENCL environment variable, which is
40PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
41
42Some graphics cards have multiple devices on the same card.  You cannot
43yet use both of them concurrently to evaluate models, but you can run
44the program twice using a different device for each session.
45
46OpenCL kernels are compiled when needed by the device driver.  Some
47drivers produce compiler output even when there is no error.  You
48can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
49harmless, albeit annoying.
50"""
51from __future__ import print_function
52
53import os
54import warnings
55import logging
56import time
57
58import numpy as np  # type: ignore
59
60
61# Attempt to setup opencl. This may fail if the opencl package is not
62# installed or if it is installed but there are no devices available.
63try:
64    import pycuda.autoinit
65    import pycuda.driver as cuda  # type: ignore
66    from pycuda.compiler import SourceModule
67    HAVE_CUDA = True
68    CUDA_ERROR = ""
69except Exception as exc:
70    HAVE_CUDA = False
71    CUDA_ERROR = str(exc)
72
73from . import generate
74from .kernel import KernelModel, Kernel
75
76# pylint: disable=unused-import
77try:
78    from typing import Tuple, Callable, Any
79    from .modelinfo import ModelInfo
80    from .details import CallDetails
81except ImportError:
82    pass
83# pylint: enable=unused-import
84
85# The max loops number is limited by the amount of local memory available
86# on the device.  You don't want to make this value too big because it will
87# waste resources, nor too small because it may interfere with users trying
88# to do their polydispersity calculations.  A value of 1024 should be much
89# larger than necessary given that cost grows as npts^k where k is the number
90# of polydisperse parameters.
91MAX_LOOPS = 2048
92
93
94# Pragmas for enable OpenCL features.  Be sure to protect them so that they
95# still compile even if OpenCL is not present.
96_F16_PRAGMA = """\
97#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
98#  pragma OPENCL EXTENSION cl_khr_fp16: enable
99#endif
100"""
101
102_F64_PRAGMA = """\
103#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
104#  pragma OPENCL EXTENSION cl_khr_fp64: enable
105#endif
106"""
107
108def use_cuda():
109    return HAVE_CUDA
110
111ENV = None
112def reset_environment():
113    """
114    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
115    """
116    global ENV
117    ENV = GpuEnvironment() if use_cuda() else None
118
119def environment():
120    # type: () -> "GpuEnvironment"
121    """
122    Returns a singleton :class:`GpuEnvironment`.
123
124    This provides an OpenCL context and one queue per device.
125    """
126    if ENV is None:
127        if not HAVE_CUDA:
128            raise RuntimeError("OpenCL startup failed with ***"
129                            + CUDA_ERROR + "***; using C compiler instead")
130        reset_environment()
131        if ENV is None:
132            raise RuntimeError("SAS_OPENCL=None in environment")
133    return ENV
134
135def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
136    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
137    """
138    Stretch an input vector to the correct boundary.
139
140    Performance on the kernels can drop by a factor of two or more if the
141    number of values to compute does not fall on a nice power of two
142    boundary.   The trailing additional vector elements are given a
143    value of *extra*, and so f(*extra*) will be computed for each of
144    them.  The returned array will thus be a subset of the computed array.
145
146    *boundary* should be a power of 2 which is at least 32 for good
147    performance on current platforms (as of Jan 2015).  It should
148    probably be the max of get_warp(kernel,queue) and
149    device.min_data_type_align_size//4.
150    """
151    remainder = vector.size % boundary
152    if remainder != 0:
153        size = vector.size + (boundary - remainder)
154        vector = np.hstack((vector, [extra] * (size - vector.size)))
155    return np.ascontiguousarray(vector, dtype=dtype)
156
157
158def compile_model(source, dtype, fast=False):
159    # type: (str, np.dtype, bool) -> cl.Program
160    """
161    Build a model to run on the gpu.
162
163    Returns the compiled program and its type.  The returned type will
164    be float32 even if the desired type is float64 if any of the
165    devices in the context do not support the cl_khr_fp64 extension.
166    """
167    source_list = [generate.convert_type(source, dtype)]
168
169    if dtype == generate.F16:
170        source_list.insert(0, _F16_PRAGMA)
171    elif dtype == generate.F64:
172        source_list.insert(0, _F64_PRAGMA)
173
174    source_list.insert(0, "#define USE_SINCOS\n")
175    source = "\n".join(source_list)
176    program = SourceModule(source) # no_extern_c=True, include_dirs=[...]
177    return program
178
179
180# for now, this returns one device in the context
181# TODO: create a context that contains all devices on all platforms
182class GpuEnvironment(object):
183    """
184    GPU context, with possibly many devices, and one queue per device.
185    """
186    def __init__(self, devnum=0):
187        # type: () -> None
188        # Byte boundary for data alignment
189        #self.data_boundary = max(d.min_data_type_align_size
190        #                         for d in self.context.devices)
191        self.compiled = {}
192        #self.device = cuda.Device(devnum)
193        #self.context = self.device.make_context()
194
195    def has_type(self, dtype):
196        # type: (np.dtype) -> bool
197        """
198        Return True if all devices support a given type.
199        """
200        return True
201
202    def compile_program(self, name, source, dtype, fast, timestamp):
203        # type: (str, str, np.dtype, bool, float) -> cl.Program
204        """
205        Compile the program for the device in the given context.
206        """
207        # Note: PyOpenCL caches based on md5 hash of source, options and device
208        # so we don't really need to cache things for ourselves.  I'll do so
209        # anyway just to save some data munging time.
210        tag = generate.tag_source(source)
211        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
212        # Check timestamp on program
213        program, program_timestamp = self.compiled.get(key, (None, np.inf))
214        if program_timestamp < timestamp:
215            del self.compiled[key]
216        if key not in self.compiled:
217            logging.info("building %s for CUDA", key)
218            program = compile_model(str(source), dtype, fast)
219            self.compiled[key] = (program, timestamp)
220        return program
221
222class GpuModel(KernelModel):
223    """
224    GPU wrapper for a single model.
225
226    *source* and *model_info* are the model source and interface as returned
227    from :func:`generate.make_source` and :func:`generate.make_model_info`.
228
229    *dtype* is the desired model precision.  Any numpy dtype for single
230    or double precision floats will do, such as 'f', 'float32' or 'single'
231    for single and 'd', 'float64' or 'double' for double.  Double precision
232    is an optional extension which may not be available on all devices.
233    Half precision ('float16','half') may be available on some devices.
234    Fast precision ('fast') is a loose version of single precision, indicating
235    that the compiler is allowed to take shortcuts.
236    """
237    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
238        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
239        self.info = model_info
240        self.source = source
241        self.dtype = dtype
242        self.fast = fast
243        self.program = None # delay program creation
244        self._kernels = None
245
246    def __getstate__(self):
247        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
248        return self.info, self.source, self.dtype, self.fast
249
250    def __setstate__(self, state):
251        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
252        self.info, self.source, self.dtype, self.fast = state
253        self.program = None
254
255    def make_kernel(self, q_vectors):
256        # type: (List[np.ndarray]) -> "GpuKernel"
257        if self.program is None:
258            compile_program = environment().compile_program
259            timestamp = generate.ocl_timestamp(self.info)
260            self.program = compile_program(
261                self.info.name,
262                self.source['opencl'],
263                self.dtype,
264                self.fast,
265                timestamp)
266            variants = ['Iq', 'Iqxy', 'Imagnetic']
267            names = [generate.kernel_name(self.info, k) for k in variants]
268            kernels = [self.program.get_function(k) for k in names]
269            self._kernels = dict((k, v) for k, v in zip(variants, kernels))
270        is_2d = len(q_vectors) == 2
271        if is_2d:
272            kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
273        else:
274            kernel = [self._kernels['Iq']]*2
275        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
276
277    def release(self):
278        # type: () -> None
279        """
280        Free the resources associated with the model.
281        """
282        if self.program is not None:
283            self.program = None
284
285    def __del__(self):
286        # type: () -> None
287        self.release()
288
289# TODO: check that we don't need a destructor for buffers which go out of scope
290class GpuInput(object):
291    """
292    Make q data available to the gpu.
293
294    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
295    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
296    to get the best performance on OpenCL, which may involve shifting and
297    stretching the array to better match the memory architecture.  Additional
298    points will be evaluated with *q=1e-3*.
299
300    *dtype* is the data type for the q vectors. The data type should be
301    set to match that of the kernel, which is an attribute of
302    :class:`GpuProgram`.  Note that not all kernels support double
303    precision, so even if the program was created for double precision,
304    the *GpuProgram.dtype* may be single precision.
305
306    Call :meth:`release` when complete.  Even if not called directly, the
307    buffer will be released when the data object is freed.
308    """
309    def __init__(self, q_vectors, dtype=generate.F32):
310        # type: (List[np.ndarray], np.dtype) -> None
311        # TODO: do we ever need double precision q?
312        self.nq = q_vectors[0].size
313        self.dtype = np.dtype(dtype)
314        self.is_2d = (len(q_vectors) == 2)
315        # TODO: stretch input based on get_warp()
316        # not doing it now since warp depends on kernel, which is not known
317        # at this point, so instead using 32, which is good on the set of
318        # architectures tested so far.
319        if self.is_2d:
320            # Note: 16 rather than 15 because result is 1 longer than input.
321            width = ((self.nq+16)//16)*16
322            self.q = np.empty((width, 2), dtype=dtype)
323            self.q[:self.nq, 0] = q_vectors[0]
324            self.q[:self.nq, 1] = q_vectors[1]
325        else:
326            # Note: 32 rather than 31 because result is 1 longer than input.
327            width = ((self.nq+32)//32)*32
328            self.q = np.empty(width, dtype=dtype)
329            self.q[:self.nq] = q_vectors[0]
330        self.global_size = [self.q.shape[0]]
331        #print("creating inputs of size", self.global_size)
332        self.q_b = cuda.to_device(self.q)
333
334    def release(self):
335        # type: () -> None
336        """
337        Free the memory.
338        """
339        if self.q_b is not None:
340            self.q_b.free()
341            self.q_b = None
342
343    def __del__(self):
344        # type: () -> None
345        self.release()
346
347class GpuKernel(Kernel):
348    """
349    Callable SAS kernel.
350
351    *kernel* is the GpuKernel object to call
352
353    *model_info* is the module information
354
355    *q_vectors* is the q vectors at which the kernel should be evaluated
356
357    *dtype* is the kernel precision
358
359    The resulting call method takes the *pars*, a list of values for
360    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
361    vectors for the polydisperse parameters.  *cutoff* determines the
362    integration limits: any points with combined weight less than *cutoff*
363    will not be calculated.
364
365    Call :meth:`release` when done with the kernel instance.
366    """
367    def __init__(self, kernel, dtype, model_info, q_vectors):
368        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
369        q_input = GpuInput(q_vectors, dtype)
370        self.kernel = kernel
371        self.info = model_info
372        self.dtype = dtype
373        self.dim = '2d' if q_input.is_2d else '1d'
374        # plus three for the normalization values
375        self.result = np.empty(q_input.nq+1, dtype)
376
377        # Inputs and outputs for each kernel call
378        # Note: res may be shorter than res_b if global_size != nq
379        self.result_b = cuda.mem_alloc(q_input.global_size[0] * dtype.itemsize)
380        self.q_input = q_input # allocated by GpuInput above
381
382        self._need_release = [self.result_b, self.q_input]
383        self.real = (np.float32 if dtype == generate.F32
384                     else np.float64 if dtype == generate.F64
385                     else np.float16 if dtype == generate.F16
386                     else np.float32)  # will never get here, so use np.float32
387
388    def __call__(self, call_details, values, cutoff, magnetic):
389        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
390        # Arrange data transfer to card
391        details_b = cuda.to_device(call_details.buffer)
392        values_b = cuda.to_device(values)
393
394        kernel = self.kernel[1 if magnetic else 0]
395        args = [
396            np.uint32(self.q_input.nq), None, None,
397            details_b, values_b, self.q_input.q_b, self.result_b,
398            self.real(cutoff),
399        ]
400        grid = partition(self.q_input.nq)
401        #print("Calling OpenCL")
402        #call_details.show(values)
403        # Call kernel and retrieve results
404        last_nap = time.clock()
405        step = 1000000//self.q_input.nq + 1
406        #step = 1000000000
407        for start in range(0, call_details.num_eval, step):
408            stop = min(start + step, call_details.num_eval)
409            #print("queuing",start,stop)
410            args[1:3] = [np.int32(start), np.int32(stop)]
411            kernel(*args, **grid)
412            if stop < call_details.num_eval:
413                sync()
414                # Allow other processes to run
415                current_time = time.clock()
416                if current_time - last_nap > 0.5:
417                    time.sleep(0.05)
418                    last_nap = current_time
419        sync()
420        details_b.free()
421        values_b.free()
422        cuda.memcpy_dtoh(self.result, self.result_b)
423        #print("result", self.result)
424
425        pd_norm = self.result[self.q_input.nq]
426        scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0)
427        background = values[1]
428        #print("scale",scale,values[0],self.result[self.q_input.nq],background)
429        return scale*self.result[:self.q_input.nq] + background
430        # return self.result[:self.q_input.nq]
431
432    def release(self):
433        # type: () -> None
434        """
435        Release resources associated with the kernel.
436        """
437        if self.result_b is not None:
438            self.result_b.free()
439            self.result_b = None
440
441    def __del__(self):
442        # type: () -> None
443        self.release()
444
445
446def sync():
447    """
448    Overview:
449        Waits for operation in the current context to complete.
450
451    Note: Maybe context.synchronize() is sufficient.
452    """
453    #return # The following works in C++; don't know what pycuda is doing
454    # Create an event with which to synchronize
455    done = cuda.Event()
456
457    # Schedule an event trigger on the GPU.
458    done.record()
459
460    #line added to not hog resources
461    while not done.query(): time.sleep(0.01)
462
463    # Block until the GPU executes the kernel.
464    done.synchronize()
465    # Clean up the event; I don't think they can be reused.
466    del done
467
468
469def partition(n):
470    '''
471    Overview:
472        Auto grids the thread blocks to achieve some level of calculation
473    efficiency.
474    '''
475    max_gx,max_gy = 65535,65535
476    blocksize = 32
477    #max_gx,max_gy = 5,65536
478    #blocksize = 3
479    block = (blocksize,1,1)
480    num_blocks = int((n+blocksize-1)/blocksize)
481    if num_blocks < max_gx:
482        grid = (num_blocks,1)
483    else:
484        gx = max_gx
485        gy = (num_blocks + max_gx - 1) / max_gx
486        if gy >= max_gy: raise ValueError("vector is too large")
487        grid = (gx,gy)
488    #print "block",block,"grid",grid
489    #print "waste",block[0]*block[1]*block[2]*grid[0]*grid[1] - n
490    return dict(block=block,grid=grid)
491
Note: See TracBrowser for help on using the repository browser.