source: sasmodels/sasmodels/kernelcuda.py @ 3199b17

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 3199b17 was 3199b17, checked in by Paul Kienzle <pkienzle@…>, 8 months ago

PEP 8 changes and improved consistency between OpenCL/CUDA/DLL code

  • Property mode set to 100644
File size: 20.2 KB
Line 
1"""
2GPU driver for C kernels (with CUDA)
3
4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5device number.  If no device number is specified, then look for CUDA_DEVICE=n
6or a file ~/.cuda-device containing n for the device number.  Otherwise, try
7all available device numbers.
8
9TODO: docs are out of date
10
11There should be a single GPU environment running on the system.  This
12environment is constructed on the first call to :func:`env`, and the
13same environment is returned on each call.
14
15After retrieving the environment, the next step is to create the kernel.
16This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17returns the type of data used by the kernel.
18
19Next a :class:`GpuData` object should be created with the correct kind
20of data.  This data object can be used by multiple kernels, for example,
21if the target model is a weighted sum of multiple kernels.  The data
22should include any extra evaluation points required to compute the proper
23data smearing.  This need not match the square grid for 2D data if there
24is an index saying which q points are active.
25
26Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27This kernel is used during fitting, receiving new sets of parameters and
28evaluating them.  The output value is stored in an output buffer on the
29devices, where it can be combined with other structure factors and form
30factors and have instrumental resolution effects applied.
31
32In order to use OpenCL for your models, you will need OpenCL drivers for
33your machine.  These should be available from your graphics card vendor.
34Intel provides OpenCL drivers for CPUs as well as their integrated HD
35graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
36this writing the performance is lacking compared to the Intel drivers.
37NVidia combines drivers for CUDA and OpenCL in one package.  The result
38is a bit messy if you have multiple drivers installed.  You can see which
39drivers are available by starting python and running:
40
41    import pyopencl as cl
42    cl.create_some_context(interactive=True)
43
44Once you have done that, it will show the available drivers which you
45can select.  It will then tell you that you can use these drivers
46automatically by setting the SAS_OPENCL environment variable, which is
47PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49Some graphics cards have multiple devices on the same card.  You cannot
50yet use both of them concurrently to evaluate models, but you can run
51the program twice using a different device for each session.
52
53OpenCL kernels are compiled when needed by the device driver.  Some
54drivers produce compiler output even when there is no error.  You
55can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
56harmless, albeit annoying.
57"""
58from __future__ import print_function
59
60import os
61import warnings
62import logging
63import time
64import re
65import atexit
66
67import numpy as np  # type: ignore
68
69
70# Attempt to setup CUDA. This may fail if the pycuda package is not
71# installed or if it is installed but there are no devices available.
72try:
73    import pycuda.driver as cuda  # type: ignore
74    from pycuda.compiler import SourceModule
75    from pycuda.tools import make_default_context, clear_context_caches
76    # Ask CUDA for the default context (so that we know that one exists)
77    # then immediately throw it away in case the user doesn't want it.
78    # Note: cribbed from pycuda.autoinit
79    cuda.init()
80    context = make_default_context()
81    context.pop()
82    clear_context_caches()
83    del context
84    HAVE_CUDA = True
85    CUDA_ERROR = ""
86except Exception as exc:
87    HAVE_CUDA = False
88    CUDA_ERROR = str(exc)
89
90from . import generate
91from .kernel import KernelModel, Kernel
92
93# pylint: disable=unused-import
94try:
95    from typing import Tuple, Callable, Any
96    from .modelinfo import ModelInfo
97    from .details import CallDetails
98except ImportError:
99    pass
100# pylint: enable=unused-import
101
102# The max loops number is limited by the amount of local memory available
103# on the device.  You don't want to make this value too big because it will
104# waste resources, nor too small because it may interfere with users trying
105# to do their polydispersity calculations.  A value of 1024 should be much
106# larger than necessary given that cost grows as npts^k where k is the number
107# of polydisperse parameters.
108MAX_LOOPS = 2048
109
110
111def use_cuda():
112    sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower()
113    return HAVE_CUDA and sas_opencl.startswith("cuda")
114
115
116ENV = None
117def reset_environment():
118    """
119    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
120    """
121    global ENV
122    # Free any previous allocated context.
123    if ENV is not None and ENV.context is not None:
124        ENV.release()
125    ENV = GpuEnvironment() if use_cuda() else None
126
127
128def environment():
129    # type: () -> "GpuEnvironment"
130    """
131    Returns a singleton :class:`GpuEnvironment`.
132
133    This provides an OpenCL context and one queue per device.
134    """
135    if ENV is None:
136        if not HAVE_CUDA:
137            raise RuntimeError("CUDA startup failed with ***"
138                            + CUDA_ERROR + "***; using C compiler instead")
139        reset_environment()
140        if ENV is None:
141            raise RuntimeError("SAS_OPENCL=None in environment")
142    return ENV
143
144
145def has_type(dtype):
146    # type: (np.dtype) -> bool
147    """
148    Return true if device supports the requested precision.
149    """
150    # Assume the NVIDIA card supports 32-bit and 64-bit floats.
151    # TODO: Check if pycuda support F16.
152    return dtype in (generate.F32, generate.F64)
153
154
155FUNCTION_PATTERN = re.compile(r"""^
156  (?P<space>\s*)                       # Initial space.
157  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function.
158  (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens.
159  """, re.VERBOSE|re.MULTILINE)
160
161MARKED_PATTERN = re.compile(r"""
162  \b(return|else|kernel|device|__device__)\b
163  """, re.VERBOSE|re.MULTILINE)
164
165
166def _add_device_tag(match):
167    # type: (None) -> str
168    # Note: Should be re.Match, but that isn't a simple type.
169    """
170    replace qualifiers with __device__ qualifiers if needed
171    """
172    qualifiers = match.group("qualifiers")
173    if MARKED_PATTERN.search(qualifiers):
174        start, end = match.span()
175        return match.string[start:end]
176    else:
177        function = match.group("function")
178        space = match.group("space")
179        return "".join((space, "__device__ ", qualifiers, function))
180
181
182def mark_device_functions(source):
183    # type: (str) -> str
184    """
185    Mark all function declarations as __device__ functions (except kernel).
186    """
187    return FUNCTION_PATTERN.sub(_add_device_tag, source)
188
189
190def show_device_functions(source):
191    # type: (str) -> str
192    """
193    Show all discovered function declarations, but don't change any.
194    """
195    for match in FUNCTION_PATTERN.finditer(source):
196        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
197    return source
198
199
200def compile_model(source, dtype, fast=False):
201    # type: (str, np.dtype, bool) -> SourceModule
202    """
203    Build a model to run on the gpu.
204
205    Returns the compiled program and its type.  The returned type will
206    be float32 even if the desired type is float64 if any of the
207    devices in the context do not support the cl_khr_fp64 extension.
208    """
209    dtype = np.dtype(dtype)
210    if not has_type(dtype):
211        raise RuntimeError("%s not supported for devices"%dtype)
212
213    source_list = [generate.convert_type(source, dtype)]
214
215    source_list.insert(0, "#define USE_SINCOS\n")
216    source = "\n".join(source_list)
217    #source = show_device_functions(source)
218    source = mark_device_functions(source)
219    #with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
220    #print(source)
221    #options = ['--verbose', '-E']
222    options = ['--use_fast_math'] if fast else None
223    program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...])
224
225    #print("done with "+program)
226    return program
227
228
229# For now, this returns one device in the context.
230# TODO: Create a context that contains all devices on all platforms.
231class GpuEnvironment(object):
232    """
233    GPU context for CUDA.
234    """
235    context = None # type: cuda.Context
236    def __init__(self, devnum=None):
237        # type: (int) -> None
238        env = os.environ.get("SAS_OPENCL", "").lower()
239        if devnum is None and env.startswith("cuda:"):
240            devnum = int(env[5:])
241
242        # Set the global context to the particular device number if one is
243        # given, otherwise use the default context.  Perhaps this will be set
244        # by an environment variable within autoinit.
245        if devnum is not None:
246            self.context = cuda.Device(devnum).make_context()
247        else:
248            self.context = make_default_context()
249
250        ## Byte boundary for data alignment.
251        #self.data_boundary = max(d.min_data_type_align_size
252        #                         for d in self.context.devices)
253
254        # Cache for compiled programs, and for items in context.
255        self.compiled = {}
256
257    def release(self):
258        if self.context is not None:
259            self.context.pop()
260            self.context = None
261
262    def __del__(self):
263        self.release()
264
265    def has_type(self, dtype):
266        # type: (np.dtype) -> bool
267        """
268        Return True if all devices support a given type.
269        """
270        return has_type(dtype)
271
272    def compile_program(self, name, source, dtype, fast, timestamp):
273        # type: (str, str, np.dtype, bool, float) -> cl.Program
274        """
275        Compile the program for the device in the given context.
276        """
277        # Note: PyCuda (probably) caches but I'll do so as well just to
278        # save some data munging time.
279        tag = generate.tag_source(source)
280        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
281        # Check timestamp on program.
282        program, program_timestamp = self.compiled.get(key, (None, np.inf))
283        if program_timestamp < timestamp:
284            del self.compiled[key]
285        if key not in self.compiled:
286            logging.info("building %s for CUDA", key)
287            program = compile_model(str(source), dtype, fast)
288            self.compiled[key] = (program, timestamp)
289        return program
290
291
292class GpuModel(KernelModel):
293    """
294    GPU wrapper for a single model.
295
296    *source* and *model_info* are the model source and interface as returned
297    from :func:`generate.make_source` and :func:`generate.make_model_info`.
298
299    *dtype* is the desired model precision.  Any numpy dtype for single
300    or double precision floats will do, such as 'f', 'float32' or 'single'
301    for single and 'd', 'float64' or 'double' for double.  Double precision
302    is an optional extension which may not be available on all devices.
303    Half precision ('float16','half') may be available on some devices.
304    Fast precision ('fast') is a loose version of single precision, indicating
305    that the compiler is allowed to take shortcuts.
306    """
307    info = None  # type: ModelInfo
308    source = ""  # type: str
309    dtype = None  # type: np.dtype
310    fast = False  # type: bool
311    _program = None  # type: SourceModule
312    _kernels = None  # type: Dict[str, cuda.Function]
313
314    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
315        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
316        self.info = model_info
317        self.source = source
318        self.dtype = dtype
319        self.fast = fast
320
321    def __getstate__(self):
322        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
323        return self.info, self.source, self.dtype, self.fast
324
325    def __setstate__(self, state):
326        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
327        self.info, self.source, self.dtype, self.fast = state
328        self._program = self._kernels = None
329
330    def make_kernel(self, q_vectors):
331        # type: (List[np.ndarray]) -> "GpuKernel"
332        return GpuKernel(self, q_vectors)
333
334    def get_function(self, name):
335        # type: (str) -> cuda.Function
336        """
337        Fetch the kernel from the environment by name, compiling it if it
338        does not already exist.
339        """
340        if self._program is None:
341            self._prepare_program()
342        return self._kernels[name]
343
344    def _prepare_program(self):
345        # type: (str) -> None
346        env = environment()
347        timestamp = generate.ocl_timestamp(self.info)
348        program = env.compile_program(
349            self.info.name,
350            self.source['opencl'],
351            self.dtype,
352            self.fast,
353            timestamp)
354        variants = ['Iq', 'Iqxy', 'Imagnetic']
355        names = [generate.kernel_name(self.info, k) for k in variants]
356        functions = [program.get_function(k) for k in names]
357        self._kernels = {k: v for k, v in zip(variants, functions)}
358        # Keep a handle to program so GC doesn't collect.
359        self._program = program
360
361
362# TODO: Check that we don't need a destructor for buffers which go out of scope.
363class GpuInput(object):
364    """
365    Make q data available to the gpu.
366
367    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
368    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
369    to get the best performance on OpenCL, which may involve shifting and
370    stretching the array to better match the memory architecture.  Additional
371    points will be evaluated with *q=1e-3*.
372
373    *dtype* is the data type for the q vectors. The data type should be
374    set to match that of the kernel, which is an attribute of
375    :class:`GpuProgram`.  Note that not all kernels support double
376    precision, so even if the program was created for double precision,
377    the *GpuProgram.dtype* may be single precision.
378
379    Call :meth:`release` when complete.  Even if not called directly, the
380    buffer will be released when the data object is freed.
381    """
382    def __init__(self, q_vectors, dtype=generate.F32):
383        # type: (List[np.ndarray], np.dtype) -> None
384        # TODO: Do we ever need double precision q?
385        self.nq = q_vectors[0].size
386        self.dtype = np.dtype(dtype)
387        self.is_2d = (len(q_vectors) == 2)
388        # TODO: Stretch input based on get_warp().
389        # Not doing it now since warp depends on kernel, which is not known
390        # at this point, so instead using 32, which is good on the set of
391        # architectures tested so far.
392        if self.is_2d:
393            width = ((self.nq+15)//16)*16
394            self.q = np.empty((width, 2), dtype=dtype)
395            self.q[:self.nq, 0] = q_vectors[0]
396            self.q[:self.nq, 1] = q_vectors[1]
397        else:
398            width = ((self.nq+31)//32)*32
399            self.q = np.empty(width, dtype=dtype)
400            self.q[:self.nq] = q_vectors[0]
401        self.global_size = [self.q.shape[0]]
402        #print("creating inputs of size", self.global_size)
403
404        # Transfer input value to GPU.
405        self.q_b = cuda.to_device(self.q)
406
407    def release(self):
408        # type: () -> None
409        """
410        Free the buffer associated with the q value.
411        """
412        if self.q_b is not None:
413            self.q_b.free()
414            self.q_b = None
415
416    def __del__(self):
417        # type: () -> None
418        self.release()
419
420
421class GpuKernel(Kernel):
422    """
423    Callable SAS kernel.
424
425    *model* is the GpuModel object to call
426
427    The kernel is derived from :class:`Kernel`, providing the
428    :meth:`call_kernel` method to evaluate the kernel for a given set of
429    parameters.  Because of the need to move the q values to the GPU before
430    evaluation, the kernel is instantiated for a particular set of q vectors,
431    and can be called many times without transfering q each time.
432
433    Call :meth:`release` when done with the kernel instance.
434    """
435    #: SAS model information structure.
436    info = None  # type: ModelInfo
437    #: Kernel precision.
438    dtype = None  # type: np.dtype
439    #: Kernel dimensions (1d or 2d).
440    dim = ""  # type: str
441    #: Calculation results, updated after each call to :meth:`_call_kernel`.
442    result = None  # type: np.ndarray
443
444    def __init__(self, model, q_vectors):
445        # type: (GpuModel, List[np.ndarray]) -> None
446        dtype = model.dtype
447        self.q_input = GpuInput(q_vectors, dtype)
448        self._model = model
449
450        # Attributes accessed from the outside.
451        self.dim = '2d' if self.q_input.is_2d else '1d'
452        self.info = model.info
453        self.dtype = dtype
454
455        # Converter to translate input to target type.
456        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
457
458        # Holding place for the returned value.
459        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
460        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
461        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
462
463        # Allocate result value on GPU.
464        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
465        self._result_b = cuda.mem_alloc(width)
466
467    def _call_kernel(self, call_details, values, cutoff, magnetic,
468                     effective_radius_type):
469        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
470
471        # Arrange data transfer to card.
472        details_b = cuda.to_device(call_details.buffer)
473        values_b = cuda.to_device(values)
474
475        # Setup kernel function and arguments.
476        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
477        kernel = self._model.get_function(name)
478        kernel_args = [
479            np.uint32(self.q_input.nq),  # Number of inputs.
480            None,  # Placeholder for pd_start.
481            None,  # Placeholder for pd_stop.
482            details_b,  # Problem definition.
483            values_b,  # Parameter values.
484            self.q_input.q_b,  # Q values.
485            self._result_b,   # Result storage.
486            self._as_dtype(cutoff),  # Probability cutoff.
487            np.uint32(effective_radius_type),  # R_eff mode.
488        ]
489        grid = partition(self.q_input.nq)
490
491        # Call kernel and retrieve results.
492        #print("Calling CUDA")
493        #call_details.show(values)
494        last_nap = time.clock()
495        step = 100000000//self.q_input.nq + 1
496        #step = 1000000000
497        for start in range(0, call_details.num_eval, step):
498            stop = min(start + step, call_details.num_eval)
499            #print("queuing",start,stop)
500            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
501            kernel(*kernel_args, **grid)
502            if stop < call_details.num_eval:
503                sync()
504                # Allow other processes to run.
505                current_time = time.clock()
506                if current_time - last_nap > 0.5:
507                    time.sleep(0.001)
508                    last_nap = current_time
509        sync()
510        cuda.memcpy_dtoh(self.result, self._result_b)
511        #print("result", self.result)
512
513        details_b.free()
514        values_b.free()
515
516    def release(self):
517        # type: () -> None
518        """
519        Release resources associated with the kernel.
520        """
521        self.q_input.release()
522        if self._result_b is not None:
523            self._result_b.free()
524            self._result_b = None
525
526    def __del__(self):
527        # type: () -> None
528        self.release()
529
530
531def sync():
532    """
533    Overview:
534        Waits for operation in the current context to complete.
535
536    Note: Maybe context.synchronize() is sufficient.
537    """
538    # Create an event with which to synchronize.
539    done = cuda.Event()
540
541    # Schedule an event trigger on the GPU.
542    done.record()
543
544    # Make sure we don't hog resource while waiting to sync.
545    while not done.query():
546        time.sleep(0.01)
547
548    # Block until the GPU executes the kernel.
549    done.synchronize()
550
551    # Clean up the event; I don't think they can be reused.
552    del done
553
554
555def partition(n):
556    '''
557    Overview:
558        Auto grids the thread blocks to achieve some level of calculation
559    efficiency.
560    '''
561    max_gx, max_gy = 65535, 65535
562    blocksize = 32
563    #max_gx, max_gy = 5, 65536
564    #blocksize = 3
565    block = (blocksize, 1, 1)
566    num_blocks = int((n+blocksize-1)/blocksize)
567    if num_blocks < max_gx:
568        grid = (num_blocks, 1)
569    else:
570        gx = max_gx
571        gy = (num_blocks + max_gx - 1) / max_gx
572        if gy >= max_gy:
573            raise ValueError("vector is too large")
574        grid = (gx, gy)
575    #print("block", block, "grid", grid)
576    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n)
577    return dict(block=block, grid=grid)
Note: See TracBrowser for help on using the repository browser.