source: sasmodels/sasmodels/kernelcuda.py @ 00afc15

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 00afc15 was 00afc15, checked in by pkienzle, 5 years ago

fix cuda tests

  • Property mode set to 100644
File size: 20.2 KB
Line 
1"""
2GPU driver for C kernels (with CUDA)
3
4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5device number.  If no device number is specified, then look for CUDA_DEVICE=n
6or a file ~/.cuda-device containing n for the device number.  Otherwise, try
7all available device numbers.
8
9TODO: docs are out of date
10
11There should be a single GPU environment running on the system.  This
12environment is constructed on the first call to :func:`env`, and the
13same environment is returned on each call.
14
15After retrieving the environment, the next step is to create the kernel.
16This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17returns the type of data used by the kernel.
18
19Next a :class:`GpuData` object should be created with the correct kind
20of data.  This data object can be used by multiple kernels, for example,
21if the target model is a weighted sum of multiple kernels.  The data
22should include any extra evaluation points required to compute the proper
23data smearing.  This need not match the square grid for 2D data if there
24is an index saying which q points are active.
25
26Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27This kernel is used during fitting, receiving new sets of parameters and
28evaluating them.  The output value is stored in an output buffer on the
29devices, where it can be combined with other structure factors and form
30factors and have instrumental resolution effects applied.
31
32In order to use OpenCL for your models, you will need OpenCL drivers for
33your machine.  These should be available from your graphics card vendor.
34Intel provides OpenCL drivers for CPUs as well as their integrated HD
35graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
36this writing the performance is lacking compared to the Intel drivers.
37NVidia combines drivers for CUDA and OpenCL in one package.  The result
38is a bit messy if you have multiple drivers installed.  You can see which
39drivers are available by starting python and running:
40
41    import pyopencl as cl
42    cl.create_some_context(interactive=True)
43
44Once you have done that, it will show the available drivers which you
45can select.  It will then tell you that you can use these drivers
46automatically by setting the SAS_OPENCL environment variable, which is
47PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49Some graphics cards have multiple devices on the same card.  You cannot
50yet use both of them concurrently to evaluate models, but you can run
51the program twice using a different device for each session.
52
53OpenCL kernels are compiled when needed by the device driver.  Some
54drivers produce compiler output even when there is no error.  You
55can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
56harmless, albeit annoying.
57"""
58from __future__ import print_function
59
60import os
61import warnings
62import logging
63import time
64import re
65import atexit
66
67import numpy as np  # type: ignore
68
69
70# Attempt to setup cuda. This may fail if the pycuda package is not
71# installed or if it is installed but there are no devices available.
72try:
73    import pycuda.driver as cuda  # type: ignore
74    from pycuda.compiler import SourceModule
75    from pycuda.tools import make_default_context, clear_context_caches
76    # Ask CUDA for the default context (so that we know that one exists)
77    # then immediately throw it away in case the user doesn't want it.
78    # Note: cribbed from pycuda.autoinit
79    cuda.init()
80    context = make_default_context()
81    context.pop()
82    clear_context_caches()
83    del context
84    HAVE_CUDA = True
85    CUDA_ERROR = ""
86except Exception as exc:
87    HAVE_CUDA = False
88    CUDA_ERROR = str(exc)
89
90from . import generate
91from .kernel import KernelModel, Kernel
92
93# pylint: disable=unused-import
94try:
95    from typing import Tuple, Callable, Any
96    from .modelinfo import ModelInfo
97    from .details import CallDetails
98except ImportError:
99    pass
100# pylint: enable=unused-import
101
102# The max loops number is limited by the amount of local memory available
103# on the device.  You don't want to make this value too big because it will
104# waste resources, nor too small because it may interfere with users trying
105# to do their polydispersity calculations.  A value of 1024 should be much
106# larger than necessary given that cost grows as npts^k where k is the number
107# of polydisperse parameters.
108MAX_LOOPS = 2048
109
110def use_cuda():
111    env = os.environ.get("SAS_OPENCL", "").lower()
112    return HAVE_CUDA and (env == "" or env.startswith("cuda"))
113
114ENV = None
115def reset_environment():
116    """
117    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
118    """
119    global ENV
120    # Free any previous allocated context.
121    if ENV is not None and ENV.context is not None:
122        ENV.release()
123    ENV = GpuEnvironment() if use_cuda() else None
124
125def environment():
126    # type: () -> "GpuEnvironment"
127    """
128    Returns a singleton :class:`GpuEnvironment`.
129
130    This provides an OpenCL context and one queue per device.
131    """
132    if ENV is None:
133        if not HAVE_CUDA:
134            raise RuntimeError("CUDA startup failed with ***"
135                            + CUDA_ERROR + "***; using C compiler instead")
136        reset_environment()
137        if ENV is None:
138            raise RuntimeError("SAS_OPENCL=None in environment")
139    return ENV
140
141def free_context():
142    global ENV
143    if ENV is not None:
144        ENV.release()
145        ENV = None
146
147atexit.register(free_context)
148
149def has_type(dtype):
150    # type: (np.dtype) -> bool
151    """
152    Return true if device supports the requested precision.
153    """
154    # Assume the nvidia card supports 32-bit and 64-bit floats.
155    # TODO: check if pycuda support F16
156    return dtype in (generate.F32, generate.F64)
157
158
159FUNCTION_PATTERN = re.compile(r"""^
160  (?P<space>\s*)                   # initial space
161  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function
162  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens
163  """, re.VERBOSE|re.MULTILINE)
164
165MARKED_PATTERN = re.compile(r"""
166  \b(return|else|kernel|device|__device__)\b
167  """, re.VERBOSE|re.MULTILINE)
168
169def _add_device_tag(match):
170    # type: (None) -> str
171    # Note: should be re.Match, but that isn't a simple type
172    """
173    replace qualifiers with __device__ qualifiers if needed
174    """
175    qualifiers = match.group("qualifiers")
176    if MARKED_PATTERN.search(qualifiers):
177        start, end = match.span()
178        return match.string[start:end]
179    else:
180        function = match.group("function")
181        space = match.group("space")
182        return "".join((space, "__device__ ", qualifiers, function))
183
184def mark_device_functions(source):
185    # type: (str) -> str
186    """
187    Mark all function declarations as __device__ functions (except kernel).
188    """
189    return FUNCTION_PATTERN.sub(_add_device_tag, source)
190
191def show_device_functions(source):
192    # type: (str) -> str
193    """
194    Show all discovered function declarations, but don't change any.
195    """
196    for match in FUNCTION_PATTERN.finditer(source):
197        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
198    return source
199
200def compile_model(source, dtype, fast=False):
201    # type: (str, np.dtype, bool) -> SourceModule
202    """
203    Build a model to run on the gpu.
204
205    Returns the compiled program and its type.  The returned type will
206    be float32 even if the desired type is float64 if any of the
207    devices in the context do not support the cl_khr_fp64 extension.
208    """
209    dtype = np.dtype(dtype)
210    if not has_type(dtype):
211        raise RuntimeError("%s not supported for devices"%dtype)
212
213    source_list = [generate.convert_type(source, dtype)]
214
215    source_list.insert(0, "#define USE_SINCOS\n")
216    source = "\n".join(source_list)
217    #source = show_device_functions(source)
218    source = mark_device_functions(source)
219    #with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
220    #print(source)
221    #options = ['--verbose', '-E']
222    options = ['--use_fast_math'] if fast else None
223    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]
224
225    #print("done with "+program)
226    return program
227
228
229# for now, this returns one device in the context
230# TODO: create a context that contains all devices on all platforms
231class GpuEnvironment(object):
232    """
233    GPU context, with possibly many devices, and one queue per device.
234    """
235    context = None # type: cuda.Context
236    def __init__(self, devnum=None):
237        # type: (int) -> None
238        # Byte boundary for data alignment
239        #self.data_boundary = max(d.min_data_type_align_size
240        #                         for d in self.context.devices)
241        self.compiled = {}
242        env = os.environ.get("SAS_OPENCL", "").lower()
243        if devnum is None and env.startswith("cuda:"):
244            devnum = int(env[5:])
245        # Set the global context to the particular device number if one is
246        # given, otherwise use the default context.  Perhaps this will be set
247        # by an environment variable within autoinit.
248        if devnum is not None:
249            self.context = cuda.Device(devnum).make_context()
250        else:
251            self.context = make_default_context()
252
253    def release(self):
254        if self.context is not None:
255            self.context.pop()
256            self.context = None
257
258    def __del__(self):
259        self.release()
260
261    def has_type(self, dtype):
262        # type: (np.dtype) -> bool
263        """
264        Return True if all devices support a given type.
265        """
266        return has_type(dtype)
267
268    def compile_program(self, name, source, dtype, fast, timestamp):
269        # type: (str, str, np.dtype, bool, float) -> cl.Program
270        """
271        Compile the program for the device in the given context.
272        """
273        # Note: PyOpenCL caches based on md5 hash of source, options and device
274        # so we don't really need to cache things for ourselves.  I'll do so
275        # anyway just to save some data munging time.
276        tag = generate.tag_source(source)
277        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
278        # Check timestamp on program
279        program, program_timestamp = self.compiled.get(key, (None, np.inf))
280        if program_timestamp < timestamp:
281            del self.compiled[key]
282        if key not in self.compiled:
283            logging.info("building %s for CUDA", key)
284            program = compile_model(str(source), dtype, fast)
285            self.compiled[key] = (program, timestamp)
286        return program
287
288class GpuModel(KernelModel):
289    """
290    GPU wrapper for a single model.
291
292    *source* and *model_info* are the model source and interface as returned
293    from :func:`generate.make_source` and :func:`generate.make_model_info`.
294
295    *dtype* is the desired model precision.  Any numpy dtype for single
296    or double precision floats will do, such as 'f', 'float32' or 'single'
297    for single and 'd', 'float64' or 'double' for double.  Double precision
298    is an optional extension which may not be available on all devices.
299    Half precision ('float16','half') may be available on some devices.
300    Fast precision ('fast') is a loose version of single precision, indicating
301    that the compiler is allowed to take shortcuts.
302    """
303    info = None # type: ModelInfo
304    source = "" # type: str
305    dtype = None # type: np.dtype
306    fast = False # type: bool
307    _program = None # type: SourceModule
308    _kernels = None # type: Dict[str, cuda.Function]
309
310    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
311        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
312        self.info = model_info
313        self.source = source
314        self.dtype = dtype
315        self.fast = fast
316
317    def __getstate__(self):
318        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
319        return self.info, self.source, self.dtype, self.fast
320
321    def __setstate__(self, state):
322        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
323        self.info, self.source, self.dtype, self.fast = state
324        self._program = self._kernels = None
325
326    def make_kernel(self, q_vectors):
327        # type: (List[np.ndarray]) -> "GpuKernel"
328        return GpuKernel(self, q_vectors)
329
330    def get_function(self, name):
331        # type: (str) -> cuda.Function
332        """
333        Fetch the kernel from the environment by name, compiling it if it
334        does not already exist.
335        """
336        if self._program is None:
337            self._prepare_program()
338        return self._kernels[name]
339
340    def _prepare_program(self):
341        # type: (str) -> None
342        env = environment()
343        timestamp = generate.ocl_timestamp(self.info)
344        program = env.compile_program(
345            self.info.name,
346            self.source['opencl'],
347            self.dtype,
348            self.fast,
349            timestamp)
350        variants = ['Iq', 'Iqxy', 'Imagnetic']
351        names = [generate.kernel_name(self.info, k) for k in variants]
352        functions = [program.get_function(k) for k in names]
353        self._kernels = {k: v for k, v in zip(variants, functions)}
354        # keep a handle to program so GC doesn't collect
355        self._program = program
356
357# TODO: check that we don't need a destructor for buffers which go out of scope
358class GpuInput(object):
359    """
360    Make q data available to the gpu.
361
362    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
363    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
364    to get the best performance on OpenCL, which may involve shifting and
365    stretching the array to better match the memory architecture.  Additional
366    points will be evaluated with *q=1e-3*.
367
368    *dtype* is the data type for the q vectors. The data type should be
369    set to match that of the kernel, which is an attribute of
370    :class:`GpuProgram`.  Note that not all kernels support double
371    precision, so even if the program was created for double precision,
372    the *GpuProgram.dtype* may be single precision.
373
374    Call :meth:`release` when complete.  Even if not called directly, the
375    buffer will be released when the data object is freed.
376    """
377    def __init__(self, q_vectors, dtype=generate.F32):
378        # type: (List[np.ndarray], np.dtype) -> None
379        # TODO: do we ever need double precision q?
380        self.nq = q_vectors[0].size
381        self.dtype = np.dtype(dtype)
382        self.is_2d = (len(q_vectors) == 2)
383        # TODO: stretch input based on get_warp()
384        # not doing it now since warp depends on kernel, which is not known
385        # at this point, so instead using 32, which is good on the set of
386        # architectures tested so far.
387        if self.is_2d:
388            # Note: 16 rather than 15 because result is 1 longer than input.
389            width = ((self.nq+16)//16)*16
390            self.q = np.empty((width, 2), dtype=dtype)
391            self.q[:self.nq, 0] = q_vectors[0]
392            self.q[:self.nq, 1] = q_vectors[1]
393        else:
394            # Note: 32 rather than 31 because result is 1 longer than input.
395            width = ((self.nq+32)//32)*32
396            self.q = np.empty(width, dtype=dtype)
397            self.q[:self.nq] = q_vectors[0]
398        self.global_size = [self.q.shape[0]]
399        #print("creating inputs of size", self.global_size)
400
401        # transfer input value to gpu
402        self.q_b = cuda.to_device(self.q)
403
404    def release(self):
405        # type: () -> None
406        """
407        Free the memory.
408        """
409        if self.q_b is not None:
410            self.q_b.free()
411            self.q_b = None
412
413    def __del__(self):
414        # type: () -> None
415        self.release()
416
417class GpuKernel(Kernel):
418    """
419    Callable SAS kernel.
420
421    *model* is the GpuModel object to call
422
423    The kernel is derived from :class:`Kernel`, providing the
424    :meth:`call_kernel` method to evaluate the kernel for a given set of
425    parameters.  Because of the need to move the q values to the GPU before
426    evaluation, the kernel is instantiated for a particular set of q vectors,
427    and can be called many times without transfering q each time.
428
429    Call :meth:`release` when done with the kernel instance.
430    """
431    #: SAS model information structure
432    info = None # type: ModelInfo
433    #: kernel precision
434    dtype = None # type: np.dtype
435    #: kernel dimensions (1d or 2d)
436    dim = "" # type: str
437    #: calculation results, updated after each call to :meth:`_call_kernel`
438    result = None # type: np.ndarray
439
440    def __init__(self, model, q_vectors):
441        # type: (GpuModel, List[np.ndarray]) -> None
442        dtype = model.dtype
443        self.q_input = GpuInput(q_vectors, dtype)
444        self._model = model
445        # F16 isn't sufficient, so don't support it
446        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
447
448        # attributes accessed from the outside
449        self.dim = '2d' if self.q_input.is_2d else '1d'
450        self.info = model.info
451        self.dtype = model.dtype
452
453        # holding place for the returned value
454        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
455        extra_q = 4  # total weight, form volume, shell volume and R_eff
456        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
457
458        # allocate result value on gpu
459        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
460        self._result_b = cuda.mem_alloc(width)
461
462    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
463        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
464        # Arrange data transfer to card
465        details_b = cuda.to_device(call_details.buffer)
466        values_b = cuda.to_device(values)
467
468        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
469        kernel = self._model.get_function(name)
470        kernel_args = [
471            np.uint32(self.q_input.nq), None, None,
472            details_b, values_b, self.q_input.q_b, self._result_b,
473            self._as_dtype(cutoff),
474            np.uint32(effective_radius_type),
475        ]
476        grid = partition(self.q_input.nq)
477        #print("Calling OpenCL")
478        #call_details.show(values)
479        # Call kernel and retrieve results
480        last_nap = time.clock()
481        step = 100000000//self.q_input.nq + 1
482        #step = 1000000000
483        for start in range(0, call_details.num_eval, step):
484            stop = min(start + step, call_details.num_eval)
485            #print("queuing",start,stop)
486            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
487            kernel(*kernel_args, **grid)
488            if stop < call_details.num_eval:
489                sync()
490                # Allow other processes to run
491                current_time = time.clock()
492                if current_time - last_nap > 0.5:
493                    time.sleep(0.001)
494                    last_nap = current_time
495        sync()
496        cuda.memcpy_dtoh(self.result, self._result_b)
497        #print("result", self.result)
498
499        details_b.free()
500        values_b.free()
501
502    def release(self):
503        # type: () -> None
504        """
505        Release resources associated with the kernel.
506        """
507        self.q_input.release()
508        if self._result_b is not None:
509            self._result_b.free()
510            self._result_b = None
511
512    def __del__(self):
513        # type: () -> None
514        self.release()
515
516
517def sync():
518    """
519    Overview:
520        Waits for operation in the current context to complete.
521
522    Note: Maybe context.synchronize() is sufficient.
523    """
524    #return # The following works in C++; don't know what pycuda is doing
525    # Create an event with which to synchronize
526    done = cuda.Event()
527
528    # Schedule an event trigger on the GPU.
529    done.record()
530
531    #line added to not hog resources
532    while not done.query():
533        time.sleep(0.01)
534
535    # Block until the GPU executes the kernel.
536    done.synchronize()
537    # Clean up the event; I don't think they can be reused.
538    del done
539
540
541def partition(n):
542    '''
543    Overview:
544        Auto grids the thread blocks to achieve some level of calculation
545    efficiency.
546    '''
547    max_gx, max_gy = 65535, 65535
548    blocksize = 32
549    #max_gx, max_gy = 5, 65536
550    #blocksize = 3
551    block = (blocksize, 1, 1)
552    num_blocks = int((n+blocksize-1)/blocksize)
553    if num_blocks < max_gx:
554        grid = (num_blocks, 1)
555    else:
556        gx = max_gx
557        gy = (num_blocks + max_gx - 1) / max_gx
558        if gy >= max_gy:
559            raise ValueError("vector is too large")
560        grid = (gx, gy)
561    #print("block", block, "grid", grid)
562    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n)
563    return dict(block=block, grid=grid)
Note: See TracBrowser for help on using the repository browser.