source: sasmodels/sasmodels/kernelcuda.py @ fa26e78

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since fa26e78 was fa26e78, checked in by Paul Kienzle <pkienzle@…>, 8 months ago

make sure cuda environment gets freed

  • Property mode set to 100644
File size: 20.3 KB
Line 
1"""
2GPU driver for C kernels (with CUDA)
3
4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5device number.  If no device number is specified, then look for CUDA_DEVICE=n
6or a file ~/.cuda-device containing n for the device number.  Otherwise, try
7all available device numbers.
8
9TODO: docs are out of date
10
11There should be a single GPU environment running on the system.  This
12environment is constructed on the first call to :func:`env`, and the
13same environment is returned on each call.
14
15After retrieving the environment, the next step is to create the kernel.
16This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17returns the type of data used by the kernel.
18
19Next a :class:`GpuData` object should be created with the correct kind
20of data.  This data object can be used by multiple kernels, for example,
21if the target model is a weighted sum of multiple kernels.  The data
22should include any extra evaluation points required to compute the proper
23data smearing.  This need not match the square grid for 2D data if there
24is an index saying which q points are active.
25
26Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27This kernel is used during fitting, receiving new sets of parameters and
28evaluating them.  The output value is stored in an output buffer on the
29devices, where it can be combined with other structure factors and form
30factors and have instrumental resolution effects applied.
31
32In order to use OpenCL for your models, you will need OpenCL drivers for
33your machine.  These should be available from your graphics card vendor.
34Intel provides OpenCL drivers for CPUs as well as their integrated HD
35graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
36this writing the performance is lacking compared to the Intel drivers.
37NVidia combines drivers for CUDA and OpenCL in one package.  The result
38is a bit messy if you have multiple drivers installed.  You can see which
39drivers are available by starting python and running:
40
41    import pyopencl as cl
42    cl.create_some_context(interactive=True)
43
44Once you have done that, it will show the available drivers which you
45can select.  It will then tell you that you can use these drivers
46automatically by setting the SAS_OPENCL environment variable, which is
47PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49Some graphics cards have multiple devices on the same card.  You cannot
50yet use both of them concurrently to evaluate models, but you can run
51the program twice using a different device for each session.
52
53OpenCL kernels are compiled when needed by the device driver.  Some
54drivers produce compiler output even when there is no error.  You
55can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
56harmless, albeit annoying.
57"""
58from __future__ import print_function
59
60import os
61import warnings
62import logging
63import time
64import re
65import atexit
66
67import numpy as np  # type: ignore
68
69
70# Attempt to setup CUDA. This may fail if the pycuda package is not
71# installed or if it is installed but there are no devices available.
72try:
73    import pycuda.driver as cuda  # type: ignore
74    from pycuda.compiler import SourceModule
75    from pycuda.tools import make_default_context, clear_context_caches
76    # Ask CUDA for the default context (so that we know that one exists)
77    # then immediately throw it away in case the user doesn't want it.
78    # Note: cribbed from pycuda.autoinit
79    cuda.init()
80    context = make_default_context()
81    context.pop()
82    clear_context_caches()
83    del context
84    HAVE_CUDA = True
85    CUDA_ERROR = ""
86except Exception as exc:
87    HAVE_CUDA = False
88    CUDA_ERROR = str(exc)
89
90from . import generate
91from .kernel import KernelModel, Kernel
92
93# pylint: disable=unused-import
94try:
95    from typing import Tuple, Callable, Any
96    from .modelinfo import ModelInfo
97    from .details import CallDetails
98except ImportError:
99    pass
100# pylint: enable=unused-import
101
102# The max loops number is limited by the amount of local memory available
103# on the device.  You don't want to make this value too big because it will
104# waste resources, nor too small because it may interfere with users trying
105# to do their polydispersity calculations.  A value of 1024 should be much
106# larger than necessary given that cost grows as npts^k where k is the number
107# of polydisperse parameters.
108MAX_LOOPS = 2048
109
110
111def use_cuda():
112    sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower()
113    return HAVE_CUDA and sas_opencl.startswith("cuda")
114
115
116ENV = None
117def reset_environment():
118    """
119    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
120    """
121    global ENV
122    # Free any previous allocated context.
123    if ENV is not None and ENV.context is not None:
124        ENV.release()
125    ENV = GpuEnvironment() if use_cuda() else None
126
127
128def environment():
129    # type: () -> "GpuEnvironment"
130    """
131    Returns a singleton :class:`GpuEnvironment`.
132
133    This provides an OpenCL context and one queue per device.
134    """
135    if ENV is None:
136        if not HAVE_CUDA:
137            raise RuntimeError("CUDA startup failed with ***"
138                            + CUDA_ERROR + "***; using C compiler instead")
139        reset_environment()
140        if ENV is None:
141            raise RuntimeError("SAS_OPENCL=None in environment")
142    return ENV
143
144
145# PyTest is not freeing ENV, so make sure it gets freed.
146atexit.register(lambda: ENV.release() if ENV is not None else None)
147
148
149def has_type(dtype):
150    # type: (np.dtype) -> bool
151    """
152    Return true if device supports the requested precision.
153    """
154    # Assume the NVIDIA card supports 32-bit and 64-bit floats.
155    # TODO: Check if pycuda support F16.
156    return dtype in (generate.F32, generate.F64)
157
158
159FUNCTION_PATTERN = re.compile(r"""^
160  (?P<space>\s*)                       # Initial space.
161  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function.
162  (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens.
163  """, re.VERBOSE|re.MULTILINE)
164
165MARKED_PATTERN = re.compile(r"""
166  \b(return|else|kernel|device|__device__)\b
167  """, re.VERBOSE|re.MULTILINE)
168
169
170def _add_device_tag(match):
171    # type: (None) -> str
172    # Note: Should be re.Match, but that isn't a simple type.
173    """
174    replace qualifiers with __device__ qualifiers if needed
175    """
176    qualifiers = match.group("qualifiers")
177    if MARKED_PATTERN.search(qualifiers):
178        start, end = match.span()
179        return match.string[start:end]
180    else:
181        function = match.group("function")
182        space = match.group("space")
183        return "".join((space, "__device__ ", qualifiers, function))
184
185
186def mark_device_functions(source):
187    # type: (str) -> str
188    """
189    Mark all function declarations as __device__ functions (except kernel).
190    """
191    return FUNCTION_PATTERN.sub(_add_device_tag, source)
192
193
194def show_device_functions(source):
195    # type: (str) -> str
196    """
197    Show all discovered function declarations, but don't change any.
198    """
199    for match in FUNCTION_PATTERN.finditer(source):
200        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
201    return source
202
203
204def compile_model(source, dtype, fast=False):
205    # type: (str, np.dtype, bool) -> SourceModule
206    """
207    Build a model to run on the gpu.
208
209    Returns the compiled program and its type.  The returned type will
210    be float32 even if the desired type is float64 if any of the
211    devices in the context do not support the cl_khr_fp64 extension.
212    """
213    dtype = np.dtype(dtype)
214    if not has_type(dtype):
215        raise RuntimeError("%s not supported for devices"%dtype)
216
217    source_list = [generate.convert_type(source, dtype)]
218
219    source_list.insert(0, "#define USE_SINCOS\n")
220    source = "\n".join(source_list)
221    #source = show_device_functions(source)
222    source = mark_device_functions(source)
223    #with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
224    #print(source)
225    #options = ['--verbose', '-E']
226    options = ['--use_fast_math'] if fast else None
227    program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...])
228
229    #print("done with "+program)
230    return program
231
232
233# For now, this returns one device in the context.
234# TODO: Create a context that contains all devices on all platforms.
235class GpuEnvironment(object):
236    """
237    GPU context for CUDA.
238    """
239    context = None # type: cuda.Context
240    def __init__(self, devnum=None):
241        # type: (int) -> None
242        env = os.environ.get("SAS_OPENCL", "").lower()
243        if devnum is None and env.startswith("cuda:"):
244            devnum = int(env[5:])
245
246        # Set the global context to the particular device number if one is
247        # given, otherwise use the default context.  Perhaps this will be set
248        # by an environment variable within autoinit.
249        if devnum is not None:
250            self.context = cuda.Device(devnum).make_context()
251        else:
252            self.context = make_default_context()
253
254        ## Byte boundary for data alignment.
255        #self.data_boundary = max(d.min_data_type_align_size
256        #                         for d in self.context.devices)
257
258        # Cache for compiled programs, and for items in context.
259        self.compiled = {}
260
261    def release(self):
262        if self.context is not None:
263            self.context.pop()
264            self.context = None
265
266    def __del__(self):
267        self.release()
268
269    def has_type(self, dtype):
270        # type: (np.dtype) -> bool
271        """
272        Return True if all devices support a given type.
273        """
274        return has_type(dtype)
275
276    def compile_program(self, name, source, dtype, fast, timestamp):
277        # type: (str, str, np.dtype, bool, float) -> cl.Program
278        """
279        Compile the program for the device in the given context.
280        """
281        # Note: PyCuda (probably) caches but I'll do so as well just to
282        # save some data munging time.
283        tag = generate.tag_source(source)
284        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
285        # Check timestamp on program.
286        program, program_timestamp = self.compiled.get(key, (None, np.inf))
287        if program_timestamp < timestamp:
288            del self.compiled[key]
289        if key not in self.compiled:
290            logging.info("building %s for CUDA", key)
291            program = compile_model(str(source), dtype, fast)
292            self.compiled[key] = (program, timestamp)
293        return program
294
295
296class GpuModel(KernelModel):
297    """
298    GPU wrapper for a single model.
299
300    *source* and *model_info* are the model source and interface as returned
301    from :func:`generate.make_source` and :func:`generate.make_model_info`.
302
303    *dtype* is the desired model precision.  Any numpy dtype for single
304    or double precision floats will do, such as 'f', 'float32' or 'single'
305    for single and 'd', 'float64' or 'double' for double.  Double precision
306    is an optional extension which may not be available on all devices.
307    Half precision ('float16','half') may be available on some devices.
308    Fast precision ('fast') is a loose version of single precision, indicating
309    that the compiler is allowed to take shortcuts.
310    """
311    info = None  # type: ModelInfo
312    source = ""  # type: str
313    dtype = None  # type: np.dtype
314    fast = False  # type: bool
315    _program = None  # type: SourceModule
316    _kernels = None  # type: Dict[str, cuda.Function]
317
318    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
319        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
320        self.info = model_info
321        self.source = source
322        self.dtype = dtype
323        self.fast = fast
324
325    def __getstate__(self):
326        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
327        return self.info, self.source, self.dtype, self.fast
328
329    def __setstate__(self, state):
330        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
331        self.info, self.source, self.dtype, self.fast = state
332        self._program = self._kernels = None
333
334    def make_kernel(self, q_vectors):
335        # type: (List[np.ndarray]) -> "GpuKernel"
336        return GpuKernel(self, q_vectors)
337
338    def get_function(self, name):
339        # type: (str) -> cuda.Function
340        """
341        Fetch the kernel from the environment by name, compiling it if it
342        does not already exist.
343        """
344        if self._program is None:
345            self._prepare_program()
346        return self._kernels[name]
347
348    def _prepare_program(self):
349        # type: (str) -> None
350        env = environment()
351        timestamp = generate.ocl_timestamp(self.info)
352        program = env.compile_program(
353            self.info.name,
354            self.source['opencl'],
355            self.dtype,
356            self.fast,
357            timestamp)
358        variants = ['Iq', 'Iqxy', 'Imagnetic']
359        names = [generate.kernel_name(self.info, k) for k in variants]
360        functions = [program.get_function(k) for k in names]
361        self._kernels = {k: v for k, v in zip(variants, functions)}
362        # Keep a handle to program so GC doesn't collect.
363        self._program = program
364
365
366# TODO: Check that we don't need a destructor for buffers which go out of scope.
367class GpuInput(object):
368    """
369    Make q data available to the gpu.
370
371    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
372    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
373    to get the best performance on OpenCL, which may involve shifting and
374    stretching the array to better match the memory architecture.  Additional
375    points will be evaluated with *q=1e-3*.
376
377    *dtype* is the data type for the q vectors. The data type should be
378    set to match that of the kernel, which is an attribute of
379    :class:`GpuProgram`.  Note that not all kernels support double
380    precision, so even if the program was created for double precision,
381    the *GpuProgram.dtype* may be single precision.
382
383    Call :meth:`release` when complete.  Even if not called directly, the
384    buffer will be released when the data object is freed.
385    """
386    def __init__(self, q_vectors, dtype=generate.F32):
387        # type: (List[np.ndarray], np.dtype) -> None
388        # TODO: Do we ever need double precision q?
389        self.nq = q_vectors[0].size
390        self.dtype = np.dtype(dtype)
391        self.is_2d = (len(q_vectors) == 2)
392        # TODO: Stretch input based on get_warp().
393        # Not doing it now since warp depends on kernel, which is not known
394        # at this point, so instead using 32, which is good on the set of
395        # architectures tested so far.
396        if self.is_2d:
397            width = ((self.nq+15)//16)*16
398            self.q = np.empty((width, 2), dtype=dtype)
399            self.q[:self.nq, 0] = q_vectors[0]
400            self.q[:self.nq, 1] = q_vectors[1]
401        else:
402            width = ((self.nq+31)//32)*32
403            self.q = np.empty(width, dtype=dtype)
404            self.q[:self.nq] = q_vectors[0]
405        self.global_size = [self.q.shape[0]]
406        #print("creating inputs of size", self.global_size)
407
408        # Transfer input value to GPU.
409        self.q_b = cuda.to_device(self.q)
410
411    def release(self):
412        # type: () -> None
413        """
414        Free the buffer associated with the q value.
415        """
416        if self.q_b is not None:
417            self.q_b.free()
418            self.q_b = None
419
420    def __del__(self):
421        # type: () -> None
422        self.release()
423
424
425class GpuKernel(Kernel):
426    """
427    Callable SAS kernel.
428
429    *model* is the GpuModel object to call
430
431    The kernel is derived from :class:`Kernel`, providing the
432    :meth:`call_kernel` method to evaluate the kernel for a given set of
433    parameters.  Because of the need to move the q values to the GPU before
434    evaluation, the kernel is instantiated for a particular set of q vectors,
435    and can be called many times without transfering q each time.
436
437    Call :meth:`release` when done with the kernel instance.
438    """
439    #: SAS model information structure.
440    info = None  # type: ModelInfo
441    #: Kernel precision.
442    dtype = None  # type: np.dtype
443    #: Kernel dimensions (1d or 2d).
444    dim = ""  # type: str
445    #: Calculation results, updated after each call to :meth:`_call_kernel`.
446    result = None  # type: np.ndarray
447
448    def __init__(self, model, q_vectors):
449        # type: (GpuModel, List[np.ndarray]) -> None
450        dtype = model.dtype
451        self.q_input = GpuInput(q_vectors, dtype)
452        self._model = model
453
454        # Attributes accessed from the outside.
455        self.dim = '2d' if self.q_input.is_2d else '1d'
456        self.info = model.info
457        self.dtype = dtype
458
459        # Converter to translate input to target type.
460        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
461
462        # Holding place for the returned value.
463        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
464        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
465        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
466
467        # Allocate result value on GPU.
468        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
469        self._result_b = cuda.mem_alloc(width)
470
471    def _call_kernel(self, call_details, values, cutoff, magnetic,
472                     effective_radius_type):
473        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
474
475        # Arrange data transfer to card.
476        details_b = cuda.to_device(call_details.buffer)
477        values_b = cuda.to_device(values)
478
479        # Setup kernel function and arguments.
480        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
481        kernel = self._model.get_function(name)
482        kernel_args = [
483            np.uint32(self.q_input.nq),  # Number of inputs.
484            None,  # Placeholder for pd_start.
485            None,  # Placeholder for pd_stop.
486            details_b,  # Problem definition.
487            values_b,  # Parameter values.
488            self.q_input.q_b,  # Q values.
489            self._result_b,   # Result storage.
490            self._as_dtype(cutoff),  # Probability cutoff.
491            np.uint32(effective_radius_type),  # R_eff mode.
492        ]
493        grid = partition(self.q_input.nq)
494
495        # Call kernel and retrieve results.
496        #print("Calling CUDA")
497        #call_details.show(values)
498        last_nap = time.clock()
499        step = 100000000//self.q_input.nq + 1
500        #step = 1000000000
501        for start in range(0, call_details.num_eval, step):
502            stop = min(start + step, call_details.num_eval)
503            #print("queuing",start,stop)
504            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
505            kernel(*kernel_args, **grid)
506            if stop < call_details.num_eval:
507                sync()
508                # Allow other processes to run.
509                current_time = time.clock()
510                if current_time - last_nap > 0.5:
511                    time.sleep(0.001)
512                    last_nap = current_time
513        sync()
514        cuda.memcpy_dtoh(self.result, self._result_b)
515        #print("result", self.result)
516
517        details_b.free()
518        values_b.free()
519
520    def release(self):
521        # type: () -> None
522        """
523        Release resources associated with the kernel.
524        """
525        self.q_input.release()
526        if self._result_b is not None:
527            self._result_b.free()
528            self._result_b = None
529
530    def __del__(self):
531        # type: () -> None
532        self.release()
533
534
535def sync():
536    """
537    Overview:
538        Waits for operation in the current context to complete.
539
540    Note: Maybe context.synchronize() is sufficient.
541    """
542    # Create an event with which to synchronize.
543    done = cuda.Event()
544
545    # Schedule an event trigger on the GPU.
546    done.record()
547
548    # Make sure we don't hog resource while waiting to sync.
549    while not done.query():
550        time.sleep(0.01)
551
552    # Block until the GPU executes the kernel.
553    done.synchronize()
554
555    # Clean up the event; I don't think they can be reused.
556    del done
557
558
559def partition(n):
560    '''
561    Overview:
562        Auto grids the thread blocks to achieve some level of calculation
563    efficiency.
564    '''
565    max_gx, max_gy = 65535, 65535
566    blocksize = 32
567    #max_gx, max_gy = 5, 65536
568    #blocksize = 3
569    block = (blocksize, 1, 1)
570    num_blocks = int((n+blocksize-1)/blocksize)
571    if num_blocks < max_gx:
572        grid = (num_blocks, 1)
573    else:
574        gx = max_gx
575        gy = (num_blocks + max_gx - 1) / max_gx
576        if gy >= max_gy:
577            raise ValueError("vector is too large")
578        grid = (gx, gy)
579    #print("block", block, "grid", grid)
580    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n)
581    return dict(block=block, grid=grid)
Note: See TracBrowser for help on using the repository browser.