source: sasmodels/sasmodels/kernelcuda.py @ 0be86aa

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 0be86aa was 7126c04, checked in by Paul Kienzle <pkienzle@…>, 5 years ago

use similar code for cuda and opencl

  • Property mode set to 100644
File size: 20.3 KB
Line 
1"""
2GPU driver for C kernels (with CUDA)
3
4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5device number.  If no device number is specified, then look for CUDA_DEVICE=n
6or a file ~/.cuda-device containing n for the device number.  Otherwise, try
7all available device numbers.
8
9TODO: docs are out of date
10
11There should be a single GPU environment running on the system.  This
12environment is constructed on the first call to :func:`env`, and the
13same environment is returned on each call.
14
15After retrieving the environment, the next step is to create the kernel.
16This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17returns the type of data used by the kernel.
18
19Next a :class:`GpuData` object should be created with the correct kind
20of data.  This data object can be used by multiple kernels, for example,
21if the target model is a weighted sum of multiple kernels.  The data
22should include any extra evaluation points required to compute the proper
23data smearing.  This need not match the square grid for 2D data if there
24is an index saying which q points are active.
25
26Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27This kernel is used during fitting, receiving new sets of parameters and
28evaluating them.  The output value is stored in an output buffer on the
29devices, where it can be combined with other structure factors and form
30factors and have instrumental resolution effects applied.
31
32In order to use OpenCL for your models, you will need OpenCL drivers for
33your machine.  These should be available from your graphics card vendor.
34Intel provides OpenCL drivers for CPUs as well as their integrated HD
35graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
36this writing the performance is lacking compared to the Intel drivers.
37NVidia combines drivers for CUDA and OpenCL in one package.  The result
38is a bit messy if you have multiple drivers installed.  You can see which
39drivers are available by starting python and running:
40
41    import pyopencl as cl
42    cl.create_some_context(interactive=True)
43
44Once you have done that, it will show the available drivers which you
45can select.  It will then tell you that you can use these drivers
46automatically by setting the SAS_OPENCL environment variable, which is
47PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49Some graphics cards have multiple devices on the same card.  You cannot
50yet use both of them concurrently to evaluate models, but you can run
51the program twice using a different device for each session.
52
53OpenCL kernels are compiled when needed by the device driver.  Some
54drivers produce compiler output even when there is no error.  You
55can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
56harmless, albeit annoying.
57"""
58from __future__ import print_function
59
60import os
61import warnings
62import logging
63import time
64import re
65
66import numpy as np  # type: ignore
67
68
69# Attempt to setup cuda. This may fail if the pycuda package is not
70# installed or if it is installed but there are no devices available.
71try:
72    import pycuda.driver as cuda  # type: ignore
73    from pycuda.compiler import SourceModule
74    from pycuda.tools import make_default_context, clear_context_caches
75    # Ask CUDA for the default context (so that we know that one exists)
76    # then immediately throw it away in case the user doesn't want it.
77    # Note: cribbed from pycuda.autoinit
78    cuda.init()
79    context = make_default_context()
80    context.pop()
81    clear_context_caches()
82    del context
83    HAVE_CUDA = True
84    CUDA_ERROR = ""
85except Exception as exc:
86    HAVE_CUDA = False
87    CUDA_ERROR = str(exc)
88
89from . import generate
90from .kernel import KernelModel, Kernel
91
92# pylint: disable=unused-import
93try:
94    from typing import Tuple, Callable, Any
95    from .modelinfo import ModelInfo
96    from .details import CallDetails
97except ImportError:
98    pass
99# pylint: enable=unused-import
100
101# The max loops number is limited by the amount of local memory available
102# on the device.  You don't want to make this value too big because it will
103# waste resources, nor too small because it may interfere with users trying
104# to do their polydispersity calculations.  A value of 1024 should be much
105# larger than necessary given that cost grows as npts^k where k is the number
106# of polydisperse parameters.
107MAX_LOOPS = 2048
108
109def use_cuda():
110    env = os.environ.get("SAS_OPENCL", "").lower()
111    return HAVE_CUDA and (env == "" or env.startswith("cuda"))
112
113ENV = None
114def reset_environment():
115    """
116    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
117    """
118    global ENV
119    # Free any previous allocated context.
120    if ENV is not None and ENV.context is not None:
121        ENV.release()
122    ENV = GpuEnvironment() if use_cuda() else None
123
124def environment():
125    # type: () -> "GpuEnvironment"
126    """
127    Returns a singleton :class:`GpuEnvironment`.
128
129    This provides an OpenCL context and one queue per device.
130    """
131    if ENV is None:
132        if not HAVE_CUDA:
133            raise RuntimeError("CUDA startup failed with ***"
134                            + CUDA_ERROR + "***; using C compiler instead")
135        reset_environment()
136        if ENV is None:
137            raise RuntimeError("SAS_OPENCL=None in environment")
138    return ENV
139
140def has_type(dtype):
141    # type: (np.dtype) -> bool
142    """
143    Return true if device supports the requested precision.
144    """
145    # Assume the nvidia card supports 32-bit and 64-bit floats.
146    # TODO: check if pycuda support F16
147    return dtype in (generate.F32, generate.F64)
148
149
150FUNCTION_PATTERN = re.compile(r"""^
151  (?P<space>\s*)                   # initial space
152  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function
153  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens
154  """, re.VERBOSE|re.MULTILINE)
155
156MARKED_PATTERN = re.compile(r"""
157  \b(return|else|kernel|device|__device__)\b
158  """, re.VERBOSE|re.MULTILINE)
159
160def _add_device_tag(match):
161    # type: (None) -> str
162    # Note: should be re.Match, but that isn't a simple type
163    """
164    replace qualifiers with __device__ qualifiers if needed
165    """
166    qualifiers = match.group("qualifiers")
167    if MARKED_PATTERN.search(qualifiers):
168        start, end = match.span()
169        return match.string[start:end]
170    else:
171        function = match.group("function")
172        space = match.group("space")
173        return "".join((space, "__device__ ", qualifiers, function))
174
175def mark_device_functions(source):
176    # type: (str) -> str
177    """
178    Mark all function declarations as __device__ functions (except kernel).
179    """
180    return FUNCTION_PATTERN.sub(_add_device_tag, source)
181
182def show_device_functions(source):
183    # type: (str) -> str
184    """
185    Show all discovered function declarations, but don't change any.
186    """
187    for match in FUNCTION_PATTERN.finditer(source):
188        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
189    return source
190
191def compile_model(source, dtype, fast=False):
192    # type: (str, np.dtype, bool) -> SourceModule
193    """
194    Build a model to run on the gpu.
195
196    Returns the compiled program and its type.  The returned type will
197    be float32 even if the desired type is float64 if any of the
198    devices in the context do not support the cl_khr_fp64 extension.
199    """
200    dtype = np.dtype(dtype)
201    if not has_type(dtype):
202        raise RuntimeError("%s not supported for devices"%dtype)
203
204    source_list = [generate.convert_type(source, dtype)]
205
206    source_list.insert(0, "#define USE_SINCOS\n")
207    source = "\n".join(source_list)
208    #source = show_device_functions(source)
209    source = mark_device_functions(source)
210    #with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
211    #print(source)
212    #options = ['--verbose', '-E']
213    options = ['--use_fast_math'] if fast else None
214    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]
215
216    #print("done with "+program)
217    return program
218
219
220# for now, this returns one device in the context
221# TODO: create a context that contains all devices on all platforms
222class GpuEnvironment(object):
223    """
224    GPU context, with possibly many devices, and one queue per device.
225    """
226    context = None # type: cuda.Context
227    def __init__(self, devnum=None):
228        # type: (int) -> None
229        # Byte boundary for data alignment
230        #self.data_boundary = max(d.min_data_type_align_size
231        #                         for d in self.context.devices)
232        self.compiled = {}
233        env = os.environ.get("SAS_OPENCL", "").lower()
234        if devnum is None and env.startswith("cuda:"):
235            devnum = int(env[5:])
236        # Set the global context to the particular device number if one is
237        # given, otherwise use the default context.  Perhaps this will be set
238        # by an environment variable within autoinit.
239        if devnum is not None:
240            self.context = cuda.Device(devnum).make_context()
241        else:
242            self.context = make_default_context()
243
244    def release(self):
245        if self.context is not None:
246            self.context.pop()
247            self.context = None
248
249    def __del__(self):
250        self.release()
251
252    def has_type(self, dtype):
253        # type: (np.dtype) -> bool
254        """
255        Return True if all devices support a given type.
256        """
257        return has_type(dtype)
258
259    def compile_program(self, name, source, dtype, fast, timestamp):
260        # type: (str, str, np.dtype, bool, float) -> cl.Program
261        """
262        Compile the program for the device in the given context.
263        """
264        # Note: PyOpenCL caches based on md5 hash of source, options and device
265        # so we don't really need to cache things for ourselves.  I'll do so
266        # anyway just to save some data munging time.
267        tag = generate.tag_source(source)
268        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
269        # Check timestamp on program
270        program, program_timestamp = self.compiled.get(key, (None, np.inf))
271        if program_timestamp < timestamp:
272            del self.compiled[key]
273        if key not in self.compiled:
274            logging.info("building %s for CUDA", key)
275            program = compile_model(str(source), dtype, fast)
276            self.compiled[key] = (program, timestamp)
277        return program
278
279class GpuModel(KernelModel):
280    """
281    GPU wrapper for a single model.
282
283    *source* and *model_info* are the model source and interface as returned
284    from :func:`generate.make_source` and :func:`generate.make_model_info`.
285
286    *dtype* is the desired model precision.  Any numpy dtype for single
287    or double precision floats will do, such as 'f', 'float32' or 'single'
288    for single and 'd', 'float64' or 'double' for double.  Double precision
289    is an optional extension which may not be available on all devices.
290    Half precision ('float16','half') may be available on some devices.
291    Fast precision ('fast') is a loose version of single precision, indicating
292    that the compiler is allowed to take shortcuts.
293    """
294    info = None # type: ModelInfo
295    source = "" # type: str
296    dtype = None # type: np.dtype
297    fast = False # type: bool
298    _program = None # type: SourceModule
299    _kernels = None # type: Dict[str, cuda.Function]
300
301    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
302        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
303        self.info = model_info
304        self.source = source
305        self.dtype = dtype
306        self.fast = fast
307
308    def __getstate__(self):
309        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
310        return self.info, self.source, self.dtype, self.fast
311
312    def __setstate__(self, state):
313        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
314        self.info, self.source, self.dtype, self.fast = state
315        self._program = self._kernels = None
316
317    def make_kernel(self, q_vectors):
318        # type: (List[np.ndarray]) -> "GpuKernel"
319        return GpuKernel(self, q_vectors)
320
321    def get_function(self, name):
322        # type: (str) -> cuda.Function
323        """
324        Fetch the kernel from the environment by name, compiling it if it
325        does not already exist.
326        """
327        if self._program is None:
328            self._prepare_program()
329        return self._kernels[name]
330
331    def _prepare_program(self):
332        # type: (str) -> None
333        env = environment()
334        timestamp = generate.ocl_timestamp(self.info)
335        program = env.compile_program(
336            self.info.name,
337            self.source['opencl'],
338            self.dtype,
339            self.fast,
340            timestamp)
341        variants = ['Iq', 'Iqxy', 'Imagnetic']
342        names = [generate.kernel_name(self.info, k) for k in variants]
343        handles = [program.get_function(k) for k in names]
344        self._kernels = {k: v for k, v in zip(variants, kernels)}
345        # keep a handle to program so GC doesn't collect
346        self._program = program
347
348    def release(self):
349        # type: () -> None
350        """
351        Free the resources associated with the model.
352        """
353        if self.program is not None:
354            self.program = None
355
356    def __del__(self):
357        # type: () -> None
358        self.release()
359
360# TODO: check that we don't need a destructor for buffers which go out of scope
361class GpuInput(object):
362    """
363    Make q data available to the gpu.
364
365    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
366    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
367    to get the best performance on OpenCL, which may involve shifting and
368    stretching the array to better match the memory architecture.  Additional
369    points will be evaluated with *q=1e-3*.
370
371    *dtype* is the data type for the q vectors. The data type should be
372    set to match that of the kernel, which is an attribute of
373    :class:`GpuProgram`.  Note that not all kernels support double
374    precision, so even if the program was created for double precision,
375    the *GpuProgram.dtype* may be single precision.
376
377    Call :meth:`release` when complete.  Even if not called directly, the
378    buffer will be released when the data object is freed.
379    """
380    def __init__(self, q_vectors, dtype=generate.F32):
381        # type: (List[np.ndarray], np.dtype) -> None
382        # TODO: do we ever need double precision q?
383        self.nq = q_vectors[0].size
384        self.dtype = np.dtype(dtype)
385        self.is_2d = (len(q_vectors) == 2)
386        # TODO: stretch input based on get_warp()
387        # not doing it now since warp depends on kernel, which is not known
388        # at this point, so instead using 32, which is good on the set of
389        # architectures tested so far.
390        if self.is_2d:
391            # Note: 16 rather than 15 because result is 1 longer than input.
392            width = ((self.nq+16)//16)*16
393            self.q = np.empty((width, 2), dtype=dtype)
394            self.q[:self.nq, 0] = q_vectors[0]
395            self.q[:self.nq, 1] = q_vectors[1]
396        else:
397            # Note: 32 rather than 31 because result is 1 longer than input.
398            width = ((self.nq+32)//32)*32
399            self.q = np.empty(width, dtype=dtype)
400            self.q[:self.nq] = q_vectors[0]
401        self.global_size = [self.q.shape[0]]
402        #print("creating inputs of size", self.global_size)
403
404        # transfer input value to gpu
405        self.q_b = cuda.to_device(self.q)
406
407    def release(self):
408        # type: () -> None
409        """
410        Free the memory.
411        """
412        if self.q_b is not None:
413            self.q_b.free()
414            self.q_b = None
415
416    def __del__(self):
417        # type: () -> None
418        self.release()
419
420class GpuKernel(Kernel):
421    """
422    Callable SAS kernel.
423
424    *model* is the GpuModel object to call
425
426    The kernel is derived from :class:`Kernel`, providing the
427    :meth:`call_kernel` method to evaluate the kernel for a given set of
428    parameters.  Because of the need to move the q values to the GPU before
429    evaluation, the kernel is instantiated for a particular set of q vectors,
430    and can be called many times without transfering q each time.
431
432    Call :meth:`release` when done with the kernel instance.
433    """
434    #: SAS model information structure
435    info = None # type: ModelInfo
436    #: kernel precision
437    dtype = None # type: np.dtype
438    #: kernel dimensions (1d or 2d)
439    dim = "" # type: str
440    #: calculation results, updated after each call to :meth:`_call_kernel`
441    result = None # type: np.ndarray
442
443    def __init__(self, model, q_vectors):
444        # type: (GpuModel, List[np.ndarray]) -> None
445        dtype = model.dtype
446        self.q_input = GpuInput(q_vectors, dtype)
447        self._model = model
448        # F16 isn't sufficient, so don't support it
449        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
450
451        # attributes accessed from the outside
452        self.dim = '2d' if self.q_input.is_2d else '1d'
453        self.info = model.info
454        self.dtype = model.dtype
455
456        # holding place for the returned value
457        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
458        extra_q = 4  # total weight, form volume, shell volume and R_eff
459        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
460
461        # allocate result value on gpu
462        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
463        self._result_b = cuda.mem_alloc(width)
464
465    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
466        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
467        # Arrange data transfer to card
468        details_b = cuda.to_device(call_details.buffer)
469        values_b = cuda.to_device(values)
470
471        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
472        kernel = self._model.get_function(name)
473        kernel_args = [
474            np.uint32(self.q_input.nq), None, None,
475            details_b, values_b, self.q_input.q_b, self.result_b,
476            self._as_dtype(cutoff),
477            np.uint32(effective_radius_type),
478        ]
479        grid = partition(self.q_input.nq)
480        #print("Calling OpenCL")
481        #call_details.show(values)
482        # Call kernel and retrieve results
483        last_nap = time.clock()
484        step = 100000000//self.q_input.nq + 1
485        #step = 1000000000
486        for start in range(0, call_details.num_eval, step):
487            stop = min(start + step, call_details.num_eval)
488            #print("queuing",start,stop)
489            args[1:3] = [np.int32(start), np.int32(stop)]
490            kernel(*args, **grid)
491            if stop < call_details.num_eval:
492                sync()
493                # Allow other processes to run
494                current_time = time.clock()
495                if current_time - last_nap > 0.5:
496                    time.sleep(0.001)
497                    last_nap = current_time
498        sync()
499        cuda.memcpy_dtoh(self.result, self.result_b)
500        #print("result", self.result)
501
502        details_b.free()
503        values_b.free()
504
505    def release(self):
506        # type: () -> None
507        """
508        Release resources associated with the kernel.
509        """
510        self.q_input.release()
511        if self._result_b is not None:
512            self._result_b.free()
513            self._result_b = None
514
515    def __del__(self):
516        # type: () -> None
517        self.release()
518
519
520def sync():
521    """
522    Overview:
523        Waits for operation in the current context to complete.
524
525    Note: Maybe context.synchronize() is sufficient.
526    """
527    #return # The following works in C++; don't know what pycuda is doing
528    # Create an event with which to synchronize
529    done = cuda.Event()
530
531    # Schedule an event trigger on the GPU.
532    done.record()
533
534    #line added to not hog resources
535    while not done.query():
536        time.sleep(0.01)
537
538    # Block until the GPU executes the kernel.
539    done.synchronize()
540    # Clean up the event; I don't think they can be reused.
541    del done
542
543
544def partition(n):
545    '''
546    Overview:
547        Auto grids the thread blocks to achieve some level of calculation
548    efficiency.
549    '''
550    max_gx, max_gy = 65535, 65535
551    blocksize = 32
552    #max_gx, max_gy = 5, 65536
553    #blocksize = 3
554    block = (blocksize, 1, 1)
555    num_blocks = int((n+blocksize-1)/blocksize)
556    if num_blocks < max_gx:
557        grid = (num_blocks, 1)
558    else:
559        gx = max_gx
560        gy = (num_blocks + max_gx - 1) / max_gx
561        if gy >= max_gy:
562            raise ValueError("vector is too large")
563        grid = (gx, gy)
564    #print("block", block, "grid", grid)
565    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n)
566    return dict(block=block, grid=grid)
Note: See TracBrowser for help on using the repository browser.