source: sasmodels/sasmodels/kernelcuda.py @ 869fd7b

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 869fd7b was 869fd7b, checked in by Paul Kienzle <pkienzle@…>, 5 years ago

update cuda kernel to support Fq interface

  • Property mode set to 100644
File size: 20.2 KB
Line 
1"""
2GPU driver for C kernels (with CUDA)
3
4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
5device number.  If no device number is specified, then look for CUDA_DEVICE=n
6or a file ~/.cuda-device containing n for the device number.  Otherwise, try
7all available device numbers.
8
9TODO: docs are out of date
10
11There should be a single GPU environment running on the system.  This
12environment is constructed on the first call to :func:`env`, and the
13same environment is returned on each call.
14
15After retrieving the environment, the next step is to create the kernel.
16This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
17returns the type of data used by the kernel.
18
19Next a :class:`GpuData` object should be created with the correct kind
20of data.  This data object can be used by multiple kernels, for example,
21if the target model is a weighted sum of multiple kernels.  The data
22should include any extra evaluation points required to compute the proper
23data smearing.  This need not match the square grid for 2D data if there
24is an index saying which q points are active.
25
26Together the GpuData, the program, and a device form a :class:`GpuKernel`.
27This kernel is used during fitting, receiving new sets of parameters and
28evaluating them.  The output value is stored in an output buffer on the
29devices, where it can be combined with other structure factors and form
30factors and have instrumental resolution effects applied.
31
32In order to use OpenCL for your models, you will need OpenCL drivers for
33your machine.  These should be available from your graphics card vendor.
34Intel provides OpenCL drivers for CPUs as well as their integrated HD
35graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
36this writing the performance is lacking compared to the Intel drivers.
37NVidia combines drivers for CUDA and OpenCL in one package.  The result
38is a bit messy if you have multiple drivers installed.  You can see which
39drivers are available by starting python and running:
40
41    import pyopencl as cl
42    cl.create_some_context(interactive=True)
43
44Once you have done that, it will show the available drivers which you
45can select.  It will then tell you that you can use these drivers
46automatically by setting the SAS_OPENCL environment variable, which is
47PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
48
49Some graphics cards have multiple devices on the same card.  You cannot
50yet use both of them concurrently to evaluate models, but you can run
51the program twice using a different device for each session.
52
53OpenCL kernels are compiled when needed by the device driver.  Some
54drivers produce compiler output even when there is no error.  You
55can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
56harmless, albeit annoying.
57"""
58from __future__ import print_function
59
60import os
61import warnings
62import logging
63import time
64import re
65
66import numpy as np  # type: ignore
67
68
69# Attempt to setup cuda. This may fail if the pycuda package is not
70# installed or if it is installed but there are no devices available.
71try:
72    import pycuda.driver as cuda  # type: ignore
73    from pycuda.compiler import SourceModule
74    from pycuda.tools import make_default_context, clear_context_caches
75    # Ask CUDA for the default context (so that we know that one exists)
76    # then immediately throw it away in case the user doesn't want it.
77    # Note: cribbed from pycuda.autoinit
78    cuda.init()
79    context = make_default_context()
80    context.pop()
81    clear_context_caches()
82    del context
83    HAVE_CUDA = True
84    CUDA_ERROR = ""
85except Exception as exc:
86    HAVE_CUDA = False
87    CUDA_ERROR = str(exc)
88
89from . import generate
90from .kernel import KernelModel, Kernel
91
92# pylint: disable=unused-import
93try:
94    from typing import Tuple, Callable, Any
95    from .modelinfo import ModelInfo
96    from .details import CallDetails
97except ImportError:
98    pass
99# pylint: enable=unused-import
100
101# The max loops number is limited by the amount of local memory available
102# on the device.  You don't want to make this value too big because it will
103# waste resources, nor too small because it may interfere with users trying
104# to do their polydispersity calculations.  A value of 1024 should be much
105# larger than necessary given that cost grows as npts^k where k is the number
106# of polydisperse parameters.
107MAX_LOOPS = 2048
108
109def use_cuda():
110    env = os.environ.get("SAS_OPENCL", "").lower()
111    return HAVE_CUDA and (env == "" or env.startswith("cuda"))
112
113ENV = None
114def reset_environment():
115    """
116    Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
117    """
118    global ENV
119    # Free any previous allocated context.
120    if ENV is not None and ENV.context is not None:
121        ENV.release()
122    ENV = GpuEnvironment() if use_cuda() else None
123
124def environment():
125    # type: () -> "GpuEnvironment"
126    """
127    Returns a singleton :class:`GpuEnvironment`.
128
129    This provides an OpenCL context and one queue per device.
130    """
131    if ENV is None:
132        if not HAVE_CUDA:
133            raise RuntimeError("CUDA startup failed with ***"
134                            + CUDA_ERROR + "***; using C compiler instead")
135        reset_environment()
136        if ENV is None:
137            raise RuntimeError("SAS_OPENCL=None in environment")
138    return ENV
139
140def has_type(dtype):
141    # type: (np.dtype) -> bool
142    """
143    Return true if device supports the requested precision.
144    """
145    # Assume the nvidia card supports 32-bit and 64-bit floats.
146    # TODO: check if pycuda support F16
147    return dtype in (generate.F32, generate.F64)
148
149
150FUNCTION_PATTERN = re.compile(r"""^
151  (?P<space>\s*)                   # initial space
152  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function
153  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens
154  """, re.VERBOSE|re.MULTILINE)
155
156MARKED_PATTERN = re.compile(r"""
157  \b(return|else|kernel|device|__device__)\b
158  """, re.VERBOSE|re.MULTILINE)
159
160def _add_device_tag(match):
161    # type: (None) -> str
162    # Note: should be re.Match, but that isn't a simple type
163    """
164    replace qualifiers with __device__ qualifiers if needed
165    """
166    qualifiers = match.group("qualifiers")
167    if MARKED_PATTERN.search(qualifiers):
168        start, end = match.span()
169        return match.string[start:end]
170    else:
171        function = match.group("function")
172        space = match.group("space")
173        return "".join((space, "__device__ ", qualifiers, function))
174
175def mark_device_functions(source):
176    # type: (str) -> str
177    """
178    Mark all function declarations as __device__ functions (except kernel).
179    """
180    return FUNCTION_PATTERN.sub(_add_device_tag, source)
181
182def show_device_functions(source):
183    # type: (str) -> str
184    """
185    Show all discovered function declarations, but don't change any.
186    """
187    for match in FUNCTION_PATTERN.finditer(source):
188        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
189    return source
190
191def compile_model(source, dtype, fast=False):
192    # type: (str, np.dtype, bool) -> SourceModule
193    """
194    Build a model to run on the gpu.
195
196    Returns the compiled program and its type.  The returned type will
197    be float32 even if the desired type is float64 if any of the
198    devices in the context do not support the cl_khr_fp64 extension.
199    """
200    dtype = np.dtype(dtype)
201    if not has_type(dtype):
202        raise RuntimeError("%s not supported for devices"%dtype)
203
204    source_list = [generate.convert_type(source, dtype)]
205
206    source_list.insert(0, "#define USE_SINCOS\n")
207    source = "\n".join(source_list)
208    #source = show_device_functions(source)
209    source = mark_device_functions(source)
210    #with open('/tmp/kernel.cu', 'w') as fd: fd.write(source)
211    #print(source)
212    #options = ['--verbose', '-E']
213    options = ['--use_fast_math'] if fast else None
214    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]
215
216    #print("done with "+program)
217    return program
218
219
220# for now, this returns one device in the context
221# TODO: create a context that contains all devices on all platforms
222class GpuEnvironment(object):
223    """
224    GPU context, with possibly many devices, and one queue per device.
225    """
226    context = None # type: cuda.Context
227    def __init__(self, devnum=None):
228        # type: (int) -> None
229        # Byte boundary for data alignment
230        #self.data_boundary = max(d.min_data_type_align_size
231        #                         for d in self.context.devices)
232        self.compiled = {}
233        env = os.environ.get("SAS_OPENCL", "").lower()
234        if devnum is None and env.startswith("cuda:"):
235            devnum = int(env[5:])
236        # Set the global context to the particular device number if one is
237        # given, otherwise use the default context.  Perhaps this will be set
238        # by an environment variable within autoinit.
239        if devnum is not None:
240            self.context = cuda.Device(devnum).make_context()
241        else:
242            self.context = make_default_context()
243
244    def release(self):
245        if self.context is not None:
246            self.context.pop()
247            self.context = None
248
249    def __del__(self):
250        self.release()
251
252    def has_type(self, dtype):
253        # type: (np.dtype) -> bool
254        """
255        Return True if all devices support a given type.
256        """
257        return has_type(dtype)
258
259    def compile_program(self, name, source, dtype, fast, timestamp):
260        # type: (str, str, np.dtype, bool, float) -> cl.Program
261        """
262        Compile the program for the device in the given context.
263        """
264        # Note: PyOpenCL caches based on md5 hash of source, options and device
265        # so we don't really need to cache things for ourselves.  I'll do so
266        # anyway just to save some data munging time.
267        tag = generate.tag_source(source)
268        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
269        # Check timestamp on program
270        program, program_timestamp = self.compiled.get(key, (None, np.inf))
271        if program_timestamp < timestamp:
272            del self.compiled[key]
273        if key not in self.compiled:
274            logging.info("building %s for CUDA", key)
275            program = compile_model(str(source), dtype, fast)
276            self.compiled[key] = (program, timestamp)
277        return program
278
279class GpuModel(KernelModel):
280    """
281    GPU wrapper for a single model.
282
283    *source* and *model_info* are the model source and interface as returned
284    from :func:`generate.make_source` and :func:`generate.make_model_info`.
285
286    *dtype* is the desired model precision.  Any numpy dtype for single
287    or double precision floats will do, such as 'f', 'float32' or 'single'
288    for single and 'd', 'float64' or 'double' for double.  Double precision
289    is an optional extension which may not be available on all devices.
290    Half precision ('float16','half') may be available on some devices.
291    Fast precision ('fast') is a loose version of single precision, indicating
292    that the compiler is allowed to take shortcuts.
293    """
294    info = None # type: ModelInfo
295    source = "" # type: str
296    dtype = None # type: np.dtype
297    fast = False # type: bool
298    program = None # type: SourceModule
299    _kernels = None # type: List[cuda.Function]
300
301    def __init__(self, source, model_info, dtype=generate.F32, fast=False):
302        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
303        self.info = model_info
304        self.source = source
305        self.dtype = dtype
306        self.fast = fast
307        self.program = None # delay program creation
308        self._kernels = None
309
310    def __getstate__(self):
311        # type: () -> Tuple[ModelInfo, str, np.dtype, bool]
312        return self.info, self.source, self.dtype, self.fast
313
314    def __setstate__(self, state):
315        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
316        self.info, self.source, self.dtype, self.fast = state
317        self.program = None
318
319    def make_kernel(self, q_vectors):
320        # type: (List[np.ndarray]) -> "GpuKernel"
321        if self.program is None:
322            compile_program = environment().compile_program
323            timestamp = generate.ocl_timestamp(self.info)
324            self.program = compile_program(
325                self.info.name,
326                self.source['opencl'],
327                self.dtype,
328                self.fast,
329                timestamp)
330            variants = ['Iq', 'Iqxy', 'Imagnetic']
331            names = [generate.kernel_name(self.info, k) for k in variants]
332            kernels = [self.program.get_function(k) for k in names]
333            self._kernels = dict((k, v) for k, v in zip(variants, kernels))
334        is_2d = len(q_vectors) == 2
335        if is_2d:
336            kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
337        else:
338            kernel = [self._kernels['Iq']]*2
339        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
340
341    def release(self):
342        # type: () -> None
343        """
344        Free the resources associated with the model.
345        """
346        if self.program is not None:
347            self.program = None
348
349    def __del__(self):
350        # type: () -> None
351        self.release()
352
353# TODO: check that we don't need a destructor for buffers which go out of scope
354class GpuInput(object):
355    """
356    Make q data available to the gpu.
357
358    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
359    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
360    to get the best performance on OpenCL, which may involve shifting and
361    stretching the array to better match the memory architecture.  Additional
362    points will be evaluated with *q=1e-3*.
363
364    *dtype* is the data type for the q vectors. The data type should be
365    set to match that of the kernel, which is an attribute of
366    :class:`GpuProgram`.  Note that not all kernels support double
367    precision, so even if the program was created for double precision,
368    the *GpuProgram.dtype* may be single precision.
369
370    Call :meth:`release` when complete.  Even if not called directly, the
371    buffer will be released when the data object is freed.
372    """
373    def __init__(self, q_vectors, dtype=generate.F32):
374        # type: (List[np.ndarray], np.dtype) -> None
375        # TODO: do we ever need double precision q?
376        self.nq = q_vectors[0].size
377        self.dtype = np.dtype(dtype)
378        self.is_2d = (len(q_vectors) == 2)
379        # TODO: stretch input based on get_warp()
380        # not doing it now since warp depends on kernel, which is not known
381        # at this point, so instead using 32, which is good on the set of
382        # architectures tested so far.
383        if self.is_2d:
384            # Note: 16 rather than 15 because result is 1 longer than input.
385            width = ((self.nq+16)//16)*16
386            self.q = np.empty((width, 2), dtype=dtype)
387            self.q[:self.nq, 0] = q_vectors[0]
388            self.q[:self.nq, 1] = q_vectors[1]
389        else:
390            # Note: 32 rather than 31 because result is 1 longer than input.
391            width = ((self.nq+32)//32)*32
392            self.q = np.empty(width, dtype=dtype)
393            self.q[:self.nq] = q_vectors[0]
394        self.global_size = [self.q.shape[0]]
395        #print("creating inputs of size", self.global_size)
396        self.q_b = cuda.to_device(self.q)
397
398    def release(self):
399        # type: () -> None
400        """
401        Free the memory.
402        """
403        if self.q_b is not None:
404            self.q_b.free()
405            self.q_b = None
406
407    def __del__(self):
408        # type: () -> None
409        self.release()
410
411class GpuKernel(Kernel):
412    """
413    Callable SAS kernel.
414
415    *kernel* is the GpuKernel object to call
416
417    *model_info* is the module information
418
419    *q_vectors* is the q vectors at which the kernel should be evaluated
420
421    *dtype* is the kernel precision
422
423    The resulting call method takes the *pars*, a list of values for
424    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
425    vectors for the polydisperse parameters.  *cutoff* determines the
426    integration limits: any points with combined weight less than *cutoff*
427    will not be calculated.
428
429    Call :meth:`release` when done with the kernel instance.
430    """
431    def __init__(self, kernel, dtype, model_info, q_vectors):
432        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
433        self.q_input = GpuInput(q_vectors, dtype)
434        self.kernel = kernel
435        self._as_dtype = (np.float32 if dtype == generate.F32
436                          else np.float64 if dtype == generate.F64
437                          else np.float16 if dtype == generate.F16
438                          else np.float32)  # will never get here, so use np.float32
439
440        # attributes accessed from the outside
441        self.dim = '2d' if self.q_input.is_2d else '1d'
442        self.info = model_info
443        self.dtype = dtype
444
445        # holding place for the returned value
446        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
447        extra_q = 4  # total weight, form volume, shell volume and R_eff
448        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
449
450        # Inputs and outputs for each kernel call
451        # Note: res may be shorter than res_b if global_size != nq
452        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
453        self.result_b = cuda.mem_alloc(width)
454        self._need_release = [self.result_b]
455
456    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
457        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
458        # Arrange data transfer to card
459        details_b = cuda.to_device(call_details.buffer)
460        values_b = cuda.to_device(values)
461
462        kernel = self.kernel[1 if magnetic else 0]
463        args = [
464            np.uint32(self.q_input.nq), None, None,
465            details_b, values_b, self.q_input.q_b, self.result_b,
466            self._as_dtype(cutoff),
467            np.uint32(effective_radius_type),
468        ]
469        grid = partition(self.q_input.nq)
470        #print("Calling OpenCL")
471        #call_details.show(values)
472        # Call kernel and retrieve results
473        last_nap = time.clock()
474        step = 100000000//self.q_input.nq + 1
475        #step = 1000000000
476        for start in range(0, call_details.num_eval, step):
477            stop = min(start + step, call_details.num_eval)
478            #print("queuing",start,stop)
479            args[1:3] = [np.int32(start), np.int32(stop)]
480            kernel(*args, **grid)
481            if stop < call_details.num_eval:
482                sync()
483                # Allow other processes to run
484                current_time = time.clock()
485                if current_time - last_nap > 0.5:
486                    time.sleep(0.001)
487                    last_nap = current_time
488        sync()
489        cuda.memcpy_dtoh(self.result, self.result_b)
490        #print("result", self.result)
491
492        details_b.free()
493        values_b.free()
494
495    def release(self):
496        # type: () -> None
497        """
498        Release resources associated with the kernel.
499        """
500        for p in self._need_release:
501            p.free()
502        self._need_release = []
503
504    def __del__(self):
505        # type: () -> None
506        self.release()
507
508
509def sync():
510    """
511    Overview:
512        Waits for operation in the current context to complete.
513
514    Note: Maybe context.synchronize() is sufficient.
515    """
516    #return # The following works in C++; don't know what pycuda is doing
517    # Create an event with which to synchronize
518    done = cuda.Event()
519
520    # Schedule an event trigger on the GPU.
521    done.record()
522
523    #line added to not hog resources
524    while not done.query():
525        time.sleep(0.01)
526
527    # Block until the GPU executes the kernel.
528    done.synchronize()
529    # Clean up the event; I don't think they can be reused.
530    del done
531
532
533def partition(n):
534    '''
535    Overview:
536        Auto grids the thread blocks to achieve some level of calculation
537    efficiency.
538    '''
539    max_gx, max_gy = 65535, 65535
540    blocksize = 32
541    #max_gx, max_gy = 5, 65536
542    #blocksize = 3
543    block = (blocksize, 1, 1)
544    num_blocks = int((n+blocksize-1)/blocksize)
545    if num_blocks < max_gx:
546        grid = (num_blocks, 1)
547    else:
548        gx = max_gx
549        gy = (num_blocks + max_gx - 1) / max_gx
550        if gy >= max_gy:
551            raise ValueError("vector is too large")
552        grid = (gx, gy)
553    #print("block", block, "grid", grid)
554    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n)
555    return dict(block=block, grid=grid)
Note: See TracBrowser for help on using the repository browser.