source: sasmodels/sasmodels/kernelcl.py @ 9404dd3

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 9404dd3 was 9404dd3, checked in by Paul Kienzle <pkienzle@…>, 8 years ago

python 3.x support

  • Property mode set to 100644
File size: 14.5 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25In order to use OpenCL for your models, you will need OpenCL drivers for
26your machine.  These should be available from your graphics card vendor.
27Intel provides OpenCL drivers for CPUs as well as their integrated HD
28graphics chipsets.  AMD also provides drivers for Intel CPUs, but as of
29this writing the performance is lacking compared to the Intel drivers.
30NVidia combines drivers for CUDA and OpenCL in one package.  The result
31is a bit messy if you have multiple drivers installed.  You can see which
32drivers are available by starting python and running:
33
34    import pyopencl as cl
35    cl.create_some_context(interactive=True)
36
37Once you have done that, it will show the available drivers which you
38can select.  It will then tell you that you can use these drivers
39automatically by setting the PYOPENCL_CTX environment variable.
40
41Some graphics cards have multiple devices on the same card.  You cannot
42yet use both of them concurrently to evaluate models, but you can run
43the program twice using a different device for each session.
44
45OpenCL kernels are compiled when needed by the device driver.  Some
46drivers produce compiler output even when there is no error.  You
47can see the output by setting PYOPENCL_COMPILER_OUTPUT=1.  It should be
48harmless, albeit annoying.
49"""
50import os
51import warnings
52
53import numpy as np
54
55try:
56    import pyopencl as cl
57    # Ask OpenCL for the default context so that we know that one exists
58    cl.create_some_context(interactive=False)
59except Exception as exc:
60    warnings.warn(str(exc))
61    raise RuntimeError("OpenCL not available")
62
63from pyopencl import mem_flags as mf
64
65from . import generate
66
67F64_DEFS = """\
68#ifdef cl_khr_fp64
69#  pragma OPENCL EXTENSION cl_khr_fp64: enable
70#endif
71"""
72
73# The max loops number is limited by the amount of local memory available
74# on the device.  You don't want to make this value too big because it will
75# waste resources, nor too small because it may interfere with users trying
76# to do their polydispersity calculations.  A value of 1024 should be much
77# larger than necessary given that cost grows as npts^k where k is the number
78# of polydisperse parameters.
79MAX_LOOPS = 2048
80
81
82ENV = None
83def environment():
84    """
85    Returns a singleton :class:`GpuEnvironment`.
86
87    This provides an OpenCL context and one queue per device.
88    """
89    global ENV
90    if ENV is None:
91        ENV = GpuEnvironment()
92    return ENV
93
94def has_double(device):
95    """
96    Return true if device supports double precision.
97    """
98    return "cl_khr_fp64" in device.extensions
99
100def get_warp(kernel, queue):
101    """
102    Return the size of an execution batch for *kernel* running on *queue*.
103    """
104    return kernel.get_work_group_info(
105        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
106        queue.device)
107
108def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
109    """
110    Stretch an input vector to the correct boundary.
111
112    Performance on the kernels can drop by a factor of two or more if the
113    number of values to compute does not fall on a nice power of two
114    boundary.   The trailing additional vector elements are given a
115    value of *extra*, and so f(*extra*) will be computed for each of
116    them.  The returned array will thus be a subset of the computed array.
117
118    *boundary* should be a power of 2 which is at least 32 for good
119    performance on current platforms (as of Jan 2015).  It should
120    probably be the max of get_warp(kernel,queue) and
121    device.min_data_type_align_size//4.
122    """
123    remainder = vector.size % boundary
124    if remainder != 0:
125        size = vector.size + (boundary - remainder)
126        vector = np.hstack((vector, [extra] * (size - vector.size)))
127    return np.ascontiguousarray(vector, dtype=dtype)
128
129
130def compile_model(context, source, dtype):
131    """
132    Build a model to run on the gpu.
133
134    Returns the compiled program and its type.  The returned type will
135    be float32 even if the desired type is float64 if any of the
136    devices in the context do not support the cl_khr_fp64 extension.
137    """
138    dtype = np.dtype(dtype)
139    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
140        raise RuntimeError("Double precision not supported for devices")
141
142    header = F64_DEFS if dtype == generate.F64 else ""
143    if dtype == generate.F32:
144        source = generate.use_single(source)
145    # Note: USE_SINCOS makes the intel cpu slower under opencl
146    if context.devices[0].type == cl.device_type.GPU:
147        header += "#define USE_SINCOS\n"
148    program = cl.Program(context, header + source).build()
149    return program
150
151
152def make_result(self, size):
153    self.res = np.empty(size, dtype=self.dtype)
154    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
155    return self.res, self.res_b
156
157
158# for now, this returns one device in the context
159# TODO: create a context that contains all devices on all platforms
160class GpuEnvironment(object):
161    """
162    GPU context, with possibly many devices, and one queue per device.
163    """
164    def __init__(self):
165        # find gpu context
166        #self.context = cl.create_some_context()
167
168        self.context = None
169        if 'PYOPENCL_CTX' in os.environ:
170            self._create_some_context()
171
172        if not self.context:
173            self.context = _get_default_context()
174
175        # Byte boundary for data alignment
176        #self.data_boundary = max(d.min_data_type_align_size
177        #                         for d in self.context.devices)
178        self.queues = [cl.CommandQueue(self.context, d)
179                       for d in self.context.devices]
180        self.has_double = all(has_double(d) for d in self.context.devices)
181        self.compiled = {}
182
183    def _create_some_context(self):
184        try:
185            self.context = cl.create_some_context(interactive=False)
186        except Exception as exc:
187            warnings.warn(str(exc))
188            warnings.warn("pyopencl.create_some_context() failed")
189            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
190
191    def compile_program(self, name, source, dtype):
192        if name not in self.compiled:
193            #print("compiling",name)
194            self.compiled[name] = compile_model(self.context, source, dtype)
195        return self.compiled[name]
196
197    def release_program(self, name):
198        if name in self.compiled:
199            self.compiled[name].release()
200            del self.compiled[name]
201
202def _get_default_context():
203    default = None
204    for platform in cl.get_platforms():
205        for device in platform.get_devices():
206            if device.type == cl.device_type.GPU:
207                return cl.Context([device])
208            if default is None:
209                default = device
210
211    if not default:
212        raise RuntimeError("OpenCL device not found")
213
214    return cl.Context([default])
215
216
217class GpuModel(object):
218    """
219    GPU wrapper for a single model.
220
221    *source* and *info* are the model source and interface as returned
222    from :func:`gen.make`.
223
224    *dtype* is the desired model precision.  Any numpy dtype for single
225    or double precision floats will do, such as 'f', 'float32' or 'single'
226    for single and 'd', 'float64' or 'double' for double.  Double precision
227    is an optional extension which may not be available on all devices.
228    """
229    def __init__(self, source, info, dtype=generate.F32):
230        self.info = info
231        self.source = source
232        self.dtype = np.dtype(dtype)
233        self.program = None # delay program creation
234
235    def __getstate__(self):
236        state = self.__dict__.copy()
237        state['program'] = None
238        return state
239
240    def __setstate__(self, state):
241        self.__dict__ = state.copy()
242
243    def __call__(self, q_input):
244        if self.dtype != q_input.dtype:
245            raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
246        if self.program is None:
247            compiler = environment().compile_program
248            self.program = compiler(self.info['name'], self.source, self.dtype)
249        kernel_name = generate.kernel_name(self.info, q_input.is_2D)
250        kernel = getattr(self.program, kernel_name)
251        return GpuKernel(kernel, self.info, q_input)
252
253    def release(self):
254        if self.program is not None:
255            environment().release_program(self.info['name'])
256            self.program = None
257
258    def make_input(self, q_vectors):
259        """
260        Make q input vectors available to the model.
261
262        Note that each model needs its own q vector even if the case of
263        mixture models because some models may be OpenCL, some may be
264        ctypes and some may be pure python.
265        """
266        return GpuInput(q_vectors, dtype=self.dtype)
267
268# TODO: check that we don't need a destructor for buffers which go out of scope
269class GpuInput(object):
270    """
271    Make q data available to the gpu.
272
273    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
274    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
275    to get the best performance on OpenCL, which may involve shifting and
276    stretching the array to better match the memory architecture.  Additional
277    points will be evaluated with *q=1e-3*.
278
279    *dtype* is the data type for the q vectors. The data type should be
280    set to match that of the kernel, which is an attribute of
281    :class:`GpuProgram`.  Note that not all kernels support double
282    precision, so even if the program was created for double precision,
283    the *GpuProgram.dtype* may be single precision.
284
285    Call :meth:`release` when complete.  Even if not called directly, the
286    buffer will be released when the data object is freed.
287    """
288    def __init__(self, q_vectors, dtype=generate.F32):
289        env = environment()
290        self.nq = q_vectors[0].size
291        self.dtype = np.dtype(dtype)
292        self.is_2D = (len(q_vectors) == 2)
293        # TODO: stretch input based on get_warp()
294        # not doing it now since warp depends on kernel, which is not known
295        # at this point, so instead using 32, which is good on the set of
296        # architectures tested so far.
297        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
298        self.q_buffers = [
299            cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
300            for q in self.q_vectors
301        ]
302        self.global_size = [self.q_vectors[0].size]
303
304    def release(self):
305        for b in self.q_buffers:
306            b.release()
307        self.q_buffers = []
308
309class GpuKernel(object):
310    """
311    Callable SAS kernel.
312
313    *kernel* is the GpuKernel object to call.
314
315    *info* is the module information
316
317    *q_input* is the DllInput q vectors at which the kernel should be
318    evaluated.
319
320    The resulting call method takes the *pars*, a list of values for
321    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
322    vectors for the polydisperse parameters.  *cutoff* determines the
323    integration limits: any points with combined weight less than *cutoff*
324    will not be calculated.
325
326    Call :meth:`release` when done with the kernel instance.
327    """
328    def __init__(self, kernel, info, q_input):
329        self.q_input = q_input
330        self.kernel = kernel
331        self.info = info
332        self.res = np.empty(q_input.nq, q_input.dtype)
333        dim = '2d' if q_input.is_2D else '1d'
334        self.fixed_pars = info['partype']['fixed-' + dim]
335        self.pd_pars = info['partype']['pd-' + dim]
336
337        # Inputs and outputs for each kernel call
338        # Note: res may be shorter than res_b if global_size != nq
339        env = environment()
340        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
341                                  2 * MAX_LOOPS * q_input.dtype.itemsize)
342                        for _ in env.queues]
343        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
344                                q_input.global_size[0] * q_input.dtype.itemsize)
345                      for _ in env.queues]
346
347
348    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
349        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
350
351        device_num = 0
352        queuei = environment().queues[device_num]
353        res_bi = self.res_b[device_num]
354        nq = np.uint32(self.q_input.nq)
355        if pd_pars:
356            cutoff = real(cutoff)
357            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
358            loops = np.hstack(pd_pars) \
359                if pd_pars else np.empty(0, dtype=self.q_input.dtype)
360            loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
361            #print("loops",Nloops, loops)
362
363            #import sys; print("opencl eval",pars)
364            #print("opencl eval",pars)
365            if len(loops) > 2 * MAX_LOOPS:
366                raise ValueError("too many polydispersity points")
367
368            loops_bi = self.loops_b[device_num]
369            cl.enqueue_copy(queuei, loops_bi, loops)
370            loops_l = cl.LocalMemory(len(loops.data))
371            #ctx = environment().context
372            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops)
373            dispersed = [loops_bi, loops_l, cutoff] + loops_N
374        else:
375            dispersed = []
376        fixed = [real(p) for p in fixed_pars]
377        args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
378        self.kernel(queuei, self.q_input.global_size, None, *args)
379        cl.enqueue_copy(queuei, self.res, res_bi)
380
381        return self.res
382
383    def release(self):
384        for b in self.loops_b:
385            b.release()
386        self.loops_b = []
387        for b in self.res_b:
388            b.release()
389        self.res_b = []
390
391    def __del__(self):
392        self.release()
Note: See TracBrowser for help on using the repository browser.