source: sasmodels/sasmodels/kernelcl.py @ af1d68c

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since af1d68c was af1d68c, checked in by Paul Kienzle <pkienzle@…>, 9 years ago

remove obsolete code

  • Property mode set to 100644
File size: 13.3 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32    # Ask OpenCL for the default context so that we know that one exists
33    cl.create_some_context(interactive=False)
34except Exception, exc:
35    warnings.warn(str(exc))
36    raise RuntimeError("OpenCL not available")
37
38from pyopencl import mem_flags as mf
39
40from . import generate
41from .kernelpy import PyModel
42
43F64_DEFS = """\
44#ifdef cl_khr_fp64
45#  pragma OPENCL EXTENSION cl_khr_fp64: enable
46#endif
47"""
48
49# The max loops number is limited by the amount of local memory available
50# on the device.  You don't want to make this value too big because it will
51# waste resources, nor too small because it may interfere with users trying
52# to do their polydispersity calculations.  A value of 1024 should be much
53# larger than necessary given that cost grows as npts^k where k is the number
54# of polydisperse parameters.
55MAX_LOOPS = 2048
56
57
58ENV = None
59def environment():
60    """
61    Returns a singleton :class:`GpuEnvironment`.
62
63    This provides an OpenCL context and one queue per device.
64    """
65    global ENV
66    if ENV is None:
67        ENV = GpuEnvironment()
68    return ENV
69
70def has_double(device):
71    """
72    Return true if device supports double precision.
73    """
74    return "cl_khr_fp64" in device.extensions
75
76def get_warp(kernel, queue):
77    """
78    Return the size of an execution batch for *kernel* running on *queue*.
79    """
80    return kernel.get_work_group_info(
81        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
82        queue.device)
83
84def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
85    """
86    Stretch an input vector to the correct boundary.
87
88    Performance on the kernels can drop by a factor of two or more if the
89    number of values to compute does not fall on a nice power of two
90    boundary.   The trailing additional vector elements are given a
91    value of *extra*, and so f(*extra*) will be computed for each of
92    them.  The returned array will thus be a subset of the computed array.
93
94    *boundary* should be a power of 2 which is at least 32 for good
95    performance on current platforms (as of Jan 2015).  It should
96    probably be the max of get_warp(kernel,queue) and
97    device.min_data_type_align_size//4.
98    """
99    remainder = vector.size % boundary
100    if remainder != 0:
101        size = vector.size + (boundary - remainder)
102        vector = np.hstack((vector, [extra] * (size - vector.size)))
103    return np.ascontiguousarray(vector, dtype=dtype)
104
105
106def compile_model(context, source, dtype):
107    """
108    Build a model to run on the gpu.
109
110    Returns the compiled program and its type.  The returned type will
111    be float32 even if the desired type is float64 if any of the
112    devices in the context do not support the cl_khr_fp64 extension.
113    """
114    dtype = np.dtype(dtype)
115    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
116        raise RuntimeError("Double precision not supported for devices")
117
118    header = F64_DEFS if dtype == generate.F64 else ""
119    if dtype == generate.F32:
120        source = generate.use_single(source)
121    # Note: USE_SINCOS makes the intel cpu slower under opencl
122    if context.devices[0].type == cl.device_type.GPU:
123        header += "#define USE_SINCOS\n"
124    program = cl.Program(context, header + source).build()
125    return program
126
127
128def make_result(self, size):
129    self.res = np.empty(size, dtype=self.dtype)
130    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
131    return self.res, self.res_b
132
133
134# for now, this returns one device in the context
135# TODO: create a context that contains all devices on all platforms
136class GpuEnvironment(object):
137    """
138    GPU context, with possibly many devices, and one queue per device.
139    """
140    def __init__(self):
141        # find gpu context
142        #self.context = cl.create_some_context()
143
144        self.context = None
145        if 'PYOPENCL_CTX' in os.environ:
146            self._create_some_context()
147
148        if not self.context:
149            self.context = _get_default_context()
150
151        # Byte boundary for data alignment
152        #self.data_boundary = max(d.min_data_type_align_size
153        #                         for d in self.context.devices)
154        self.queues = [cl.CommandQueue(self.context, d)
155                       for d in self.context.devices]
156        self.has_double = all(has_double(d) for d in self.context.devices)
157        self.compiled = {}
158
159    def _create_some_context(self):
160        try:
161            self.context = cl.create_some_context(interactive=False)
162        except Exception, exc:
163            warnings.warn(str(exc))
164            warnings.warn("pyopencl.create_some_context() failed")
165            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
166
167    def compile_program(self, name, source, dtype):
168        if name not in self.compiled:
169            #print "compiling",name
170            self.compiled[name] = compile_model(self.context, source, dtype)
171        return self.compiled[name]
172
173    def release_program(self, name):
174        if name in self.compiled:
175            self.compiled[name].release()
176            del self.compiled[name]
177
178def _get_default_context():
179    default = None
180    for platform in cl.get_platforms():
181        for device in platform.get_devices():
182            if device.type == cl.device_type.GPU:
183                return cl.Context([device])
184            if default is None:
185                default = device
186
187    if not default:
188        raise RuntimeError("OpenCL device not found")
189
190    return cl.Context([default])
191
192
193class GpuModel(object):
194    """
195    GPU wrapper for a single model.
196
197    *source* and *info* are the model source and interface as returned
198    from :func:`gen.make`.
199
200    *dtype* is the desired model precision.  Any numpy dtype for single
201    or double precision floats will do, such as 'f', 'float32' or 'single'
202    for single and 'd', 'float64' or 'double' for double.  Double precision
203    is an optional extension which may not be available on all devices.
204    """
205    def __init__(self, source, info, dtype=generate.F32):
206        self.info = info
207        self.source = source
208        self.dtype = np.dtype(dtype)
209        self.program = None # delay program creation
210
211    def __getstate__(self):
212        state = self.__dict__.copy()
213        state['program'] = None
214        return state
215
216    def __setstate__(self, state):
217        self.__dict__ = state.copy()
218
219    def __call__(self, q_input):
220        if self.dtype != q_input.dtype:
221            raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
222        if self.program is None:
223            compiler = environment().compile_program
224            self.program = compiler(self.info['name'], self.source, self.dtype)
225        kernel_name = generate.kernel_name(self.info, q_input.is_2D)
226        kernel = getattr(self.program, kernel_name)
227        return GpuKernel(kernel, self.info, q_input)
228
229    def release(self):
230        if self.program is not None:
231            environment().release_program(self.info['name'])
232            self.program = None
233
234    def make_input(self, q_vectors):
235        """
236        Make q input vectors available to the model.
237
238        Note that each model needs its own q vector even if the case of
239        mixture models because some models may be OpenCL, some may be
240        ctypes and some may be pure python.
241        """
242        return GpuInput(q_vectors, dtype=self.dtype)
243
244# TODO: check that we don't need a destructor for buffers which go out of scope
245class GpuInput(object):
246    """
247    Make q data available to the gpu.
248
249    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
250    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
251    to get the best performance on OpenCL, which may involve shifting and
252    stretching the array to better match the memory architecture.  Additional
253    points will be evaluated with *q=1e-3*.
254
255    *dtype* is the data type for the q vectors. The data type should be
256    set to match that of the kernel, which is an attribute of
257    :class:`GpuProgram`.  Note that not all kernels support double
258    precision, so even if the program was created for double precision,
259    the *GpuProgram.dtype* may be single precision.
260
261    Call :meth:`release` when complete.  Even if not called directly, the
262    buffer will be released when the data object is freed.
263    """
264    def __init__(self, q_vectors, dtype=generate.F32):
265        env = environment()
266        self.nq = q_vectors[0].size
267        self.dtype = np.dtype(dtype)
268        self.is_2D = (len(q_vectors) == 2)
269        # TODO: stretch input based on get_warp()
270        # not doing it now since warp depends on kernel, which is not known
271        # at this point, so instead using 32, which is good on the set of
272        # architectures tested so far.
273        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
274        self.q_buffers = [
275            cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
276            for q in self.q_vectors
277        ]
278        self.global_size = [self.q_vectors[0].size]
279
280    def release(self):
281        for b in self.q_buffers:
282            b.release()
283        self.q_buffers = []
284
285class GpuKernel(object):
286    """
287    Callable SAS kernel.
288
289    *kernel* is the GpuKernel object to call.
290
291    *info* is the module information
292
293    *q_input* is the DllInput q vectors at which the kernel should be
294    evaluated.
295
296    The resulting call method takes the *pars*, a list of values for
297    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
298    vectors for the polydisperse parameters.  *cutoff* determines the
299    integration limits: any points with combined weight less than *cutoff*
300    will not be calculated.
301
302    Call :meth:`release` when done with the kernel instance.
303    """
304    def __init__(self, kernel, info, q_input):
305        self.q_input = q_input
306        self.kernel = kernel
307        self.info = info
308        self.res = np.empty(q_input.nq, q_input.dtype)
309        dim = '2d' if q_input.is_2D else '1d'
310        self.fixed_pars = info['partype']['fixed-' + dim]
311        self.pd_pars = info['partype']['pd-' + dim]
312
313        # Inputs and outputs for each kernel call
314        # Note: res may be shorter than res_b if global_size != nq
315        env = environment()
316        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
317                                  2 * MAX_LOOPS * q_input.dtype.itemsize)
318                        for _ in env.queues]
319        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
320                                q_input.global_size[0] * q_input.dtype.itemsize)
321                      for _ in env.queues]
322
323
324    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
325        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
326
327        device_num = 0
328        queuei = environment().queues[device_num]
329        res_bi = self.res_b[device_num]
330        nq = np.uint32(self.q_input.nq)
331        if pd_pars:
332            cutoff = real(cutoff)
333            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
334            loops = np.hstack(pd_pars) \
335                if pd_pars else np.empty(0, dtype=self.q_input.dtype)
336            loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
337            #print "loops",Nloops, loops
338
339            #import sys; print >>sys.stderr,"opencl eval",pars
340            #print "opencl eval",pars
341            if len(loops) > 2 * MAX_LOOPS:
342                raise ValueError("too many polydispersity points")
343
344            loops_bi = self.loops_b[device_num]
345            cl.enqueue_copy(queuei, loops_bi, loops)
346            loops_l = cl.LocalMemory(len(loops.data))
347            #ctx = environment().context
348            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops)
349            dispersed = [loops_bi, loops_l, cutoff] + loops_N
350        else:
351            dispersed = []
352        fixed = [real(p) for p in fixed_pars]
353        args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
354        self.kernel(queuei, self.q_input.global_size, None, *args)
355        cl.enqueue_copy(queuei, self.res, res_bi)
356
357        return self.res
358
359    def release(self):
360        for b in self.loops_b:
361            b.release()
362        self.loops_b = []
363        for b in self.res_b:
364            b.release()
365        self.res_b = []
366
367    def __del__(self):
368        self.release()
Note: See TracBrowser for help on using the repository browser.