source: sasmodels/sasmodels/kernelcl.py @ 750ffa5

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 750ffa5 was 750ffa5, checked in by Paul Kienzle <pkienzle@…>, 9 years ago

allow test of dll using single precision

  • Property mode set to 100644
File size: 13.9 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32    # Ask OpenCL for the default context so that we know that one exists
33    cl.create_some_context(interactive=False)
34except Exception, exc:
35    warnings.warn(str(exc))
36    raise RuntimeError("OpenCL not available")
37
38from pyopencl import mem_flags as mf
39
40from . import generate
41from .kernelpy import PyModel
42
43F64_DEFS = """\
44#ifdef cl_khr_fp64
45#  pragma OPENCL EXTENSION cl_khr_fp64: enable
46#endif
47"""
48
49# The max loops number is limited by the amount of local memory available
50# on the device.  You don't want to make this value too big because it will
51# waste resources, nor too small because it may interfere with users trying
52# to do their polydispersity calculations.  A value of 1024 should be much
53# larger than necessary given that cost grows as npts^k where k is the number
54# of polydisperse parameters.
55MAX_LOOPS = 2048
56
57def load_model(kernel_module, dtype="single"):
58    """
59    Load the OpenCL model defined by *kernel_module*.
60
61    Access to the OpenCL device is delayed until the kernel is called
62    so models can be defined without using too many resources.
63    """
64    source, info = generate.make(kernel_module)
65    if callable(info.get('Iq', None)):
66        return PyModel(info)
67    ## for debugging, save source to a .cl file, edit it, and reload as model
68    #open(info['name']+'.cl','w').write(source)
69    #source = open(info['name']+'.cl','r').read()
70    return GpuModel(source, info, np.dtype(dtype))
71
72ENV = None
73def environment():
74    """
75    Returns a singleton :class:`GpuEnvironment`.
76
77    This provides an OpenCL context and one queue per device.
78    """
79    global ENV
80    if ENV is None:
81        ENV = GpuEnvironment()
82    return ENV
83
84def has_double(device):
85    """
86    Return true if device supports double precision.
87    """
88    return "cl_khr_fp64" in device.extensions
89
90def get_warp(kernel, queue):
91    """
92    Return the size of an execution batch for *kernel* running on *queue*.
93    """
94    return kernel.get_work_group_info(
95            cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
96            queue.device)
97
98def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
99    """
100    Stretch an input vector to the correct boundary.
101
102    Performance on the kernels can drop by a factor of two or more if the
103    number of values to compute does not fall on a nice power of two
104    boundary.   The trailing additional vector elements are given a
105    value of *extra*, and so f(*extra*) will be computed for each of
106    them.  The returned array will thus be a subset of the computed array.
107
108    *boundary* should be a power of 2 which is at least 32 for good
109    performance on current platforms (as of Jan 2015).  It should
110    probably be the max of get_warp(kernel,queue) and
111    device.min_data_type_align_size//4.
112    """
113    remainder = vector.size % boundary
114    if remainder != 0:
115        size = vector.size + (boundary - remainder)
116        vector = np.hstack((vector, [extra] * (size - vector.size)))
117    return np.ascontiguousarray(vector, dtype=dtype)
118
119
120def compile_model(context, source, dtype):
121    """
122    Build a model to run on the gpu.
123
124    Returns the compiled program and its type.  The returned type will
125    be float32 even if the desired type is float64 if any of the
126    devices in the context do not support the cl_khr_fp64 extension.
127    """
128    dtype = np.dtype(dtype)
129    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
130        raise RuntimeError("Double precision not supported for devices")
131
132    header = F64_DEFS if dtype == generate.F64 else ""
133    if dtype == generate.F32:
134        source = generate.use_single(source)
135    # Note: USE_SINCOS makes the intel cpu slower under opencl
136    if context.devices[0].type == cl.device_type.GPU:
137        header += "#define USE_SINCOS\n"
138    program = cl.Program(context, header + source).build()
139    return program
140
141
142def make_result(self, size):
143    self.res = np.empty(size, dtype=self.dtype)
144    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
145    return self.res, self.res_b
146
147
148# for now, this returns one device in the context
149# TODO: create a context that contains all devices on all platforms
150class GpuEnvironment(object):
151    """
152    GPU context, with possibly many devices, and one queue per device.
153    """
154    def __init__(self):
155        # find gpu context
156        #self.context = cl.create_some_context()
157
158        self.context = None
159        if 'PYOPENCL_CTX' in os.environ:
160            self._create_some_context()
161
162        if not self.context:
163            self.context = _get_default_context()
164
165        # Byte boundary for data alignment
166        #self.data_boundary = max(d.min_data_type_align_size
167        #                         for d in self.context.devices)
168        self.queues = [cl.CommandQueue(self.context, d)
169                       for d in self.context.devices]
170        self.has_double = all(has_double(d) for d in self.context.devices)
171        self.compiled = {}
172
173    def _create_some_context(self):
174        try:
175            self.context = cl.create_some_context(interactive=False)
176        except Exception, exc:
177            warnings.warn(str(exc))
178            warnings.warn("pyopencl.create_some_context() failed")
179            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
180
181    def compile_program(self, name, source, dtype):
182        if name not in self.compiled:
183            #print "compiling",name
184            self.compiled[name] = compile_model(self.context, source, dtype)
185        return self.compiled[name]
186
187    def release_program(self, name):
188        if name in self.compiled:
189            self.compiled[name].release()
190            del self.compiled[name]
191
192def _get_default_context():
193    default = None
194    for platform in cl.get_platforms():
195        for device in platform.get_devices():
196            if device.type == cl.device_type.GPU:
197                return cl.Context([device])
198            if default is None:
199                default = device
200
201    if not default:
202        raise RuntimeError("OpenCL device not found")
203
204    return cl.Context([default])
205
206
207class GpuModel(object):
208    """
209    GPU wrapper for a single model.
210
211    *source* and *info* are the model source and interface as returned
212    from :func:`gen.make`.
213
214    *dtype* is the desired model precision.  Any numpy dtype for single
215    or double precision floats will do, such as 'f', 'float32' or 'single'
216    for single and 'd', 'float64' or 'double' for double.  Double precision
217    is an optional extension which may not be available on all devices.
218    """
219    def __init__(self, source, info, dtype=generate.F32):
220        self.info = info
221        self.source = source
222        self.dtype = np.dtype(dtype)
223        self.program = None # delay program creation
224
225    def __getstate__(self):
226        state = self.__dict__.copy()
227        state['program'] = None
228        return state
229
230    def __setstate__(self, state):
231        self.__dict__ = state.copy()
232
233    def __call__(self, q_input):
234        if self.dtype != q_input.dtype:
235            raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
236        if self.program is None:
237            compiler = environment().compile_program
238            self.program = compiler(self.info['name'], self.source, self.dtype)
239        kernel_name = generate.kernel_name(self.info, q_input.is_2D)
240        kernel = getattr(self.program, kernel_name)
241        return GpuKernel(kernel, self.info, q_input)
242
243    def release(self):
244        if self.program is not None:
245            environment().release_program(self.info['name'])
246            self.program = None
247
248    def make_input(self, q_vectors):
249        """
250        Make q input vectors available to the model.
251
252        Note that each model needs its own q vector even if the case of
253        mixture models because some models may be OpenCL, some may be
254        ctypes and some may be pure python.
255        """
256        return GpuInput(q_vectors, dtype=self.dtype)
257
258# TODO: check that we don't need a destructor for buffers which go out of scope
259class GpuInput(object):
260    """
261    Make q data available to the gpu.
262
263    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
264    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
265    to get the best performance on OpenCL, which may involve shifting and
266    stretching the array to better match the memory architecture.  Additional
267    points will be evaluated with *q=1e-3*.
268
269    *dtype* is the data type for the q vectors. The data type should be
270    set to match that of the kernel, which is an attribute of
271    :class:`GpuProgram`.  Note that not all kernels support double
272    precision, so even if the program was created for double precision,
273    the *GpuProgram.dtype* may be single precision.
274
275    Call :meth:`release` when complete.  Even if not called directly, the
276    buffer will be released when the data object is freed.
277    """
278    def __init__(self, q_vectors, dtype=generate.F32):
279        env = environment()
280        self.nq = q_vectors[0].size
281        self.dtype = np.dtype(dtype)
282        self.is_2D = (len(q_vectors) == 2)
283        # TODO: stretch input based on get_warp()
284        # not doing it now since warp depends on kernel, which is not known
285        # at this point, so instead using 32, which is good on the set of
286        # architectures tested so far.
287        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
288        self.q_buffers = [
289            cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
290            for q in self.q_vectors
291        ]
292        self.global_size = [self.q_vectors[0].size]
293
294    def release(self):
295        for b in self.q_buffers:
296            b.release()
297        self.q_buffers = []
298
299class GpuKernel(object):
300    """
301    Callable SAS kernel.
302
303    *kernel* is the GpuKernel object to call.
304
305    *info* is the module information
306
307    *q_input* is the DllInput q vectors at which the kernel should be
308    evaluated.
309
310    The resulting call method takes the *pars*, a list of values for
311    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
312    vectors for the polydisperse parameters.  *cutoff* determines the
313    integration limits: any points with combined weight less than *cutoff*
314    will not be calculated.
315
316    Call :meth:`release` when done with the kernel instance.
317    """
318    def __init__(self, kernel, info, q_input):
319        self.q_input = q_input
320        self.kernel = kernel
321        self.info = info
322        self.res = np.empty(q_input.nq, q_input.dtype)
323        dim = '2d' if q_input.is_2D else '1d'
324        self.fixed_pars = info['partype']['fixed-' + dim]
325        self.pd_pars = info['partype']['pd-' + dim]
326
327        # Inputs and outputs for each kernel call
328        # Note: res may be shorter than res_b if global_size != nq
329        env = environment()
330        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
331                                  2 * MAX_LOOPS * q_input.dtype.itemsize)
332                        for _ in env.queues]
333        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
334                                q_input.global_size[0] * q_input.dtype.itemsize)
335                      for _ in env.queues]
336
337
338    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
339        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
340
341        device_num = 0
342        queuei = environment().queues[device_num]
343        res_bi = self.res_b[device_num]
344        nq = np.uint32(self.q_input.nq)
345        if pd_pars:
346            cutoff = real(cutoff)
347            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
348            loops = np.hstack(pd_pars) \
349                if pd_pars else np.empty(0, dtype=self.q_input.dtype)
350            loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
351            #print "loops",Nloops, loops
352
353            #import sys; print >>sys.stderr,"opencl eval",pars
354            #print "opencl eval",pars
355            if len(loops) > 2 * MAX_LOOPS:
356                raise ValueError("too many polydispersity points")
357
358            loops_bi = self.loops_b[device_num]
359            cl.enqueue_copy(queuei, loops_bi, loops)
360            loops_l = cl.LocalMemory(len(loops.data))
361            #ctx = environment().context
362            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops)
363            dispersed = [loops_bi, loops_l, cutoff] + loops_N
364        else:
365            dispersed = []
366        fixed = [real(p) for p in fixed_pars]
367        args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
368        self.kernel(queuei, self.q_input.global_size, None, *args)
369        cl.enqueue_copy(queuei, self.res, res_bi)
370
371        return self.res
372
373    def release(self):
374        for b in self.loops_b:
375            b.release()
376        self.loops_b = []
377        for b in self.res_b:
378            b.release()
379        self.res_b = []
380
381    def __del__(self):
382        self.release()
Note: See TracBrowser for help on using the repository browser.