source: sasmodels/sasmodels/kernelcl.py @ aa4946b

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since aa4946b was aa4946b, checked in by Paul Kienzle <pkienzle@…>, 9 years ago

refactor so kernels are loaded via core.load_model

  • Property mode set to 100644
File size: 13.9 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32    # Ask OpenCL for the default context so that we know that one exists
33    cl.create_some_context(interactive=False)
34except Exception, exc:
35    warnings.warn(str(exc))
36    raise RuntimeError("OpenCL not available")
37
38from pyopencl import mem_flags as mf
39
40from . import generate
41from .kernelpy import PyModel
42
43F64_DEFS = """\
44#ifdef cl_khr_fp64
45#  pragma OPENCL EXTENSION cl_khr_fp64: enable
46#endif
47"""
48
49# The max loops number is limited by the amount of local memory available
50# on the device.  You don't want to make this value too big because it will
51# waste resources, nor too small because it may interfere with users trying
52# to do their polydispersity calculations.  A value of 1024 should be much
53# larger than necessary given that cost grows as npts^k where k is the number
54# of polydisperse parameters.
55MAX_LOOPS = 2048
56
57def _load_model(kernel_module, dtype="single"):
58    """
59    Load the OpenCL model defined by *kernel_module*.
60
61    Access to the OpenCL device is delayed until the kernel is called
62    so models can be defined without using too many resources.
63    """
64    raise DeprecationWarning
65    source, info = generate.make(kernel_module)
66    if callable(info.get('Iq', None)):
67        return PyModel(info)
68    ## for debugging, save source to a .cl file, edit it, and reload as model
69    #open(info['name']+'.cl','w').write(source)
70    #source = open(info['name']+'.cl','r').read()
71    return GpuModel(source, info, np.dtype(dtype))
72
73ENV = None
74def environment():
75    """
76    Returns a singleton :class:`GpuEnvironment`.
77
78    This provides an OpenCL context and one queue per device.
79    """
80    global ENV
81    if ENV is None:
82        ENV = GpuEnvironment()
83    return ENV
84
85def has_double(device):
86    """
87    Return true if device supports double precision.
88    """
89    return "cl_khr_fp64" in device.extensions
90
91def get_warp(kernel, queue):
92    """
93    Return the size of an execution batch for *kernel* running on *queue*.
94    """
95    return kernel.get_work_group_info(
96        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
97        queue.device)
98
99def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
100    """
101    Stretch an input vector to the correct boundary.
102
103    Performance on the kernels can drop by a factor of two or more if the
104    number of values to compute does not fall on a nice power of two
105    boundary.   The trailing additional vector elements are given a
106    value of *extra*, and so f(*extra*) will be computed for each of
107    them.  The returned array will thus be a subset of the computed array.
108
109    *boundary* should be a power of 2 which is at least 32 for good
110    performance on current platforms (as of Jan 2015).  It should
111    probably be the max of get_warp(kernel,queue) and
112    device.min_data_type_align_size//4.
113    """
114    remainder = vector.size % boundary
115    if remainder != 0:
116        size = vector.size + (boundary - remainder)
117        vector = np.hstack((vector, [extra] * (size - vector.size)))
118    return np.ascontiguousarray(vector, dtype=dtype)
119
120
121def compile_model(context, source, dtype):
122    """
123    Build a model to run on the gpu.
124
125    Returns the compiled program and its type.  The returned type will
126    be float32 even if the desired type is float64 if any of the
127    devices in the context do not support the cl_khr_fp64 extension.
128    """
129    dtype = np.dtype(dtype)
130    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
131        raise RuntimeError("Double precision not supported for devices")
132
133    header = F64_DEFS if dtype == generate.F64 else ""
134    if dtype == generate.F32:
135        source = generate.use_single(source)
136    # Note: USE_SINCOS makes the intel cpu slower under opencl
137    if context.devices[0].type == cl.device_type.GPU:
138        header += "#define USE_SINCOS\n"
139    program = cl.Program(context, header + source).build()
140    return program
141
142
143def make_result(self, size):
144    self.res = np.empty(size, dtype=self.dtype)
145    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
146    return self.res, self.res_b
147
148
149# for now, this returns one device in the context
150# TODO: create a context that contains all devices on all platforms
151class GpuEnvironment(object):
152    """
153    GPU context, with possibly many devices, and one queue per device.
154    """
155    def __init__(self):
156        # find gpu context
157        #self.context = cl.create_some_context()
158
159        self.context = None
160        if 'PYOPENCL_CTX' in os.environ:
161            self._create_some_context()
162
163        if not self.context:
164            self.context = _get_default_context()
165
166        # Byte boundary for data alignment
167        #self.data_boundary = max(d.min_data_type_align_size
168        #                         for d in self.context.devices)
169        self.queues = [cl.CommandQueue(self.context, d)
170                       for d in self.context.devices]
171        self.has_double = all(has_double(d) for d in self.context.devices)
172        self.compiled = {}
173
174    def _create_some_context(self):
175        try:
176            self.context = cl.create_some_context(interactive=False)
177        except Exception, exc:
178            warnings.warn(str(exc))
179            warnings.warn("pyopencl.create_some_context() failed")
180            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
181
182    def compile_program(self, name, source, dtype):
183        if name not in self.compiled:
184            #print "compiling",name
185            self.compiled[name] = compile_model(self.context, source, dtype)
186        return self.compiled[name]
187
188    def release_program(self, name):
189        if name in self.compiled:
190            self.compiled[name].release()
191            del self.compiled[name]
192
193def _get_default_context():
194    default = None
195    for platform in cl.get_platforms():
196        for device in platform.get_devices():
197            if device.type == cl.device_type.GPU:
198                return cl.Context([device])
199            if default is None:
200                default = device
201
202    if not default:
203        raise RuntimeError("OpenCL device not found")
204
205    return cl.Context([default])
206
207
208class GpuModel(object):
209    """
210    GPU wrapper for a single model.
211
212    *source* and *info* are the model source and interface as returned
213    from :func:`gen.make`.
214
215    *dtype* is the desired model precision.  Any numpy dtype for single
216    or double precision floats will do, such as 'f', 'float32' or 'single'
217    for single and 'd', 'float64' or 'double' for double.  Double precision
218    is an optional extension which may not be available on all devices.
219    """
220    def __init__(self, source, info, dtype=generate.F32):
221        self.info = info
222        self.source = source
223        self.dtype = np.dtype(dtype)
224        self.program = None # delay program creation
225
226    def __getstate__(self):
227        state = self.__dict__.copy()
228        state['program'] = None
229        return state
230
231    def __setstate__(self, state):
232        self.__dict__ = state.copy()
233
234    def __call__(self, q_input):
235        if self.dtype != q_input.dtype:
236            raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
237        if self.program is None:
238            compiler = environment().compile_program
239            self.program = compiler(self.info['name'], self.source, self.dtype)
240        kernel_name = generate.kernel_name(self.info, q_input.is_2D)
241        kernel = getattr(self.program, kernel_name)
242        return GpuKernel(kernel, self.info, q_input)
243
244    def release(self):
245        if self.program is not None:
246            environment().release_program(self.info['name'])
247            self.program = None
248
249    def make_input(self, q_vectors):
250        """
251        Make q input vectors available to the model.
252
253        Note that each model needs its own q vector even if the case of
254        mixture models because some models may be OpenCL, some may be
255        ctypes and some may be pure python.
256        """
257        return GpuInput(q_vectors, dtype=self.dtype)
258
259# TODO: check that we don't need a destructor for buffers which go out of scope
260class GpuInput(object):
261    """
262    Make q data available to the gpu.
263
264    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
265    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
266    to get the best performance on OpenCL, which may involve shifting and
267    stretching the array to better match the memory architecture.  Additional
268    points will be evaluated with *q=1e-3*.
269
270    *dtype* is the data type for the q vectors. The data type should be
271    set to match that of the kernel, which is an attribute of
272    :class:`GpuProgram`.  Note that not all kernels support double
273    precision, so even if the program was created for double precision,
274    the *GpuProgram.dtype* may be single precision.
275
276    Call :meth:`release` when complete.  Even if not called directly, the
277    buffer will be released when the data object is freed.
278    """
279    def __init__(self, q_vectors, dtype=generate.F32):
280        env = environment()
281        self.nq = q_vectors[0].size
282        self.dtype = np.dtype(dtype)
283        self.is_2D = (len(q_vectors) == 2)
284        # TODO: stretch input based on get_warp()
285        # not doing it now since warp depends on kernel, which is not known
286        # at this point, so instead using 32, which is good on the set of
287        # architectures tested so far.
288        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
289        self.q_buffers = [
290            cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
291            for q in self.q_vectors
292        ]
293        self.global_size = [self.q_vectors[0].size]
294
295    def release(self):
296        for b in self.q_buffers:
297            b.release()
298        self.q_buffers = []
299
300class GpuKernel(object):
301    """
302    Callable SAS kernel.
303
304    *kernel* is the GpuKernel object to call.
305
306    *info* is the module information
307
308    *q_input* is the DllInput q vectors at which the kernel should be
309    evaluated.
310
311    The resulting call method takes the *pars*, a list of values for
312    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
313    vectors for the polydisperse parameters.  *cutoff* determines the
314    integration limits: any points with combined weight less than *cutoff*
315    will not be calculated.
316
317    Call :meth:`release` when done with the kernel instance.
318    """
319    def __init__(self, kernel, info, q_input):
320        self.q_input = q_input
321        self.kernel = kernel
322        self.info = info
323        self.res = np.empty(q_input.nq, q_input.dtype)
324        dim = '2d' if q_input.is_2D else '1d'
325        self.fixed_pars = info['partype']['fixed-' + dim]
326        self.pd_pars = info['partype']['pd-' + dim]
327
328        # Inputs and outputs for each kernel call
329        # Note: res may be shorter than res_b if global_size != nq
330        env = environment()
331        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
332                                  2 * MAX_LOOPS * q_input.dtype.itemsize)
333                        for _ in env.queues]
334        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
335                                q_input.global_size[0] * q_input.dtype.itemsize)
336                      for _ in env.queues]
337
338
339    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
340        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
341
342        device_num = 0
343        queuei = environment().queues[device_num]
344        res_bi = self.res_b[device_num]
345        nq = np.uint32(self.q_input.nq)
346        if pd_pars:
347            cutoff = real(cutoff)
348            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
349            loops = np.hstack(pd_pars) \
350                if pd_pars else np.empty(0, dtype=self.q_input.dtype)
351            loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
352            #print "loops",Nloops, loops
353
354            #import sys; print >>sys.stderr,"opencl eval",pars
355            #print "opencl eval",pars
356            if len(loops) > 2 * MAX_LOOPS:
357                raise ValueError("too many polydispersity points")
358
359            loops_bi = self.loops_b[device_num]
360            cl.enqueue_copy(queuei, loops_bi, loops)
361            loops_l = cl.LocalMemory(len(loops.data))
362            #ctx = environment().context
363            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops)
364            dispersed = [loops_bi, loops_l, cutoff] + loops_N
365        else:
366            dispersed = []
367        fixed = [real(p) for p in fixed_pars]
368        args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
369        self.kernel(queuei, self.q_input.global_size, None, *args)
370        cl.enqueue_copy(queuei, self.res, res_bi)
371
372        return self.res
373
374    def release(self):
375        for b in self.loops_b:
376            b.release()
377        self.loops_b = []
378        for b in self.res_b:
379            b.release()
380        self.res_b = []
381
382    def __del__(self):
383        self.release()
Note: See TracBrowser for help on using the repository browser.