source: sasmodels/sasmodels/kernelcl.py @ 3c56da87

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 3c56da87 was 3c56da87, checked in by Paul Kienzle <pkienzle@…>, 9 years ago

lint cleanup

  • Property mode set to 100644
File size: 13.9 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32    # Ask OpenCL for the default context so that we know that one exists
33    cl.create_some_context(interactive=False)
34except Exception, exc:
35    warnings.warn(str(exc))
36    raise RuntimeError("OpenCL not available")
37
38from pyopencl import mem_flags as mf
39
40from . import generate
41from .kernelpy import PyModel
42
43F64_DEFS = """\
44#ifdef cl_khr_fp64
45#  pragma OPENCL EXTENSION cl_khr_fp64: enable
46#endif
47"""
48
49# The max loops number is limited by the amount of local memory available
50# on the device.  You don't want to make this value too big because it will
51# waste resources, nor too small because it may interfere with users trying
52# to do their polydispersity calculations.  A value of 1024 should be much
53# larger than necessary given that cost grows as npts^k where k is the number
54# of polydisperse parameters.
55MAX_LOOPS = 2048
56
57def load_model(kernel_module, dtype="single"):
58    """
59    Load the OpenCL model defined by *kernel_module*.
60
61    Access to the OpenCL device is delayed until the kernel is called
62    so models can be defined without using too many resources.
63    """
64    source, info = generate.make(kernel_module)
65    if callable(info.get('Iq', None)):
66        return PyModel(info)
67    ## for debugging, save source to a .cl file, edit it, and reload as model
68    #open(info['name']+'.cl','w').write(source)
69    #source = open(info['name']+'.cl','r').read()
70    return GpuModel(source, info, dtype)
71
72ENV = None
73def environment():
74    """
75    Returns a singleton :class:`GpuEnvironment`.
76
77    This provides an OpenCL context and one queue per device.
78    """
79    global ENV
80    if ENV is None:
81        ENV = GpuEnvironment()
82    return ENV
83
84def has_double(device):
85    """
86    Return true if device supports double precision.
87    """
88    return "cl_khr_fp64" in device.extensions
89
90def get_warp(kernel, queue):
91    """
92    Return the size of an execution batch for *kernel* running on *queue*.
93    """
94    return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
95                                      queue.device)
96
97def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
98    """
99    Stretch an input vector to the correct boundary.
100
101    Performance on the kernels can drop by a factor of two or more if the
102    number of values to compute does not fall on a nice power of two
103    boundary.   The trailing additional vector elements are given a
104    value of *extra*, and so f(*extra*) will be computed for each of
105    them.  The returned array will thus be a subset of the computed array.
106
107    *boundary* should be a power of 2 which is at least 32 for good
108    performance on current platforms (as of Jan 2015).  It should
109    probably be the max of get_warp(kernel,queue) and
110    device.min_data_type_align_size//4.
111    """
112    remainder = vector.size % boundary
113    if remainder != 0:
114        size = vector.size + (boundary - remainder)
115        vector = np.hstack((vector, [extra] * (size - vector.size)))
116    return np.ascontiguousarray(vector, dtype=dtype)
117
118
119def compile_model(context, source, dtype):
120    """
121    Build a model to run on the gpu.
122
123    Returns the compiled program and its type.  The returned type will
124    be float32 even if the desired type is float64 if any of the
125    devices in the context do not support the cl_khr_fp64 extension.
126    """
127    dtype = np.dtype(dtype)
128    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
129        raise RuntimeError("Double precision not supported for devices")
130
131    header = F64_DEFS if dtype == generate.F64 else ""
132    if dtype == generate.F32:
133        source = generate.use_single(source)
134    # Note: USE_SINCOS makes the intel cpu slower under opencl
135    if context.devices[0].type == cl.device_type.GPU:
136        header += "#define USE_SINCOS\n"
137    program = cl.Program(context, header + source).build()
138    return program
139
140
141def make_result(self, size):
142    self.res = np.empty(size, dtype=self.dtype)
143    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
144    return self.res, self.res_b
145
146
147# for now, this returns one device in the context
148# TODO: create a context that contains all devices on all platforms
149class GpuEnvironment(object):
150    """
151    GPU context, with possibly many devices, and one queue per device.
152    """
153    def __init__(self):
154        # find gpu context
155        #self.context = cl.create_some_context()
156
157        self.context = None
158        if 'PYOPENCL_CTX' in os.environ:
159            self._create_some_context()
160
161        if not self.context:
162            self.context = _get_default_context()
163
164        # Byte boundary for data alignment
165        #self.data_boundary = max(d.min_data_type_align_size
166        #                         for d in self.context.devices)
167        self.queues = [cl.CommandQueue(self.context, d)
168                       for d in self.context.devices]
169        self.has_double = all(has_double(d) for d in self.context.devices)
170        self.compiled = {}
171
172    def _create_some_context(self):
173        try:
174            self.context = cl.create_some_context(interactive=False)
175        except Exception, exc:
176            warnings.warn(str(exc))
177            warnings.warn("pyopencl.create_some_context() failed")
178            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
179
180    def compile_program(self, name, source, dtype):
181        if name not in self.compiled:
182            #print "compiling",name
183            self.compiled[name] = compile_model(self.context, source, dtype)
184        return self.compiled[name]
185
186    def release_program(self, name):
187        if name in self.compiled:
188            self.compiled[name].release()
189            del self.compiled[name]
190
191def _get_default_context():
192    default = None
193    for platform in cl.get_platforms():
194        for device in platform.get_devices():
195            if device.type == cl.device_type.GPU:
196                return cl.Context([device])
197            if default is None:
198                default = device
199
200    if not default:
201        raise RuntimeError("OpenCL device not found")
202
203    return cl.Context([default])
204
205
206class GpuModel(object):
207    """
208    GPU wrapper for a single model.
209
210    *source* and *info* are the model source and interface as returned
211    from :func:`gen.make`.
212
213    *dtype* is the desired model precision.  Any numpy dtype for single
214    or double precision floats will do, such as 'f', 'float32' or 'single'
215    for single and 'd', 'float64' or 'double' for double.  Double precision
216    is an optional extension which may not be available on all devices.
217    """
218    def __init__(self, source, info, dtype=generate.F32):
219        self.info = info
220        self.source = source
221        self.dtype = dtype
222        self.program = None # delay program creation
223
224    def __getstate__(self):
225        state = self.__dict__.copy()
226        state['program'] = None
227        return state
228
229    def __setstate__(self, state):
230        self.__dict__ = state.copy()
231
232    def __call__(self, input_value):
233        if self.dtype != input_value.dtype:
234            raise TypeError("data and kernel have different types")
235        if self.program is None:
236            compiler = environment().compile_program
237            self.program = compiler(self.info['name'], self.source, self.dtype)
238        kernel_name = generate.kernel_name(self.info, input_value.is_2D)
239        kernel = getattr(self.program, kernel_name)
240        return GpuKernel(kernel, self.info, input_value)
241
242    def release(self):
243        if self.program is not None:
244            environment().release_program(self.info['name'])
245            self.program = None
246
247    def make_input(self, q_vectors):
248        """
249        Make q input vectors available to the model.
250
251        Note that each model needs its own q vector even if the case of
252        mixture models because some models may be OpenCL, some may be
253        ctypes and some may be pure python.
254        """
255        return GpuInput(q_vectors, dtype=self.dtype)
256
257# TODO: check that we don't need a destructor for buffers which go out of scope
258class GpuInput(object):
259    """
260    Make q data available to the gpu.
261
262    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
263    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
264    to get the best performance on OpenCL, which may involve shifting and
265    stretching the array to better match the memory architecture.  Additional
266    points will be evaluated with *q=1e-3*.
267
268    *dtype* is the data type for the q vectors. The data type should be
269    set to match that of the kernel, which is an attribute of
270    :class:`GpuProgram`.  Note that not all kernels support double
271    precision, so even if the program was created for double precision,
272    the *GpuProgram.dtype* may be single precision.
273
274    Call :meth:`release` when complete.  Even if not called directly, the
275    buffer will be released when the data object is freed.
276    """
277    def __init__(self, q_vectors, dtype=generate.F32):
278        env = environment()
279        self.nq = q_vectors[0].size
280        self.dtype = np.dtype(dtype)
281        self.is_2D = (len(q_vectors) == 2)
282        # TODO: stretch input based on get_warp()
283        # not doing it now since warp depends on kernel, which is not known
284        # at this point, so instead using 32, which is good on the set of
285        # architectures tested so far.
286        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
287        self.q_buffers = [
288            cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
289            for q in self.q_vectors
290        ]
291        self.global_size = [self.q_vectors[0].size]
292
293    def release(self):
294        for b in self.q_buffers:
295            b.release()
296        self.q_buffers = []
297
298class GpuKernel(object):
299    """
300    Callable SAS kernel.
301
302    *kernel* is the GpuKernel object to call.
303
304    *info* is the module information
305
306    *q_input* is the DllInput q vectors at which the kernel should be
307    evaluated.
308
309    The resulting call method takes the *pars*, a list of values for
310    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
311    vectors for the polydisperse parameters.  *cutoff* determines the
312    integration limits: any points with combined weight less than *cutoff*
313    will not be calculated.
314
315    Call :meth:`release` when done with the kernel instance.
316    """
317    def __init__(self, kernel, info, q_input):
318        self.q_input = q_input
319        self.kernel = kernel
320        self.info = info
321        self.res = np.empty(q_input.nq, q_input.dtype)
322        dim = '2d' if q_input.is_2D else '1d'
323        self.fixed_pars = info['partype']['fixed-' + dim]
324        self.pd_pars = info['partype']['pd-' + dim]
325
326        # Inputs and outputs for each kernel call
327        # Note: res may be shorter than res_b if global_size != nq
328        env = environment()
329        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
330                                  2 * MAX_LOOPS * q_input.dtype.itemsize)
331                        for _ in env.queues]
332        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
333                                q_input.global_size[0] * q_input.dtype.itemsize)
334                      for _ in env.queues]
335
336
337    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
338        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
339
340        device_num = 0
341        queuei = environment().queues[device_num]
342        res_bi = self.res_b[device_num]
343        nq = np.uint32(self.q_input.nq)
344        if pd_pars:
345            cutoff = real(cutoff)
346            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
347            loops = np.hstack(pd_pars) \
348                if pd_pars else np.empty(0, dtype=self.q_input.dtype)
349            loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten()
350            #print "loops",Nloops, loops
351
352            #import sys; print >>sys.stderr,"opencl eval",pars
353            #print "opencl eval",pars
354            if len(loops) > 2 * MAX_LOOPS:
355                raise ValueError("too many polydispersity points")
356
357            loops_bi = self.loops_b[device_num]
358            cl.enqueue_copy(queuei, loops_bi, loops)
359            loops_l = cl.LocalMemory(len(loops.data))
360            #ctx = environment().context
361            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops)
362            dispersed = [loops_bi, loops_l, cutoff] + loops_N
363        else:
364            dispersed = []
365        fixed = [real(p) for p in fixed_pars]
366        args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed
367        self.kernel(queuei, self.q_input.global_size, None, *args)
368        cl.enqueue_copy(queuei, self.res, res_bi)
369
370        return self.res
371
372    def release(self):
373        for b in self.loops_b:
374            b.release()
375        self.loops_b = []
376        for b in self.res_b:
377            b.release()
378        self.res_b = []
379
380    def __del__(self):
381        self.release()
Note: See TracBrowser for help on using the repository browser.