source: sasmodels/sasmodels/kernelcl.py @ f734e7d

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since f734e7d was f734e7d, checked in by pkienzle, 9 years ago

restructure c code generation for maintainability; extend test harness to allow opencl and ctypes tests

  • Property mode set to 100644
File size: 13.9 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32except ImportError,exc:
33    warnings.warn(str(exc))
34    raise RuntimeError("OpenCL not available")
35
36try:
37    context = cl.create_some_context(interactive=False)
38    del context
39except cl.RuntimeError, exc:
40    warnings.warn(str(exc))
41    raise RuntimeError("OpenCl not available")
42
43from pyopencl import mem_flags as mf
44
45from . import generate
46from .kernelpy import PyInput, PyModel
47
48F64_DEFS = """\
49#ifdef cl_khr_fp64
50#  pragma OPENCL EXTENSION cl_khr_fp64: enable
51#endif
52"""
53
54# The max loops number is limited by the amount of local memory available
55# on the device.  You don't want to make this value too big because it will
56# waste resources, nor too small because it may interfere with users trying
57# to do their polydispersity calculations.  A value of 1024 should be much
58# larger than necessary given that cost grows as npts^k where k is the number
59# of polydisperse parameters.
60MAX_LOOPS = 2048
61
62def load_model(kernel_module, dtype="single"):
63    """
64    Load the OpenCL model defined by *kernel_module*.
65
66    Access to the OpenCL device is delayed until the kernel is called
67    so models can be defined without using too many resources.
68    """
69    source, info = generate.make(kernel_module)
70    if callable(info.get('Iq',None)):
71        return PyModel(info)
72    ## for debugging, save source to a .cl file, edit it, and reload as model
73    #open(info['name']+'.cl','w').write(source)
74    #source = open(info['name']+'.cl','r').read()
75    return GpuModel(source, info, dtype)
76
77ENV = None
78def environment():
79    """
80    Returns a singleton :class:`GpuEnvironment`.
81
82    This provides an OpenCL context and one queue per device.
83    """
84    global ENV
85    if ENV is None:
86        ENV = GpuEnvironment()
87    return ENV
88
89def has_double(device):
90    """
91    Return true if device supports double precision.
92    """
93    return "cl_khr_fp64" in device.extensions
94
95def get_warp(kernel, queue):
96    """
97    Return the size of an execution batch for *kernel* running on *queue*.
98    """
99    return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
100                                      queue.device)
101
102def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
103    """
104    Stretch an input vector to the correct boundary.
105
106    Performance on the kernels can drop by a factor of two or more if the
107    number of values to compute does not fall on a nice power of two
108    boundary.   The trailing additional vector elements are given a
109    value of *extra*, and so f(*extra*) will be computed for each of
110    them.  The returned array will thus be a subset of the computed array.
111
112    *boundary* should be a power of 2 which is at least 32 for good
113    performance on current platforms (as of Jan 2015).  It should
114    probably be the max of get_warp(kernel,queue) and
115    device.min_data_type_align_size//4.
116    """
117    remainder = vector.size%boundary
118    if remainder != 0:
119        size = vector.size + (boundary - remainder)
120        vector = np.hstack((vector, [extra]*(size-vector.size)))
121    return np.ascontiguousarray(vector, dtype=dtype)
122
123
124def compile_model(context, source, dtype):
125    """
126    Build a model to run on the gpu.
127
128    Returns the compiled program and its type.  The returned type will
129    be float32 even if the desired type is float64 if any of the
130    devices in the context do not support the cl_khr_fp64 extension.
131    """
132    dtype = np.dtype(dtype)
133    if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
134        raise RuntimeError("Double precision not supported for devices")
135
136    header = F64_DEFS if dtype == generate.F64 else ""
137    if dtype == generate.F32:
138        source = generate.use_single(source)
139    # Note: USE_SINCOS makes the intel cpu slower under opencl
140    if context.devices[0].type == cl.device_type.GPU:
141        header += "#define USE_SINCOS\n"
142    program  = cl.Program(context, header+source).build()
143    return program
144
145
146def make_result(self, size):
147    self.res = np.empty(size, dtype=self.dtype)
148    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
149    return self.res, self.res_b
150
151
152# for now, this returns one device in the context
153# TODO: create a context that contains all devices on all platforms
154class GpuEnvironment(object):
155    """
156    GPU context, with possibly many devices, and one queue per device.
157    """
158    def __init__(self):
159        # find gpu context
160        #self.context = cl.create_some_context()
161
162        self.context = None
163        if 'PYOPENCL_CTX' in os.environ:
164            self._create_some_context()
165
166        if not self.context:
167            self.context = self._find_context()
168
169        # Byte boundary for data alignment
170        #self.data_boundary = max(d.min_data_type_align_size
171        #                         for d in self.context.devices)
172        self.queues = [cl.CommandQueue(self.context, d)
173                       for d in self.context.devices]
174        self.has_double = all(has_double(d) for d in self.context.devices)
175        self.compiled = {}
176
177    def _create_some_context(self):
178        try:
179            self.context = cl.create_some_context(interactive=False)
180        except Exception,exc:
181            warnings.warn(str(exc))
182            warnings.warn("pyopencl.create_some_context() failed")
183            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
184
185    def _find_context(self):
186        default = None
187        for platform in cl.get_platforms():
188            for device in platform.get_devices():
189                if device.type == cl.device_type.GPU:
190                    return cl.Context([device])
191                if default is None:
192                    default = device
193
194        if not default:
195            raise RuntimeError("OpenCL device not found")
196
197        return cl.Context([default])
198
199    def compile_program(self, name, source, dtype):
200        if name not in self.compiled:
201            #print "compiling",name
202            self.compiled[name] = compile_model(self.context, source, dtype)
203        return self.compiled[name]
204
205    def release_program(self, name):
206        if name in self.compiled:
207            self.compiled[name].release()
208            del self.compiled[name]
209
210
211class GpuModel(object):
212    """
213    GPU wrapper for a single model.
214
215    *source* and *info* are the model source and interface as returned
216    from :func:`gen.make`.
217
218    *dtype* is the desired model precision.  Any numpy dtype for single
219    or double precision floats will do, such as 'f', 'float32' or 'single'
220    for single and 'd', 'float64' or 'double' for double.  Double precision
221    is an optional extension which may not be available on all devices.
222    """
223    def __init__(self, source, info, dtype=generate.F32):
224        self.info = info
225        self.source = source
226        self.dtype = dtype
227        self.program = None # delay program creation
228
229    def __getstate__(self):
230        state = self.__dict__.copy()
231        state['program'] = None
232        return state
233
234    def __setstate__(self, state):
235        self.__dict__ = state.copy()
236
237    def __call__(self, input):
238        if self.dtype != input.dtype:
239            raise TypeError("data and kernel have different types")
240        if self.program is None:
241            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
242        kernel_name = generate.kernel_name(self.info, input.is_2D)
243        kernel = getattr(self.program, kernel_name)
244        return GpuKernel(kernel, self.info, input)
245
246    def release(self):
247        if self.program is not None:
248            environment().release_program(self.info['name'])
249            self.program = None
250
251    def make_input(self, q_vectors):
252        """
253        Make q input vectors available to the model.
254
255        Note that each model needs its own q vector even if the case of
256        mixture models because some models may be OpenCL, some may be
257        ctypes and some may be pure python.
258        """
259        return GpuInput(q_vectors, dtype=self.dtype)
260
261# TODO: check that we don't need a destructor for buffers which go out of scope
262class GpuInput(object):
263    """
264    Make q data available to the gpu.
265
266    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
267    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
268    to get the best performance on OpenCL, which may involve shifting and
269    stretching the array to better match the memory architecture.  Additional
270    points will be evaluated with *q=1e-3*.
271
272    *dtype* is the data type for the q vectors. The data type should be
273    set to match that of the kernel, which is an attribute of
274    :class:`GpuProgram`.  Note that not all kernels support double
275    precision, so even if the program was created for double precision,
276    the *GpuProgram.dtype* may be single precision.
277
278    Call :meth:`release` when complete.  Even if not called directly, the
279    buffer will be released when the data object is freed.
280    """
281    def __init__(self, q_vectors, dtype=generate.F32):
282        env = environment()
283        self.nq = q_vectors[0].size
284        self.dtype = np.dtype(dtype)
285        self.is_2D = (len(q_vectors) == 2)
286        # TODO: stretch input based on get_warp()
287        # not doing it now since warp depends on kernel, which is not known
288        # at this point, so instead using 32, which is good on the set of
289        # architectures tested so far.
290        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
291        self.q_buffers = [
292            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
293            for q in self.q_vectors
294        ]
295        self.global_size = [self.q_vectors[0].size]
296
297    def release(self):
298        for b in self.q_buffers:
299            b.release()
300        self.q_buffers = []
301
302class GpuKernel(object):
303    """
304    Callable SAS kernel.
305
306    *kernel* is the GpuKernel object to call.
307
308    *info* is the module information
309
310    *input* is the DllInput q vectors at which the kernel should be
311    evaluated.
312
313    The resulting call method takes the *pars*, a list of values for
314    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
315    vectors for the polydisperse parameters.  *cutoff* determines the
316    integration limits: any points with combined weight less than *cutoff*
317    will not be calculated.
318
319    Call :meth:`release` when done with the kernel instance.
320    """
321    def __init__(self, kernel, info, input):
322        self.input = input
323        self.kernel = kernel
324        self.info = info
325        self.res = np.empty(input.nq, input.dtype)
326        dim = '2d' if input.is_2D else '1d'
327        self.fixed_pars = info['partype']['fixed-'+dim]
328        self.pd_pars = info['partype']['pd-'+dim]
329
330        # Inputs and outputs for each kernel call
331        # Note: res may be shorter than res_b if global_size != nq
332        env = environment()
333        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
334                                  2*MAX_LOOPS*input.dtype.itemsize)
335                        for _ in env.queues]
336        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
337                                input.global_size[0]*input.dtype.itemsize)
338                      for _ in env.queues]
339
340
341    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
342        real = np.float32 if self.input.dtype == generate.F32 else np.float64
343
344        device_num = 0
345        queuei = environment().queues[device_num]
346        res_bi = self.res_b[device_num]
347        nq = np.uint32(self.input.nq)
348        if pd_pars:
349            cutoff = real(cutoff)
350            loops_N = [np.uint32(len(p[0])) for p in pd_pars]
351            loops = np.hstack(pd_pars) if pd_pars else np.empty(0,dtype=self.input.dtype)
352            loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
353            #print "loops",Nloops, loops
354
355            #import sys; print >>sys.stderr,"opencl eval",pars
356            #print "opencl eval",pars
357            if len(loops) > 2*MAX_LOOPS:
358                raise ValueError("too many polydispersity points")
359
360            loops_bi = self.loops_b[device_num]
361            cl.enqueue_copy(queuei, loops_bi, loops)
362            loops_l = cl.LocalMemory(len(loops.data))
363            #ctx = environment().context
364            #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
365            dispersed = [loops_bi, loops_l, cutoff] + loops_N
366        else:
367            dispersed = []
368        fixed = [real(p) for p in fixed_pars]
369        args = self.input.q_buffers + [res_bi, nq] + dispersed + fixed
370        self.kernel(queuei, self.input.global_size, None, *args)
371        cl.enqueue_copy(queuei, self.res, res_bi)
372
373        return self.res
374
375    def release(self):
376        for b in self.loops_b:
377            b.release()
378        self.loops_b = []
379        for b in self.res_b:
380            b.release()
381        self.res_b = []
382
383    def __del__(self):
384        self.release()
Note: See TracBrowser for help on using the repository browser.