source: sasmodels/sasmodels/kernelcl.py @ 676351f

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 676351f was 676351f, checked in by ajj, 9 years ago

Merge branch 'master' of https://github.com/SasView/sasmodels

  • Property mode set to 100644
File size: 14.1 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32except ImportError,exc:
33    warnings.warn(str(exc))
34    raise RuntimeError("OpenCL not available")
35
36try:
37    context = cl.create_some_context(interactive=False)
38    del context
39except cl.RuntimeError, exc:
40    warnings.warn(str(exc))
41    raise RuntimeError("OpenCl not available")
42
43from pyopencl import mem_flags as mf
44
45from . import generate
46from .kernelpy import PyInput, PyKernel
47
48F64_DEFS = """\
49#ifdef cl_khr_fp64
50#  pragma OPENCL EXTENSION cl_khr_fp64: enable
51#endif
52"""
53
54# The max loops number is limited by the amount of local memory available
55# on the device.  You don't want to make this value too big because it will
56# waste resources, nor too small because it may interfere with users trying
57# to do their polydispersity calculations.  A value of 1024 should be much
58# larger than necessary given that cost grows as npts^k where k is the number
59# of polydisperse parameters.
60MAX_LOOPS = 2048
61
62def load_model(kernel_module, dtype="single"):
63    """
64    Load the OpenCL model defined by *kernel_module*.
65
66    Access to the OpenCL device is delayed until the kernel is called
67    so models can be defined without using too many resources.
68    """
69    source, info = generate.make(kernel_module)
70    ## for debugging, save source to a .cl file, edit it, and reload as model
71    #open(info['name']+'.cl','w').write(source)
72    #source = open(info['name']+'.cl','r').read()
73    return GpuModel(source, info, dtype)
74
75ENV = None
76def environment():
77    """
78    Returns a singleton :class:`GpuEnvironment`.
79
80    This provides an OpenCL context and one queue per device.
81    """
82    global ENV
83    if ENV is None:
84        ENV = GpuEnvironment()
85    return ENV
86
87def has_double(device):
88    """
89    Return true if device supports double precision.
90    """
91    return "cl_khr_fp64" in device.extensions
92
93def get_warp(kernel, queue):
94    """
95    Return the size of an execution batch for *kernel* running on *queue*.
96    """
97    return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
98                                      queue.device)
99
100def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
101    """
102    Stretch an input vector to the correct boundary.
103
104    Performance on the kernels can drop by a factor of two or more if the
105    number of values to compute does not fall on a nice power of two
106    boundary.   The trailing additional vector elements are given a
107    value of *extra*, and so f(*extra*) will be computed for each of
108    them.  The returned array will thus be a subset of the computed array.
109
110    *boundary* should be a power of 2 which is at least 32 for good
111    performance on current platforms (as of Jan 2015).  It should
112    probably be the max of get_warp(kernel,queue) and
113    device.min_data_type_align_size//4.
114    """
115    remainder = vector.size%boundary
116    if remainder != 0:
117        size = vector.size + (boundary - remainder)
118        vector = np.hstack((vector, [extra]*(size-vector.size)))
119    return np.ascontiguousarray(vector, dtype=dtype)
120
121
122def compile_model(context, source, dtype):
123    """
124    Build a model to run on the gpu.
125
126    Returns the compiled program and its type.  The returned type will
127    be float32 even if the desired type is float64 if any of the
128    devices in the context do not support the cl_khr_fp64 extension.
129    """
130    dtype = np.dtype(dtype)
131    if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
132        raise RuntimeError("Double precision not supported for devices")
133
134    header = F64_DEFS if dtype == generate.F64 else ""
135    if dtype == generate.F32:
136        source = generate.use_single(source)
137    # Note: USE_SINCOS makes the intel cpu slower under opencl
138    if context.devices[0].type == cl.device_type.GPU:
139        header += "#define USE_SINCOS\n"
140    program  = cl.Program(context, header+source).build()
141    return program
142
143
144def make_result(self, size):
145    self.res = np.empty(size, dtype=self.dtype)
146    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
147    return self.res, self.res_b
148
149
150# for now, this returns one device in the context
151# TODO: create a context that contains all devices on all platforms
152class GpuEnvironment(object):
153    """
154    GPU context, with possibly many devices, and one queue per device.
155    """
156    def __init__(self):
157        # find gpu context
158        #self.context = cl.create_some_context()
159
160        self.context = None
161        if 'PYOPENCL_CTX' in os.environ:
162            self._create_some_context()
163
164        if not self.context:
165            self.context = self._find_context()
166
167        # Byte boundary for data alignment
168        #self.data_boundary = max(d.min_data_type_align_size
169        #                         for d in self.context.devices)
170        self.queues = [cl.CommandQueue(self.context, d)
171                       for d in self.context.devices]
172        self.has_double = all(has_double(d) for d in self.context.devices)
173        self.compiled = {}
174
175    def _create_some_context(self):
176        try:
177            self.context = cl.create_some_context(interactive=False)
178        except Exception,exc:
179            warnings.warn(str(exc))
180            warnings.warn("pyopencl.create_some_context() failed")
181            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
182
183    def _find_context(self):
184        default = None
185        for platform in cl.get_platforms():
186            for device in platform.get_devices():
187                if device.type == cl.device_type.GPU:
188                    return cl.Context([device])
189                if default is None:
190                    default = device
191
192        if not default:
193            raise RuntimeError("OpenCL device not found")
194
195        return cl.Context([default])
196
197    def compile_program(self, name, source, dtype):
198        if name not in self.compiled:
199            #print "compiling",name
200            self.compiled[name] = compile_model(self.context, source, dtype)
201        return self.compiled[name]
202
203    def release_program(self, name):
204        if name in self.compiled:
205            self.compiled[name].release()
206            del self.compiled[name]
207
208
209class GpuModel(object):
210    """
211    GPU wrapper for a single model.
212
213    *source* and *info* are the model source and interface as returned
214    from :func:`gen.make`.
215
216    *dtype* is the desired model precision.  Any numpy dtype for single
217    or double precision floats will do, such as 'f', 'float32' or 'single'
218    for single and 'd', 'float64' or 'double' for double.  Double precision
219    is an optional extension which may not be available on all devices.
220    """
221    def __init__(self, source, info, dtype=generate.F32):
222        self.info = info
223        self.source = source
224        self.dtype = dtype
225        self.program = None # delay program creation
226
227    def __getstate__(self):
228        state = self.__dict__.copy()
229        state['program'] = None
230        return state
231
232    def __setstate__(self, state):
233        self.__dict__ = state.copy()
234
235    def __call__(self, input):
236        # Support pure python kernel call
237        if input.is_2D and callable(self.info['Iqxy']):
238            return PyKernel(self.info['Iqxy'], self.info, input)
239        elif not input.is_2D and callable(self.info['Iq']):
240            return PyKernel(self.info['Iq'], self.info, input)
241
242        if self.dtype != input.dtype:
243            raise TypeError("data and kernel have different types")
244        if self.program is None:
245            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
246        kernel_name = generate.kernel_name(self.info, input.is_2D)
247        kernel = getattr(self.program, kernel_name)
248        return GpuKernel(kernel, self.info, input)
249
250    def release(self):
251        if self.program is not None:
252            environment().release_program(self.info['name'])
253            self.program = None
254
255    def make_input(self, q_vectors):
256        """
257        Make q input vectors available to the model.
258
259        Note that each model needs its own q vector even if the case of
260        mixture models because some models may be OpenCL, some may be
261        ctypes and some may be pure python.
262        """
263        # Support pure python kernel call
264        if len(q_vectors) == 1 and callable(self.info['Iq']):
265            return PyInput(q_vectors, dtype=self.dtype)
266        elif callable(self.info['Iqxy']):
267            return PyInput(q_vectors, dtype=self.dtype)
268        else:
269            return GpuInput(q_vectors, dtype=self.dtype)
270
271# TODO: check that we don't need a destructor for buffers which go out of scope
272class GpuInput(object):
273    """
274    Make q data available to the gpu.
275
276    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
277    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
278    to get the best performance on OpenCL, which may involve shifting and
279    stretching the array to better match the memory architecture.  Additional
280    points will be evaluated with *q=1e-3*.
281
282    *dtype* is the data type for the q vectors. The data type should be
283    set to match that of the kernel, which is an attribute of
284    :class:`GpuProgram`.  Note that not all kernels support double
285    precision, so even if the program was created for double precision,
286    the *GpuProgram.dtype* may be single precision.
287
288    Call :meth:`release` when complete.  Even if not called directly, the
289    buffer will be released when the data object is freed.
290    """
291    def __init__(self, q_vectors, dtype=generate.F32):
292        env = environment()
293        self.nq = q_vectors[0].size
294        self.dtype = np.dtype(dtype)
295        self.is_2D = (len(q_vectors) == 2)
296        # TODO: stretch input based on get_warp()
297        # not doing it now since warp depends on kernel, which is not known
298        # at this point, so instead using 32, which is good on the set of
299        # architectures tested so far.
300        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
301        self.q_buffers = [
302            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
303            for q in self.q_vectors
304        ]
305        self.global_size = [self.q_vectors[0].size]
306
307    def release(self):
308        for b in self.q_buffers:
309            b.release()
310        self.q_buffers = []
311
312class GpuKernel(object):
313    """
314    Callable SAS kernel.
315
316    *kernel* is the GpuKernel object to call.
317
318    *info* is the module information
319
320    *input* is the DllInput q vectors at which the kernel should be
321    evaluated.
322
323    The resulting call method takes the *pars*, a list of values for
324    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
325    vectors for the polydisperse parameters.  *cutoff* determines the
326    integration limits: any points with combined weight less than *cutoff*
327    will not be calculated.
328
329    Call :meth:`release` when done with the kernel instance.
330    """
331    def __init__(self, kernel, info, input):
332        self.input = input
333        self.kernel = kernel
334        self.info = info
335        self.res = np.empty(input.nq, input.dtype)
336        dim = '2d' if input.is_2D else '1d'
337        self.fixed_pars = info['partype']['fixed-'+dim]
338        self.pd_pars = info['partype']['pd-'+dim]
339
340        # Inputs and outputs for each kernel call
341        # Note: res may be shorter than res_b if global_size != nq
342        env = environment()
343        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
344                                  2*MAX_LOOPS*input.dtype.itemsize)
345                        for _ in env.queues]
346        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
347                                input.global_size[0]*input.dtype.itemsize)
348                      for _ in env.queues]
349
350
351    def __call__(self, pars, pd_pars, cutoff=1e-5):
352        real = np.float32 if self.input.dtype == generate.F32 else np.float64
353        fixed = [real(p) for p in pars]
354        cutoff = real(cutoff)
355        loops = np.hstack(pd_pars)
356        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
357        Nloops = [np.uint32(len(p[0])) for p in pd_pars]
358        #print "loops",Nloops, loops
359
360        #import sys; print >>sys.stderr,"opencl eval",pars
361        #print "opencl eval",pars
362        if len(loops) > 2*MAX_LOOPS:
363            raise ValueError("too many polydispersity points")
364        device_num = 0
365        res_bi = self.res_b[device_num]
366        queuei = environment().queues[device_num]
367        loops_bi = self.loops_b[device_num]
368        loops_l = cl.LocalMemory(len(loops.data))
369        cl.enqueue_copy(queuei, loops_bi, loops)
370        #ctx = environment().context
371        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
372        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
373        self.kernel(queuei, self.input.global_size, None, *args)
374        cl.enqueue_copy(queuei, self.res, res_bi)
375
376        return self.res
377
378    def release(self):
379        for b in self.loops_b:
380            b.release()
381        self.loops_b = []
382        for b in self.res_b:
383            b.release()
384        self.res_b = []
385
386    def __del__(self):
387        self.release()
Note: See TracBrowser for help on using the repository browser.