source: sasmodels/sasmodels/kernelcl.py @ da32ec3

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since da32ec3 was da32ec3, checked in by ajj, 9 years ago

fix importerror

  • Property mode set to 100644
File size: 13.9 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import os
26import warnings
27
28import numpy as np
29
30try:
31    import pyopencl as cl
32except ImportError,exc:
33    warnings.warn(str(exc))
34    raise RuntimeError("OpenCL not available")
35
36from pyopencl import mem_flags as mf
37
38from . import generate
39from .kernelpy import PyInput, PyKernel
40
41F64_DEFS = """\
42#ifdef cl_khr_fp64
43#  pragma OPENCL EXTENSION cl_khr_fp64: enable
44#endif
45"""
46
47# The max loops number is limited by the amount of local memory available
48# on the device.  You don't want to make this value too big because it will
49# waste resources, nor too small because it may interfere with users trying
50# to do their polydispersity calculations.  A value of 1024 should be much
51# larger than necessary given that cost grows as npts^k where k is the number
52# of polydisperse parameters.
53MAX_LOOPS = 2048
54
55def load_model(kernel_module, dtype="single"):
56    """
57    Load the OpenCL model defined by *kernel_module*.
58
59    Access to the OpenCL device is delayed until the kernel is called
60    so models can be defined without using too many resources.
61    """
62    source, info = generate.make(kernel_module)
63    ## for debugging, save source to a .cl file, edit it, and reload as model
64    #open(info['name']+'.cl','w').write(source)
65    #source = open(info['name']+'.cl','r').read()
66    return GpuModel(source, info, dtype)
67
68ENV = None
69def environment():
70    """
71    Returns a singleton :class:`GpuEnvironment`.
72
73    This provides an OpenCL context and one queue per device.
74    """
75    global ENV
76    if ENV is None:
77        ENV = GpuEnvironment()
78    return ENV
79
80def has_double(device):
81    """
82    Return true if device supports double precision.
83    """
84    return "cl_khr_fp64" in device.extensions
85
86def get_warp(kernel, queue):
87    """
88    Return the size of an execution batch for *kernel* running on *queue*.
89    """
90    return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
91                                      queue.device)
92
93def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
94    """
95    Stretch an input vector to the correct boundary.
96
97    Performance on the kernels can drop by a factor of two or more if the
98    number of values to compute does not fall on a nice power of two
99    boundary.   The trailing additional vector elements are given a
100    value of *extra*, and so f(*extra*) will be computed for each of
101    them.  The returned array will thus be a subset of the computed array.
102
103    *boundary* should be a power of 2 which is at least 32 for good
104    performance on current platforms (as of Jan 2015).  It should
105    probably be the max of get_warp(kernel,queue) and
106    device.min_data_type_align_size//4.
107    """
108    remainder = vector.size%boundary
109    if remainder != 0:
110        size = vector.size + (boundary - remainder)
111        vector = np.hstack((vector, [extra]*(size-vector.size)))
112    return np.ascontiguousarray(vector, dtype=dtype)
113
114
115def compile_model(context, source, dtype):
116    """
117    Build a model to run on the gpu.
118
119    Returns the compiled program and its type.  The returned type will
120    be float32 even if the desired type is float64 if any of the
121    devices in the context do not support the cl_khr_fp64 extension.
122    """
123    dtype = np.dtype(dtype)
124    if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
125        raise RuntimeError("Double precision not supported for devices")
126
127    header = F64_DEFS if dtype == generate.F64 else ""
128    if dtype == generate.F32:
129        source = generate.use_single(source)
130    # Note: USE_SINCOS makes the intel cpu slower under opencl
131    if context.devices[0].type == cl.device_type.GPU:
132        header += "#define USE_SINCOS\n"
133    program  = cl.Program(context, header+source).build()
134    return program
135
136
137def make_result(self, size):
138    self.res = np.empty(size, dtype=self.dtype)
139    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
140    return self.res, self.res_b
141
142
143# for now, this returns one device in the context
144# TODO: create a context that contains all devices on all platforms
145class GpuEnvironment(object):
146    """
147    GPU context, with possibly many devices, and one queue per device.
148    """
149    def __init__(self):
150        # find gpu context
151        #self.context = cl.create_some_context()
152
153        self.context = None
154        if 'PYOPENCL_CTX' in os.environ:
155            self._create_some_context()
156
157        if not self.context:
158            self.context = self._find_context()
159
160        # Byte boundary for data alignment
161        #self.data_boundary = max(d.min_data_type_align_size
162        #                         for d in self.context.devices)
163        self.queues = [cl.CommandQueue(self.context, d)
164                       for d in self.context.devices]
165        self.has_double = all(has_double(d) for d in self.context.devices)
166        self.compiled = {}
167
168    def _create_some_context(self):
169        try:
170            self.context = cl.create_some_context(interactive=False)
171        except Exception,exc:
172            warnings.warn(str(exc))
173            warnings.warn("pyopencl.create_some_context() failed")
174            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
175
176    def _find_context(self):
177        default = None
178        for platform in cl.get_platforms():
179            for device in platform.get_devices():
180                if device.type == cl.device_type.GPU:
181                    return cl.Context([device])
182                if default is None:
183                    default = device
184
185        if not default:
186            raise RuntimeError("OpenCL device not found")
187
188        return cl.Context([default])
189
190    def compile_program(self, name, source, dtype):
191        if name not in self.compiled:
192            #print "compiling",name
193            self.compiled[name] = compile_model(self.context, source, dtype)
194        return self.compiled[name]
195
196    def release_program(self, name):
197        if name in self.compiled:
198            self.compiled[name].release()
199            del self.compiled[name]
200
201
202class GpuModel(object):
203    """
204    GPU wrapper for a single model.
205
206    *source* and *info* are the model source and interface as returned
207    from :func:`gen.make`.
208
209    *dtype* is the desired model precision.  Any numpy dtype for single
210    or double precision floats will do, such as 'f', 'float32' or 'single'
211    for single and 'd', 'float64' or 'double' for double.  Double precision
212    is an optional extension which may not be available on all devices.
213    """
214    def __init__(self, source, info, dtype=generate.F32):
215        self.info = info
216        self.source = source
217        self.dtype = dtype
218        self.program = None # delay program creation
219
220    def __getstate__(self):
221        state = self.__dict__.copy()
222        state['program'] = None
223        return state
224
225    def __setstate__(self, state):
226        self.__dict__ = state.copy()
227
228    def __call__(self, input):
229        # Support pure python kernel call
230        if input.is_2D and callable(self.info['Iqxy']):
231            return PyKernel(self.info['Iqxy'], self.info, input)
232        elif not input.is_2D and callable(self.info['Iq']):
233            return PyKernel(self.info['Iq'], self.info, input)
234
235        if self.dtype != input.dtype:
236            raise TypeError("data and kernel have different types")
237        if self.program is None:
238            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
239        kernel_name = generate.kernel_name(self.info, input.is_2D)
240        kernel = getattr(self.program, kernel_name)
241        return GpuKernel(kernel, self.info, input)
242
243    def release(self):
244        if self.program is not None:
245            environment().release_program(self.info['name'])
246            self.program = None
247
248    def make_input(self, q_vectors):
249        """
250        Make q input vectors available to the model.
251
252        Note that each model needs its own q vector even if the case of
253        mixture models because some models may be OpenCL, some may be
254        ctypes and some may be pure python.
255        """
256        # Support pure python kernel call
257        if len(q_vectors) == 1 and callable(self.info['Iq']):
258            return PyInput(q_vectors, dtype=self.dtype)
259        elif callable(self.info['Iqxy']):
260            return PyInput(q_vectors, dtype=self.dtype)
261        else:
262            return GpuInput(q_vectors, dtype=self.dtype)
263
264# TODO: check that we don't need a destructor for buffers which go out of scope
265class GpuInput(object):
266    """
267    Make q data available to the gpu.
268
269    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
270    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
271    to get the best performance on OpenCL, which may involve shifting and
272    stretching the array to better match the memory architecture.  Additional
273    points will be evaluated with *q=1e-3*.
274
275    *dtype* is the data type for the q vectors. The data type should be
276    set to match that of the kernel, which is an attribute of
277    :class:`GpuProgram`.  Note that not all kernels support double
278    precision, so even if the program was created for double precision,
279    the *GpuProgram.dtype* may be single precision.
280
281    Call :meth:`release` when complete.  Even if not called directly, the
282    buffer will be released when the data object is freed.
283    """
284    def __init__(self, q_vectors, dtype=generate.F32):
285        env = environment()
286        self.nq = q_vectors[0].size
287        self.dtype = np.dtype(dtype)
288        self.is_2D = (len(q_vectors) == 2)
289        # TODO: stretch input based on get_warp()
290        # not doing it now since warp depends on kernel, which is not known
291        # at this point, so instead using 32, which is good on the set of
292        # architectures tested so far.
293        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
294        self.q_buffers = [
295            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
296            for q in self.q_vectors
297        ]
298        self.global_size = [self.q_vectors[0].size]
299
300    def release(self):
301        for b in self.q_buffers:
302            b.release()
303        self.q_buffers = []
304
305class GpuKernel(object):
306    """
307    Callable SAS kernel.
308
309    *kernel* is the GpuKernel object to call.
310
311    *info* is the module information
312
313    *input* is the DllInput q vectors at which the kernel should be
314    evaluated.
315
316    The resulting call method takes the *pars*, a list of values for
317    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
318    vectors for the polydisperse parameters.  *cutoff* determines the
319    integration limits: any points with combined weight less than *cutoff*
320    will not be calculated.
321
322    Call :meth:`release` when done with the kernel instance.
323    """
324    def __init__(self, kernel, info, input):
325        self.input = input
326        self.kernel = kernel
327        self.info = info
328        self.res = np.empty(input.nq, input.dtype)
329        dim = '2d' if input.is_2D else '1d'
330        self.fixed_pars = info['partype']['fixed-'+dim]
331        self.pd_pars = info['partype']['pd-'+dim]
332
333        # Inputs and outputs for each kernel call
334        # Note: res may be shorter than res_b if global_size != nq
335        env = environment()
336        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
337                                  2*MAX_LOOPS*input.dtype.itemsize)
338                        for _ in env.queues]
339        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
340                                input.global_size[0]*input.dtype.itemsize)
341                      for _ in env.queues]
342
343
344    def __call__(self, pars, pd_pars, cutoff=1e-5):
345        real = np.float32 if self.input.dtype == generate.F32 else np.float64
346        fixed = [real(p) for p in pars]
347        cutoff = real(cutoff)
348        loops = np.hstack(pd_pars)
349        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
350        Nloops = [np.uint32(len(p[0])) for p in pd_pars]
351        #print "loops",Nloops, loops
352
353        #import sys; print >>sys.stderr,"opencl eval",pars
354        #print "opencl eval",pars
355        if len(loops) > 2*MAX_LOOPS:
356            raise ValueError("too many polydispersity points")
357        device_num = 0
358        res_bi = self.res_b[device_num]
359        queuei = environment().queues[device_num]
360        loops_bi = self.loops_b[device_num]
361        loops_l = cl.LocalMemory(len(loops.data))
362        cl.enqueue_copy(queuei, loops_bi, loops)
363        #ctx = environment().context
364        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
365        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
366        self.kernel(queuei, self.input.global_size, None, *args)
367        cl.enqueue_copy(queuei, self.res, res_bi)
368
369        return self.res
370
371    def release(self):
372        for b in self.loops_b:
373            b.release()
374        self.loops_b = []
375        for b in self.res_b:
376            b.release()
377        self.res_b = []
378
379    def __del__(self):
380        self.release()
Note: See TracBrowser for help on using the repository browser.