source: sasmodels/sasmodels/gpu.py @ b3f6bc3

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since b3f6bc3 was b3f6bc3, checked in by pkienzle, 9 years ago

support pure python model Iq/Iqxy?

  • Property mode set to 100644
File size: 12.8 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import numpy as np
26
27import pyopencl as cl
28from pyopencl import mem_flags as mf
29
30from . import generate
31from .pykernel import PyInput, PyKernel
32
33F64_DEFS = """\
34#ifdef cl_khr_fp64
35#  pragma OPENCL EXTENSION cl_khr_fp64: enable
36#endif
37"""
38
39# The max loops number is limited by the amount of local memory available
40# on the device.  You don't want to make this value too big because it will
41# waste resources, nor too small because it may interfere with users trying
42# to do their polydispersity calculations.  A value of 1024 should be much
43# larger than necessary given that cost grows as npts^k where k is the number
44# of polydisperse parameters.
45MAX_LOOPS = 2048
46
47def load_model(kernel_module, dtype="single"):
48    """
49    Load the OpenCL model defined by *kernel_module*.
50
51    Access to the OpenCL device is delayed until the kernel is called
52    so models can be defined without using too many resources.
53    """
54    source, info = generate.make(kernel_module)
55    ## for debugging, save source to a .cl file, edit it, and reload as model
56    #open(info['name']+'.cl','w').write(source)
57    #source = open(info['name']+'.cl','r').read()
58    return GpuModel(source, info, dtype)
59
60ENV = None
61def environment():
62    """
63    Returns a singleton :class:`GpuEnvironment`.
64
65    This provides an OpenCL context and one queue per device.
66    """
67    global ENV
68    if ENV is None:
69        ENV = GpuEnvironment()
70    return ENV
71
72def has_double(device):
73    """
74    Return true if device supports double precision.
75    """
76    return "cl_khr_fp64" in device.extensions
77
78def get_warp(kernel, queue):
79    """
80    Return the size of an execution batch for *kernel* running on *queue*.
81    """
82    return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
83                                      queue.device)
84
85def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
86    """
87    Stretch an input vector to the correct boundary.
88
89    Performance on the kernels can drop by a factor of two or more if the
90    number of values to compute does not fall on a nice power of two
91    boundary.   The trailing additional vector elements are given a
92    value of *extra*, and so f(*extra*) will be computed for each of
93    them.  The returned array will thus be a subset of the computed array.
94
95    *boundary* should be a power of 2 which is at least 32 for good
96    performance on current platforms (as of Jan 2015).  It should
97    probably be the max of get_warp(kernel,queue) and
98    device.min_data_type_align_size//4.
99    """
100    remainder = vector.size%boundary
101    if remainder != 0:
102        size = vector.size + (boundary - remainder)
103        vector = np.hstack((vector, [extra]*(size-vector.size)))
104    return np.ascontiguousarray(vector, dtype=dtype)
105
106
107def compile_model(context, source, dtype):
108    """
109    Build a model to run on the gpu.
110
111    Returns the compiled program and its type.  The returned type will
112    be float32 even if the desired type is float64 if any of the
113    devices in the context do not support the cl_khr_fp64 extension.
114    """
115    dtype = np.dtype(dtype)
116    if dtype==generate.F64 and not all(has_double(d) for d in context.devices):
117        raise RuntimeError("Double precision not supported for devices")
118
119    header = F64_DEFS if dtype == generate.F64 else ""
120    if dtype == generate.F32:
121        source = generate.use_single(source)
122    # Note: USE_SINCOS makes the intel cpu slower under opencl
123    if context.devices[0].type == cl.device_type.GPU:
124        header += "#define USE_SINCOS\n"
125    program  = cl.Program(context, header+source).build()
126    return program
127
128
129def make_result(self, size):
130    self.res = np.empty(size, dtype=self.dtype)
131    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
132    return self.res, self.res_b
133
134
135# for now, this returns one device in the context
136# TODO: create a context that contains all devices on all platforms
137class GpuEnvironment(object):
138    """
139    GPU context, with possibly many devices, and one queue per device.
140    """
141    def __init__(self):
142        self.context = cl.create_some_context()
143        self.queues = [cl.CommandQueue(self.context, d)
144                       for d in self.context.devices]
145        # Byte boundary for data alignment
146        #self.data_boundary = max(d.min_data_type_align_size
147        #                         for d in self.context.devices)
148        self.has_double = all(has_double(d) for d in self.context.devices)
149        self.compiled = {}
150
151    def compile_program(self, name, source, dtype):
152        if name not in self.compiled:
153            #print "compiling",name
154            self.compiled[name] = compile_model(self.context, source, dtype)
155        return self.compiled[name]
156
157    def release_program(self, name):
158        if name in self.compiled:
159            self.compiled[name].release()
160            del self.compiled[name]
161
162class GpuModel(object):
163    """
164    GPU wrapper for a single model.
165
166    *source* and *info* are the model source and interface as returned
167    from :func:`gen.make`.
168
169    *dtype* is the desired model precision.  Any numpy dtype for single
170    or double precision floats will do, such as 'f', 'float32' or 'single'
171    for single and 'd', 'float64' or 'double' for double.  Double precision
172    is an optional extension which may not be available on all devices.
173    """
174    def __init__(self, source, info, dtype=generate.F32):
175        self.info = info
176        self.source = source
177        self.dtype = dtype
178        self.program = None # delay program creation
179
180    def __getstate__(self):
181        state = self.__dict__.copy()
182        state['program'] = None
183        return state
184
185    def __setstate__(self, state):
186        self.__dict__ = state.copy()
187
188    def __call__(self, input):
189        # Support pure python kernel call
190        if input.is_2D and callable(self.info['Iqxy']):
191            return PyKernel(self.info['Iqxy'], self.info, input)
192        elif not input.is_2D and callable(self.info['Iq']):
193            return PyKernel(self.info['Iq'], self.info, input)
194
195        if self.dtype != input.dtype:
196            raise TypeError("data and kernel have different types")
197        if self.program is None:
198            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
199        kernel_name = generate.kernel_name(self.info, input.is_2D)
200        kernel = getattr(self.program, kernel_name)
201        return GpuKernel(kernel, self.info, input)
202
203    def release(self):
204        if self.program is not None:
205            environment().release_program(self.info['name'])
206            self.program = None
207
208    def make_input(self, q_vectors):
209        """
210        Make q input vectors available to the model.
211
212        Note that each model needs its own q vector even if the case of
213        mixture models because some models may be OpenCL, some may be
214        ctypes and some may be pure python.
215        """
216        # Support pure python kernel call
217        if len(q_vectors) == 1 and callable(self.info['Iq']):
218            return PyInput(q_vectors, dtype=self.dtype)
219        elif callable(self.info['Iqxy']):
220            return PyInput(q_vectors, dtype=self.dtype)
221        else:
222            return GpuInput(q_vectors, dtype=self.dtype)
223
224# TODO: check that we don't need a destructor for buffers which go out of scope
225class GpuInput(object):
226    """
227    Make q data available to the gpu.
228
229    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
230    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
231    to get the best performance on OpenCL, which may involve shifting and
232    stretching the array to better match the memory architecture.  Additional
233    points will be evaluated with *q=1e-3*.
234
235    *dtype* is the data type for the q vectors. The data type should be
236    set to match that of the kernel, which is an attribute of
237    :class:`GpuProgram`.  Note that not all kernels support double
238    precision, so even if the program was created for double precision,
239    the *GpuProgram.dtype* may be single precision.
240
241    Call :meth:`release` when complete.  Even if not called directly, the
242    buffer will be released when the data object is freed.
243    """
244    def __init__(self, q_vectors, dtype=generate.F32):
245        env = environment()
246        self.nq = q_vectors[0].size
247        self.dtype = np.dtype(dtype)
248        self.is_2D = (len(q_vectors) == 2)
249        # TODO: stretch input based on get_warp()
250        # not doing it now since warp depends on kernel, which is not known
251        # at this point, so instead using 32, which is good on the set of
252        # architectures tested so far.
253        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors]
254        self.q_buffers = [
255            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
256            for q in self.q_vectors
257        ]
258        self.global_size = [self.q_vectors[0].size]
259
260    def release(self):
261        for b in self.q_buffers:
262            b.release()
263        self.q_buffers = []
264
265class GpuKernel(object):
266    """
267    Callable SAS kernel.
268
269    *kernel* is the GpuKernel object to call.
270
271    *info* is the module information
272
273    *input* is the DllInput q vectors at which the kernel should be
274    evaluated.
275
276    The resulting call method takes the *pars*, a list of values for
277    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
278    vectors for the polydisperse parameters.  *cutoff* determines the
279    integration limits: any points with combined weight less than *cutoff*
280    will not be calculated.
281
282    Call :meth:`release` when done with the kernel instance.
283    """
284    def __init__(self, kernel, info, input):
285        self.input = input
286        self.kernel = kernel
287        self.info = info
288        self.res = np.empty(input.nq, input.dtype)
289        dim = '2d' if input.is_2D else '1d'
290        self.fixed_pars = info['partype']['fixed-'+dim]
291        self.pd_pars = info['partype']['pd-'+dim]
292
293        # Inputs and outputs for each kernel call
294        # Note: res may be shorter than res_b if global_size != nq
295        env = environment()
296        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
297                                  2*MAX_LOOPS*input.dtype.itemsize)
298                        for _ in env.queues]
299        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
300                                input.global_size[0]*input.dtype.itemsize)
301                      for _ in env.queues]
302
303
304    def __call__(self, pars, pd_pars, cutoff=1e-5):
305        real = np.float32 if self.input.dtype == generate.F32 else np.float64
306        fixed = [real(p) for p in pars]
307        cutoff = real(cutoff)
308        loops = np.hstack(pd_pars)
309        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
310        Nloops = [np.uint32(len(p[0])) for p in pd_pars]
311        #print "loops",Nloops, loops
312
313        #import sys; print >>sys.stderr,"opencl eval",pars
314        #print "opencl eval",pars
315        if len(loops) > 2*MAX_LOOPS:
316            raise ValueError("too many polydispersity points")
317        device_num = 0
318        res_bi = self.res_b[device_num]
319        queuei = environment().queues[device_num]
320        loops_bi = self.loops_b[device_num]
321        loops_l = cl.LocalMemory(len(loops.data))
322        cl.enqueue_copy(queuei, loops_bi, loops)
323        #ctx = environment().context
324        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
325        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
326        self.kernel(queuei, self.input.global_size, None, *args)
327        cl.enqueue_copy(queuei, self.res, res_bi)
328
329        return self.res
330
331    def release(self):
332        for b in self.loops_b:
333            b.release()
334        self.loops_b = []
335        for b in self.res_b:
336            b.release()
337        self.res_b = []
338
339    def __del__(self):
340        self.release()
Note: See TracBrowser for help on using the repository browser.