source: sasmodels/sasmodels/gpu.py @ 5d4777d

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 5d4777d was 5d4777d, checked in by Paul Kienzle <pkienzle@…>, 10 years ago

reorganize, check and update models

  • Property mode set to 100644
File size: 12.1 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25
26"""
27import numpy as np
28import pyopencl as cl
29from pyopencl import mem_flags as mf
30
31from . import gen
32
33F32_DEFS = """\
34#define REAL(x) (x##f)
35#define real float
36"""
37
38F64_DEFS = """\
39#pragma OPENCL EXTENSION cl_khr_fp64: enable
40#define REAL(x) (x)
41#define real double
42"""
43
44# The max loops number is limited by the amount of local memory available
45# on the device.  You don't want to make this value too big because it will
46# waste resources, nor too small because it may interfere with users trying
47# to do their polydispersity calculations.  A value of 1024 should be much
48# larger than necessary given that cost grows as npts^k where k is the number
49# of polydisperse parameters.
50MAX_LOOPS = 2048
51
52def load_model(kernel_module, dtype="single"):
53    """
54    Load the OpenCL model defined by *kernel_module*.
55
56    Access to the OpenCL device is delayed until the kernel is called
57    so models can be defined without using too many resources.
58    """
59    source, info = gen.make(kernel_module)
60    ## for debugging, save source to a .cl file, edit it, and reload as model
61    open(info['name']+'.cl','w').write(source)
62    #source = open(info['name']+'.cl','r').read()
63    return GpuModel(source, info, dtype)
64
65ENV = None
66def environment():
67    """
68    Returns a singleton :class:`GpuEnvironment`.
69
70    This provides an OpenCL context and one queue per device.
71    """
72    global ENV
73    if ENV is None:
74        ENV = GpuEnvironment()
75    return ENV
76
77def has_double(device):
78    """
79    Return true if device supports double precision.
80    """
81    return "cl_khr_fp64" in device.extensions
82
83
84def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
85    """
86    Stretch an input vector to the correct boundary.
87
88    Performance on the kernels can drop by a factor of two or more if the
89    number of values to compute does not fall on a nice power of two
90    boundary.  A good choice for the boundary value is the
91    min_data_type_align_size property of the OpenCL device.  The usual
92    value of 128 gives a working size as a multiple of 32.  The trailing
93    additional vector elements are given a value of *extra*, and so
94    f(*extra*) will be computed for each of them.  The returned array
95    will thus be a subset of the computed array.
96    """
97    boundary // dtype.itemsize
98    remainder = vector.size%boundary
99    size = vector.size + (boundary - remainder if remainder != 0 else 0)
100    if size != vector.size:
101        vector = np.hstack((vector, [extra]*(size-vector.size)))
102    return np.ascontiguousarray(vector, dtype=dtype)
103
104
105def compile_model(context, source, dtype):
106    """
107    Build a model to run on the gpu.
108
109    Returns the compiled program and its type.  The returned type will
110    be float32 even if the desired type is float64 if any of the
111    devices in the context do not support the cl_khr_fp64 extension.
112    """
113    dtype = np.dtype(dtype)
114    if dtype==gen.F64 and not all(has_double(d) for d in context.devices):
115        raise RuntimeError("Double precision not supported for devices")
116
117    header = F64_DEFS if dtype == gen.F64 else F32_DEFS
118    # Note: USE_SINCOS makes the intel cpu slower under opencl
119    if context.devices[0].type == cl.device_type.GPU:
120        header += "#define USE_SINCOS\n"
121    program  = cl.Program(context, header+source).build()
122    return program
123
124
125def make_result(self, size):
126    self.res = np.empty(size, dtype=self.dtype)
127    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
128    return self.res, self.res_b
129
130
131# for now, this returns one device in the context
132# TODO: create a context that contains all devices on all platforms
133class GpuEnvironment(object):
134    """
135    GPU context, with possibly many devices, and one queue per device.
136    """
137    def __init__(self):
138        self.context = cl.create_some_context()
139        self.queues = [cl.CommandQueue(self.context, d)
140                       for d in self.context.devices]
141        self.boundary = max(d.min_data_type_align_size
142                            for d in self.context.devices)
143        self.has_double = all(has_double(d) for d in self.context.devices)
144        self.compiled = {}
145
146    def compile_program(self, name, source, dtype):
147        if name not in self.compiled:
148            #print "compiling",name
149            self.compiled[name] = compile_model(self.context, source, dtype)
150        return self.compiled[name]
151
152    def release_program(self, name):
153        if name in self.compiled:
154            self.compiled[name].release()
155            del self.compiled[name]
156
157class GpuModel(object):
158    """
159    GPU wrapper for a single model.
160
161    *source* and *info* are the model source and interface as returned
162    from :func:`gen.make`.
163
164    *dtype* is the desired model precision.  Any numpy dtype for single
165    or double precision floats will do, such as 'f', 'float32' or 'single'
166    for single and 'd', 'float64' or 'double' for double.  Double precision
167    is an optional extension which may not be available on all devices.
168    """
169    def __init__(self, source, info, dtype=gen.F32):
170        self.info = info
171        self.source = source
172        self.dtype = dtype
173        self.program = None # delay program creation
174
175    def __getstate__(self):
176        state = self.__dict__.copy()
177        state['program'] = None
178        return state
179
180    def __setstate__(self, state):
181        self.__dict__ = state.copy()
182
183    def __call__(self, input):
184        if self.dtype != input.dtype:
185            raise TypeError("data and kernel have different types")
186        if self.program is None:
187            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
188        kernel_name = gen.kernel_name(self.info, input.is_2D)
189        kernel = getattr(self.program, kernel_name)
190        return GpuKernel(kernel, self.info, input)
191
192    def release(self):
193        if self.program is not None:
194            environment().release_program(self.info['name'])
195            self.program = None
196
197    def make_input(self, q_vectors):
198        """
199        Make q input vectors available to the model.
200
201        This only needs to be done once for all models that operate on the
202        same input.  So for example, if you are adding two different models
203        together to compare to a data set, then only one model needs to
204        needs to call make_input, so long as the models have the same dtype.
205        """
206        # Note: the weird interface, where make_input doesn't care which
207        # model calls it, allows us to ask the model to define the data
208        # and the caller does not need to know if it is opencl or ctypes.
209        # The returned data object is opaque.
210        return GpuInput(q_vectors, dtype=self.dtype)
211
212# TODO: check that we don't need a destructor for buffers which go out of scope
213class GpuInput(object):
214    """
215    Make q data available to the gpu.
216
217    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
218    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
219    to get the best performance on OpenCL, which may involve shifting and
220    stretching the array to better match the memory architecture.  Additional
221    points will be evaluated with *q=1e-3*.
222
223    *dtype* is the data type for the q vectors. The data type should be
224    set to match that of the kernel, which is an attribute of
225    :class:`GpuProgram`.  Note that not all kernels support double
226    precision, so even if the program was created for double precision,
227    the *GpuProgram.dtype* may be single precision.
228
229    Call :meth:`release` when complete.  Even if not called directly, the
230    buffer will be released when the data object is freed.
231    """
232    def __init__(self, q_vectors, dtype=gen.F32):
233        env = environment()
234        self.nq = q_vectors[0].size
235        self.dtype = np.dtype(dtype)
236        self.is_2D = (len(q_vectors) == 2)
237        self.q_vectors = [
238            _stretch_input(q, self.dtype, boundary=env.boundary)
239            for q in q_vectors
240        ]
241        self.q_buffers = [
242            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
243            for q in self.q_vectors
244        ]
245        self.global_size = [self.q_vectors[0].size]
246
247    def release(self):
248        for b in self.q_buffers:
249            b.release()
250        self.q_buffers = []
251
252class GpuKernel(object):
253    """
254    Callable SAS kernel.
255
256    *kernel* is the GpuKernel object to call.
257
258    *info* is the module information
259
260    *input* is the DllInput q vectors at which the kernel should be
261    evaluated.
262
263    The resulting call method takes the *pars*, a list of values for
264    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
265    vectors for the polydisperse parameters.  *cutoff* determines the
266    integration limits: any points with combined weight less than *cutoff*
267    will not be calculated.
268
269    Call :meth:`release` when done with the kernel instance.
270    """
271    def __init__(self, kernel, info, input):
272        self.input = input
273        self.kernel = kernel
274        self.info = info
275        self.res = np.empty(input.nq, input.dtype)
276        dim = '2d' if input.is_2D else '1d'
277        self.fixed_pars = info['partype']['fixed-'+dim]
278        self.pd_pars = info['partype']['pd-'+dim]
279
280        # Inputs and outputs for each kernel call
281        # Note: res may be shorter than res_b if global_size != nq
282        env = environment()
283        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
284                                  2*MAX_LOOPS*input.dtype.itemsize)
285                        for _ in env.queues]
286        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
287                                input.global_size[0]*input.dtype.itemsize)
288                      for _ in env.queues]
289
290
291    def __call__(self, pars, pd_pars, cutoff=1e-5):
292        real = np.float32 if self.input.dtype == gen.F32 else np.float64
293        fixed = [real(p) for p in pars]
294        cutoff = real(cutoff)
295        loops = np.hstack(pd_pars)
296        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
297        Nloops = [np.uint32(len(p[0])) for p in pd_pars]
298        #print "loops",Nloops, loops
299
300        #import sys; print >>sys.stderr,"opencl eval",pars
301        #print "opencl eval",pars
302        if len(loops) > 2*MAX_LOOPS:
303            raise ValueError("too many polydispersity points")
304        device_num = 0
305        res_bi = self.res_b[device_num]
306        queuei = environment().queues[device_num]
307        loops_bi = self.loops_b[device_num]
308        loops_l = cl.LocalMemory(len(loops.data))
309        cl.enqueue_copy(queuei, loops_bi, loops)
310        #ctx = environment().context
311        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
312        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
313        self.kernel(queuei, self.input.global_size, None, *args)
314        cl.enqueue_copy(queuei, self.res, res_bi)
315
316        return self.res
317
318    def release(self):
319        for b in self.loops_b:
320            b.release()
321        self.loops_b = []
322        for b in self.res_b:
323            b.release()
324        self.res_b = []
325
326    def __del__(self):
327        self.release()
Note: See TracBrowser for help on using the repository browser.