source: sasmodels/sasmodels/gpu.py @ 1780d59

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 1780d59 was 1780d59, checked in by Paul Kienzle <pkienzle@…>, 10 years ago

hack sasview model polydispersity

  • Property mode set to 100644
File size: 11.0 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25
26"""
27import warnings
28
29import numpy as np
30import pyopencl as cl
31from pyopencl import mem_flags as mf
32
33from . import gen
34
35from .gen import F32, F64
36
37F32_DEFS = """\
38#define REAL(x) (x##f)
39#define real float
40"""
41
42F64_DEFS = """\
43#pragma OPENCL EXTENSION cl_khr_fp64: enable
44#define REAL(x) (x)
45#define real double
46"""
47
48# The max loops number is limited by the amount of local memory available
49# on the device.  You don't want to make this value too big because it will
50# waste resources, nor too small because it may interfere with users trying
51# to do their polydispersity calculations.  A value of 1024 should be much
52# larger than necessary given that cost grows as npts^k where k is the number
53# of polydisperse parameters.
54MAX_LOOPS = 1024
55
56ENV = None
57def environment():
58    """
59    Returns a singleton :class:`GpuEnvironment`.
60
61    This provides an OpenCL context and one queue per device.
62    """
63    global ENV
64    if ENV is None:
65        ENV = GpuEnvironment()
66    return ENV
67
68def has_double(device):
69    """
70    Return true if device supports double precision.
71    """
72    return "cl_khr_fp64" in device.extensions
73
74
75def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
76    """
77    Stretch an input vector to the correct boundary.
78
79    Performance on the kernels can drop by a factor of two or more if the
80    number of values to compute does not fall on a nice power of two
81    boundary.  A good choice for the boundary value is the
82    min_data_type_align_size property of the OpenCL device.  The usual
83    value of 128 gives a working size as a multiple of 32.  The trailing
84    additional vector elements are given a value of *extra*, and so
85    f(*extra*) will be computed for each of them.  The returned array
86    will thus be a subset of the computed array.
87    """
88    boundary // dtype.itemsize
89    remainder = vector.size%boundary
90    size = vector.size + (boundary - remainder if remainder != 0 else 0)
91    if size != vector.size:
92        vector = np.hstack((vector, [extra]*(size-vector.size)))
93    return np.ascontiguousarray(vector, dtype=dtype)
94
95
96def compile_model(context, source, dtype):
97    """
98    Build a model to run on the gpu.
99
100    Returns the compiled program and its type.  The returned type will
101    be float32 even if the desired type is float64 if any of the
102    devices in the context do not support the cl_khr_fp64 extension.
103    """
104    dtype = np.dtype(dtype)
105    if dtype==F64 and not all(has_double(d) for d in context.devices):
106        raise RuntimeError("Double precision not supported for devices")
107
108    header = F64_DEFS if dtype == F64 else F32_DEFS
109    # Note: USE_SINCOS makes the intel cpu slower under opencl
110    if context.devices[0].type == cl.device_type.GPU:
111        header += "#define USE_SINCOS\n"
112    program  = cl.Program(context, header+source).build()
113    return program
114
115
116def make_result(self, size):
117    self.res = np.empty(size, dtype=self.dtype)
118    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
119    return self.res, self.res_b
120
121
122# for now, this returns one device in the context
123# TODO: create a context that contains all devices on all platforms
124class GpuEnvironment(object):
125    """
126    GPU context, with possibly many devices, and one queue per device.
127    """
128    def __init__(self):
129        self.context = cl.create_some_context()
130        self.queues = [cl.CommandQueue(self.context, d)
131                       for d in self.context.devices]
132        self.boundary = max(d.min_data_type_align_size
133                            for d in self.context.devices)
134        self.has_double = all(has_double(d) for d in self.context.devices)
135        self.compiled = {}
136
137    def compile_program(self, name, source, dtype):
138        if name not in self.compiled:
139            #print "compiling",name
140            self.compiled[name] = compile_model(self.context, source, dtype)
141        return self.compiled[name]
142
143    def release_program(self, name):
144        if name in self.compiled:
145            self.compiled[name].release()
146            del self.compiled[name]
147
148class GpuModel(object):
149    """
150    GPU wrapper for a single model.
151
152    *source* and *info* are the model source and interface as returned
153    from :func:`gen.make`.
154
155    *dtype* is the desired model precision.  Any numpy dtype for single
156    or double precision floats will do, such as 'f', 'float32' or 'single'
157    for single and 'd', 'float64' or 'double' for double.  Double precision
158    is an optional extension which may not be available on all devices.
159    """
160    def __init__(self, source, info, dtype=F32):
161        self.info = info
162        self.source = source
163        self.dtype = dtype
164        self.program = None # delay program creation
165
166    def __getstate__(self):
167        state = self.__dict__.copy()
168        state['program'] = None
169        return state
170
171    def __setstate__(self, state):
172        self.__dict__ = state.copy()
173
174    def __call__(self, input):
175        if self.dtype != input.dtype:
176            raise TypeError("data and kernel have different types")
177        if self.program is None:
178            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
179        kernel_name = gen.kernel_name(self.info, input.is_2D)
180        kernel = getattr(self.program, kernel_name)
181        return GpuKernel(kernel, self.info, input)
182
183    def release(self):
184        if self.program is not None:
185            environment().release_program(self.info['name'])
186            self.program = None
187
188    def make_input(self, q_vectors):
189        """
190        Make q input vectors available to the model.
191
192        This only needs to be done once for all models that operate on the
193        same input.  So for example, if you are adding two different models
194        together to compare to a data set, then only one model needs to
195        needs to call make_input, so long as the models have the same dtype.
196        """
197        # Note: the weird interface, where make_input doesn't care which
198        # model calls it, allows us to ask the model to define the data
199        # and the caller does not need to know if it is opencl or ctypes.
200        # The returned data object is opaque.
201        return GpuInput(q_vectors, dtype=self.dtype)
202
203# TODO: check that we don't need a destructor for buffers which go out of scope
204class GpuInput(object):
205    """
206    Make q data available to the gpu.
207
208    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
209    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
210    to get the best performance on OpenCL, which may involve shifting and
211    stretching the array to better match the memory architecture.  Additional
212    points will be evaluated with *q=1e-3*.
213
214    *dtype* is the data type for the q vectors. The data type should be
215    set to match that of the kernel, which is an attribute of
216    :class:`GpuProgram`.  Note that not all kernels support double
217    precision, so even if the program was created for double precision,
218    the *GpuProgram.dtype* may be single precision.
219
220    Call :meth:`release` when complete.  Even if not called directly, the
221    buffer will be released when the data object is freed.
222    """
223    def __init__(self, q_vectors, dtype=F32):
224        env = environment()
225        self.nq = q_vectors[0].size
226        self.dtype = np.dtype(dtype)
227        self.is_2D = (len(q_vectors) == 2)
228        self.q_vectors = [
229            _stretch_input(q, self.dtype, boundary=env.boundary)
230            for q in q_vectors
231        ]
232        self.q_buffers = [
233            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
234            for q in self.q_vectors
235        ]
236        self.global_size = [self.q_vectors[0].size]
237
238    def release(self):
239        for b in self.q_buffers:
240            b.release()
241        self.q_buffers = []
242
243class GpuKernel(object):
244    def __init__(self, kernel, info, input):
245        self.input = input
246        self.kernel = kernel
247        self.info = info
248        self.res = np.empty(input.nq, input.dtype)
249        dim = '2d' if input.is_2D else '1d'
250        self.fixed_pars = info['partype']['fixed-'+dim]
251        self.pd_pars = info['partype']['pd-'+dim]
252
253        # Inputs and outputs for each kernel call
254        # Note: res may be shorter than res_b if global_size != nq
255        env = environment()
256        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
257                                  MAX_LOOPS*input.dtype.itemsize)
258                        for _ in env.queues]
259        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
260                                input.global_size[0]*input.dtype.itemsize)
261                      for _ in env.queues]
262
263
264    def __call__(self, pars, pd_pars, cutoff=1e-5):
265        real = np.float32 if self.input.dtype == F32 else np.float64
266        fixed = [real(p) for p in pars]
267        cutoff = real(cutoff)
268        loops = np.hstack(pd_pars)
269        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
270        loops_N = [np.uint32(len(p[0])) for p in pd_pars]
271
272        #import sys; print >>sys.stderr,"opencl eval",pars
273        #print "opencl eval",pars
274        if len(loops) > MAX_LOOPS:
275            raise ValueError("too many polydispersity points")
276        device_num = 0
277        res_bi = self.res_b[device_num]
278        queuei = environment().queues[device_num]
279        loops_bi = self.loops_b[device_num]
280        loops_l = cl.LocalMemory(len(loops.data))
281        cl.enqueue_copy(queuei, loops_bi, loops)
282        #ctx = environment().context
283        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
284        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loops_N
285        self.kernel(queuei, self.input.global_size, None, *args)
286        cl.enqueue_copy(queuei, self.res, res_bi)
287
288        return self.res
289
290    def release(self):
291        for b in self.loops_b:
292            b.release()
293        self.loops_b = []
294        for b in self.res_b:
295            b.release()
296        self.res_b = []
297
298    def __del__(self):
299        self.release()
Note: See TracBrowser for help on using the repository browser.