source: sasmodels/sasmodels/gpu.py @ 14de349

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since 14de349 was 14de349, checked in by Paul Kienzle <pkienzle@…>, 10 years ago

add autogenerated polydispersity loops

  • Property mode set to 100644
File size: 9.7 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24
25
26"""
27import warnings
28
29import numpy as np
30import pyopencl as cl
31from pyopencl import mem_flags as mf
32
33from . import gen
34
35from .gen import F32, F64
36
37F32_DEFS = """\
38#define REAL(x) (x##f)
39#define real float
40"""
41
42F64_DEFS = """\
43#pragma OPENCL EXTENSION cl_khr_fp64: enable
44#define REAL(x) (x)
45#define real double
46"""
47
48ENV = None
49def environment():
50    """
51    Returns a singleton :class:`GpuEnvironment`.
52
53    This provides an OpenCL context and one queue per device.
54    """
55    global ENV
56    if ENV is None:
57        ENV = GpuEnvironment()
58    return ENV
59
60def has_double(device):
61    """
62    Return true if device supports double precision.
63    """
64    return "cl_khr_fp64" in device.extensions
65
66
67def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
68    """
69    Stretch an input vector to the correct boundary.
70
71    Performance on the kernels can drop by a factor of two or more if the
72    number of values to compute does not fall on a nice power of two
73    boundary.  A good choice for the boundary value is the
74    min_data_type_align_size property of the OpenCL device.  The usual
75    value of 128 gives a working size as a multiple of 32.  The trailing
76    additional vector elements are given a value of *extra*, and so
77    f(*extra*) will be computed for each of them.  The returned array
78    will thus be a subset of the computed array.
79    """
80    boundary // dtype.itemsize
81    remainder = vector.size%boundary
82    size = vector.size + (boundary - remainder if remainder != 0 else 0)
83    if size != vector.size:
84        vector = np.hstack((vector, [extra]*(size-vector.size)))
85    return np.ascontiguousarray(vector, dtype=dtype)
86
87
88def compile_model(context, source, dtype):
89    """
90    Build a model to run on the gpu.
91
92    Returns the compiled program and its type.  The returned type will
93    be float32 even if the desired type is float64 if any of the
94    devices in the context do not support the cl_khr_fp64 extension.
95    """
96    dtype = np.dtype(dtype)
97    if dtype==F64 and not all(has_double(d) for d in context.devices):
98        warnings.warn(RuntimeWarning("Double precision not support; using single precision instead"))
99        dtype = F32
100
101    header = F64_DEFS if dtype == F64 else F32_DEFS
102    # Note: USE_SINCOS makes the intel cpu slower under opencl
103    if context.devices[0].type == cl.device_type.GPU:
104        header += "#define USE_SINCOS\n"
105    program  = cl.Program(context, header+source).build()
106    return program, dtype
107
108
109def make_result(self, size):
110    self.res = np.empty(size, dtype=self.dtype)
111    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
112    return self.res, self.res_b
113
114
115# for now, this returns one device in the context
116# TODO: create a context that contains all devices on all platforms
117class GpuEnvironment(object):
118    """
119    GPU context, with possibly many devices, and one queue per device.
120    """
121    def __init__(self):
122        self.context = cl.create_some_context()
123        self.queues = [cl.CommandQueue(self.context, d)
124                       for d in self.context.devices]
125        self.boundary = max(d.min_data_type_align_size
126                            for d in self.context.devices)
127
128class GpuModel(object):
129    """
130    GPU wrapper for a single model.
131
132    *source* and *meta* are the model source and interface as returned
133    from :func:`gen.make`.
134
135    *dtype* is the desired model precision.  Any numpy dtype for single
136    or double precision floats will do, such as 'f', 'float32' or 'single'
137    for single and 'd', 'float64' or 'double' for double.  Double precision
138    is an optional extension which may not be available on all devices.
139    """
140    def __init__(self, source, meta, dtype=F32):
141        context = environment().context
142        self.meta = meta
143        self.program, self.dtype = compile_model(context, source, dtype)
144
145        #TODO: need better interface
146        self.PARS = dict((p[0],p[2]) for p in meta['parameters'])
147        self.PD_PARS = [p[0] for p in meta['parameters'] if p[4] != ""]
148
149    def __call__(self, input, cutoff=1e-5):
150        if self.dtype != input.dtype:
151            raise TypeError("data and kernel have different types")
152        kernel_name = gen.kernel_name(self.meta, input.is_2D)
153        kernel = getattr(self.program, kernel_name)
154        return GpuKernel(kernel, self.meta, input, cutoff)
155
156    def make_input(self, q_vectors):
157        """
158        Make q input vectors available to the model.
159
160        This only needs to be done once for all models that operate on the
161        same input.  So for example, if you are adding two different models
162        together to compare to a data set, then only one model needs to
163        needs to call make_input, so long as the models have the same dtype.
164        """
165        # Note: the weird interface, where make_input doesn't care which
166        # model calls it, allows us to ask the model to define the data
167        # and the caller does not need to know if it is opencl or ctypes.
168        # The returned data object is opaque.
169        return GpuInput(q_vectors, dtype=self.dtype)
170
171# TODO: check that we don't need a destructor for buffers which go out of scope
172class GpuInput(object):
173    """
174    Make q data available to the gpu.
175
176    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
177    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
178    to get the best performance on OpenCL, which may involve shifting and
179    stretching the array to better match the memory architecture.  Additional
180    points will be evaluated with *q=1e-3*.
181
182    *dtype* is the data type for the q vectors. The data type should be
183    set to match that of the kernel, which is an attribute of
184    :class:`GpuProgram`.  Note that not all kernels support double
185    precision, so even if the program was created for double precision,
186    the *GpuProgram.dtype* may be single precision.
187
188    Call :meth:`release` when complete.  Even if not called directly, the
189    buffer will be released when the data object is freed.
190    """
191    def __init__(self, q_vectors, dtype=F32):
192        env = environment()
193        self.nq = q_vectors[0].size
194        self.dtype = np.dtype(dtype)
195        self.is_2D = (len(q_vectors) == 2)
196        self.q_vectors = [
197            _stretch_input(q, self.dtype, boundary=env.boundary)
198            for q in q_vectors
199        ]
200        self.q_buffers = [
201            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
202            for q in self.q_vectors
203        ]
204        self.global_size = [self.q_vectors[0].size]
205
206    def release(self):
207        for b in self.q_buffers:
208            b.release()
209        self.q_buffers = []
210
211class GpuKernel(object):
212    def __init__(self, kernel, meta, input, cutoff):
213        env = environment()
214
215        self.cutoff = cutoff
216        self.input = input
217        self.kernel = kernel
218        self.meta = meta
219
220        # Inputs and outputs for each kernel call
221        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
222                                  1024*input.dtype.itemsize)
223                        for _ in env.queues]
224        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
225                                input.global_size[0]*input.dtype.itemsize)
226                      for _ in env.queues]
227
228        # Note: may be shorter than res_b if global_size != nq
229        self.res = np.empty(input.nq, input.dtype)
230
231        # Determine the set of fixed and polydisperse parameters
232        self.fixed_pars = [p[0] for p in meta['parameters'] if p[4] == ""]
233        self.pd_pars = [p for p in meta['parameters']
234               if p[4]=="volume" or (p[4]=="orientation" and input.is_2D)]
235
236    def eval(self, pars):
237        device_num = 0
238        res_bi = self.res_b[device_num]
239        queuei = environment().queues[device_num]
240        fixed, loops, loop_n = \
241            gen.kernel_pars(pars, self.meta, self.input.is_2D, dtype=self.input.dtype)
242        loops_l = cl.LocalMemory(len(loops.data))
243        real = np.float32 if self.input.dtype == F32 else np.float64
244        cutoff = real(self.cutoff)
245
246        loops_bi = self.loops_b[device_num]
247        cl.enqueue_copy(queuei, loops_bi, loops)
248        #ctx = environment().context
249        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
250        pars = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loop_n
251        self.kernel(queuei, self.input.global_size, None, *pars)
252        cl.enqueue_copy(queuei, self.res, res_bi)
253
254        return self.res
255
256    def release(self):
257        for b in self.loops_b:
258            b.release()
259        self.loops_b = []
260        for b in self.res_b:
261            b.release()
262        self.res_b = []
263
264    def __del__(self):
265        self.release()
Note: See TracBrowser for help on using the repository browser.