source: sasmodels/sasmodels/gpu.py @ d087487b

core_shell_microgelscostrafo411magnetic_modelrelease_v0.94release_v0.95ticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests
Last change on this file since d087487b was d087487b, checked in by Paul Kienzle <pkienzle@…>, 9 years ago

remove warning from radeon drivers that fp64 is already enabled

  • Property mode set to 100644
File size: 12.1 KB
Line 
1"""
2GPU support through OpenCL
3
4There should be a single GPU environment running on the system.  This
5environment is constructed on the first call to :func:`env`, and the
6same environment is returned on each call.
7
8After retrieving the environment, the next step is to create the kernel.
9This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
10returns the type of data used by the kernel.
11
12Next a :class:`GpuData` object should be created with the correct kind
13of data.  This data object can be used by multiple kernels, for example,
14if the target model is a weighted sum of multiple kernels.  The data
15should include any extra evaluation points required to compute the proper
16data smearing.  This need not match the square grid for 2D data if there
17is an index saying which q points are active.
18
19Together the GpuData, the program, and a device form a :class:`GpuKernel`.
20This kernel is used during fitting, receiving new sets of parameters and
21evaluating them.  The output value is stored in an output buffer on the
22devices, where it can be combined with other structure factors and form
23factors and have instrumental resolution effects applied.
24"""
25import numpy as np
26import pyopencl as cl
27from pyopencl import mem_flags as mf
28
29from . import gen
30
31F32_DEFS = """\
32#define REAL(x) (x##f)
33#define real float
34"""
35
36
37F64_DEFS = """\
38#ifdef cl_khr_fp64
39#  pragma OPENCL EXTENSION cl_khr_fp64: enable
40#endif
41#define REAL(x) (x)
42#define real double
43"""
44
45# The max loops number is limited by the amount of local memory available
46# on the device.  You don't want to make this value too big because it will
47# waste resources, nor too small because it may interfere with users trying
48# to do their polydispersity calculations.  A value of 1024 should be much
49# larger than necessary given that cost grows as npts^k where k is the number
50# of polydisperse parameters.
51MAX_LOOPS = 2048
52
53def load_model(kernel_module, dtype="single"):
54    """
55    Load the OpenCL model defined by *kernel_module*.
56
57    Access to the OpenCL device is delayed until the kernel is called
58    so models can be defined without using too many resources.
59    """
60    source, info = gen.make(kernel_module)
61    ## for debugging, save source to a .cl file, edit it, and reload as model
62    #open(info['name']+'.cl','w').write(source)
63    #source = open(info['name']+'.cl','r').read()
64    return GpuModel(source, info, dtype)
65
66ENV = None
67def environment():
68    """
69    Returns a singleton :class:`GpuEnvironment`.
70
71    This provides an OpenCL context and one queue per device.
72    """
73    global ENV
74    if ENV is None:
75        ENV = GpuEnvironment()
76    return ENV
77
78def has_double(device):
79    """
80    Return true if device supports double precision.
81    """
82    return "cl_khr_fp64" in device.extensions
83
84
85def _stretch_input(vector, dtype, extra=1e-3, boundary=128):
86    """
87    Stretch an input vector to the correct boundary.
88
89    Performance on the kernels can drop by a factor of two or more if the
90    number of values to compute does not fall on a nice power of two
91    boundary.  A good choice for the boundary value is the
92    min_data_type_align_size property of the OpenCL device.  The usual
93    value of 128 gives a working size as a multiple of 32.  The trailing
94    additional vector elements are given a value of *extra*, and so
95    f(*extra*) will be computed for each of them.  The returned array
96    will thus be a subset of the computed array.
97    """
98    boundary // dtype.itemsize
99    remainder = vector.size%boundary
100    size = vector.size + (boundary - remainder if remainder != 0 else 0)
101    if size != vector.size:
102        vector = np.hstack((vector, [extra]*(size-vector.size)))
103    return np.ascontiguousarray(vector, dtype=dtype)
104
105
106def compile_model(context, source, dtype):
107    """
108    Build a model to run on the gpu.
109
110    Returns the compiled program and its type.  The returned type will
111    be float32 even if the desired type is float64 if any of the
112    devices in the context do not support the cl_khr_fp64 extension.
113    """
114    dtype = np.dtype(dtype)
115    if dtype==gen.F64 and not all(has_double(d) for d in context.devices):
116        raise RuntimeError("Double precision not supported for devices")
117
118    header = F64_DEFS if dtype == gen.F64 else F32_DEFS
119    # Note: USE_SINCOS makes the intel cpu slower under opencl
120    if context.devices[0].type == cl.device_type.GPU:
121        header += "#define USE_SINCOS\n"
122    program  = cl.Program(context, header+source).build()
123    return program
124
125
126def make_result(self, size):
127    self.res = np.empty(size, dtype=self.dtype)
128    self.res_b = cl.Buffer(self.program.context, mf.READ_WRITE, self.res.nbytes)
129    return self.res, self.res_b
130
131
132# for now, this returns one device in the context
133# TODO: create a context that contains all devices on all platforms
134class GpuEnvironment(object):
135    """
136    GPU context, with possibly many devices, and one queue per device.
137    """
138    def __init__(self):
139        self.context = cl.create_some_context()
140        self.queues = [cl.CommandQueue(self.context, d)
141                       for d in self.context.devices]
142        self.boundary = max(d.min_data_type_align_size
143                            for d in self.context.devices)
144        self.has_double = all(has_double(d) for d in self.context.devices)
145        self.compiled = {}
146
147    def compile_program(self, name, source, dtype):
148        if name not in self.compiled:
149            #print "compiling",name
150            self.compiled[name] = compile_model(self.context, source, dtype)
151        return self.compiled[name]
152
153    def release_program(self, name):
154        if name in self.compiled:
155            self.compiled[name].release()
156            del self.compiled[name]
157
158class GpuModel(object):
159    """
160    GPU wrapper for a single model.
161
162    *source* and *info* are the model source and interface as returned
163    from :func:`gen.make`.
164
165    *dtype* is the desired model precision.  Any numpy dtype for single
166    or double precision floats will do, such as 'f', 'float32' or 'single'
167    for single and 'd', 'float64' or 'double' for double.  Double precision
168    is an optional extension which may not be available on all devices.
169    """
170    def __init__(self, source, info, dtype=gen.F32):
171        self.info = info
172        self.source = source
173        self.dtype = dtype
174        self.program = None # delay program creation
175
176    def __getstate__(self):
177        state = self.__dict__.copy()
178        state['program'] = None
179        return state
180
181    def __setstate__(self, state):
182        self.__dict__ = state.copy()
183
184    def __call__(self, input):
185        if self.dtype != input.dtype:
186            raise TypeError("data and kernel have different types")
187        if self.program is None:
188            self.program = environment().compile_program(self.info['name'],self.source, self.dtype)
189        kernel_name = gen.kernel_name(self.info, input.is_2D)
190        kernel = getattr(self.program, kernel_name)
191        return GpuKernel(kernel, self.info, input)
192
193    def release(self):
194        if self.program is not None:
195            environment().release_program(self.info['name'])
196            self.program = None
197
198    def make_input(self, q_vectors):
199        """
200        Make q input vectors available to the model.
201
202        This only needs to be done once for all models that operate on the
203        same input.  So for example, if you are adding two different models
204        together to compare to a data set, then only one model needs to
205        needs to call make_input, so long as the models have the same dtype.
206        """
207        # Note: the weird interface, where make_input doesn't care which
208        # model calls it, allows us to ask the model to define the data
209        # and the caller does not need to know if it is opencl or ctypes.
210        # The returned data object is opaque.
211        return GpuInput(q_vectors, dtype=self.dtype)
212
213# TODO: check that we don't need a destructor for buffers which go out of scope
214class GpuInput(object):
215    """
216    Make q data available to the gpu.
217
218    *q_vectors* is a list of q vectors, which will be *[q]* for 1-D data,
219    and *[qx, qy]* for 2-D data.  Internally, the vectors will be reallocated
220    to get the best performance on OpenCL, which may involve shifting and
221    stretching the array to better match the memory architecture.  Additional
222    points will be evaluated with *q=1e-3*.
223
224    *dtype* is the data type for the q vectors. The data type should be
225    set to match that of the kernel, which is an attribute of
226    :class:`GpuProgram`.  Note that not all kernels support double
227    precision, so even if the program was created for double precision,
228    the *GpuProgram.dtype* may be single precision.
229
230    Call :meth:`release` when complete.  Even if not called directly, the
231    buffer will be released when the data object is freed.
232    """
233    def __init__(self, q_vectors, dtype=gen.F32):
234        env = environment()
235        self.nq = q_vectors[0].size
236        self.dtype = np.dtype(dtype)
237        self.is_2D = (len(q_vectors) == 2)
238        self.q_vectors = [
239            _stretch_input(q, self.dtype, boundary=env.boundary)
240            for q in q_vectors
241        ]
242        self.q_buffers = [
243            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)
244            for q in self.q_vectors
245        ]
246        self.global_size = [self.q_vectors[0].size]
247
248    def release(self):
249        for b in self.q_buffers:
250            b.release()
251        self.q_buffers = []
252
253class GpuKernel(object):
254    """
255    Callable SAS kernel.
256
257    *kernel* is the GpuKernel object to call.
258
259    *info* is the module information
260
261    *input* is the DllInput q vectors at which the kernel should be
262    evaluated.
263
264    The resulting call method takes the *pars*, a list of values for
265    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight)
266    vectors for the polydisperse parameters.  *cutoff* determines the
267    integration limits: any points with combined weight less than *cutoff*
268    will not be calculated.
269
270    Call :meth:`release` when done with the kernel instance.
271    """
272    def __init__(self, kernel, info, input):
273        self.input = input
274        self.kernel = kernel
275        self.info = info
276        self.res = np.empty(input.nq, input.dtype)
277        dim = '2d' if input.is_2D else '1d'
278        self.fixed_pars = info['partype']['fixed-'+dim]
279        self.pd_pars = info['partype']['pd-'+dim]
280
281        # Inputs and outputs for each kernel call
282        # Note: res may be shorter than res_b if global_size != nq
283        env = environment()
284        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE,
285                                  2*MAX_LOOPS*input.dtype.itemsize)
286                        for _ in env.queues]
287        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE,
288                                input.global_size[0]*input.dtype.itemsize)
289                      for _ in env.queues]
290
291
292    def __call__(self, pars, pd_pars, cutoff=1e-5):
293        real = np.float32 if self.input.dtype == gen.F32 else np.float64
294        fixed = [real(p) for p in pars]
295        cutoff = real(cutoff)
296        loops = np.hstack(pd_pars)
297        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten()
298        Nloops = [np.uint32(len(p[0])) for p in pd_pars]
299        #print "loops",Nloops, loops
300
301        #import sys; print >>sys.stderr,"opencl eval",pars
302        #print "opencl eval",pars
303        if len(loops) > 2*MAX_LOOPS:
304            raise ValueError("too many polydispersity points")
305        device_num = 0
306        res_bi = self.res_b[device_num]
307        queuei = environment().queues[device_num]
308        loops_bi = self.loops_b[device_num]
309        loops_l = cl.LocalMemory(len(loops.data))
310        cl.enqueue_copy(queuei, loops_bi, loops)
311        #ctx = environment().context
312        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops)
313        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops
314        self.kernel(queuei, self.input.global_size, None, *args)
315        cl.enqueue_copy(queuei, self.res, res_bi)
316
317        return self.res
318
319    def release(self):
320        for b in self.loops_b:
321            b.release()
322        self.loops_b = []
323        for b in self.res_b:
324            b.release()
325        self.res_b = []
326
327    def __del__(self):
328        self.release()
Note: See TracBrowser for help on using the repository browser.