Changes in sasmodels/kernelcl.py [da32ec3:3c56da87] in sasmodels
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcl.py
rda32ec3 r3c56da87 30 30 try: 31 31 import pyopencl as cl 32 except ImportError,exc: 32 # Ask OpenCL for the default context so that we know that one exists 33 cl.create_some_context(interactive=False) 34 except Exception, exc: 33 35 warnings.warn(str(exc)) 34 36 raise RuntimeError("OpenCL not available") … … 37 39 38 40 from . import generate 39 from .kernelpy import Py Input, PyKernel41 from .kernelpy import PyModel 40 42 41 43 F64_DEFS = """\ … … 61 63 """ 62 64 source, info = generate.make(kernel_module) 65 if callable(info.get('Iq', None)): 66 return PyModel(info) 63 67 ## for debugging, save source to a .cl file, edit it, and reload as model 64 68 #open(info['name']+'.cl','w').write(source) … … 106 110 device.min_data_type_align_size//4. 107 111 """ 108 remainder = vector.size %boundary112 remainder = vector.size % boundary 109 113 if remainder != 0: 110 114 size = vector.size + (boundary - remainder) 111 vector = np.hstack((vector, [extra] *(size-vector.size)))115 vector = np.hstack((vector, [extra] * (size - vector.size))) 112 116 return np.ascontiguousarray(vector, dtype=dtype) 113 117 … … 122 126 """ 123 127 dtype = np.dtype(dtype) 124 if dtype ==generate.F64 and not all(has_double(d) for d in context.devices):128 if dtype == generate.F64 and not all(has_double(d) for d in context.devices): 125 129 raise RuntimeError("Double precision not supported for devices") 126 130 … … 131 135 if context.devices[0].type == cl.device_type.GPU: 132 136 header += "#define USE_SINCOS\n" 133 program = cl.Program(context, header+source).build()137 program = cl.Program(context, header + source).build() 134 138 return program 135 139 … … 156 160 157 161 if not self.context: 158 self.context = self._find_context()162 self.context = _get_default_context() 159 163 160 164 # Byte boundary for data alignment … … 169 173 try: 170 174 self.context = cl.create_some_context(interactive=False) 171 except Exception, exc:175 except Exception, exc: 172 176 warnings.warn(str(exc)) 173 177 warnings.warn("pyopencl.create_some_context() failed") 174 178 warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly") 175 176 def _find_context(self):177 default = None178 for platform in cl.get_platforms():179 for device in platform.get_devices():180 if device.type == cl.device_type.GPU:181 return cl.Context([device])182 if default is None:183 default = device184 185 if not default:186 raise RuntimeError("OpenCL device not found")187 188 return cl.Context([default])189 179 190 180 def compile_program(self, name, source, dtype): … … 199 189 del self.compiled[name] 200 190 191 def _get_default_context(): 192 default = None 193 for platform in cl.get_platforms(): 194 for device in platform.get_devices(): 195 if device.type == cl.device_type.GPU: 196 return cl.Context([device]) 197 if default is None: 198 default = device 199 200 if not default: 201 raise RuntimeError("OpenCL device not found") 202 203 return cl.Context([default]) 204 201 205 202 206 class GpuModel(object): … … 226 230 self.__dict__ = state.copy() 227 231 228 def __call__(self, input): 229 # Support pure python kernel call 230 if input.is_2D and callable(self.info['Iqxy']): 231 return PyKernel(self.info['Iqxy'], self.info, input) 232 elif not input.is_2D and callable(self.info['Iq']): 233 return PyKernel(self.info['Iq'], self.info, input) 234 235 if self.dtype != input.dtype: 232 def __call__(self, input_value): 233 if self.dtype != input_value.dtype: 236 234 raise TypeError("data and kernel have different types") 237 235 if self.program is None: 238 self.program = environment().compile_program(self.info['name'],self.source, self.dtype) 239 kernel_name = generate.kernel_name(self.info, input.is_2D) 236 compiler = environment().compile_program 237 self.program = compiler(self.info['name'], self.source, self.dtype) 238 kernel_name = generate.kernel_name(self.info, input_value.is_2D) 240 239 kernel = getattr(self.program, kernel_name) 241 return GpuKernel(kernel, self.info, input )240 return GpuKernel(kernel, self.info, input_value) 242 241 243 242 def release(self): … … 254 253 ctypes and some may be pure python. 255 254 """ 256 # Support pure python kernel call 257 if len(q_vectors) == 1 and callable(self.info['Iq']): 258 return PyInput(q_vectors, dtype=self.dtype) 259 elif callable(self.info['Iqxy']): 260 return PyInput(q_vectors, dtype=self.dtype) 261 else: 262 return GpuInput(q_vectors, dtype=self.dtype) 255 return GpuInput(q_vectors, dtype=self.dtype) 263 256 264 257 # TODO: check that we don't need a destructor for buffers which go out of scope … … 293 286 self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors] 294 287 self.q_buffers = [ 295 cl.Buffer(env.context, 288 cl.Buffer(env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q) 296 289 for q in self.q_vectors 297 290 ] … … 311 304 *info* is the module information 312 305 313 * input* is the DllInput q vectors at which the kernel should be306 *q_input* is the DllInput q vectors at which the kernel should be 314 307 evaluated. 315 308 … … 322 315 Call :meth:`release` when done with the kernel instance. 323 316 """ 324 def __init__(self, kernel, info, input):325 self. input =input317 def __init__(self, kernel, info, q_input): 318 self.q_input = q_input 326 319 self.kernel = kernel 327 320 self.info = info 328 self.res = np.empty( input.nq,input.dtype)329 dim = '2d' if input.is_2D else '1d'330 self.fixed_pars = info['partype']['fixed-' +dim]331 self.pd_pars = info['partype']['pd-' +dim]321 self.res = np.empty(q_input.nq, q_input.dtype) 322 dim = '2d' if q_input.is_2D else '1d' 323 self.fixed_pars = info['partype']['fixed-' + dim] 324 self.pd_pars = info['partype']['pd-' + dim] 332 325 333 326 # Inputs and outputs for each kernel call … … 335 328 env = environment() 336 329 self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE, 337 2 *MAX_LOOPS*input.dtype.itemsize)330 2 * MAX_LOOPS * q_input.dtype.itemsize) 338 331 for _ in env.queues] 339 332 self.res_b = [cl.Buffer(env.context, mf.READ_WRITE, 340 input.global_size[0]*input.dtype.itemsize)333 q_input.global_size[0] * q_input.dtype.itemsize) 341 334 for _ in env.queues] 342 335 343 336 344 def __call__(self, pars, pd_pars, cutoff=1e-5): 345 real = np.float32 if self.input.dtype == generate.F32 else np.float64 346 fixed = [real(p) for p in pars] 347 cutoff = real(cutoff) 348 loops = np.hstack(pd_pars) 349 loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten() 350 Nloops = [np.uint32(len(p[0])) for p in pd_pars] 351 #print "loops",Nloops, loops 352 353 #import sys; print >>sys.stderr,"opencl eval",pars 354 #print "opencl eval",pars 355 if len(loops) > 2*MAX_LOOPS: 356 raise ValueError("too many polydispersity points") 337 def __call__(self, fixed_pars, pd_pars, cutoff=1e-5): 338 real = np.float32 if self.q_input.dtype == generate.F32 else np.float64 339 357 340 device_num = 0 341 queuei = environment().queues[device_num] 358 342 res_bi = self.res_b[device_num] 359 queuei = environment().queues[device_num] 360 loops_bi = self.loops_b[device_num] 361 loops_l = cl.LocalMemory(len(loops.data)) 362 cl.enqueue_copy(queuei, loops_bi, loops) 363 #ctx = environment().context 364 #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops) 365 args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + Nloops 366 self.kernel(queuei, self.input.global_size, None, *args) 343 nq = np.uint32(self.q_input.nq) 344 if pd_pars: 345 cutoff = real(cutoff) 346 loops_N = [np.uint32(len(p[0])) for p in pd_pars] 347 loops = np.hstack(pd_pars) \ 348 if pd_pars else np.empty(0, dtype=self.q_input.dtype) 349 loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten() 350 #print "loops",Nloops, loops 351 352 #import sys; print >>sys.stderr,"opencl eval",pars 353 #print "opencl eval",pars 354 if len(loops) > 2 * MAX_LOOPS: 355 raise ValueError("too many polydispersity points") 356 357 loops_bi = self.loops_b[device_num] 358 cl.enqueue_copy(queuei, loops_bi, loops) 359 loops_l = cl.LocalMemory(len(loops.data)) 360 #ctx = environment().context 361 #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops) 362 dispersed = [loops_bi, loops_l, cutoff] + loops_N 363 else: 364 dispersed = [] 365 fixed = [real(p) for p in fixed_pars] 366 args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed 367 self.kernel(queuei, self.q_input.global_size, None, *args) 367 368 cl.enqueue_copy(queuei, self.res, res_bi) 368 369
Note: See TracChangeset
for help on using the changeset viewer.