Changeset d18582e in sasmodels for sasmodels/kernelcl.py
- Timestamp:
- Feb 4, 2016 12:44:23 PM (8 years ago)
- Branches:
- master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 5054e80
- Parents:
- bb6f0f3
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcl.py
reafc9fa rd18582e 172 172 #self.data_boundary = max(d.min_data_type_align_size 173 173 # for d in self.context.devices) 174 self.queues = [cl.CommandQueue( self.context, d)175 for d in self.context.devices]174 self.queues = [cl.CommandQueue(context, context.devices[0]) 175 for context in self.context] 176 176 self.compiled = {} 177 177 … … 181 181 """ 182 182 dtype = generate.F32 if dtype == 'fast' else np.dtype(dtype) 183 return all(has_type(d, dtype) for d in self.context.devices) 183 return any(has_type(d, dtype) 184 for context in self.context 185 for d in context.devices) 186 187 def get_queue(self, dtype): 188 """ 189 Return a command queue for the kernels of type dtype. 190 """ 191 for context, queue in zip(self.context, self.queues): 192 if all(has_type(d, dtype) for d in context.devices): 193 return queue 194 195 def get_context(self, dtype): 196 """ 197 Return a OpenCL context for the kernels of type dtype. 198 """ 199 for context, queue in zip(self.context, self.queues): 200 if all(has_type(d, dtype) for d in context.devices): 201 return context 184 202 185 203 def _create_some_context(self): … … 190 208 """ 191 209 try: 192 self.context = cl.create_some_context(interactive=False)210 self.context = [cl.create_some_context(interactive=False)] 193 211 except Exception as exc: 194 212 warnings.warn(str(exc)) … … 204 222 #print("compiling",name) 205 223 dtype = np.dtype(dtype) 206 program = compile_model(self. context, source, dtype, fast)224 program = compile_model(self.get_context(dtype), source, dtype, fast) 207 225 self.compiled[key] = program 208 226 return self.compiled[key] … … 218 236 def _get_default_context(): 219 237 """ 220 Get an OpenCL context, preferring GPU over CPU. 221 """ 222 default = None 238 Get an OpenCL context, preferring GPU over CPU, and preferring Intel 239 drivers over AMD drivers. 240 """ 241 # Note: on mobile devices there is automatic clock scaling if either the 242 # CPU or the GPU is underutilized; probably doesn't affect us, but we if 243 # it did, it would mean that putting a busy loop on the CPU while the GPU 244 # is running may increase throughput. 245 # 246 # Macbook pro, base install: 247 # {'Apple': [Intel CPU, NVIDIA GPU]} 248 # Macbook pro, base install: 249 # {'Apple': [Intel CPU, Intel GPU]} 250 # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed 251 # {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 252 gpu, cpu = None, None 223 253 for platform in cl.get_platforms(): 224 254 for device in platform.get_devices(): 225 255 if device.type == cl.device_type.GPU: 226 return cl.Context([device])227 if default is None:228 default= device229 230 if not default:231 raise RuntimeError("OpenCL device not found")232 233 return cl.Context([default])256 gpu = device 257 else: 258 cpu = device 259 single = gpu if gpu is not None else cpu 260 double = gpu if has_type(gpu, np.dtype('double')) else cpu 261 single_context = cl.Context([single]) 262 double_context = cl.Context([double]) if single != double else single_context 263 return single_context, double_context 234 264 235 265 … … 314 344 # architectures tested so far. 315 345 self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors] 346 context = env.get_context(self.dtype) 316 347 self.q_buffers = [ 317 cl.Buffer( env.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q)348 cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q) 318 349 for q in self.q_vectors 319 350 ] … … 363 394 # Note: res may be shorter than res_b if global_size != nq 364 395 env = environment() 365 self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE, 366 2 * MAX_LOOPS * q_input.dtype.itemsize) 367 for _ in env.queues] 368 self.res_b = [cl.Buffer(env.context, mf.READ_WRITE, 369 q_input.global_size[0] * q_input.dtype.itemsize) 370 for _ in env.queues] 396 self.queue = env.get_queue(dtype) 397 self.loops_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 398 2 * MAX_LOOPS * q_input.dtype.itemsize) 399 self.res_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 400 q_input.global_size[0] * q_input.dtype.itemsize) 371 401 self.q_input = q_input 402 403 self._need_release = [self.loops_b, self.res_b, self.q_input] 372 404 373 405 def __call__(self, fixed_pars, pd_pars, cutoff=1e-5): … … 377 409 else np.float32) # will never get here, so use np.float32 378 410 379 device_num = 0 380 queuei = environment().queues[device_num] 381 res_bi = self.res_b[device_num] 411 res_bi = self.res_b 382 412 nq = np.uint32(self.q_input.nq) 383 413 if pd_pars: … … 394 424 raise ValueError("too many polydispersity points") 395 425 396 loops_bi = self.loops_b [device_num]397 cl.enqueue_copy( queuei, loops_bi, loops)426 loops_bi = self.loops_b 427 cl.enqueue_copy(self.queue, loops_bi, loops) 398 428 loops_l = cl.LocalMemory(len(loops.data)) 399 429 #ctx = environment().context … … 404 434 fixed = [real(p) for p in fixed_pars] 405 435 args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed 406 self.kernel( queuei, self.q_input.global_size, None, *args)407 cl.enqueue_copy( queuei, self.res, res_bi)436 self.kernel(self.queue, self.q_input.global_size, None, *args) 437 cl.enqueue_copy(self.queue, self.res, res_bi) 408 438 409 439 return self.res … … 413 443 Release resources associated with the kernel. 414 444 """ 415 for b in self.loops_b: 416 b.release() 417 self.loops_b = [] 418 for b in self.res_b: 419 b.release() 420 self.res_b = [] 421 self.q_input.release() 445 for v in self._need_release: 446 v.release() 447 self._need_release = [] 422 448 423 449 def __del__(self):
Note: See TracChangeset
for help on using the changeset viewer.