Changeset 8064d5e in sasmodels for sasmodels/kernelcl.py
- Timestamp:
- Nov 9, 2018 2:23:18 PM (5 years ago)
- Branches:
- master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- c11d09f
- Parents:
- b093c1c (diff), cf3d0ce (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent. - File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcl.py
rf31815d r8064d5e 1 1 """ 2 2 GPU driver for C kernels 3 4 TODO: docs are out of date 3 5 4 6 There should be a single GPU environment running on the system. This … … 67 69 import numpy as np # type: ignore 68 70 69 # Attempt to setup opencl. This may fail if the opencl package is not71 # Attempt to setup opencl. This may fail if the pyopencl package is not 70 72 # installed or if it is installed but there are no devices available. 71 73 try: … … 82 84 83 85 from . import generate 86 from .generate import F32, F64 84 87 from .kernel import KernelModel, Kernel 85 88 … … 139 142 140 143 def use_opencl(): 141 return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none" 144 sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 145 return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 142 146 143 147 ENV = None … … 170 174 Return true if device supports the requested precision. 171 175 """ 172 if dtype == generate.F32:176 if dtype == F32: 173 177 return True 174 elif dtype == generate.F64:178 elif dtype == F64: 175 179 return "cl_khr_fp64" in device.extensions 176 elif dtype == generate.F16:177 return "cl_khr_fp16" in device.extensions178 180 else: 181 # Not supporting F16 type since it isn't accurate enough 179 182 return False 180 183 … … 187 190 cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 188 191 queue.device) 189 190 def _stretch_input(vector, dtype, extra=1e-3, boundary=32):191 # type: (np.ndarray, np.dtype, float, int) -> np.ndarray192 """193 Stretch an input vector to the correct boundary.194 195 Performance on the kernels can drop by a factor of two or more if the196 number of values to compute does not fall on a nice power of two197 boundary. The trailing additional vector elements are given a198 value of *extra*, and so f(*extra*) will be computed for each of199 them. The returned array will thus be a subset of the computed array.200 201 *boundary* should be a power of 2 which is at least 32 for good202 performance on current platforms (as of Jan 2015). It should203 probably be the max of get_warp(kernel,queue) and204 device.min_data_type_align_size//4.205 """206 remainder = vector.size % boundary207 if remainder != 0:208 size = vector.size + (boundary - remainder)209 vector = np.hstack((vector, [extra] * (size - vector.size)))210 return np.ascontiguousarray(vector, dtype=dtype)211 212 192 213 193 def compile_model(context, source, dtype, fast=False): … … 247 227 """ 248 228 GPU context, with possibly many devices, and one queue per device. 229 230 Because the environment can be reset during a live program (e.g., if the 231 user changes the active GPU device in the GUI), everything associated 232 with the device context must be cached in the environment and recreated 233 if the environment changes. The *cache* attribute is a simple dictionary 234 which holds keys and references to objects, such as compiled kernels and 235 allocated buffers. The running program should check in the cache for 236 long lived objects and create them if they are not there. The program 237 should not hold onto cached objects, but instead only keep them active 238 for the duration of a function call. When the environment is destroyed 239 then the *release* method for each active cache item is called before 240 the environment is freed. This means that each cl buffer should be 241 in its own cache entry. 249 242 """ 250 243 def __init__(self): 251 244 # type: () -> None 252 245 # find gpu context 253 #self.context = cl.create_some_context() 254 255 self.context = None 256 if 'SAS_OPENCL' in os.environ: 257 #Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 258 os.environ["PYOPENCL_CTX"] = os.environ["SAS_OPENCL"] 259 if 'PYOPENCL_CTX' in os.environ: 260 self._create_some_context() 261 262 if not self.context: 263 self.context = _get_default_context() 246 context_list = _create_some_context() 247 248 # Find a context for F32 and for F64 (maybe the same one). 249 # F16 isn't good enough. 250 self.context = {} 251 for dtype in (F32, F64): 252 for context in context_list: 253 if has_type(context.devices[0], dtype): 254 self.context[dtype] = context 255 break 256 else: 257 self.context[dtype] = None 258 259 # Build a queue for each context 260 self.queue = {} 261 context = self.context[F32] 262 self.queue[F32] = cl.CommandQueue(context, context.devices[0]) 263 if self.context[F64] == self.context[F32]: 264 self.queue[F64] = self.queue[F32] 265 else: 266 context = self.context[F64] 267 self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 264 268 265 269 # Byte boundary for data alignment 266 #self.data_boundary = max( d.min_data_type_align_size267 # for d in self.context.devices)268 self.queues = [cl.CommandQueue(context, context.devices[0]) 269 for context in self.context]270 #self.data_boundary = max(context.devices[0].min_data_type_align_size 271 # for context in self.context.values()) 272 273 # Cache for compiled programs, and for items in context 270 274 self.compiled = {} 275 self.cache = {} 271 276 272 277 def has_type(self, dtype): … … 275 280 Return True if all devices support a given type. 276 281 """ 277 return any(has_type(d, dtype) 278 for context in self.context 279 for d in context.devices) 280 281 def get_queue(self, dtype): 282 # type: (np.dtype) -> cl.CommandQueue 283 """ 284 Return a command queue for the kernels of type dtype. 285 """ 286 for context, queue in zip(self.context, self.queues): 287 if all(has_type(d, dtype) for d in context.devices): 288 return queue 289 290 def get_context(self, dtype): 291 # type: (np.dtype) -> cl.Context 292 """ 293 Return a OpenCL context for the kernels of type dtype. 294 """ 295 for context in self.context: 296 if all(has_type(d, dtype) for d in context.devices): 297 return context 298 299 def _create_some_context(self): 300 # type: () -> cl.Context 301 """ 302 Protected call to cl.create_some_context without interactivity. Use 303 this if SAS_OPENCL is set in the environment. Sets the *context* 304 attribute. 305 """ 306 try: 307 self.context = [cl.create_some_context(interactive=False)] 308 except Exception as exc: 309 warnings.warn(str(exc)) 310 warnings.warn("pyopencl.create_some_context() failed") 311 warnings.warn("the environment variable 'SAS_OPENCL' might not be set correctly") 282 return self.context.get(dtype, None) is not None 312 283 313 284 def compile_program(self, name, source, dtype, fast, timestamp): … … 326 297 del self.compiled[key] 327 298 if key not in self.compiled: 328 context = self. get_context(dtype)299 context = self.context[dtype] 329 300 logging.info("building %s for OpenCL %s", key, 330 301 context.devices[0].name.strip()) 331 program = compile_model(self. get_context(dtype),302 program = compile_model(self.context[dtype], 332 303 str(source), dtype, fast) 333 304 self.compiled[key] = (program, timestamp) 334 305 return program 306 307 def free_buffer(self, key): 308 if key in self.cache: 309 self.cache[key].release() 310 del self.cache[key] 311 312 def __del__(self): 313 for v in self.cache.values(): 314 release = getattr(v, 'release', lambda: None) 315 release() 316 self.cache = {} 317 318 _CURRENT_ID = 0 319 def unique_id(): 320 global _CURRENT_ID 321 _CURRENT_ID += 1 322 return _CURRENT_ID 323 324 def _create_some_context(): 325 # type: () -> cl.Context 326 """ 327 Protected call to cl.create_some_context without interactivity. 328 329 Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment, 330 otherwise scans for the most appropriate device using 331 :func:`_get_default_context`. Ignore *SAS_OPENCL=OpenCL*, which 332 indicates that an OpenCL device should be used without specifying 333 which one (and not a CUDA device, or no GPU). 334 """ 335 # Assume we do not get here if SAS_OPENCL is None or CUDA 336 sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 337 if sas_opencl.lower() != 'opencl': 338 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 339 os.environ["PYOPENCL_CTX"] = sas_opencl 340 341 if 'PYOPENCL_CTX' in os.environ: 342 try: 343 return [cl.create_some_context(interactive=False)] 344 except Exception as exc: 345 warnings.warn(str(exc)) 346 warnings.warn("pyopencl.create_some_context() failed") 347 warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 348 349 return _get_default_context() 335 350 336 351 def _get_default_context(): … … 412 427 self.dtype = dtype 413 428 self.fast = fast 414 self. program = None # delay program creation415 self._ kernels = None429 self.timestamp = generate.ocl_timestamp(self.info) 430 self._cache_key = unique_id() 416 431 417 432 def __getstate__(self): … … 422 437 # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 423 438 self.info, self.source, self.dtype, self.fast = state 424 self.program = None425 439 426 440 def make_kernel(self, q_vectors): 427 441 # type: (List[np.ndarray]) -> "GpuKernel" 428 if self.program is None: 429 compile_program = environment().compile_program 430 timestamp = generate.ocl_timestamp(self.info) 431 self.program = compile_program( 442 return GpuKernel(self, q_vectors) 443 444 @property 445 def Iq(self): 446 return self._fetch_kernel('Iq') 447 448 def fetch_kernel(self, name): 449 # type: (str) -> cl.Kernel 450 """ 451 Fetch the kernel from the environment by name, compiling it if it 452 does not already exist. 453 """ 454 gpu = environment() 455 key = self._cache_key 456 if key not in gpu.cache: 457 program = gpu.compile_program( 432 458 self.info.name, 433 459 self.source['opencl'], 434 460 self.dtype, 435 461 self.fast, 436 timestamp)462 self.timestamp) 437 463 variants = ['Iq', 'Iqxy', 'Imagnetic'] 438 464 names = [generate.kernel_name(self.info, k) for k in variants] 439 kernels = [getattr( self.program, k) for k in names]440 self._kernels= dict((k, v) for k, v in zip(variants, kernels))441 is_2d = len(q_vectors) == 2442 if is_2d:443 kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]465 kernels = [getattr(program, k) for k in names] 466 data = dict((k, v) for k, v in zip(variants, kernels)) 467 # keep a handle to program so GC doesn't collect 468 data['program'] = program 469 gpu.cache[key] = data 444 470 else: 445 kernel = [self._kernels['Iq']]*2 446 return GpuKernel(kernel, self.dtype, self.info, q_vectors) 447 448 def release(self): 449 # type: () -> None 450 """ 451 Free the resources associated with the model. 452 """ 453 if self.program is not None: 454 self.program = None 455 456 def __del__(self): 457 # type: () -> None 458 self.release() 471 data = gpu.cache[key] 472 return data[name] 459 473 460 474 # TODO: check that we don't need a destructor for buffers which go out of scope … … 481 495 # type: (List[np.ndarray], np.dtype) -> None 482 496 # TODO: do we ever need double precision q? 483 env = environment()484 497 self.nq = q_vectors[0].size 485 498 self.dtype = np.dtype(dtype) … … 490 503 # architectures tested so far. 491 504 if self.is_2d: 492 # Note: 16 rather than 15 because result is 1 longer than input. 493 width = ((self.nq+16)//16)*16 505 width = ((self.nq+15)//16)*16 494 506 self.q = np.empty((width, 2), dtype=dtype) 495 507 self.q[:self.nq, 0] = q_vectors[0] 496 508 self.q[:self.nq, 1] = q_vectors[1] 497 509 else: 498 # Note: 32 rather than 31 because result is 1 longer than input. 499 width = ((self.nq+32)//32)*32 510 width = ((self.nq+31)//32)*32 500 511 self.q = np.empty(width, dtype=dtype) 501 512 self.q[:self.nq] = q_vectors[0] 502 513 self.global_size = [self.q.shape[0]] 503 context = env.get_context(self.dtype) 504 #print("creating inputs of size", self.global_size) 505 self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 506 hostbuf=self.q) 514 self._cache_key = unique_id() 515 516 @property 517 def q_b(self): 518 """Lazy creation of q buffer so it can survive context reset""" 519 env = environment() 520 key = self._cache_key 521 if key not in env.cache: 522 context = env.context[self.dtype] 523 #print("creating inputs of size", self.global_size) 524 buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 525 hostbuf=self.q) 526 env.cache[key] = buffer 527 return env.cache[key] 507 528 508 529 def release(self): 509 530 # type: () -> None 510 531 """ 511 Free the memory. 512 """ 513 if self.q_b is not None: 514 self.q_b.release() 515 self.q_b = None 532 Free the buffer associated with the q value 533 """ 534 environment().free_buffer(id(self)) 516 535 517 536 def __del__(self): … … 523 542 Callable SAS kernel. 524 543 525 * kernel* is the GpuKernel object to call526 527 *model_info* is the module information528 529 * q_vectors* is the q vectors at which the kernel should be evaluated544 *model* is the GpuModel object to call 545 546 The following attributes are defined: 547 548 *info* is the module information 530 549 531 550 *dtype* is the kernel precision 551 552 *dim* is '1d' or '2d' 553 554 *result* is a vector to contain the results of the call 532 555 533 556 The resulting call method takes the *pars*, a list of values for … … 539 562 Call :meth:`release` when done with the kernel instance. 540 563 """ 541 def __init__(self, kernel, dtype, model_info, q_vectors):564 def __init__(self, model, q_vectors): 542 565 # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 543 q_input = GpuInput(q_vectors, dtype) 544 self.kernel = kernel 545 self.info = model_info 546 self.dtype = dtype 547 self.dim = '2d' if q_input.is_2d else '1d' 548 # plus three for the normalization values 549 self.result = np.empty(q_input.nq+1, dtype) 550 551 # Inputs and outputs for each kernel call 552 # Note: res may be shorter than res_b if global_size != nq 566 dtype = model.dtype 567 self.q_input = GpuInput(q_vectors, dtype) 568 self._model = model 569 # F16 isn't sufficient, so don't support it 570 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 571 self._cache_key = unique_id() 572 573 # attributes accessed from the outside 574 self.dim = '2d' if self.q_input.is_2d else '1d' 575 self.info = model.info 576 self.dtype = model.dtype 577 578 # holding place for the returned value 579 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 580 extra_q = 4 # total weight, form volume, shell volume and R_eff 581 self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 582 583 @property 584 def _result_b(self): 585 """Lazy creation of result buffer so it can survive context reset""" 553 586 env = environment() 554 self.queue = env.get_queue(dtype) 555 556 self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 557 q_input.global_size[0] * dtype.itemsize) 558 self.q_input = q_input # allocated by GpuInput above 559 560 self._need_release = [self.result_b, self.q_input] 561 self.real = (np.float32 if dtype == generate.F32 562 else np.float64 if dtype == generate.F64 563 else np.float16 if dtype == generate.F16 564 else np.float32) # will never get here, so use np.float32 565 566 def __call__(self, call_details, values, cutoff, magnetic): 587 key = self._cache_key 588 if key not in env.cache: 589 context = env.context[self.dtype] 590 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 591 buffer = cl.Buffer(context, mf.READ_WRITE, width) 592 env.cache[key] = buffer 593 return env.cache[key] 594 595 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 567 596 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 568 context = self.queue.context 569 # Arrange data transfer to card 597 env = environment() 598 queue = env.queue[self._model.dtype] 599 context = queue.context 600 601 # Arrange data transfer to/from card 602 q_b = self.q_input.q_b 603 result_b = self._result_b 570 604 details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 571 605 hostbuf=call_details.buffer) … … 573 607 hostbuf=values) 574 608 575 kernel = self.kernel[1 if magnetic else 0] 576 args = [ 609 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 610 kernel = self._model.fetch_kernel(name) 611 kernel_args = [ 577 612 np.uint32(self.q_input.nq), None, None, 578 details_b, values_b, self.q_input.q_b, self.result_b, 579 self.real(cutoff), 613 details_b, values_b, q_b, result_b, 614 self._as_dtype(cutoff), 615 np.uint32(effective_radius_type), 580 616 ] 581 617 #print("Calling OpenCL") 582 618 #call_details.show(values) 583 # 619 #Call kernel and retrieve results 584 620 wait_for = None 585 621 last_nap = clock() … … 588 624 stop = min(start + step, call_details.num_eval) 589 625 #print("queuing",start,stop) 590 args[1:3] = [np.int32(start), np.int32(stop)]591 wait_for = [kernel( self.queue, self.q_input.global_size, None,592 * args, wait_for=wait_for)]626 kernel_args[1:3] = [np.int32(start), np.int32(stop)] 627 wait_for = [kernel(queue, self.q_input.global_size, None, 628 *kernel_args, wait_for=wait_for)] 593 629 if stop < call_details.num_eval: 594 630 # Allow other processes to run … … 596 632 current_time = clock() 597 633 if current_time - last_nap > 0.5: 598 time.sleep(0.0 5)634 time.sleep(0.001) 599 635 last_nap = current_time 600 cl.enqueue_copy( self.queue, self.result, self.result_b)636 cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for) 601 637 #print("result", self.result) 602 638 … … 606 642 v.release() 607 643 608 pd_norm = self.result[self.q_input.nq]609 scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0)610 background = values[1]611 #print("scale",scale,values[0],self.result[self.q_input.nq],background)612 return scale*self.result[:self.q_input.nq] + background613 # return self.result[:self.q_input.nq]614 615 644 def release(self): 616 645 # type: () -> None … … 618 647 Release resources associated with the kernel. 619 648 """ 620 for v in self._need_release: 621 v.release() 622 self._need_release = [] 649 environment().free_buffer(id(self)) 650 self.q_input.release() 623 651 624 652 def __del__(self):
Note: See TracChangeset
for help on using the changeset viewer.