Changes in sasmodels/kernelcl.py [f872fd1:3199b17] in sasmodels
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcl.py
rf872fd1 r3199b17 61 61 62 62 63 # Attempt to setup opencl. This may fail if the pyopencl package is not63 # Attempt to setup OpenCL. This may fail if the pyopencl package is not 64 64 # installed or if it is installed but there are no devices available. 65 65 try: … … 67 67 from pyopencl import mem_flags as mf 68 68 from pyopencl.characterize import get_fast_inaccurate_build_options 69 # Ask OpenCL for the default context so that we know that one exists 69 # Ask OpenCL for the default context so that we know that one exists. 70 70 cl.create_some_context(interactive=False) 71 71 HAVE_OPENCL = True … … 88 88 # pylint: enable=unused-import 89 89 90 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path) 90 91 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 91 92 def quote_path(v): 92 93 """ … … 99 100 return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 100 101 102 101 103 def fix_pyopencl_include(): 102 104 """ … … 105 107 import pyopencl as cl 106 108 if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 107 cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 109 cl._DEFAULT_INCLUDE_OPTIONS = [ 110 quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 111 ] 112 108 113 109 114 if HAVE_OPENCL: … … 118 123 MAX_LOOPS = 2048 119 124 120 121 125 # Pragmas for enable OpenCL features. Be sure to protect them so that they 122 126 # still compile even if OpenCL is not present. … … 133 137 """ 134 138 139 135 140 def use_opencl(): 136 141 sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 137 142 return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 138 143 144 139 145 ENV = None 140 146 def reset_environment(): … … 144 150 global ENV 145 151 ENV = GpuEnvironment() if use_opencl() else None 152 146 153 147 154 def environment(): … … 161 168 return ENV 162 169 170 163 171 def has_type(device, dtype): 164 172 # type: (cl.Device, np.dtype) -> bool … … 171 179 return "cl_khr_fp64" in device.extensions 172 180 else: 173 # Not supporting F16 type since it isn't accurate enough 181 # Not supporting F16 type since it isn't accurate enough. 174 182 return False 183 175 184 176 185 def get_warp(kernel, queue): … … 182 191 cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 183 192 queue.device) 193 184 194 185 195 def compile_model(context, source, dtype, fast=False): … … 203 213 source_list.insert(0, _F64_PRAGMA) 204 214 205 # Note: USE_SINCOS makes the intel cpu slower under opencl215 # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 206 216 if context.devices[0].type == cl.device_type.GPU: 207 217 source_list.insert(0, "#define USE_SINCOS\n") … … 210 220 source = "\n".join(source_list) 211 221 program = cl.Program(context, source).build(options=options) 222 212 223 #print("done with "+program) 213 224 return program 214 225 215 226 216 # for now, this returns one device in the context217 # TODO: create a context that contains all devices on all platforms227 # For now, this returns one device in the context. 228 # TODO: Create a context that contains all devices on all platforms. 218 229 class GpuEnvironment(object): 219 230 """ 220 GPU context, with possibly many devices, and one queue per device. 221 222 Because the environment can be reset during a live program (e.g., if the 223 user changes the active GPU device in the GUI), everything associated 224 with the device context must be cached in the environment and recreated 225 if the environment changes. The *cache* attribute is a simple dictionary 226 which holds keys and references to objects, such as compiled kernels and 227 allocated buffers. The running program should check in the cache for 228 long lived objects and create them if they are not there. The program 229 should not hold onto cached objects, but instead only keep them active 230 for the duration of a function call. When the environment is destroyed 231 then the *release* method for each active cache item is called before 232 the environment is freed. This means that each cl buffer should be 233 in its own cache entry. 231 GPU context for OpenCL, with possibly many devices and one queue per device. 234 232 """ 235 233 def __init__(self): 236 234 # type: () -> None 237 # find gpu context235 # Find gpu context. 238 236 context_list = _create_some_context() 239 237 … … 249 247 self.context[dtype] = None 250 248 251 # Build a queue for each context 249 # Build a queue for each context. 252 250 self.queue = {} 253 251 context = self.context[F32] … … 259 257 self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 260 258 261 # Byte boundary for data alignment259 ## Byte boundary for data alignment. 262 260 #self.data_boundary = max(context.devices[0].min_data_type_align_size 263 261 # for context in self.context.values()) 264 262 265 # Cache for compiled programs, and for items in context 263 # Cache for compiled programs, and for items in context. 266 264 self.compiled = {} 267 self.cache = {}268 265 269 266 def has_type(self, dtype): … … 280 277 """ 281 278 # Note: PyOpenCL caches based on md5 hash of source, options and device 282 # so we don't really need to cache things for ourselves. I'll do so 283 # anyway just to save some data munging time. 279 # but I'll do so as well just to save some data munging time. 284 280 tag = generate.tag_source(source) 285 281 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 286 # Check timestamp on program 282 # Check timestamp on program. 287 283 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 288 284 if program_timestamp < timestamp: … … 297 293 return program 298 294 299 def free_buffer(self, key):300 if key in self.cache:301 self.cache[key].release()302 del self.cache[key]303 304 def __del__(self):305 for v in self.cache.values():306 release = getattr(v, 'release', lambda: None)307 release()308 self.cache = {}309 310 _CURRENT_ID = 0311 def unique_id():312 global _CURRENT_ID313 _CURRENT_ID += 1314 return _CURRENT_ID315 295 316 296 def _create_some_context(): … … 325 305 which one (and not a CUDA device, or no GPU). 326 306 """ 327 # Assume we do not get here if SAS_OPENCL is None or CUDA 307 # Assume we do not get here if SAS_OPENCL is None or CUDA. 328 308 sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 329 309 if sas_opencl.lower() != 'opencl': 330 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 310 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 331 311 os.environ["PYOPENCL_CTX"] = sas_opencl 332 312 … … 336 316 except Exception as exc: 337 317 warnings.warn(str(exc)) 338 warnings.warn("pyopencl.create_some_context() failed") 339 warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 318 warnings.warn("pyopencl.create_some_context() failed. The " 319 "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 320 "not be set correctly") 340 321 341 322 return _get_default_context() 323 342 324 343 325 def _get_default_context(): … … 352 334 # is running may increase throughput. 353 335 # 354 # Mac book pro, base install:336 # MacBook Pro, base install: 355 337 # {'Apple': [Intel CPU, NVIDIA GPU]} 356 # Mac book pro, base install:338 # MacBook Pro, base install: 357 339 # {'Apple': [Intel CPU, Intel GPU]} 358 # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed340 # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 359 341 # {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 360 342 gpu, cpu = None, None … … 379 361 else: 380 362 # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 381 # Intel Phi for example registers as an accelerator 363 # Intel Phi for example registers as an accelerator. 382 364 # Since the user installed a custom device on their system 383 365 # and went through the pain of sorting out OpenCL drivers for … … 386 368 gpu = device 387 369 388 # order the devices by gpu then by cpu; when searching for an available370 # Order the devices by gpu then by cpu; when searching for an available 389 371 # device by data type they will be checked in this order, which means 390 372 # that if the gpu supports double then the cpu will never be used (though … … 413 395 that the compiler is allowed to take shortcuts. 414 396 """ 397 info = None # type: ModelInfo 398 source = "" # type: str 399 dtype = None # type: np.dtype 400 fast = False # type: bool 401 _program = None # type: cl.Program 402 _kernels = None # type: Dict[str, cl.Kernel] 403 415 404 def __init__(self, source, model_info, dtype=generate.F32, fast=False): 416 405 # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None … … 419 408 self.dtype = dtype 420 409 self.fast = fast 421 self.timestamp = generate.ocl_timestamp(self.info)422 self._cache_key = unique_id()423 410 424 411 def __getstate__(self): … … 429 416 # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 430 417 self.info, self.source, self.dtype, self.fast = state 418 self._program = self._kernels = None 431 419 432 420 def make_kernel(self, q_vectors): … … 434 422 return GpuKernel(self, q_vectors) 435 423 436 @property 437 def Iq(self): 438 return self._fetch_kernel('Iq') 439 440 def fetch_kernel(self, name): 424 def get_function(self, name): 441 425 # type: (str) -> cl.Kernel 442 426 """ … … 444 428 does not already exist. 445 429 """ 446 gpu = environment() 447 key = self._cache_key 448 if key not in gpu.cache: 449 program = gpu.compile_program( 450 self.info.name, 451 self.source['opencl'], 452 self.dtype, 453 self.fast, 454 self.timestamp) 455 variants = ['Iq', 'Iqxy', 'Imagnetic'] 456 names = [generate.kernel_name(self.info, k) for k in variants] 457 kernels = [getattr(program, k) for k in names] 458 data = dict((k, v) for k, v in zip(variants, kernels)) 459 # keep a handle to program so GC doesn't collect 460 data['program'] = program 461 gpu.cache[key] = data 462 else: 463 data = gpu.cache[key] 464 return data[name] 465 466 # TODO: check that we don't need a destructor for buffers which go out of scope 430 if self._program is None: 431 self._prepare_program() 432 return self._kernels[name] 433 434 def _prepare_program(self): 435 # type: (str) -> None 436 env = environment() 437 timestamp = generate.ocl_timestamp(self.info) 438 program = env.compile_program( 439 self.info.name, 440 self.source['opencl'], 441 self.dtype, 442 self.fast, 443 timestamp) 444 variants = ['Iq', 'Iqxy', 'Imagnetic'] 445 names = [generate.kernel_name(self.info, k) for k in variants] 446 functions = [getattr(program, k) for k in names] 447 self._kernels = {k: v for k, v in zip(variants, functions)} 448 # Keep a handle to program so GC doesn't collect. 449 self._program = program 450 451 452 # TODO: Check that we don't need a destructor for buffers which go out of scope. 467 453 class GpuInput(object): 468 454 """ … … 486 472 def __init__(self, q_vectors, dtype=generate.F32): 487 473 # type: (List[np.ndarray], np.dtype) -> None 488 # TODO: do we ever need double precision q?474 # TODO: Do we ever need double precision q? 489 475 self.nq = q_vectors[0].size 490 476 self.dtype = np.dtype(dtype) 491 477 self.is_2d = (len(q_vectors) == 2) 492 # TODO: stretch input based on get_warp()493 # not doing it now since warp depends on kernel, which is not known478 # TODO: Stretch input based on get_warp(). 479 # Not doing it now since warp depends on kernel, which is not known 494 480 # at this point, so instead using 32, which is good on the set of 495 481 # architectures tested so far. … … 504 490 self.q[:self.nq] = q_vectors[0] 505 491 self.global_size = [self.q.shape[0]] 506 self._cache_key = unique_id() 507 508 @property 509 def q_b(self): 510 """Lazy creation of q buffer so it can survive context reset""" 492 #print("creating inputs of size", self.global_size) 493 494 # Transfer input value to GPU. 511 495 env = environment() 512 key = self._cache_key 513 if key not in env.cache: 514 context = env.context[self.dtype] 515 #print("creating inputs of size", self.global_size) 516 buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 517 hostbuf=self.q) 518 env.cache[key] = buffer 519 return env.cache[key] 496 context = env.context[self.dtype] 497 self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 498 hostbuf=self.q) 520 499 521 500 def release(self): 522 501 # type: () -> None 523 502 """ 524 Free the buffer associated with the q value 525 """ 526 environment().free_buffer(id(self)) 503 Free the buffer associated with the q value. 504 """ 505 if self.q_b is not None: 506 self.q_b.release() 507 self.q_b = None 527 508 528 509 def __del__(self): … … 530 511 self.release() 531 512 513 532 514 class GpuKernel(Kernel): 533 515 """ … … 536 518 *model* is the GpuModel object to call 537 519 538 The following attributes are defined: 539 540 *info* is the module information 541 542 *dtype* is the kernel precision 543 544 *dim* is '1d' or '2d' 545 546 *result* is a vector to contain the results of the call 547 548 The resulting call method takes the *pars*, a list of values for 549 the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 550 vectors for the polydisperse parameters. *cutoff* determines the 551 integration limits: any points with combined weight less than *cutoff* 552 will not be calculated. 520 The kernel is derived from :class:`Kernel`, providing the 521 :meth:`call_kernel` method to evaluate the kernel for a given set of 522 parameters. Because of the need to move the q values to the GPU before 523 evaluation, the kernel is instantiated for a particular set of q vectors, 524 and can be called many times without transfering q each time. 553 525 554 526 Call :meth:`release` when done with the kernel instance. 555 527 """ 528 #: SAS model information structure. 529 info = None # type: ModelInfo 530 #: Kernel precision. 531 dtype = None # type: np.dtype 532 #: Kernel dimensions (1d or 2d). 533 dim = "" # type: str 534 #: Calculation results, updated after each call to :meth:`_call_kernel`. 535 result = None # type: np.ndarray 536 556 537 def __init__(self, model, q_vectors): 557 # type: ( cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None538 # type: (GpuModel, List[np.ndarray]) -> None 558 539 dtype = model.dtype 559 540 self.q_input = GpuInput(q_vectors, dtype) 560 541 self._model = model 561 # F16 isn't sufficient, so don't support it 562 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 563 self._cache_key = unique_id() 564 565 # attributes accessed from the outside 542 543 # Attributes accessed from the outside. 566 544 self.dim = '2d' if self.q_input.is_2d else '1d' 567 545 self.info = model.info 568 self.dtype = model.dtype 569 570 # holding place for the returned value 546 self.dtype = dtype 547 548 # Converter to translate input to target type. 549 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 550 551 # Holding place for the returned value. 571 552 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 572 extra_q = 4 # total weight, form volume, shell volume and R_eff 573 self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 574 575 @property 576 def _result_b(self): 577 """Lazy creation of result buffer so it can survive context reset""" 553 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 554 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 555 556 # Allocate result value on GPU. 578 557 env = environment() 579 key = self._cache_key 580 if key not in env.cache: 581 context = env.context[self.dtype] 582 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 583 buffer = cl.Buffer(context, mf.READ_WRITE, width) 584 env.cache[key] = buffer 585 return env.cache[key] 586 587 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 588 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 558 context = env.context[self.dtype] 559 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 560 self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 561 562 def _call_kernel(self, call_details, values, cutoff, magnetic, 563 effective_radius_type): 564 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 589 565 env = environment() 590 566 queue = env.queue[self._model.dtype] 591 567 context = queue.context 592 568 593 # Arrange data transfer to/from card 594 q_b = self.q_input.q_b 595 result_b = self._result_b 569 # Arrange data transfer to card. 596 570 details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 597 571 hostbuf=call_details.buffer) … … 599 573 hostbuf=values) 600 574 575 # Setup kernel function and arguments. 601 576 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 602 kernel = self._model. fetch_kernel(name)577 kernel = self._model.get_function(name) 603 578 kernel_args = [ 604 np.uint32(self.q_input.nq), None, None, 605 details_b, values_b, q_b, result_b, 606 self._as_dtype(cutoff), 607 np.uint32(effective_radius_type), 579 np.uint32(self.q_input.nq), # Number of inputs. 580 None, # Placeholder for pd_start. 581 None, # Placeholder for pd_stop. 582 details_b, # Problem definition. 583 values_b, # Parameter values. 584 self.q_input.q_b, # Q values. 585 self._result_b, # Result storage. 586 self._as_dtype(cutoff), # Probability cutoff. 587 np.uint32(effective_radius_type), # R_eff mode. 608 588 ] 589 590 # Call kernel and retrieve results. 609 591 #print("Calling OpenCL") 610 592 #call_details.show(values) 611 #Call kernel and retrieve results612 593 wait_for = None 613 594 last_nap = time.clock() … … 620 601 *kernel_args, wait_for=wait_for)] 621 602 if stop < call_details.num_eval: 622 # Allow other processes to run 603 # Allow other processes to run. 623 604 wait_for[0].wait() 624 605 current_time = time.clock() … … 626 607 time.sleep(0.001) 627 608 last_nap = current_time 628 cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for)609 cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for) 629 610 #print("result", self.result) 630 611 631 # Free buffers 632 for v in (details_b, values_b): 633 if v is not None: 634 v.release() 612 # Free buffers. 613 details_b.release() 614 values_b.release() 635 615 636 616 def release(self): … … 639 619 Release resources associated with the kernel. 640 620 """ 641 environment().free_buffer(id(self))642 621 self.q_input.release() 622 if self._result_b is not None: 623 self._result_b.release() 624 self._result_b = None 643 625 644 626 def __del__(self):
Note: See TracChangeset
for help on using the changeset viewer.