Changeset 3199b17 in sasmodels for sasmodels/kernelcl.py
- Timestamp:
- Mar 6, 2019 2:24:03 PM (5 years ago)
- Branches:
- master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 4453136
- Parents:
- 00afc15
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcl.py
r00afc15 r3199b17 61 61 62 62 63 # Attempt to setup opencl. This may fail if the pyopencl package is not63 # Attempt to setup OpenCL. This may fail if the pyopencl package is not 64 64 # installed or if it is installed but there are no devices available. 65 65 try: … … 67 67 from pyopencl import mem_flags as mf 68 68 from pyopencl.characterize import get_fast_inaccurate_build_options 69 # Ask OpenCL for the default context so that we know that one exists 69 # Ask OpenCL for the default context so that we know that one exists. 70 70 cl.create_some_context(interactive=False) 71 71 HAVE_OPENCL = True … … 88 88 # pylint: enable=unused-import 89 89 90 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path) 90 91 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 91 92 def quote_path(v): 92 93 """ … … 99 100 return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 100 101 102 101 103 def fix_pyopencl_include(): 102 104 """ … … 105 107 import pyopencl as cl 106 108 if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 107 cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 109 cl._DEFAULT_INCLUDE_OPTIONS = [ 110 quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 111 ] 112 108 113 109 114 if HAVE_OPENCL: … … 118 123 MAX_LOOPS = 2048 119 124 120 121 125 # Pragmas for enable OpenCL features. Be sure to protect them so that they 122 126 # still compile even if OpenCL is not present. … … 133 137 """ 134 138 139 135 140 def use_opencl(): 136 141 sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 137 142 return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 138 143 144 139 145 ENV = None 140 146 def reset_environment(): … … 144 150 global ENV 145 151 ENV = GpuEnvironment() if use_opencl() else None 152 146 153 147 154 def environment(): … … 161 168 return ENV 162 169 170 163 171 def has_type(device, dtype): 164 172 # type: (cl.Device, np.dtype) -> bool … … 171 179 return "cl_khr_fp64" in device.extensions 172 180 else: 173 # Not supporting F16 type since it isn't accurate enough 181 # Not supporting F16 type since it isn't accurate enough. 174 182 return False 183 175 184 176 185 def get_warp(kernel, queue): … … 182 191 cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 183 192 queue.device) 193 184 194 185 195 def compile_model(context, source, dtype, fast=False): … … 203 213 source_list.insert(0, _F64_PRAGMA) 204 214 205 # Note: USE_SINCOS makes the intel cpu slower under opencl215 # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 206 216 if context.devices[0].type == cl.device_type.GPU: 207 217 source_list.insert(0, "#define USE_SINCOS\n") … … 210 220 source = "\n".join(source_list) 211 221 program = cl.Program(context, source).build(options=options) 222 212 223 #print("done with "+program) 213 224 return program 214 225 215 226 216 # for now, this returns one device in the context217 # TODO: create a context that contains all devices on all platforms227 # For now, this returns one device in the context. 228 # TODO: Create a context that contains all devices on all platforms. 218 229 class GpuEnvironment(object): 219 230 """ 220 GPU context, with possibly many devices, and one queue per device. 221 222 Because the environment can be reset during a live program (e.g., if the 223 user changes the active GPU device in the GUI), everything associated 224 with the device context must be cached in the environment and recreated 225 if the environment changes. The *cache* attribute is a simple dictionary 226 which holds keys and references to objects, such as compiled kernels and 227 allocated buffers. The running program should check in the cache for 228 long lived objects and create them if they are not there. The program 229 should not hold onto cached objects, but instead only keep them active 230 for the duration of a function call. When the environment is destroyed 231 then the *release* method for each active cache item is called before 232 the environment is freed. This means that each cl buffer should be 233 in its own cache entry. 231 GPU context for OpenCL, with possibly many devices and one queue per device. 234 232 """ 235 233 def __init__(self): 236 234 # type: () -> None 237 # find gpu context235 # Find gpu context. 238 236 context_list = _create_some_context() 239 237 … … 249 247 self.context[dtype] = None 250 248 251 # Build a queue for each context 249 # Build a queue for each context. 252 250 self.queue = {} 253 251 context = self.context[F32] … … 259 257 self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 260 258 261 # Byte boundary for data alignment259 ## Byte boundary for data alignment. 262 260 #self.data_boundary = max(context.devices[0].min_data_type_align_size 263 261 # for context in self.context.values()) 264 262 265 # Cache for compiled programs, and for items in context 263 # Cache for compiled programs, and for items in context. 266 264 self.compiled = {} 267 265 … … 279 277 """ 280 278 # Note: PyOpenCL caches based on md5 hash of source, options and device 281 # so we don't really need to cache things for ourselves. I'll do so 282 # anyway just to save some data munging time. 279 # but I'll do so as well just to save some data munging time. 283 280 tag = generate.tag_source(source) 284 281 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 285 # Check timestamp on program 282 # Check timestamp on program. 286 283 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 287 284 if program_timestamp < timestamp: … … 296 293 return program 297 294 295 298 296 def _create_some_context(): 299 297 # type: () -> cl.Context … … 307 305 which one (and not a CUDA device, or no GPU). 308 306 """ 309 # Assume we do not get here if SAS_OPENCL is None or CUDA 307 # Assume we do not get here if SAS_OPENCL is None or CUDA. 310 308 sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 311 309 if sas_opencl.lower() != 'opencl': 312 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 310 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 313 311 os.environ["PYOPENCL_CTX"] = sas_opencl 314 312 … … 318 316 except Exception as exc: 319 317 warnings.warn(str(exc)) 320 warnings.warn("pyopencl.create_some_context() failed") 321 warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 318 warnings.warn("pyopencl.create_some_context() failed. The " 319 "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 320 "not be set correctly") 322 321 323 322 return _get_default_context() 323 324 324 325 325 def _get_default_context(): … … 334 334 # is running may increase throughput. 335 335 # 336 # Mac book pro, base install:336 # MacBook Pro, base install: 337 337 # {'Apple': [Intel CPU, NVIDIA GPU]} 338 # Mac book pro, base install:338 # MacBook Pro, base install: 339 339 # {'Apple': [Intel CPU, Intel GPU]} 340 # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed340 # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 341 341 # {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 342 342 gpu, cpu = None, None … … 361 361 else: 362 362 # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 363 # Intel Phi for example registers as an accelerator 363 # Intel Phi for example registers as an accelerator. 364 364 # Since the user installed a custom device on their system 365 365 # and went through the pain of sorting out OpenCL drivers for … … 368 368 gpu = device 369 369 370 # order the devices by gpu then by cpu; when searching for an available370 # Order the devices by gpu then by cpu; when searching for an available 371 371 # device by data type they will be checked in this order, which means 372 372 # that if the gpu supports double then the cpu will never be used (though … … 395 395 that the compiler is allowed to take shortcuts. 396 396 """ 397 info = None # type: ModelInfo398 source = "" # type: str399 dtype = None # type: np.dtype400 fast = False # type: bool401 _program = None # type: cl.Program402 _kernels = None # type: Dict[str, cl.Kernel]397 info = None # type: ModelInfo 398 source = "" # type: str 399 dtype = None # type: np.dtype 400 fast = False # type: bool 401 _program = None # type: cl.Program 402 _kernels = None # type: Dict[str, cl.Kernel] 403 403 404 404 def __init__(self, source, model_info, dtype=generate.F32, fast=False): … … 446 446 functions = [getattr(program, k) for k in names] 447 447 self._kernels = {k: v for k, v in zip(variants, functions)} 448 # keep a handle to program so GC doesn't collect448 # Keep a handle to program so GC doesn't collect. 449 449 self._program = program 450 450 451 # TODO: check that we don't need a destructor for buffers which go out of scope 451 452 # TODO: Check that we don't need a destructor for buffers which go out of scope. 452 453 class GpuInput(object): 453 454 """ … … 471 472 def __init__(self, q_vectors, dtype=generate.F32): 472 473 # type: (List[np.ndarray], np.dtype) -> None 473 # TODO: do we ever need double precision q?474 # TODO: Do we ever need double precision q? 474 475 self.nq = q_vectors[0].size 475 476 self.dtype = np.dtype(dtype) 476 477 self.is_2d = (len(q_vectors) == 2) 477 # TODO: stretch input based on get_warp()478 # not doing it now since warp depends on kernel, which is not known478 # TODO: Stretch input based on get_warp(). 479 # Not doing it now since warp depends on kernel, which is not known 479 480 # at this point, so instead using 32, which is good on the set of 480 481 # architectures tested so far. … … 491 492 #print("creating inputs of size", self.global_size) 492 493 493 # transfer input value to gpu494 # Transfer input value to GPU. 494 495 env = environment() 495 496 context = env.context[self.dtype] … … 500 501 # type: () -> None 501 502 """ 502 Free the buffer associated with the q value 503 Free the buffer associated with the q value. 503 504 """ 504 505 if self.q_b is not None: … … 509 510 # type: () -> None 510 511 self.release() 512 511 513 512 514 class GpuKernel(Kernel): … … 524 526 Call :meth:`release` when done with the kernel instance. 525 527 """ 526 #: SAS model information structure 527 info = None # type: ModelInfo528 #: kernel precision529 dtype = None # type: np.dtype530 #: kernel dimensions (1d or 2d)531 dim = "" # type: str532 #: calculation results, updated after each call to :meth:`_call_kernel`533 result = None # type: np.ndarray528 #: SAS model information structure. 529 info = None # type: ModelInfo 530 #: Kernel precision. 531 dtype = None # type: np.dtype 532 #: Kernel dimensions (1d or 2d). 533 dim = "" # type: str 534 #: Calculation results, updated after each call to :meth:`_call_kernel`. 535 result = None # type: np.ndarray 534 536 535 537 def __init__(self, model, q_vectors): … … 538 540 self.q_input = GpuInput(q_vectors, dtype) 539 541 self._model = model 540 # F16 isn't sufficient, so don't support it 541 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 542 543 # attributes accessed from the outside 542 543 # Attributes accessed from the outside. 544 544 self.dim = '2d' if self.q_input.is_2d else '1d' 545 545 self.info = model.info 546 self.dtype = model.dtype 547 548 # holding place for the returned value 546 self.dtype = dtype 547 548 # Converter to translate input to target type. 549 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 550 551 # Holding place for the returned value. 549 552 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 550 extra_q = 4 # total weight, form volume, shell volume and R_eff551 self.result = np.empty(self.q_input.nq*nout +extra_q, dtype)552 553 # allocate result value on gpu553 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 554 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 555 556 # Allocate result value on GPU. 554 557 env = environment() 555 558 context = env.context[self.dtype] … … 557 560 self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 558 561 559 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 560 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 562 def _call_kernel(self, call_details, values, cutoff, magnetic, 563 effective_radius_type): 564 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 561 565 env = environment() 562 566 queue = env.queue[self._model.dtype] 563 567 context = queue.context 564 568 565 # Arrange data transfer to /from card569 # Arrange data transfer to card. 566 570 details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 567 571 hostbuf=call_details.buffer) … … 569 573 hostbuf=values) 570 574 575 # Setup kernel function and arguments. 571 576 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 572 577 kernel = self._model.get_function(name) 573 578 kernel_args = [ 574 np.uint32(self.q_input.nq), None, None, 575 details_b, values_b, self.q_input.q_b, self._result_b, 576 self._as_dtype(cutoff), 577 np.uint32(effective_radius_type), 579 np.uint32(self.q_input.nq), # Number of inputs. 580 None, # Placeholder for pd_start. 581 None, # Placeholder for pd_stop. 582 details_b, # Problem definition. 583 values_b, # Parameter values. 584 self.q_input.q_b, # Q values. 585 self._result_b, # Result storage. 586 self._as_dtype(cutoff), # Probability cutoff. 587 np.uint32(effective_radius_type), # R_eff mode. 578 588 ] 589 590 # Call kernel and retrieve results. 579 591 #print("Calling OpenCL") 580 592 #call_details.show(values) 581 #Call kernel and retrieve results582 593 wait_for = None 583 594 last_nap = time.clock() … … 590 601 *kernel_args, wait_for=wait_for)] 591 602 if stop < call_details.num_eval: 592 # Allow other processes to run 603 # Allow other processes to run. 593 604 wait_for[0].wait() 594 605 current_time = time.clock() … … 599 610 #print("result", self.result) 600 611 601 # Free buffers 612 # Free buffers. 602 613 details_b.release() 603 614 values_b.release()
Note: See TracChangeset
for help on using the changeset viewer.