Changeset 3199b17 in sasmodels
- Timestamp:
- Mar 6, 2019 2:24:03 PM (6 years ago)
- Branches:
- master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 4453136
- Parents:
- 00afc15
- Location:
- sasmodels
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernel.py
re44432d r3199b17 23 23 # pylint: enable=unused-import 24 24 25 25 26 class KernelModel(object): 26 27 info = None # type: ModelInfo … … 33 34 # type: () -> None 34 35 pass 36 35 37 36 38 class Kernel(object): -
sasmodels/kernelcl.py
r00afc15 r3199b17 61 61 62 62 63 # Attempt to setup opencl. This may fail if the pyopencl package is not63 # Attempt to setup OpenCL. This may fail if the pyopencl package is not 64 64 # installed or if it is installed but there are no devices available. 65 65 try: … … 67 67 from pyopencl import mem_flags as mf 68 68 from pyopencl.characterize import get_fast_inaccurate_build_options 69 # Ask OpenCL for the default context so that we know that one exists 69 # Ask OpenCL for the default context so that we know that one exists. 70 70 cl.create_some_context(interactive=False) 71 71 HAVE_OPENCL = True … … 88 88 # pylint: enable=unused-import 89 89 90 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path) 90 91 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 91 92 def quote_path(v): 92 93 """ … … 99 100 return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 100 101 102 101 103 def fix_pyopencl_include(): 102 104 """ … … 105 107 import pyopencl as cl 106 108 if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 107 cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 109 cl._DEFAULT_INCLUDE_OPTIONS = [ 110 quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 111 ] 112 108 113 109 114 if HAVE_OPENCL: … … 118 123 MAX_LOOPS = 2048 119 124 120 121 125 # Pragmas for enable OpenCL features. Be sure to protect them so that they 122 126 # still compile even if OpenCL is not present. … … 133 137 """ 134 138 139 135 140 def use_opencl(): 136 141 sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 137 142 return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 138 143 144 139 145 ENV = None 140 146 def reset_environment(): … … 144 150 global ENV 145 151 ENV = GpuEnvironment() if use_opencl() else None 152 146 153 147 154 def environment(): … … 161 168 return ENV 162 169 170 163 171 def has_type(device, dtype): 164 172 # type: (cl.Device, np.dtype) -> bool … … 171 179 return "cl_khr_fp64" in device.extensions 172 180 else: 173 # Not supporting F16 type since it isn't accurate enough 181 # Not supporting F16 type since it isn't accurate enough. 174 182 return False 183 175 184 176 185 def get_warp(kernel, queue): … … 182 191 cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 183 192 queue.device) 193 184 194 185 195 def compile_model(context, source, dtype, fast=False): … … 203 213 source_list.insert(0, _F64_PRAGMA) 204 214 205 # Note: USE_SINCOS makes the intel cpu slower under opencl215 # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 206 216 if context.devices[0].type == cl.device_type.GPU: 207 217 source_list.insert(0, "#define USE_SINCOS\n") … … 210 220 source = "\n".join(source_list) 211 221 program = cl.Program(context, source).build(options=options) 222 212 223 #print("done with "+program) 213 224 return program 214 225 215 226 216 # for now, this returns one device in the context217 # TODO: create a context that contains all devices on all platforms227 # For now, this returns one device in the context. 228 # TODO: Create a context that contains all devices on all platforms. 218 229 class GpuEnvironment(object): 219 230 """ 220 GPU context, with possibly many devices, and one queue per device. 221 222 Because the environment can be reset during a live program (e.g., if the 223 user changes the active GPU device in the GUI), everything associated 224 with the device context must be cached in the environment and recreated 225 if the environment changes. The *cache* attribute is a simple dictionary 226 which holds keys and references to objects, such as compiled kernels and 227 allocated buffers. The running program should check in the cache for 228 long lived objects and create them if they are not there. The program 229 should not hold onto cached objects, but instead only keep them active 230 for the duration of a function call. When the environment is destroyed 231 then the *release* method for each active cache item is called before 232 the environment is freed. This means that each cl buffer should be 233 in its own cache entry. 231 GPU context for OpenCL, with possibly many devices and one queue per device. 234 232 """ 235 233 def __init__(self): 236 234 # type: () -> None 237 # find gpu context235 # Find gpu context. 238 236 context_list = _create_some_context() 239 237 … … 249 247 self.context[dtype] = None 250 248 251 # Build a queue for each context 249 # Build a queue for each context. 252 250 self.queue = {} 253 251 context = self.context[F32] … … 259 257 self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 260 258 261 # Byte boundary for data alignment259 ## Byte boundary for data alignment. 262 260 #self.data_boundary = max(context.devices[0].min_data_type_align_size 263 261 # for context in self.context.values()) 264 262 265 # Cache for compiled programs, and for items in context 263 # Cache for compiled programs, and for items in context. 266 264 self.compiled = {} 267 265 … … 279 277 """ 280 278 # Note: PyOpenCL caches based on md5 hash of source, options and device 281 # so we don't really need to cache things for ourselves. I'll do so 282 # anyway just to save some data munging time. 279 # but I'll do so as well just to save some data munging time. 283 280 tag = generate.tag_source(source) 284 281 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 285 # Check timestamp on program 282 # Check timestamp on program. 286 283 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 287 284 if program_timestamp < timestamp: … … 296 293 return program 297 294 295 298 296 def _create_some_context(): 299 297 # type: () -> cl.Context … … 307 305 which one (and not a CUDA device, or no GPU). 308 306 """ 309 # Assume we do not get here if SAS_OPENCL is None or CUDA 307 # Assume we do not get here if SAS_OPENCL is None or CUDA. 310 308 sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 311 309 if sas_opencl.lower() != 'opencl': 312 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 310 # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 313 311 os.environ["PYOPENCL_CTX"] = sas_opencl 314 312 … … 318 316 except Exception as exc: 319 317 warnings.warn(str(exc)) 320 warnings.warn("pyopencl.create_some_context() failed") 321 warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 318 warnings.warn("pyopencl.create_some_context() failed. The " 319 "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 320 "not be set correctly") 322 321 323 322 return _get_default_context() 323 324 324 325 325 def _get_default_context(): … … 334 334 # is running may increase throughput. 335 335 # 336 # Mac book pro, base install:336 # MacBook Pro, base install: 337 337 # {'Apple': [Intel CPU, NVIDIA GPU]} 338 # Mac book pro, base install:338 # MacBook Pro, base install: 339 339 # {'Apple': [Intel CPU, Intel GPU]} 340 # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed340 # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 341 341 # {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 342 342 gpu, cpu = None, None … … 361 361 else: 362 362 # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 363 # Intel Phi for example registers as an accelerator 363 # Intel Phi for example registers as an accelerator. 364 364 # Since the user installed a custom device on their system 365 365 # and went through the pain of sorting out OpenCL drivers for … … 368 368 gpu = device 369 369 370 # order the devices by gpu then by cpu; when searching for an available370 # Order the devices by gpu then by cpu; when searching for an available 371 371 # device by data type they will be checked in this order, which means 372 372 # that if the gpu supports double then the cpu will never be used (though … … 395 395 that the compiler is allowed to take shortcuts. 396 396 """ 397 info = None # type: ModelInfo398 source = "" # type: str399 dtype = None # type: np.dtype400 fast = False # type: bool401 _program = None # type: cl.Program402 _kernels = None # type: Dict[str, cl.Kernel]397 info = None # type: ModelInfo 398 source = "" # type: str 399 dtype = None # type: np.dtype 400 fast = False # type: bool 401 _program = None # type: cl.Program 402 _kernels = None # type: Dict[str, cl.Kernel] 403 403 404 404 def __init__(self, source, model_info, dtype=generate.F32, fast=False): … … 446 446 functions = [getattr(program, k) for k in names] 447 447 self._kernels = {k: v for k, v in zip(variants, functions)} 448 # keep a handle to program so GC doesn't collect448 # Keep a handle to program so GC doesn't collect. 449 449 self._program = program 450 450 451 # TODO: check that we don't need a destructor for buffers which go out of scope 451 452 # TODO: Check that we don't need a destructor for buffers which go out of scope. 452 453 class GpuInput(object): 453 454 """ … … 471 472 def __init__(self, q_vectors, dtype=generate.F32): 472 473 # type: (List[np.ndarray], np.dtype) -> None 473 # TODO: do we ever need double precision q?474 # TODO: Do we ever need double precision q? 474 475 self.nq = q_vectors[0].size 475 476 self.dtype = np.dtype(dtype) 476 477 self.is_2d = (len(q_vectors) == 2) 477 # TODO: stretch input based on get_warp()478 # not doing it now since warp depends on kernel, which is not known478 # TODO: Stretch input based on get_warp(). 479 # Not doing it now since warp depends on kernel, which is not known 479 480 # at this point, so instead using 32, which is good on the set of 480 481 # architectures tested so far. … … 491 492 #print("creating inputs of size", self.global_size) 492 493 493 # transfer input value to gpu494 # Transfer input value to GPU. 494 495 env = environment() 495 496 context = env.context[self.dtype] … … 500 501 # type: () -> None 501 502 """ 502 Free the buffer associated with the q value 503 Free the buffer associated with the q value. 503 504 """ 504 505 if self.q_b is not None: … … 509 510 # type: () -> None 510 511 self.release() 512 511 513 512 514 class GpuKernel(Kernel): … … 524 526 Call :meth:`release` when done with the kernel instance. 525 527 """ 526 #: SAS model information structure 527 info = None # type: ModelInfo528 #: kernel precision529 dtype = None # type: np.dtype530 #: kernel dimensions (1d or 2d)531 dim = "" # type: str532 #: calculation results, updated after each call to :meth:`_call_kernel`533 result = None # type: np.ndarray528 #: SAS model information structure. 529 info = None # type: ModelInfo 530 #: Kernel precision. 531 dtype = None # type: np.dtype 532 #: Kernel dimensions (1d or 2d). 533 dim = "" # type: str 534 #: Calculation results, updated after each call to :meth:`_call_kernel`. 535 result = None # type: np.ndarray 534 536 535 537 def __init__(self, model, q_vectors): … … 538 540 self.q_input = GpuInput(q_vectors, dtype) 539 541 self._model = model 540 # F16 isn't sufficient, so don't support it 541 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 542 543 # attributes accessed from the outside 542 543 # Attributes accessed from the outside. 544 544 self.dim = '2d' if self.q_input.is_2d else '1d' 545 545 self.info = model.info 546 self.dtype = model.dtype 547 548 # holding place for the returned value 546 self.dtype = dtype 547 548 # Converter to translate input to target type. 549 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 550 551 # Holding place for the returned value. 549 552 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 550 extra_q = 4 # total weight, form volume, shell volume and R_eff551 self.result = np.empty(self.q_input.nq*nout +extra_q, dtype)552 553 # allocate result value on gpu553 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 554 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 555 556 # Allocate result value on GPU. 554 557 env = environment() 555 558 context = env.context[self.dtype] … … 557 560 self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 558 561 559 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 560 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 562 def _call_kernel(self, call_details, values, cutoff, magnetic, 563 effective_radius_type): 564 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 561 565 env = environment() 562 566 queue = env.queue[self._model.dtype] 563 567 context = queue.context 564 568 565 # Arrange data transfer to /from card569 # Arrange data transfer to card. 566 570 details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 567 571 hostbuf=call_details.buffer) … … 569 573 hostbuf=values) 570 574 575 # Setup kernel function and arguments. 571 576 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 572 577 kernel = self._model.get_function(name) 573 578 kernel_args = [ 574 np.uint32(self.q_input.nq), None, None, 575 details_b, values_b, self.q_input.q_b, self._result_b, 576 self._as_dtype(cutoff), 577 np.uint32(effective_radius_type), 579 np.uint32(self.q_input.nq), # Number of inputs. 580 None, # Placeholder for pd_start. 581 None, # Placeholder for pd_stop. 582 details_b, # Problem definition. 583 values_b, # Parameter values. 584 self.q_input.q_b, # Q values. 585 self._result_b, # Result storage. 586 self._as_dtype(cutoff), # Probability cutoff. 587 np.uint32(effective_radius_type), # R_eff mode. 578 588 ] 589 590 # Call kernel and retrieve results. 579 591 #print("Calling OpenCL") 580 592 #call_details.show(values) 581 #Call kernel and retrieve results582 593 wait_for = None 583 594 last_nap = time.clock() … … 590 601 *kernel_args, wait_for=wait_for)] 591 602 if stop < call_details.num_eval: 592 # Allow other processes to run 603 # Allow other processes to run. 593 604 wait_for[0].wait() 594 605 current_time = time.clock() … … 599 610 #print("result", self.result) 600 611 601 # Free buffers 612 # Free buffers. 602 613 details_b.release() 603 614 values_b.release() -
sasmodels/kernelcuda.py
r00afc15 r3199b17 68 68 69 69 70 # Attempt to setup cuda. This may fail if the pycuda package is not70 # Attempt to setup CUDA. This may fail if the pycuda package is not 71 71 # installed or if it is installed but there are no devices available. 72 72 try: … … 108 108 MAX_LOOPS = 2048 109 109 110 110 111 def use_cuda(): 111 env = os.environ.get("SAS_OPENCL", "").lower() 112 return HAVE_CUDA and (env == "" or env.startswith("cuda")) 112 sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 113 return HAVE_CUDA and sas_opencl.startswith("cuda") 114 113 115 114 116 ENV = None … … 122 124 ENV.release() 123 125 ENV = GpuEnvironment() if use_cuda() else None 126 124 127 125 128 def environment(): … … 139 142 return ENV 140 143 141 def free_context():142 global ENV143 if ENV is not None:144 ENV.release()145 ENV = None146 147 atexit.register(free_context)148 144 149 145 def has_type(dtype): … … 152 148 Return true if device supports the requested precision. 153 149 """ 154 # Assume the nvidiacard supports 32-bit and 64-bit floats.155 # TODO: check if pycuda support F16150 # Assume the NVIDIA card supports 32-bit and 64-bit floats. 151 # TODO: Check if pycuda support F16. 156 152 return dtype in (generate.F32, generate.F64) 157 153 158 154 159 155 FUNCTION_PATTERN = re.compile(r"""^ 160 (?P<space>\s*) # initial space161 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function162 (?P<function>\s*\b\w+\b\s*[(]) # function name plus open parens156 (?P<space>\s*) # Initial space. 157 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # One or more qualifiers before function. 158 (?P<function>\s*\b\w+\b\s*[(]) # Function name plus open parens. 163 159 """, re.VERBOSE|re.MULTILINE) 164 160 … … 167 163 """, re.VERBOSE|re.MULTILINE) 168 164 165 169 166 def _add_device_tag(match): 170 167 # type: (None) -> str 171 # Note: should be re.Match, but that isn't a simple type168 # Note: Should be re.Match, but that isn't a simple type. 172 169 """ 173 170 replace qualifiers with __device__ qualifiers if needed … … 182 179 return "".join((space, "__device__ ", qualifiers, function)) 183 180 181 184 182 def mark_device_functions(source): 185 183 # type: (str) -> str … … 188 186 """ 189 187 return FUNCTION_PATTERN.sub(_add_device_tag, source) 188 190 189 191 190 def show_device_functions(source): … … 197 196 print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 198 197 return source 198 199 199 200 200 def compile_model(source, dtype, fast=False): … … 221 221 #options = ['--verbose', '-E'] 222 222 options = ['--use_fast_math'] if fast else None 223 program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]223 program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 224 224 225 225 #print("done with "+program) … … 227 227 228 228 229 # for now, this returns one device in the context230 # TODO: create a context that contains all devices on all platforms229 # For now, this returns one device in the context. 230 # TODO: Create a context that contains all devices on all platforms. 231 231 class GpuEnvironment(object): 232 232 """ 233 GPU context , with possibly many devices, and one queue per device.233 GPU context for CUDA. 234 234 """ 235 235 context = None # type: cuda.Context 236 236 def __init__(self, devnum=None): 237 237 # type: (int) -> None 238 # Byte boundary for data alignment239 #self.data_boundary = max(d.min_data_type_align_size240 # for d in self.context.devices)241 self.compiled = {}242 238 env = os.environ.get("SAS_OPENCL", "").lower() 243 239 if devnum is None and env.startswith("cuda:"): 244 240 devnum = int(env[5:]) 241 245 242 # Set the global context to the particular device number if one is 246 243 # given, otherwise use the default context. Perhaps this will be set … … 251 248 self.context = make_default_context() 252 249 250 ## Byte boundary for data alignment. 251 #self.data_boundary = max(d.min_data_type_align_size 252 # for d in self.context.devices) 253 254 # Cache for compiled programs, and for items in context. 255 self.compiled = {} 256 253 257 def release(self): 254 258 if self.context is not None: … … 271 275 Compile the program for the device in the given context. 272 276 """ 273 # Note: PyOpenCL caches based on md5 hash of source, options and device 274 # so we don't really need to cache things for ourselves. I'll do so 275 # anyway just to save some data munging time. 277 # Note: PyCuda (probably) caches but I'll do so as well just to 278 # save some data munging time. 276 279 tag = generate.tag_source(source) 277 280 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 278 # Check timestamp on program 281 # Check timestamp on program. 279 282 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 280 283 if program_timestamp < timestamp: … … 286 289 return program 287 290 291 288 292 class GpuModel(KernelModel): 289 293 """ … … 301 305 that the compiler is allowed to take shortcuts. 302 306 """ 303 info = None # type: ModelInfo304 source = "" # type: str305 dtype = None # type: np.dtype306 fast = False # type: bool307 _program = None # type: SourceModule308 _kernels = None # type: Dict[str, cuda.Function]307 info = None # type: ModelInfo 308 source = "" # type: str 309 dtype = None # type: np.dtype 310 fast = False # type: bool 311 _program = None # type: SourceModule 312 _kernels = None # type: Dict[str, cuda.Function] 309 313 310 314 def __init__(self, source, model_info, dtype=generate.F32, fast=False): … … 352 356 functions = [program.get_function(k) for k in names] 353 357 self._kernels = {k: v for k, v in zip(variants, functions)} 354 # keep a handle to program so GC doesn't collect358 # Keep a handle to program so GC doesn't collect. 355 359 self._program = program 356 360 357 # TODO: check that we don't need a destructor for buffers which go out of scope 361 362 # TODO: Check that we don't need a destructor for buffers which go out of scope. 358 363 class GpuInput(object): 359 364 """ … … 377 382 def __init__(self, q_vectors, dtype=generate.F32): 378 383 # type: (List[np.ndarray], np.dtype) -> None 379 # TODO: do we ever need double precision q?384 # TODO: Do we ever need double precision q? 380 385 self.nq = q_vectors[0].size 381 386 self.dtype = np.dtype(dtype) 382 387 self.is_2d = (len(q_vectors) == 2) 383 # TODO: stretch input based on get_warp()384 # not doing it now since warp depends on kernel, which is not known388 # TODO: Stretch input based on get_warp(). 389 # Not doing it now since warp depends on kernel, which is not known 385 390 # at this point, so instead using 32, which is good on the set of 386 391 # architectures tested so far. 387 392 if self.is_2d: 388 # Note: 16 rather than 15 because result is 1 longer than input. 389 width = ((self.nq+16)//16)*16 393 width = ((self.nq+15)//16)*16 390 394 self.q = np.empty((width, 2), dtype=dtype) 391 395 self.q[:self.nq, 0] = q_vectors[0] 392 396 self.q[:self.nq, 1] = q_vectors[1] 393 397 else: 394 # Note: 32 rather than 31 because result is 1 longer than input. 395 width = ((self.nq+32)//32)*32 398 width = ((self.nq+31)//32)*32 396 399 self.q = np.empty(width, dtype=dtype) 397 400 self.q[:self.nq] = q_vectors[0] … … 399 402 #print("creating inputs of size", self.global_size) 400 403 401 # transfer input value to gpu404 # Transfer input value to GPU. 402 405 self.q_b = cuda.to_device(self.q) 403 406 … … 405 408 # type: () -> None 406 409 """ 407 Free the memory.410 Free the buffer associated with the q value. 408 411 """ 409 412 if self.q_b is not None: … … 414 417 # type: () -> None 415 418 self.release() 419 416 420 417 421 class GpuKernel(Kernel): … … 429 433 Call :meth:`release` when done with the kernel instance. 430 434 """ 431 #: SAS model information structure 432 info = None # type: ModelInfo433 #: kernel precision434 dtype = None # type: np.dtype435 #: kernel dimensions (1d or 2d)436 dim = "" # type: str437 #: calculation results, updated after each call to :meth:`_call_kernel`438 result = None # type: np.ndarray435 #: SAS model information structure. 436 info = None # type: ModelInfo 437 #: Kernel precision. 438 dtype = None # type: np.dtype 439 #: Kernel dimensions (1d or 2d). 440 dim = "" # type: str 441 #: Calculation results, updated after each call to :meth:`_call_kernel`. 442 result = None # type: np.ndarray 439 443 440 444 def __init__(self, model, q_vectors): … … 443 447 self.q_input = GpuInput(q_vectors, dtype) 444 448 self._model = model 445 # F16 isn't sufficient, so don't support it 446 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 447 448 # attributes accessed from the outside 449 450 # Attributes accessed from the outside. 449 451 self.dim = '2d' if self.q_input.is_2d else '1d' 450 452 self.info = model.info 451 self.dtype = model.dtype 452 453 # holding place for the returned value 453 self.dtype = dtype 454 455 # Converter to translate input to target type. 456 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 457 458 # Holding place for the returned value. 454 459 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 455 extra_q = 4 # total weight, form volume, shell volume and R_eff456 self.result = np.empty(self.q_input.nq*nout +extra_q, dtype)457 458 # allocate result value on gpu460 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 461 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 462 463 # Allocate result value on GPU. 459 464 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 460 465 self._result_b = cuda.mem_alloc(width) 461 466 462 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 463 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 464 # Arrange data transfer to card 467 def _call_kernel(self, call_details, values, cutoff, magnetic, 468 effective_radius_type): 469 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 470 471 # Arrange data transfer to card. 465 472 details_b = cuda.to_device(call_details.buffer) 466 473 values_b = cuda.to_device(values) 467 474 475 # Setup kernel function and arguments. 468 476 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 469 477 kernel = self._model.get_function(name) 470 478 kernel_args = [ 471 np.uint32(self.q_input.nq), None, None, 472 details_b, values_b, self.q_input.q_b, self._result_b, 473 self._as_dtype(cutoff), 474 np.uint32(effective_radius_type), 479 np.uint32(self.q_input.nq), # Number of inputs. 480 None, # Placeholder for pd_start. 481 None, # Placeholder for pd_stop. 482 details_b, # Problem definition. 483 values_b, # Parameter values. 484 self.q_input.q_b, # Q values. 485 self._result_b, # Result storage. 486 self._as_dtype(cutoff), # Probability cutoff. 487 np.uint32(effective_radius_type), # R_eff mode. 475 488 ] 476 489 grid = partition(self.q_input.nq) 477 #print("Calling OpenCL") 490 491 # Call kernel and retrieve results. 492 #print("Calling CUDA") 478 493 #call_details.show(values) 479 # Call kernel and retrieve results480 494 last_nap = time.clock() 481 495 step = 100000000//self.q_input.nq + 1 … … 488 502 if stop < call_details.num_eval: 489 503 sync() 490 # Allow other processes to run 504 # Allow other processes to run. 491 505 current_time = time.clock() 492 506 if current_time - last_nap > 0.5: … … 522 536 Note: Maybe context.synchronize() is sufficient. 523 537 """ 524 #return # The following works in C++; don't know what pycuda is doing 525 # Create an event with which to synchronize 538 # Create an event with which to synchronize. 526 539 done = cuda.Event() 527 540 … … 529 542 done.record() 530 543 531 # line added to not hog resources544 # Make sure we don't hog resource while waiting to sync. 532 545 while not done.query(): 533 546 time.sleep(0.01) … … 535 548 # Block until the GPU executes the kernel. 536 549 done.synchronize() 550 537 551 # Clean up the event; I don't think they can be reused. 538 552 del done -
sasmodels/kerneldll.py
re44432d r3199b17 100 100 # pylint: enable=unused-import 101 101 102 # Compiler output is a byte stream that needs to be decode in python 3 102 # Compiler output is a byte stream that needs to be decode in python 3. 103 103 decode = (lambda s: s) if sys.version_info[0] < 3 else (lambda s: s.decode('utf8')) 104 104 … … 115 115 COMPILER = "tinycc" 116 116 elif "VCINSTALLDIR" in os.environ: 117 # If vcvarsall.bat has been called, then VCINSTALLDIR is in the environment 118 # and we can use the MSVC compiler. Otherwise, if tinycc is available 119 # the use it. Otherwise, hope that mingw is available. 117 # If vcvarsall.bat has been called, then VCINSTALLDIR is in the 118 # environment and we can use the MSVC compiler. Otherwise, if 119 # tinycc is available then use it. Otherwise, hope that mingw 120 # is available. 120 121 COMPILER = "msvc" 121 122 else: … … 124 125 COMPILER = "unix" 125 126 126 ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86" # 4 byte pointers on x86 127 ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86" # 4 byte pointers on x86. 127 128 if COMPILER == "unix": 128 # Generic unix compile 129 # On mac users will need the X code command line tools installed129 # Generic unix compile. 130 # On Mac users will need the X code command line tools installed. 130 131 #COMPILE = "gcc-mp-4.7 -shared -fPIC -std=c99 -fopenmp -O2 -Wall %s -o %s -lm -lgomp" 131 132 CC = "cc -shared -fPIC -std=c99 -O2 -Wall".split() 132 # add openmp support if not running on a mac133 # Add OpenMP support if not running on a Mac. 133 134 if sys.platform != "darwin": 134 # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9) 135 # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9). 135 136 # Shut it off for all unix until we can investigate. 136 137 #CC.append("-fopenmp") … … 144 145 # vcomp90.dll on the path. One may be found here: 145 146 # C:/Windows/winsxs/x86_microsoft.vc90.openmp*/vcomp90.dll 146 # Copy this to the python directory and uncomment the OpenMP COMPILE 147 # TODO: remove intermediate OBJ file created in the directory148 # TODO: maybe don't use randomized name for the c file149 # TODO: maybe ask distutils to find MSVC147 # Copy this to the python directory and uncomment the OpenMP COMPILE. 148 # TODO: Remove intermediate OBJ file created in the directory. 149 # TODO: Maybe don't use randomized name for the c file. 150 # TODO: Maybe ask distutils to find MSVC. 150 151 CC = "cl /nologo /Ox /MD /W3 /GS- /DNDEBUG".split() 151 152 if "SAS_OPENMP" in os.environ: … … 172 173 ALLOW_SINGLE_PRECISION_DLLS = True 173 174 175 174 176 def compile(source, output): 175 177 # type: (str, str) -> None … … 183 185 logging.info(command_str) 184 186 try: 185 # need shell=True on windows to keep console box from popping up187 # Need shell=True on windows to keep console box from popping up. 186 188 shell = (os.name == 'nt') 187 189 subprocess.check_output(command, shell=shell, stderr=subprocess.STDOUT) … … 192 194 raise RuntimeError("compile failed. File is in %r"%source) 193 195 196 194 197 def dll_name(model_info, dtype): 195 198 # type: (ModelInfo, np.dtype) -> str … … 202 205 basename += ARCH + ".so" 203 206 204 # Hack to find precompiled dlls 207 # Hack to find precompiled dlls. 205 208 path = joinpath(generate.DATA_PATH, '..', 'compiled_models', basename) 206 209 if os.path.exists(path): … … 242 245 raise ValueError("16 bit floats not supported") 243 246 if dtype == F32 and not ALLOW_SINGLE_PRECISION_DLLS: 244 dtype = F64 # Force 64-bit dll 245 # Note: dtype may be F128 for long double precision 247 dtype = F64 # Force 64-bit dll. 248 # Note: dtype may be F128 for long double precision. 246 249 247 250 dll = dll_path(model_info, dtype) … … 254 257 need_recompile = dll_time < newest_source 255 258 if need_recompile: 256 # Make sure the DLL path exists 259 # Make sure the DLL path exists. 257 260 if not os.path.exists(SAS_DLL_PATH): 258 261 os.makedirs(SAS_DLL_PATH) … … 263 266 file_handle.write(source) 264 267 compile(source=filename, output=dll) 265 # comment the following to keep the generated c file266 # Note: if there is a syntax error then compile raises an error268 # Comment the following to keep the generated C file. 269 # Note: If there is a syntax error then compile raises an error 267 270 # and the source file will not be deleted. 268 271 os.unlink(filename) … … 303 306 self.dllpath = dllpath 304 307 self._dll = None # type: ct.CDLL 305 self._kernels = None # type: List[Callable, Callable]308 self._kernels = None # type: List[Callable, Callable] 306 309 self.dtype = np.dtype(dtype) 307 310 … … 338 341 # type: (List[np.ndarray]) -> DllKernel 339 342 q_input = PyInput(q_vectors, self.dtype) 340 # Note: pickle not supported for DllKernel343 # Note: DLL is lazy loaded. 341 344 if self._dll is None: 342 345 self._load_dll() … … 358 361 self._dll = None 359 362 363 360 364 class DllKernel(Kernel): 361 365 """ … … 379 383 def __init__(self, kernel, model_info, q_input): 380 384 # type: (Callable[[], np.ndarray], ModelInfo, PyInput) -> None 381 #,model_info,q_input) 385 dtype = q_input.dtype 386 self.q_input = q_input 382 387 self.kernel = kernel 388 389 # Attributes accessed from the outside. 390 self.dim = '2d' if q_input.is_2d else '1d' 383 391 self.info = model_info 384 self.q_input = q_input 385 self.dtype = q_input.dtype 386 self.dim = '2d' if q_input.is_2d else '1d' 387 # leave room for f1/f2 results in case we need to compute beta for 1d models 392 self.dtype = dtype 393 394 # Converter to translate input to target type. 395 self._as_dtype = (np.float32 if dtype == generate.F32 396 else np.float64 if dtype == generate.F64 397 else np.float128) 398 399 # Holding place for the returned value. 388 400 nout = 2 if self.info.have_Fq else 1 389 # +4 for total weight, shell volume, effective radius, form volume390 self.result = np.empty( q_input.nq*nout + 4, self.dtype)391 self.real = (np.float32 if self.q_input.dtype == generate.F32 392 else np.float64 if self.q_input.dtype == generate.F64393 e lse np.float128)394 395 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 396 # type: (CallDetails, np.ndarray, np.ndarray, float, bool, int) -> np.ndarray401 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 402 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 403 404 def _call_kernel(self, call_details, values, cutoff, magnetic, 405 effective_radius_type): 406 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 407 408 # Setup kernel function and arguments. 397 409 kernel = self.kernel[1 if magnetic else 0] 398 args = [399 self.q_input.nq, # nq400 None, # pd_start401 None, # pd_stop pd_stride[MAX_PD]402 call_details.buffer.ctypes.data, # problem403 values.ctypes.data, # pars404 self.q_input.q.ctypes.data, # q405 self.result.ctypes.data, # results406 self. real(cutoff), # cutoff407 effective_radius_type, # cutoff410 kernel_args = [ 411 self.q_input.nq, # Number of inputs. 412 None, # Placeholder for pd_start. 413 None, # Placeholder for pd_stop. 414 call_details.buffer.ctypes.data, # Problem definition. 415 values.ctypes.data, # Parameter values. 416 self.q_input.q.ctypes.data, # Q values. 417 self.result.ctypes.data, # Result storage. 418 self._as_dtype(cutoff), # Probability cutoff. 419 effective_radius_type, # R_eff mode. 408 420 ] 421 422 # Call kernel and retrieve results. 409 423 #print("Calling DLL") 410 424 #call_details.show(values) 411 425 step = 100 426 # TODO: Do we need the explicit sleep like the OpenCL and CUDA loops? 412 427 for start in range(0, call_details.num_eval, step): 413 428 stop = min(start + step, call_details.num_eval) 414 args[1:3] = [start, stop]415 kernel(* args) # type: ignore429 kernel_args[1:3] = [start, stop] 430 kernel(*kernel_args) # type: ignore 416 431 417 432 def release(self): 418 433 # type: () -> None 419 434 """ 420 Release anyresources associated with the kernel.435 Release resources associated with the kernel. 421 436 """ 422 self.q_input.release() 437 # TODO: OpenCL/CUDA allocate q_input in __init__ and free it in release. 438 # Should we be doing the same for DLL? 439 #self.q_input.release() 440 pass 441 442 def __del__(self): 443 # type: () -> None 444 self.release() -
sasmodels/kernelpy.py
raa8c6e0 r3199b17 33 33 logger = logging.getLogger(__name__) 34 34 35 35 36 class PyModel(KernelModel): 36 37 """ … … 38 39 """ 39 40 def __init__(self, model_info): 40 # Make sure Iq is available and vectorized 41 # Make sure Iq is available and vectorized. 41 42 _create_default_functions(model_info) 42 43 self.info = model_info … … 53 54 """ 54 55 pass 56 55 57 56 58 class PyInput(object): … … 91 93 self.q = None 92 94 95 93 96 class PyKernel(Kernel): 94 97 """ … … 131 134 parameter_vector = np.empty(len(partable.call_parameters)-2, 'd') 132 135 133 # Create views into the array to hold the arguments 136 # Create views into the array to hold the arguments. 134 137 offset = 0 135 138 kernel_args, volume_args = [], [] … … 174 177 else (lambda mode: 1.0)) 175 178 176 177 178 179 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 179 180 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray … … 195 196 self.q_input.release() 196 197 self.q_input = None 198 197 199 198 200 def _loops(parameters, # type: np.ndarray … … 254 256 total = np.zeros(nq, 'd') 255 257 for loop_index in range(call_details.num_eval): 256 # update polydispersity parameter values258 # Update polydispersity parameter values. 257 259 if p0_index == p0_length: 258 260 pd_index = (loop_index//pd_stride)%pd_length … … 265 267 p0_index += 1 266 268 if weight > cutoff: 267 # Call the scattering function 269 # Call the scattering function. 268 270 # Assume that NaNs are only generated if the parameters are bad; 269 271 # exclude all q for that NaN. Even better would be to have an … … 273 275 continue 274 276 275 # update value and norm277 # Update value and norm. 276 278 total += weight * Iq 277 279 weight_norm += weight … … 293 295 any functions that are not already marked as vectorized. 294 296 """ 295 # Note: must call create_vector_Iq before create_vector_Iqxy297 # Note: Must call create_vector_Iq before create_vector_Iqxy. 296 298 _create_vector_Iq(model_info) 297 299 _create_vector_Iqxy(model_info)
Note: See TracChangeset
for help on using the changeset viewer.