Changes in sasmodels/kernelcuda.py [fa26e78:f872fd1] in sasmodels
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcuda.py
rfa26e78 rf872fd1 63 63 import time 64 64 import re 65 import atexit66 65 67 66 import numpy as np # type: ignore 68 67 69 68 70 # Attempt to setup CUDA. This may fail if the pycuda package is not69 # Attempt to setup cuda. This may fail if the pycuda package is not 71 70 # installed or if it is installed but there are no devices available. 72 71 try: … … 108 107 MAX_LOOPS = 2048 109 108 110 111 109 def use_cuda(): 112 sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 113 return HAVE_CUDA and sas_opencl.startswith("cuda") 114 110 env = os.environ.get("SAS_OPENCL", "").lower() 111 return HAVE_CUDA and (env == "" or env.startswith("cuda")) 115 112 116 113 ENV = None … … 124 121 ENV.release() 125 122 ENV = GpuEnvironment() if use_cuda() else None 126 127 123 128 124 def environment(): … … 142 138 return ENV 143 139 144 145 # PyTest is not freeing ENV, so make sure it gets freed.146 atexit.register(lambda: ENV.release() if ENV is not None else None)147 148 149 140 def has_type(dtype): 150 141 # type: (np.dtype) -> bool … … 152 143 Return true if device supports the requested precision. 153 144 """ 154 # Assume the NVIDIAcard supports 32-bit and 64-bit floats.155 # TODO: Check if pycuda support F16.145 # Assume the nvidia card supports 32-bit and 64-bit floats. 146 # TODO: check if pycuda support F16 156 147 return dtype in (generate.F32, generate.F64) 157 148 158 149 159 150 FUNCTION_PATTERN = re.compile(r"""^ 160 (?P<space>\s*) # Initial space.161 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # One or more qualifiers before function.162 (?P<function>\s*\b\w+\b\s*[(]) # Function name plus open parens.151 (?P<space>\s*) # initial space 152 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 153 (?P<function>\s*\b\w+\b\s*[(]) # function name plus open parens 163 154 """, re.VERBOSE|re.MULTILINE) 164 155 … … 167 158 """, re.VERBOSE|re.MULTILINE) 168 159 169 170 160 def _add_device_tag(match): 171 161 # type: (None) -> str 172 # Note: Should be re.Match, but that isn't a simple type.162 # Note: should be re.Match, but that isn't a simple type 173 163 """ 174 164 replace qualifiers with __device__ qualifiers if needed … … 183 173 return "".join((space, "__device__ ", qualifiers, function)) 184 174 185 186 175 def mark_device_functions(source): 187 176 # type: (str) -> str … … 190 179 """ 191 180 return FUNCTION_PATTERN.sub(_add_device_tag, source) 192 193 181 194 182 def show_device_functions(source): … … 200 188 print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 201 189 return source 202 203 190 204 191 def compile_model(source, dtype, fast=False): … … 225 212 #options = ['--verbose', '-E'] 226 213 options = ['--use_fast_math'] if fast else None 227 program = SourceModule(source, no_extern_c=True, options=options) # , include_dirs=[...])214 program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 228 215 229 216 #print("done with "+program) … … 231 218 232 219 233 # For now, this returns one device in the context.234 # TODO: Create a context that contains all devices on all platforms.220 # for now, this returns one device in the context 221 # TODO: create a context that contains all devices on all platforms 235 222 class GpuEnvironment(object): 236 223 """ 237 GPU context for CUDA.224 GPU context, with possibly many devices, and one queue per device. 238 225 """ 239 226 context = None # type: cuda.Context 240 227 def __init__(self, devnum=None): 241 228 # type: (int) -> None 229 # Byte boundary for data alignment 230 #self.data_boundary = max(d.min_data_type_align_size 231 # for d in self.context.devices) 232 self.compiled = {} 242 233 env = os.environ.get("SAS_OPENCL", "").lower() 243 234 if devnum is None and env.startswith("cuda:"): 244 235 devnum = int(env[5:]) 245 246 236 # Set the global context to the particular device number if one is 247 237 # given, otherwise use the default context. Perhaps this will be set … … 252 242 self.context = make_default_context() 253 243 254 ## Byte boundary for data alignment.255 #self.data_boundary = max(d.min_data_type_align_size256 # for d in self.context.devices)257 258 # Cache for compiled programs, and for items in context.259 self.compiled = {}260 261 244 def release(self): 262 245 if self.context is not None: … … 279 262 Compile the program for the device in the given context. 280 263 """ 281 # Note: PyCuda (probably) caches but I'll do so as well just to 282 # save some data munging time. 264 # Note: PyOpenCL caches based on md5 hash of source, options and device 265 # so we don't really need to cache things for ourselves. I'll do so 266 # anyway just to save some data munging time. 283 267 tag = generate.tag_source(source) 284 268 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 285 # Check timestamp on program .269 # Check timestamp on program 286 270 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 287 271 if program_timestamp < timestamp: … … 293 277 return program 294 278 295 296 279 class GpuModel(KernelModel): 297 280 """ … … 309 292 that the compiler is allowed to take shortcuts. 310 293 """ 311 info = None 312 source = "" 313 dtype = None 314 fast = False 315 _program = None# type: SourceModule316 _kernels = None # type: Dict[str,cuda.Function]294 info = None # type: ModelInfo 295 source = "" # type: str 296 dtype = None # type: np.dtype 297 fast = False # type: bool 298 program = None # type: SourceModule 299 _kernels = None # type: List[cuda.Function] 317 300 318 301 def __init__(self, source, model_info, dtype=generate.F32, fast=False): … … 322 305 self.dtype = dtype 323 306 self.fast = fast 307 self.program = None # delay program creation 308 self._kernels = None 324 309 325 310 def __getstate__(self): … … 330 315 # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 331 316 self.info, self.source, self.dtype, self.fast = state 332 self. _program = self._kernels= None317 self.program = None 333 318 334 319 def make_kernel(self, q_vectors): 335 320 # type: (List[np.ndarray]) -> "GpuKernel" 336 return GpuKernel(self, q_vectors) 337 338 def get_function(self, name): 339 # type: (str) -> cuda.Function 340 """ 341 Fetch the kernel from the environment by name, compiling it if it 342 does not already exist. 343 """ 344 if self._program is None: 345 self._prepare_program() 346 return self._kernels[name] 347 348 def _prepare_program(self): 349 # type: (str) -> None 350 env = environment() 351 timestamp = generate.ocl_timestamp(self.info) 352 program = env.compile_program( 353 self.info.name, 354 self.source['opencl'], 355 self.dtype, 356 self.fast, 357 timestamp) 358 variants = ['Iq', 'Iqxy', 'Imagnetic'] 359 names = [generate.kernel_name(self.info, k) for k in variants] 360 functions = [program.get_function(k) for k in names] 361 self._kernels = {k: v for k, v in zip(variants, functions)} 362 # Keep a handle to program so GC doesn't collect. 363 self._program = program 364 365 366 # TODO: Check that we don't need a destructor for buffers which go out of scope. 321 if self.program is None: 322 compile_program = environment().compile_program 323 timestamp = generate.ocl_timestamp(self.info) 324 self.program = compile_program( 325 self.info.name, 326 self.source['opencl'], 327 self.dtype, 328 self.fast, 329 timestamp) 330 variants = ['Iq', 'Iqxy', 'Imagnetic'] 331 names = [generate.kernel_name(self.info, k) for k in variants] 332 kernels = [self.program.get_function(k) for k in names] 333 self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 334 is_2d = len(q_vectors) == 2 335 if is_2d: 336 kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 337 else: 338 kernel = [self._kernels['Iq']]*2 339 return GpuKernel(kernel, self.dtype, self.info, q_vectors) 340 341 def release(self): 342 # type: () -> None 343 """ 344 Free the resources associated with the model. 345 """ 346 if self.program is not None: 347 self.program = None 348 349 def __del__(self): 350 # type: () -> None 351 self.release() 352 353 # TODO: check that we don't need a destructor for buffers which go out of scope 367 354 class GpuInput(object): 368 355 """ … … 386 373 def __init__(self, q_vectors, dtype=generate.F32): 387 374 # type: (List[np.ndarray], np.dtype) -> None 388 # TODO: Do we ever need double precision q?375 # TODO: do we ever need double precision q? 389 376 self.nq = q_vectors[0].size 390 377 self.dtype = np.dtype(dtype) 391 378 self.is_2d = (len(q_vectors) == 2) 392 # TODO: Stretch input based on get_warp().393 # Not doing it now since warp depends on kernel, which is not known379 # TODO: stretch input based on get_warp() 380 # not doing it now since warp depends on kernel, which is not known 394 381 # at this point, so instead using 32, which is good on the set of 395 382 # architectures tested so far. 396 383 if self.is_2d: 397 width = ((self.nq+15)//16)*16 384 # Note: 16 rather than 15 because result is 1 longer than input. 385 width = ((self.nq+16)//16)*16 398 386 self.q = np.empty((width, 2), dtype=dtype) 399 387 self.q[:self.nq, 0] = q_vectors[0] 400 388 self.q[:self.nq, 1] = q_vectors[1] 401 389 else: 402 width = ((self.nq+31)//32)*32 390 # Note: 32 rather than 31 because result is 1 longer than input. 391 width = ((self.nq+32)//32)*32 403 392 self.q = np.empty(width, dtype=dtype) 404 393 self.q[:self.nq] = q_vectors[0] 405 394 self.global_size = [self.q.shape[0]] 406 395 #print("creating inputs of size", self.global_size) 407 408 # Transfer input value to GPU.409 396 self.q_b = cuda.to_device(self.q) 410 397 … … 412 399 # type: () -> None 413 400 """ 414 Free the buffer associated with the q value.401 Free the memory. 415 402 """ 416 403 if self.q_b is not None: … … 422 409 self.release() 423 410 424 425 411 class GpuKernel(Kernel): 426 412 """ 427 413 Callable SAS kernel. 428 414 429 *model* is the GpuModel object to call 430 431 The kernel is derived from :class:`Kernel`, providing the 432 :meth:`call_kernel` method to evaluate the kernel for a given set of 433 parameters. Because of the need to move the q values to the GPU before 434 evaluation, the kernel is instantiated for a particular set of q vectors, 435 and can be called many times without transfering q each time. 415 *kernel* is the GpuKernel object to call 416 417 *model_info* is the module information 418 419 *q_vectors* is the q vectors at which the kernel should be evaluated 420 421 *dtype* is the kernel precision 422 423 The resulting call method takes the *pars*, a list of values for 424 the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 425 vectors for the polydisperse parameters. *cutoff* determines the 426 integration limits: any points with combined weight less than *cutoff* 427 will not be calculated. 436 428 437 429 Call :meth:`release` when done with the kernel instance. 438 430 """ 439 #: SAS model information structure. 440 info = None # type: ModelInfo 441 #: Kernel precision. 442 dtype = None # type: np.dtype 443 #: Kernel dimensions (1d or 2d). 444 dim = "" # type: str 445 #: Calculation results, updated after each call to :meth:`_call_kernel`. 446 result = None # type: np.ndarray 447 448 def __init__(self, model, q_vectors): 449 # type: (GpuModel, List[np.ndarray]) -> None 450 dtype = model.dtype 431 def __init__(self, kernel, dtype, model_info, q_vectors): 432 # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 451 433 self.q_input = GpuInput(q_vectors, dtype) 452 self._model = model 453 454 # Attributes accessed from the outside. 434 self.kernel = kernel 435 # F16 isn't sufficient, so don't support it 436 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 437 438 # attributes accessed from the outside 455 439 self.dim = '2d' if self.q_input.is_2d else '1d' 456 self.info = model .info440 self.info = model_info 457 441 self.dtype = dtype 458 442 459 # Converter to translate input to target type. 460 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 461 462 # Holding place for the returned value. 443 # holding place for the returned value 463 444 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 464 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 465 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 466 467 # Allocate result value on GPU. 445 extra_q = 4 # total weight, form volume, shell volume and R_eff 446 self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 447 448 # Inputs and outputs for each kernel call 449 # Note: res may be shorter than res_b if global_size != nq 468 450 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 469 self._result_b = cuda.mem_alloc(width) 470 471 def _call_kernel(self, call_details, values, cutoff, magnetic, 472 effective_radius_type): 473 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 474 475 # Arrange data transfer to card. 451 self.result_b = cuda.mem_alloc(width) 452 self._need_release = [self.result_b] 453 454 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 455 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 456 # Arrange data transfer to card 476 457 details_b = cuda.to_device(call_details.buffer) 477 458 values_b = cuda.to_device(values) 478 459 479 # Setup kernel function and arguments. 480 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 481 kernel = self._model.get_function(name) 482 kernel_args = [ 483 np.uint32(self.q_input.nq), # Number of inputs. 484 None, # Placeholder for pd_start. 485 None, # Placeholder for pd_stop. 486 details_b, # Problem definition. 487 values_b, # Parameter values. 488 self.q_input.q_b, # Q values. 489 self._result_b, # Result storage. 490 self._as_dtype(cutoff), # Probability cutoff. 491 np.uint32(effective_radius_type), # R_eff mode. 460 kernel = self.kernel[1 if magnetic else 0] 461 args = [ 462 np.uint32(self.q_input.nq), None, None, 463 details_b, values_b, self.q_input.q_b, self.result_b, 464 self._as_dtype(cutoff), 465 np.uint32(effective_radius_type), 492 466 ] 493 467 grid = partition(self.q_input.nq) 494 495 # Call kernel and retrieve results. 496 #print("Calling CUDA") 468 #print("Calling OpenCL") 497 469 #call_details.show(values) 470 # Call kernel and retrieve results 498 471 last_nap = time.clock() 499 472 step = 100000000//self.q_input.nq + 1 … … 502 475 stop = min(start + step, call_details.num_eval) 503 476 #print("queuing",start,stop) 504 kernel_args[1:3] = [np.int32(start), np.int32(stop)]505 kernel(* kernel_args, **grid)477 args[1:3] = [np.int32(start), np.int32(stop)] 478 kernel(*args, **grid) 506 479 if stop < call_details.num_eval: 507 480 sync() 508 # Allow other processes to run .481 # Allow other processes to run 509 482 current_time = time.clock() 510 483 if current_time - last_nap > 0.5: … … 512 485 last_nap = current_time 513 486 sync() 514 cuda.memcpy_dtoh(self.result, self. _result_b)487 cuda.memcpy_dtoh(self.result, self.result_b) 515 488 #print("result", self.result) 516 489 … … 523 496 Release resources associated with the kernel. 524 497 """ 525 self.q_input.release() 526 if self._result_b is not None: 527 self._result_b.free() 528 self._result_b = None 498 for p in self._need_release: 499 p.free() 500 self._need_release = [] 529 501 530 502 def __del__(self): … … 540 512 Note: Maybe context.synchronize() is sufficient. 541 513 """ 542 # Create an event with which to synchronize. 514 #return # The following works in C++; don't know what pycuda is doing 515 # Create an event with which to synchronize 543 516 done = cuda.Event() 544 517 … … 546 519 done.record() 547 520 548 # Make sure we don't hog resource while waiting to sync.521 #line added to not hog resources 549 522 while not done.query(): 550 523 time.sleep(0.01) … … 552 525 # Block until the GPU executes the kernel. 553 526 done.synchronize() 554 555 527 # Clean up the event; I don't think they can be reused. 556 528 del done
Note: See TracChangeset
for help on using the changeset viewer.