Changes in sasmodels/kernelcuda.py [f872fd1:fa26e78] in sasmodels
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcuda.py
rf872fd1 rfa26e78 63 63 import time 64 64 import re 65 import atexit 65 66 66 67 import numpy as np # type: ignore 67 68 68 69 69 # Attempt to setup cuda. This may fail if the pycuda package is not70 # Attempt to setup CUDA. This may fail if the pycuda package is not 70 71 # installed or if it is installed but there are no devices available. 71 72 try: … … 107 108 MAX_LOOPS = 2048 108 109 110 109 111 def use_cuda(): 110 env = os.environ.get("SAS_OPENCL", "").lower() 111 return HAVE_CUDA and (env == "" or env.startswith("cuda")) 112 sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 113 return HAVE_CUDA and sas_opencl.startswith("cuda") 114 112 115 113 116 ENV = None … … 121 124 ENV.release() 122 125 ENV = GpuEnvironment() if use_cuda() else None 126 123 127 124 128 def environment(): … … 138 142 return ENV 139 143 144 145 # PyTest is not freeing ENV, so make sure it gets freed. 146 atexit.register(lambda: ENV.release() if ENV is not None else None) 147 148 140 149 def has_type(dtype): 141 150 # type: (np.dtype) -> bool … … 143 152 Return true if device supports the requested precision. 144 153 """ 145 # Assume the nvidiacard supports 32-bit and 64-bit floats.146 # TODO: check if pycuda support F16154 # Assume the NVIDIA card supports 32-bit and 64-bit floats. 155 # TODO: Check if pycuda support F16. 147 156 return dtype in (generate.F32, generate.F64) 148 157 149 158 150 159 FUNCTION_PATTERN = re.compile(r"""^ 151 (?P<space>\s*) # initial space152 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function153 (?P<function>\s*\b\w+\b\s*[(]) # function name plus open parens160 (?P<space>\s*) # Initial space. 161 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # One or more qualifiers before function. 162 (?P<function>\s*\b\w+\b\s*[(]) # Function name plus open parens. 154 163 """, re.VERBOSE|re.MULTILINE) 155 164 … … 158 167 """, re.VERBOSE|re.MULTILINE) 159 168 169 160 170 def _add_device_tag(match): 161 171 # type: (None) -> str 162 # Note: should be re.Match, but that isn't a simple type172 # Note: Should be re.Match, but that isn't a simple type. 163 173 """ 164 174 replace qualifiers with __device__ qualifiers if needed … … 173 183 return "".join((space, "__device__ ", qualifiers, function)) 174 184 185 175 186 def mark_device_functions(source): 176 187 # type: (str) -> str … … 179 190 """ 180 191 return FUNCTION_PATTERN.sub(_add_device_tag, source) 192 181 193 182 194 def show_device_functions(source): … … 188 200 print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 189 201 return source 202 190 203 191 204 def compile_model(source, dtype, fast=False): … … 212 225 #options = ['--verbose', '-E'] 213 226 options = ['--use_fast_math'] if fast else None 214 program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]227 program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 215 228 216 229 #print("done with "+program) … … 218 231 219 232 220 # for now, this returns one device in the context221 # TODO: create a context that contains all devices on all platforms233 # For now, this returns one device in the context. 234 # TODO: Create a context that contains all devices on all platforms. 222 235 class GpuEnvironment(object): 223 236 """ 224 GPU context , with possibly many devices, and one queue per device.237 GPU context for CUDA. 225 238 """ 226 239 context = None # type: cuda.Context 227 240 def __init__(self, devnum=None): 228 241 # type: (int) -> None 229 # Byte boundary for data alignment230 #self.data_boundary = max(d.min_data_type_align_size231 # for d in self.context.devices)232 self.compiled = {}233 242 env = os.environ.get("SAS_OPENCL", "").lower() 234 243 if devnum is None and env.startswith("cuda:"): 235 244 devnum = int(env[5:]) 245 236 246 # Set the global context to the particular device number if one is 237 247 # given, otherwise use the default context. Perhaps this will be set … … 242 252 self.context = make_default_context() 243 253 254 ## Byte boundary for data alignment. 255 #self.data_boundary = max(d.min_data_type_align_size 256 # for d in self.context.devices) 257 258 # Cache for compiled programs, and for items in context. 259 self.compiled = {} 260 244 261 def release(self): 245 262 if self.context is not None: … … 262 279 Compile the program for the device in the given context. 263 280 """ 264 # Note: PyOpenCL caches based on md5 hash of source, options and device 265 # so we don't really need to cache things for ourselves. I'll do so 266 # anyway just to save some data munging time. 281 # Note: PyCuda (probably) caches but I'll do so as well just to 282 # save some data munging time. 267 283 tag = generate.tag_source(source) 268 284 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 269 # Check timestamp on program 285 # Check timestamp on program. 270 286 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 271 287 if program_timestamp < timestamp: … … 277 293 return program 278 294 295 279 296 class GpuModel(KernelModel): 280 297 """ … … 292 309 that the compiler is allowed to take shortcuts. 293 310 """ 294 info = None # type: ModelInfo295 source = "" # type: str296 dtype = None # type: np.dtype297 fast = False # type: bool298 program = None# type: SourceModule299 _kernels = None # type: List[cuda.Function]311 info = None # type: ModelInfo 312 source = "" # type: str 313 dtype = None # type: np.dtype 314 fast = False # type: bool 315 _program = None # type: SourceModule 316 _kernels = None # type: Dict[str, cuda.Function] 300 317 301 318 def __init__(self, source, model_info, dtype=generate.F32, fast=False): … … 305 322 self.dtype = dtype 306 323 self.fast = fast 307 self.program = None # delay program creation308 self._kernels = None309 324 310 325 def __getstate__(self): … … 315 330 # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 316 331 self.info, self.source, self.dtype, self.fast = state 317 self. program= None332 self._program = self._kernels = None 318 333 319 334 def make_kernel(self, q_vectors): 320 335 # type: (List[np.ndarray]) -> "GpuKernel" 321 if self.program is None: 322 compile_program = environment().compile_program 323 timestamp = generate.ocl_timestamp(self.info) 324 self.program = compile_program( 325 self.info.name, 326 self.source['opencl'], 327 self.dtype, 328 self.fast, 329 timestamp) 330 variants = ['Iq', 'Iqxy', 'Imagnetic'] 331 names = [generate.kernel_name(self.info, k) for k in variants] 332 kernels = [self.program.get_function(k) for k in names] 333 self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 334 is_2d = len(q_vectors) == 2 335 if is_2d: 336 kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 337 else: 338 kernel = [self._kernels['Iq']]*2 339 return GpuKernel(kernel, self.dtype, self.info, q_vectors) 340 341 def release(self): 342 # type: () -> None 343 """ 344 Free the resources associated with the model. 345 """ 346 if self.program is not None: 347 self.program = None 348 349 def __del__(self): 350 # type: () -> None 351 self.release() 352 353 # TODO: check that we don't need a destructor for buffers which go out of scope 336 return GpuKernel(self, q_vectors) 337 338 def get_function(self, name): 339 # type: (str) -> cuda.Function 340 """ 341 Fetch the kernel from the environment by name, compiling it if it 342 does not already exist. 343 """ 344 if self._program is None: 345 self._prepare_program() 346 return self._kernels[name] 347 348 def _prepare_program(self): 349 # type: (str) -> None 350 env = environment() 351 timestamp = generate.ocl_timestamp(self.info) 352 program = env.compile_program( 353 self.info.name, 354 self.source['opencl'], 355 self.dtype, 356 self.fast, 357 timestamp) 358 variants = ['Iq', 'Iqxy', 'Imagnetic'] 359 names = [generate.kernel_name(self.info, k) for k in variants] 360 functions = [program.get_function(k) for k in names] 361 self._kernels = {k: v for k, v in zip(variants, functions)} 362 # Keep a handle to program so GC doesn't collect. 363 self._program = program 364 365 366 # TODO: Check that we don't need a destructor for buffers which go out of scope. 354 367 class GpuInput(object): 355 368 """ … … 373 386 def __init__(self, q_vectors, dtype=generate.F32): 374 387 # type: (List[np.ndarray], np.dtype) -> None 375 # TODO: do we ever need double precision q?388 # TODO: Do we ever need double precision q? 376 389 self.nq = q_vectors[0].size 377 390 self.dtype = np.dtype(dtype) 378 391 self.is_2d = (len(q_vectors) == 2) 379 # TODO: stretch input based on get_warp()380 # not doing it now since warp depends on kernel, which is not known392 # TODO: Stretch input based on get_warp(). 393 # Not doing it now since warp depends on kernel, which is not known 381 394 # at this point, so instead using 32, which is good on the set of 382 395 # architectures tested so far. 383 396 if self.is_2d: 384 # Note: 16 rather than 15 because result is 1 longer than input. 385 width = ((self.nq+16)//16)*16 397 width = ((self.nq+15)//16)*16 386 398 self.q = np.empty((width, 2), dtype=dtype) 387 399 self.q[:self.nq, 0] = q_vectors[0] 388 400 self.q[:self.nq, 1] = q_vectors[1] 389 401 else: 390 # Note: 32 rather than 31 because result is 1 longer than input. 391 width = ((self.nq+32)//32)*32 402 width = ((self.nq+31)//32)*32 392 403 self.q = np.empty(width, dtype=dtype) 393 404 self.q[:self.nq] = q_vectors[0] 394 405 self.global_size = [self.q.shape[0]] 395 406 #print("creating inputs of size", self.global_size) 407 408 # Transfer input value to GPU. 396 409 self.q_b = cuda.to_device(self.q) 397 410 … … 399 412 # type: () -> None 400 413 """ 401 Free the memory.414 Free the buffer associated with the q value. 402 415 """ 403 416 if self.q_b is not None: … … 409 422 self.release() 410 423 424 411 425 class GpuKernel(Kernel): 412 426 """ 413 427 Callable SAS kernel. 414 428 415 *kernel* is the GpuKernel object to call 416 417 *model_info* is the module information 418 419 *q_vectors* is the q vectors at which the kernel should be evaluated 420 421 *dtype* is the kernel precision 422 423 The resulting call method takes the *pars*, a list of values for 424 the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 425 vectors for the polydisperse parameters. *cutoff* determines the 426 integration limits: any points with combined weight less than *cutoff* 427 will not be calculated. 429 *model* is the GpuModel object to call 430 431 The kernel is derived from :class:`Kernel`, providing the 432 :meth:`call_kernel` method to evaluate the kernel for a given set of 433 parameters. Because of the need to move the q values to the GPU before 434 evaluation, the kernel is instantiated for a particular set of q vectors, 435 and can be called many times without transfering q each time. 428 436 429 437 Call :meth:`release` when done with the kernel instance. 430 438 """ 431 def __init__(self, kernel, dtype, model_info, q_vectors): 432 # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 439 #: SAS model information structure. 440 info = None # type: ModelInfo 441 #: Kernel precision. 442 dtype = None # type: np.dtype 443 #: Kernel dimensions (1d or 2d). 444 dim = "" # type: str 445 #: Calculation results, updated after each call to :meth:`_call_kernel`. 446 result = None # type: np.ndarray 447 448 def __init__(self, model, q_vectors): 449 # type: (GpuModel, List[np.ndarray]) -> None 450 dtype = model.dtype 433 451 self.q_input = GpuInput(q_vectors, dtype) 434 self.kernel = kernel 435 # F16 isn't sufficient, so don't support it 452 self._model = model 453 454 # Attributes accessed from the outside. 455 self.dim = '2d' if self.q_input.is_2d else '1d' 456 self.info = model.info 457 self.dtype = dtype 458 459 # Converter to translate input to target type. 436 460 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 437 461 438 # attributes accessed from the outside 439 self.dim = '2d' if self.q_input.is_2d else '1d' 440 self.info = model_info 441 self.dtype = dtype 442 443 # holding place for the returned value 462 # Holding place for the returned value. 444 463 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 445 extra_q = 4 # total weight, form volume, shell volume and R_eff 446 self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 447 448 # Inputs and outputs for each kernel call 449 # Note: res may be shorter than res_b if global_size != nq 464 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 465 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 466 467 # Allocate result value on GPU. 450 468 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 451 self.result_b = cuda.mem_alloc(width) 452 self._need_release = [self.result_b] 453 454 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 455 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 456 # Arrange data transfer to card 469 self._result_b = cuda.mem_alloc(width) 470 471 def _call_kernel(self, call_details, values, cutoff, magnetic, 472 effective_radius_type): 473 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 474 475 # Arrange data transfer to card. 457 476 details_b = cuda.to_device(call_details.buffer) 458 477 values_b = cuda.to_device(values) 459 478 460 kernel = self.kernel[1 if magnetic else 0] 461 args = [ 462 np.uint32(self.q_input.nq), None, None, 463 details_b, values_b, self.q_input.q_b, self.result_b, 464 self._as_dtype(cutoff), 465 np.uint32(effective_radius_type), 479 # Setup kernel function and arguments. 480 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 481 kernel = self._model.get_function(name) 482 kernel_args = [ 483 np.uint32(self.q_input.nq), # Number of inputs. 484 None, # Placeholder for pd_start. 485 None, # Placeholder for pd_stop. 486 details_b, # Problem definition. 487 values_b, # Parameter values. 488 self.q_input.q_b, # Q values. 489 self._result_b, # Result storage. 490 self._as_dtype(cutoff), # Probability cutoff. 491 np.uint32(effective_radius_type), # R_eff mode. 466 492 ] 467 493 grid = partition(self.q_input.nq) 468 #print("Calling OpenCL") 494 495 # Call kernel and retrieve results. 496 #print("Calling CUDA") 469 497 #call_details.show(values) 470 # Call kernel and retrieve results471 498 last_nap = time.clock() 472 499 step = 100000000//self.q_input.nq + 1 … … 475 502 stop = min(start + step, call_details.num_eval) 476 503 #print("queuing",start,stop) 477 args[1:3] = [np.int32(start), np.int32(stop)]478 kernel(* args, **grid)504 kernel_args[1:3] = [np.int32(start), np.int32(stop)] 505 kernel(*kernel_args, **grid) 479 506 if stop < call_details.num_eval: 480 507 sync() 481 # Allow other processes to run 508 # Allow other processes to run. 482 509 current_time = time.clock() 483 510 if current_time - last_nap > 0.5: … … 485 512 last_nap = current_time 486 513 sync() 487 cuda.memcpy_dtoh(self.result, self. result_b)514 cuda.memcpy_dtoh(self.result, self._result_b) 488 515 #print("result", self.result) 489 516 … … 496 523 Release resources associated with the kernel. 497 524 """ 498 for p in self._need_release: 499 p.free() 500 self._need_release = [] 525 self.q_input.release() 526 if self._result_b is not None: 527 self._result_b.free() 528 self._result_b = None 501 529 502 530 def __del__(self): … … 512 540 Note: Maybe context.synchronize() is sufficient. 513 541 """ 514 #return # The following works in C++; don't know what pycuda is doing 515 # Create an event with which to synchronize 542 # Create an event with which to synchronize. 516 543 done = cuda.Event() 517 544 … … 519 546 done.record() 520 547 521 # line added to not hog resources548 # Make sure we don't hog resource while waiting to sync. 522 549 while not done.query(): 523 550 time.sleep(0.01) … … 525 552 # Block until the GPU executes the kernel. 526 553 done.synchronize() 554 527 555 # Clean up the event; I don't think they can be reused. 528 556 del done
Note: See TracChangeset
for help on using the changeset viewer.