Changeset 3199b17 in sasmodels for sasmodels/kernelcuda.py
- Timestamp:
- Mar 6, 2019 12:24:03 PM (5 years ago)
- Branches:
- master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 4453136
- Parents:
- 00afc15
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcuda.py
r00afc15 r3199b17 68 68 69 69 70 # Attempt to setup cuda. This may fail if the pycuda package is not70 # Attempt to setup CUDA. This may fail if the pycuda package is not 71 71 # installed or if it is installed but there are no devices available. 72 72 try: … … 108 108 MAX_LOOPS = 2048 109 109 110 110 111 def use_cuda(): 111 env = os.environ.get("SAS_OPENCL", "").lower() 112 return HAVE_CUDA and (env == "" or env.startswith("cuda")) 112 sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 113 return HAVE_CUDA and sas_opencl.startswith("cuda") 114 113 115 114 116 ENV = None … … 122 124 ENV.release() 123 125 ENV = GpuEnvironment() if use_cuda() else None 126 124 127 125 128 def environment(): … … 139 142 return ENV 140 143 141 def free_context():142 global ENV143 if ENV is not None:144 ENV.release()145 ENV = None146 147 atexit.register(free_context)148 144 149 145 def has_type(dtype): … … 152 148 Return true if device supports the requested precision. 153 149 """ 154 # Assume the nvidiacard supports 32-bit and 64-bit floats.155 # TODO: check if pycuda support F16150 # Assume the NVIDIA card supports 32-bit and 64-bit floats. 151 # TODO: Check if pycuda support F16. 156 152 return dtype in (generate.F32, generate.F64) 157 153 158 154 159 155 FUNCTION_PATTERN = re.compile(r"""^ 160 (?P<space>\s*) # initial space161 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function162 (?P<function>\s*\b\w+\b\s*[(]) # function name plus open parens156 (?P<space>\s*) # Initial space. 157 (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # One or more qualifiers before function. 158 (?P<function>\s*\b\w+\b\s*[(]) # Function name plus open parens. 163 159 """, re.VERBOSE|re.MULTILINE) 164 160 … … 167 163 """, re.VERBOSE|re.MULTILINE) 168 164 165 169 166 def _add_device_tag(match): 170 167 # type: (None) -> str 171 # Note: should be re.Match, but that isn't a simple type168 # Note: Should be re.Match, but that isn't a simple type. 172 169 """ 173 170 replace qualifiers with __device__ qualifiers if needed … … 182 179 return "".join((space, "__device__ ", qualifiers, function)) 183 180 181 184 182 def mark_device_functions(source): 185 183 # type: (str) -> str … … 188 186 """ 189 187 return FUNCTION_PATTERN.sub(_add_device_tag, source) 188 190 189 191 190 def show_device_functions(source): … … 197 196 print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 198 197 return source 198 199 199 200 200 def compile_model(source, dtype, fast=False): … … 221 221 #options = ['--verbose', '-E'] 222 222 options = ['--use_fast_math'] if fast else None 223 program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]223 program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 224 224 225 225 #print("done with "+program) … … 227 227 228 228 229 # for now, this returns one device in the context230 # TODO: create a context that contains all devices on all platforms229 # For now, this returns one device in the context. 230 # TODO: Create a context that contains all devices on all platforms. 231 231 class GpuEnvironment(object): 232 232 """ 233 GPU context , with possibly many devices, and one queue per device.233 GPU context for CUDA. 234 234 """ 235 235 context = None # type: cuda.Context 236 236 def __init__(self, devnum=None): 237 237 # type: (int) -> None 238 # Byte boundary for data alignment239 #self.data_boundary = max(d.min_data_type_align_size240 # for d in self.context.devices)241 self.compiled = {}242 238 env = os.environ.get("SAS_OPENCL", "").lower() 243 239 if devnum is None and env.startswith("cuda:"): 244 240 devnum = int(env[5:]) 241 245 242 # Set the global context to the particular device number if one is 246 243 # given, otherwise use the default context. Perhaps this will be set … … 251 248 self.context = make_default_context() 252 249 250 ## Byte boundary for data alignment. 251 #self.data_boundary = max(d.min_data_type_align_size 252 # for d in self.context.devices) 253 254 # Cache for compiled programs, and for items in context. 255 self.compiled = {} 256 253 257 def release(self): 254 258 if self.context is not None: … … 271 275 Compile the program for the device in the given context. 272 276 """ 273 # Note: PyOpenCL caches based on md5 hash of source, options and device 274 # so we don't really need to cache things for ourselves. I'll do so 275 # anyway just to save some data munging time. 277 # Note: PyCuda (probably) caches but I'll do so as well just to 278 # save some data munging time. 276 279 tag = generate.tag_source(source) 277 280 key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 278 # Check timestamp on program 281 # Check timestamp on program. 279 282 program, program_timestamp = self.compiled.get(key, (None, np.inf)) 280 283 if program_timestamp < timestamp: … … 286 289 return program 287 290 291 288 292 class GpuModel(KernelModel): 289 293 """ … … 301 305 that the compiler is allowed to take shortcuts. 302 306 """ 303 info = None # type: ModelInfo304 source = "" # type: str305 dtype = None # type: np.dtype306 fast = False # type: bool307 _program = None # type: SourceModule308 _kernels = None # type: Dict[str, cuda.Function]307 info = None # type: ModelInfo 308 source = "" # type: str 309 dtype = None # type: np.dtype 310 fast = False # type: bool 311 _program = None # type: SourceModule 312 _kernels = None # type: Dict[str, cuda.Function] 309 313 310 314 def __init__(self, source, model_info, dtype=generate.F32, fast=False): … … 352 356 functions = [program.get_function(k) for k in names] 353 357 self._kernels = {k: v for k, v in zip(variants, functions)} 354 # keep a handle to program so GC doesn't collect358 # Keep a handle to program so GC doesn't collect. 355 359 self._program = program 356 360 357 # TODO: check that we don't need a destructor for buffers which go out of scope 361 362 # TODO: Check that we don't need a destructor for buffers which go out of scope. 358 363 class GpuInput(object): 359 364 """ … … 377 382 def __init__(self, q_vectors, dtype=generate.F32): 378 383 # type: (List[np.ndarray], np.dtype) -> None 379 # TODO: do we ever need double precision q?384 # TODO: Do we ever need double precision q? 380 385 self.nq = q_vectors[0].size 381 386 self.dtype = np.dtype(dtype) 382 387 self.is_2d = (len(q_vectors) == 2) 383 # TODO: stretch input based on get_warp()384 # not doing it now since warp depends on kernel, which is not known388 # TODO: Stretch input based on get_warp(). 389 # Not doing it now since warp depends on kernel, which is not known 385 390 # at this point, so instead using 32, which is good on the set of 386 391 # architectures tested so far. 387 392 if self.is_2d: 388 # Note: 16 rather than 15 because result is 1 longer than input. 389 width = ((self.nq+16)//16)*16 393 width = ((self.nq+15)//16)*16 390 394 self.q = np.empty((width, 2), dtype=dtype) 391 395 self.q[:self.nq, 0] = q_vectors[0] 392 396 self.q[:self.nq, 1] = q_vectors[1] 393 397 else: 394 # Note: 32 rather than 31 because result is 1 longer than input. 395 width = ((self.nq+32)//32)*32 398 width = ((self.nq+31)//32)*32 396 399 self.q = np.empty(width, dtype=dtype) 397 400 self.q[:self.nq] = q_vectors[0] … … 399 402 #print("creating inputs of size", self.global_size) 400 403 401 # transfer input value to gpu404 # Transfer input value to GPU. 402 405 self.q_b = cuda.to_device(self.q) 403 406 … … 405 408 # type: () -> None 406 409 """ 407 Free the memory.410 Free the buffer associated with the q value. 408 411 """ 409 412 if self.q_b is not None: … … 414 417 # type: () -> None 415 418 self.release() 419 416 420 417 421 class GpuKernel(Kernel): … … 429 433 Call :meth:`release` when done with the kernel instance. 430 434 """ 431 #: SAS model information structure 432 info = None # type: ModelInfo433 #: kernel precision434 dtype = None # type: np.dtype435 #: kernel dimensions (1d or 2d)436 dim = "" # type: str437 #: calculation results, updated after each call to :meth:`_call_kernel`438 result = None # type: np.ndarray435 #: SAS model information structure. 436 info = None # type: ModelInfo 437 #: Kernel precision. 438 dtype = None # type: np.dtype 439 #: Kernel dimensions (1d or 2d). 440 dim = "" # type: str 441 #: Calculation results, updated after each call to :meth:`_call_kernel`. 442 result = None # type: np.ndarray 439 443 440 444 def __init__(self, model, q_vectors): … … 443 447 self.q_input = GpuInput(q_vectors, dtype) 444 448 self._model = model 445 # F16 isn't sufficient, so don't support it 446 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 447 448 # attributes accessed from the outside 449 450 # Attributes accessed from the outside. 449 451 self.dim = '2d' if self.q_input.is_2d else '1d' 450 452 self.info = model.info 451 self.dtype = model.dtype 452 453 # holding place for the returned value 453 self.dtype = dtype 454 455 # Converter to translate input to target type. 456 self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 457 458 # Holding place for the returned value. 454 459 nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 455 extra_q = 4 # total weight, form volume, shell volume and R_eff456 self.result = np.empty(self.q_input.nq*nout +extra_q, dtype)457 458 # allocate result value on gpu460 extra_q = 4 # Total weight, form volume, shell volume and R_eff. 461 self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 462 463 # Allocate result value on GPU. 459 464 width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 460 465 self._result_b = cuda.mem_alloc(width) 461 466 462 def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 463 # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 464 # Arrange data transfer to card 467 def _call_kernel(self, call_details, values, cutoff, magnetic, 468 effective_radius_type): 469 # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 470 471 # Arrange data transfer to card. 465 472 details_b = cuda.to_device(call_details.buffer) 466 473 values_b = cuda.to_device(values) 467 474 475 # Setup kernel function and arguments. 468 476 name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 469 477 kernel = self._model.get_function(name) 470 478 kernel_args = [ 471 np.uint32(self.q_input.nq), None, None, 472 details_b, values_b, self.q_input.q_b, self._result_b, 473 self._as_dtype(cutoff), 474 np.uint32(effective_radius_type), 479 np.uint32(self.q_input.nq), # Number of inputs. 480 None, # Placeholder for pd_start. 481 None, # Placeholder for pd_stop. 482 details_b, # Problem definition. 483 values_b, # Parameter values. 484 self.q_input.q_b, # Q values. 485 self._result_b, # Result storage. 486 self._as_dtype(cutoff), # Probability cutoff. 487 np.uint32(effective_radius_type), # R_eff mode. 475 488 ] 476 489 grid = partition(self.q_input.nq) 477 #print("Calling OpenCL") 490 491 # Call kernel and retrieve results. 492 #print("Calling CUDA") 478 493 #call_details.show(values) 479 # Call kernel and retrieve results480 494 last_nap = time.clock() 481 495 step = 100000000//self.q_input.nq + 1 … … 488 502 if stop < call_details.num_eval: 489 503 sync() 490 # Allow other processes to run 504 # Allow other processes to run. 491 505 current_time = time.clock() 492 506 if current_time - last_nap > 0.5: … … 522 536 Note: Maybe context.synchronize() is sufficient. 523 537 """ 524 #return # The following works in C++; don't know what pycuda is doing 525 # Create an event with which to synchronize 538 # Create an event with which to synchronize. 526 539 done = cuda.Event() 527 540 … … 529 542 done.record() 530 543 531 # line added to not hog resources544 # Make sure we don't hog resource while waiting to sync. 532 545 while not done.query(): 533 546 time.sleep(0.01) … … 535 548 # Block until the GPU executes the kernel. 536 549 done.synchronize() 550 537 551 # Clean up the event; I don't think they can be reused. 538 552 del done
Note: See TracChangeset
for help on using the changeset viewer.