Changeset b0de252 in sasmodels for sasmodels/kernelcuda.py
- Timestamp:
- Oct 12, 2018 7:31:24 PM (6 years ago)
- Branches:
- master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 74e9b5f
- Parents:
- 47fb816
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernelcuda.py
r0db7dbd rb0de252 1 1 """ 2 GPU driver for C kernels 2 GPU driver for C kernels (with CUDA) 3 4 To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular 5 device number. If no device number is specified, then look for CUDA_DEVICE=n 6 or a file ~/.cuda-device containing n for the device number. Otherwise, try 7 all available device numbers. 8 9 TODO: docs are out of date 3 10 4 11 There should be a single GPU environment running on the system. This … … 59 66 60 67 61 # Attempt to setup opencl. This may fail if the openclpackage is not68 # Attempt to setup cuda. This may fail if the pycuda package is not 62 69 # installed or if it is installed but there are no devices available. 63 70 try: 64 import pycuda.autoinit65 71 import pycuda.driver as cuda # type: ignore 66 72 from pycuda.compiler import SourceModule 73 from pycuda.tools import make_default_context, clear_context_caches 74 # Ask CUDA for the default context (so that we know that one exists) 75 # then immediately throw it away in case the user doesn't want it. 76 # Note: cribbed from pycuda.autoinit 77 cuda.init() 78 context = make_default_context() 79 context.pop() 80 clear_context_caches() 81 del context 67 82 HAVE_CUDA = True 68 83 CUDA_ERROR = "" … … 91 106 MAX_LOOPS = 2048 92 107 93 94 # Pragmas for enable OpenCL features. Be sure to protect them so that they95 # still compile even if OpenCL is not present.96 _F16_PRAGMA = """\97 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)98 # pragma OPENCL EXTENSION cl_khr_fp16: enable99 #endif100 """101 102 _F64_PRAGMA = """\103 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)104 # pragma OPENCL EXTENSION cl_khr_fp64: enable105 #endif106 """107 108 108 def use_cuda(): 109 return HAVE_CUDA 109 env = os.environ.get("SAS_OPENCL", "").lower() 110 return HAVE_CUDA and (env == "" or env.startswith("cuda")) 110 111 111 112 ENV = None … … 115 116 """ 116 117 global ENV 118 # Free any previous allocated context. 119 if ENV is not None and ENV.context is not None: 120 ENV.release() 117 121 ENV = GpuEnvironment() if use_cuda() else None 118 122 … … 126 130 if ENV is None: 127 131 if not HAVE_CUDA: 128 raise RuntimeError(" OpenCLstartup failed with ***"132 raise RuntimeError("CUDA startup failed with ***" 129 133 + CUDA_ERROR + "***; using C compiler instead") 130 134 reset_environment() … … 133 137 return ENV 134 138 135 def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 136 # type: (np.ndarray, np.dtype, float, int) -> np.ndarray 137 """ 138 Stretch an input vector to the correct boundary. 139 140 Performance on the kernels can drop by a factor of two or more if the 141 number of values to compute does not fall on a nice power of two 142 boundary. The trailing additional vector elements are given a 143 value of *extra*, and so f(*extra*) will be computed for each of 144 them. The returned array will thus be a subset of the computed array. 145 146 *boundary* should be a power of 2 which is at least 32 for good 147 performance on current platforms (as of Jan 2015). It should 148 probably be the max of get_warp(kernel,queue) and 149 device.min_data_type_align_size//4. 150 """ 151 remainder = vector.size % boundary 152 if remainder != 0: 153 size = vector.size + (boundary - remainder) 154 vector = np.hstack((vector, [extra] * (size - vector.size))) 155 return np.ascontiguousarray(vector, dtype=dtype) 156 139 def has_type(dtype): 140 # type: (np.dtype) -> bool 141 """ 142 Return true if device supports the requested precision. 143 """ 144 # Assume the nvidia card supports 32-bit and 64-bit floats. 145 # TODO: check if pycuda support F16 146 return dtype in (generate.F32, generate.F64) 157 147 158 148 def compile_model(source, dtype, fast=False): 159 # type: (str, np.dtype, bool) -> cl.Program149 # type: (str, np.dtype, bool) -> SourceModule 160 150 """ 161 151 Build a model to run on the gpu. … … 165 155 devices in the context do not support the cl_khr_fp64 extension. 166 156 """ 157 dtype = np.dtype(dtype) 158 if not has_type(dtype): 159 raise RuntimeError("%s not supported for devices"%dtype) 160 167 161 source_list = [generate.convert_type(source, dtype)] 168 169 if dtype == generate.F16:170 source_list.insert(0, _F16_PRAGMA)171 elif dtype == generate.F64:172 source_list.insert(0, _F64_PRAGMA)173 162 174 163 source_list.insert(0, "#define USE_SINCOS\n") 175 164 source = "\n".join(source_list) 176 program = SourceModule(source) # no_extern_c=True, include_dirs=[...] 165 options = '-use_fast_math' if fast else None 166 program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 167 #print("done with "+program) 177 168 return program 178 169 … … 184 175 GPU context, with possibly many devices, and one queue per device. 185 176 """ 186 def __init__(self, devnum=0): 187 # type: () -> None 177 context = None # type: cuda.Context 178 def __init__(self, devnum=None): 179 # type: (int) -> None 188 180 # Byte boundary for data alignment 189 181 #self.data_boundary = max(d.min_data_type_align_size 190 182 # for d in self.context.devices) 191 183 self.compiled = {} 192 #self.device = cuda.Device(devnum) 193 #self.context = self.device.make_context() 184 env = os.environ.get("SAS_OPENCL", "").lower() 185 if devnum is None and env.startswith("cuda:"): 186 devnum = int(env[5:]) 187 # Set the global context to the particular device number if one is 188 # given, otherwise use the default context. Perhaps this will be set 189 # by an environment variable within autoinit. 190 if devnum is not None: 191 self.context = cuda.Device(devnum).make_context() 192 else: 193 self.context = make_default_context() 194 195 def release(self): 196 if self.context is not None: 197 self.context.pop() 198 self.context = None 199 200 def __del__(self): 201 self.release() 194 202 195 203 def has_type(self, dtype): … … 198 206 Return True if all devices support a given type. 199 207 """ 200 return True208 return has_type(dtype) 201 209 202 210 def compile_program(self, name, source, dtype, fast, timestamp): … … 235 243 that the compiler is allowed to take shortcuts. 236 244 """ 245 info = None # type: ModelInfo 246 source = "" # type: str 247 dtype = None # type: np.dtype 248 fast = False # type: bool 249 program = None # type: SourceModule 250 _kernels = None # type: List[cuda.Function] 251 237 252 def __init__(self, source, model_info, dtype=generate.F32, fast=False): 238 253 # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None … … 418 433 last_nap = current_time 419 434 sync() 435 cuda.memcpy_dtoh(self.result, self.result_b) 436 #print("result", self.result) 437 420 438 details_b.free() 421 439 values_b.free() 422 cuda.memcpy_dtoh(self.result, self.result_b)423 #print("result", self.result)424 440 425 441 pd_norm = self.result[self.q_input.nq] … … 459 475 460 476 #line added to not hog resources 461 while not done.query(): time.sleep(0.01) 477 while not done.query(): 478 time.sleep(0.01) 462 479 463 480 # Block until the GPU executes the kernel. … … 473 490 efficiency. 474 491 ''' 475 max_gx, max_gy = 65535,65535492 max_gx, max_gy = 65535, 65535 476 493 blocksize = 32 477 #max_gx, max_gy = 5,65536494 #max_gx, max_gy = 5, 65536 478 495 #blocksize = 3 479 block = (blocksize, 1,1)496 block = (blocksize, 1, 1) 480 497 num_blocks = int((n+blocksize-1)/blocksize) 481 498 if num_blocks < max_gx: 482 grid = (num_blocks, 1)499 grid = (num_blocks, 1) 483 500 else: 484 501 gx = max_gx 485 502 gy = (num_blocks + max_gx - 1) / max_gx 486 if gy >= max_gy: raise ValueError("vector is too large")487 grid = (gx,gy)488 #print "block",block,"grid",grid489 #print "waste",block[0]*block[1]*block[2]*grid[0]*grid[1] - n490 return dict(block=block,grid=grid)491 503 if gy >= max_gy: 504 raise ValueError("vector is too large") 505 grid = (gx, gy) 506 #print("block", block, "grid", grid) 507 #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n) 508 return dict(block=block, grid=grid)
Note: See TracChangeset
for help on using the changeset viewer.