Changeset b0de252 in sasmodels
- Timestamp:
- Oct 12, 2018 7:31:24 PM (6 years ago)
- Branches:
- master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 74e9b5f
- Parents:
- 47fb816
- Location:
- sasmodels
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/compare.py
r610ef23 rb0de252 115 115 === environment variables === 116 116 -DSAS_MODELPATH=path sets directory containing custom models 117 -DSAS_OPENCL=vendor:device| none sets the target OpenCLdevice117 -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device 118 118 -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only) 119 119 -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler -
sasmodels/core.py
r47fb816 rb0de252 13 13 from glob import glob 14 14 import re 15 16 # Set "SAS_OPENCL=cuda" in the environment to use the CUDA rather than OpenCL17 USE_CUDA = os.environ.get("SAS_OPENCL", "") == "cuda"18 15 19 16 import numpy as np # type: ignore … … 24 21 from . import mixture 25 22 from . import kernelpy 26 if USE_CUDA: 27 from . import kernelcuda 28 else: 29 from . import kernelcl 23 from . import kernelcuda 24 from . import kernelcl 30 25 from . import kerneldll 31 26 from . import custom … … 216 211 #print("building dll", numpy_dtype) 217 212 return kerneldll.load_dll(source['dll'], model_info, numpy_dtype) 218 elif USE_CUDA: 219 #print("building cuda", numpy_dtype) 213 elif platform == "cuda": 220 214 return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast) 221 215 else: … … 254 248 # type: (ModelInfo, str, str) -> (np.dtype, bool, str) 255 249 """ 256 Interpret dtype string, returning np.dtype and fast flag.250 Interpret dtype string, returning np.dtype, fast flag and platform. 257 251 258 252 Possible types include 'half', 'single', 'double' and 'quad'. If the … … 262 256 default for the model and platform. 263 257 264 Platform preference can be specfied ("ocl" vs "dll"), with the default 265 being OpenCL if it is availabe. If the dtype name ends with '!' then 266 platform is forced to be DLL rather than OpenCL. 258 Platform preference can be specfied ("ocl", "cuda", "dll"), with the 259 default being OpenCL or CUDA if available, otherwise DLL. If the dtype 260 name ends with '!' then platform is forced to be DLL rather than GPU. 261 The default platform is set by the environment variable SAS_OPENCL, 262 SAS_OPENCL=driver:device for OpenCL, SAS_OPENCL=cuda:device for CUDA 263 or SAS_OPENCL=none for DLL. 267 264 268 265 This routine ignores the preferences within the model definition. This … … 277 274 if platform is None: 278 275 platform = "ocl" 279 if not model_info.opencl:280 platform = "dll"281 elif USE_CUDA:282 if not kernelcuda.use_cuda():283 platform = "dll"284 else:285 if not kernelcl.use_opencl():286 platform = "dll"287 276 288 277 # Check if type indicates dll regardless of which platform is given … … 290 279 platform = "dll" 291 280 dtype = dtype[:-1] 281 282 # Make sure model allows opencl/gpu 283 if not model_info.opencl: 284 platform = "dll" 285 286 # Make sure opencl is available, or fallback to cuda then to dll 287 if platform == "ocl" and not kernelcl.use_opencl(): 288 platform = "cuda" if kernelcuda.use_cuda() else "dll" 292 289 293 290 # Convert special type names "half", "fast", and "quad" … … 300 297 dtype = "float16" 301 298 302 # Convert dtype string to numpy dtype. 299 # Convert dtype string to numpy dtype. Use single precision for GPU 300 # if model allows it, otherwise use double precision. 303 301 if dtype is None or dtype == "default": 304 numpy_dtype = (generate.F32 if platform == "ocl" and model_info.single302 numpy_dtype = (generate.F32 if model_info.single and platform in ("ocl", "cuda") 305 303 else generate.F64) 306 304 else: 307 305 numpy_dtype = np.dtype(dtype) 308 306 309 # Make sure that the type is supported by opencl, otherwise use dll307 # Make sure that the type is supported by GPU, otherwise use dll 310 308 if platform == "ocl": 311 if USE_CUDA: 312 env = kernelcuda.environment() 313 else: 314 env = kernelcl.environment() 315 if not env.has_type(numpy_dtype): 316 platform = "dll" 317 if dtype is None: 318 numpy_dtype = generate.F64 309 env = kernelcl.environment() 310 elif platform == "cuda": 311 env = kernelcuda.environment() 312 else: 313 env = None 314 if env is not None and not env.has_type(numpy_dtype): 315 platform = "dll" 316 if dtype is None: 317 numpy_dtype = generate.F64 319 318 320 319 return numpy_dtype, fast, platform -
sasmodels/kernel_header.c
r0db7dbd rb0de252 5 5 #elif defined(_OPENMP) 6 6 # define USE_OPENMP 7 #endif 7 #elif defined(__CUDACC__) 8 # define USE_CUDA 9 #endif 10 11 // Use SAS_DOUBLE to force the use of double even for float kernels 12 #define SAS_DOUBLE dou ## ble 8 13 9 14 // If opencl is not available, then we are compiling a C function … … 127 132 #endif // !USE_OPENCL 128 133 129 // Use SAS_DOUBLE to force the use of double even for float kernels130 #define SAS_DOUBLE dou ## ble131 132 134 #if defined(NEED_EXPM1) 133 135 // TODO: precision is a half digit lower than numpy on mac in [1e-7, 0.5] -
sasmodels/kernel_iq.c
r47fb816 rb0de252 80 80 // du * (m_sigma_y + 1j*m_sigma_z); 81 81 // weights for spin crosssections: dd du real, ud real, uu, du imag, ud imag 82 __device__ 82 83 static void set_spin_weights(double in_spin, double out_spin, double weight[6]) 83 84 { -
sasmodels/kernelcl.py
rd86f0fc rb0de252 1 1 """ 2 2 GPU driver for C kernels 3 4 TODO: docs are out of date 3 5 4 6 There should be a single GPU environment running on the system. This … … 59 61 60 62 61 # Attempt to setup opencl. This may fail if the opencl package is not63 # Attempt to setup opencl. This may fail if the pyopencl package is not 62 64 # installed or if it is installed but there are no devices available. 63 65 try: … … 131 133 132 134 def use_opencl(): 133 return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none" 135 env = os.environ.get("SAS_OPENCL", "").lower() 136 return HAVE_OPENCL and env != "none" and not env.startswith("cuda") 134 137 135 138 ENV = None … … 179 182 cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 180 183 queue.device) 181 182 def _stretch_input(vector, dtype, extra=1e-3, boundary=32):183 # type: (np.ndarray, np.dtype, float, int) -> np.ndarray184 """185 Stretch an input vector to the correct boundary.186 187 Performance on the kernels can drop by a factor of two or more if the188 number of values to compute does not fall on a nice power of two189 boundary. The trailing additional vector elements are given a190 value of *extra*, and so f(*extra*) will be computed for each of191 them. The returned array will thus be a subset of the computed array.192 193 *boundary* should be a power of 2 which is at least 32 for good194 performance on current platforms (as of Jan 2015). It should195 probably be the max of get_warp(kernel,queue) and196 device.min_data_type_align_size//4.197 """198 remainder = vector.size % boundary199 if remainder != 0:200 size = vector.size + (boundary - remainder)201 vector = np.hstack((vector, [extra] * (size - vector.size)))202 return np.ascontiguousarray(vector, dtype=dtype)203 204 184 205 185 def compile_model(context, source, dtype, fast=False): -
sasmodels/kernelcuda.py
r0db7dbd rb0de252 1 1 """ 2 GPU driver for C kernels 2 GPU driver for C kernels (with CUDA) 3 4 To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular 5 device number. If no device number is specified, then look for CUDA_DEVICE=n 6 or a file ~/.cuda-device containing n for the device number. Otherwise, try 7 all available device numbers. 8 9 TODO: docs are out of date 3 10 4 11 There should be a single GPU environment running on the system. This … … 59 66 60 67 61 # Attempt to setup opencl. This may fail if the openclpackage is not68 # Attempt to setup cuda. This may fail if the pycuda package is not 62 69 # installed or if it is installed but there are no devices available. 63 70 try: 64 import pycuda.autoinit65 71 import pycuda.driver as cuda # type: ignore 66 72 from pycuda.compiler import SourceModule 73 from pycuda.tools import make_default_context, clear_context_caches 74 # Ask CUDA for the default context (so that we know that one exists) 75 # then immediately throw it away in case the user doesn't want it. 76 # Note: cribbed from pycuda.autoinit 77 cuda.init() 78 context = make_default_context() 79 context.pop() 80 clear_context_caches() 81 del context 67 82 HAVE_CUDA = True 68 83 CUDA_ERROR = "" … … 91 106 MAX_LOOPS = 2048 92 107 93 94 # Pragmas for enable OpenCL features. Be sure to protect them so that they95 # still compile even if OpenCL is not present.96 _F16_PRAGMA = """\97 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)98 # pragma OPENCL EXTENSION cl_khr_fp16: enable99 #endif100 """101 102 _F64_PRAGMA = """\103 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)104 # pragma OPENCL EXTENSION cl_khr_fp64: enable105 #endif106 """107 108 108 def use_cuda(): 109 return HAVE_CUDA 109 env = os.environ.get("SAS_OPENCL", "").lower() 110 return HAVE_CUDA and (env == "" or env.startswith("cuda")) 110 111 111 112 ENV = None … … 115 116 """ 116 117 global ENV 118 # Free any previous allocated context. 119 if ENV is not None and ENV.context is not None: 120 ENV.release() 117 121 ENV = GpuEnvironment() if use_cuda() else None 118 122 … … 126 130 if ENV is None: 127 131 if not HAVE_CUDA: 128 raise RuntimeError(" OpenCLstartup failed with ***"132 raise RuntimeError("CUDA startup failed with ***" 129 133 + CUDA_ERROR + "***; using C compiler instead") 130 134 reset_environment() … … 133 137 return ENV 134 138 135 def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 136 # type: (np.ndarray, np.dtype, float, int) -> np.ndarray 137 """ 138 Stretch an input vector to the correct boundary. 139 140 Performance on the kernels can drop by a factor of two or more if the 141 number of values to compute does not fall on a nice power of two 142 boundary. The trailing additional vector elements are given a 143 value of *extra*, and so f(*extra*) will be computed for each of 144 them. The returned array will thus be a subset of the computed array. 145 146 *boundary* should be a power of 2 which is at least 32 for good 147 performance on current platforms (as of Jan 2015). It should 148 probably be the max of get_warp(kernel,queue) and 149 device.min_data_type_align_size//4. 150 """ 151 remainder = vector.size % boundary 152 if remainder != 0: 153 size = vector.size + (boundary - remainder) 154 vector = np.hstack((vector, [extra] * (size - vector.size))) 155 return np.ascontiguousarray(vector, dtype=dtype) 156 139 def has_type(dtype): 140 # type: (np.dtype) -> bool 141 """ 142 Return true if device supports the requested precision. 143 """ 144 # Assume the nvidia card supports 32-bit and 64-bit floats. 145 # TODO: check if pycuda support F16 146 return dtype in (generate.F32, generate.F64) 157 147 158 148 def compile_model(source, dtype, fast=False): 159 # type: (str, np.dtype, bool) -> cl.Program149 # type: (str, np.dtype, bool) -> SourceModule 160 150 """ 161 151 Build a model to run on the gpu. … … 165 155 devices in the context do not support the cl_khr_fp64 extension. 166 156 """ 157 dtype = np.dtype(dtype) 158 if not has_type(dtype): 159 raise RuntimeError("%s not supported for devices"%dtype) 160 167 161 source_list = [generate.convert_type(source, dtype)] 168 169 if dtype == generate.F16:170 source_list.insert(0, _F16_PRAGMA)171 elif dtype == generate.F64:172 source_list.insert(0, _F64_PRAGMA)173 162 174 163 source_list.insert(0, "#define USE_SINCOS\n") 175 164 source = "\n".join(source_list) 176 program = SourceModule(source) # no_extern_c=True, include_dirs=[...] 165 options = '-use_fast_math' if fast else None 166 program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 167 #print("done with "+program) 177 168 return program 178 169 … … 184 175 GPU context, with possibly many devices, and one queue per device. 185 176 """ 186 def __init__(self, devnum=0): 187 # type: () -> None 177 context = None # type: cuda.Context 178 def __init__(self, devnum=None): 179 # type: (int) -> None 188 180 # Byte boundary for data alignment 189 181 #self.data_boundary = max(d.min_data_type_align_size 190 182 # for d in self.context.devices) 191 183 self.compiled = {} 192 #self.device = cuda.Device(devnum) 193 #self.context = self.device.make_context() 184 env = os.environ.get("SAS_OPENCL", "").lower() 185 if devnum is None and env.startswith("cuda:"): 186 devnum = int(env[5:]) 187 # Set the global context to the particular device number if one is 188 # given, otherwise use the default context. Perhaps this will be set 189 # by an environment variable within autoinit. 190 if devnum is not None: 191 self.context = cuda.Device(devnum).make_context() 192 else: 193 self.context = make_default_context() 194 195 def release(self): 196 if self.context is not None: 197 self.context.pop() 198 self.context = None 199 200 def __del__(self): 201 self.release() 194 202 195 203 def has_type(self, dtype): … … 198 206 Return True if all devices support a given type. 199 207 """ 200 return True208 return has_type(dtype) 201 209 202 210 def compile_program(self, name, source, dtype, fast, timestamp): … … 235 243 that the compiler is allowed to take shortcuts. 236 244 """ 245 info = None # type: ModelInfo 246 source = "" # type: str 247 dtype = None # type: np.dtype 248 fast = False # type: bool 249 program = None # type: SourceModule 250 _kernels = None # type: List[cuda.Function] 251 237 252 def __init__(self, source, model_info, dtype=generate.F32, fast=False): 238 253 # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None … … 418 433 last_nap = current_time 419 434 sync() 435 cuda.memcpy_dtoh(self.result, self.result_b) 436 #print("result", self.result) 437 420 438 details_b.free() 421 439 values_b.free() 422 cuda.memcpy_dtoh(self.result, self.result_b)423 #print("result", self.result)424 440 425 441 pd_norm = self.result[self.q_input.nq] … … 459 475 460 476 #line added to not hog resources 461 while not done.query(): time.sleep(0.01) 477 while not done.query(): 478 time.sleep(0.01) 462 479 463 480 # Block until the GPU executes the kernel. … … 473 490 efficiency. 474 491 ''' 475 max_gx, max_gy = 65535,65535492 max_gx, max_gy = 65535, 65535 476 493 blocksize = 32 477 #max_gx, max_gy = 5,65536494 #max_gx, max_gy = 5, 65536 478 495 #blocksize = 3 479 block = (blocksize, 1,1)496 block = (blocksize, 1, 1) 480 497 num_blocks = int((n+blocksize-1)/blocksize) 481 498 if num_blocks < max_gx: 482 grid = (num_blocks, 1)499 grid = (num_blocks, 1) 483 500 else: 484 501 gx = max_gx 485 502 gy = (num_blocks + max_gx - 1) / max_gx 486 if gy >= max_gy: raise ValueError("vector is too large")487 grid = (gx,gy)488 #print "block",block,"grid",grid489 #print "waste",block[0]*block[1]*block[2]*grid[0]*grid[1] - n490 return dict(block=block,grid=grid)491 503 if gy >= max_gy: 504 raise ValueError("vector is too large") 505 grid = (gx, gy) 506 #print("block", block, "grid", grid) 507 #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n) 508 return dict(block=block, grid=grid)
Note: See TracChangeset
for help on using the changeset viewer.