← Previous Changeset
Next Changeset →

Changeset b0de252 in sasmodels

Timestamp:

Oct 12, 2018 5:31:24 PM (7 years ago)

Author:

pkienzle

Branches:

master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

Parents:

Message:

improve control over cuda context

Location:

Files:

: 6 edited

compare.py (modified) (1 diff)
core.py (modified) (8 diffs)
kernel_header.c (modified) (2 diffs)
kernel_iq.c (modified) (1 diff)
kernelcl.py (modified) (4 diffs)
kernelcuda.py (modified) (13 diffs)

Legend:

: Unmodified
: Added
: Removed

sasmodels/compare.py

r610ef23	rb0de252
115	115	=== environment variables ===
116	116	-DSAS_MODELPATH=path sets directory containing custom models
117		-DSAS_OPENCL=vendor:device\|~~none sets the target OpenCL~~ device
	117	-DSAS_OPENCL=vendor:device\|cuda:device\|none sets the target GPU device
118	118	-DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only)
119	119	-DSAS_COMPILER=tinycc\|msvc\|mingw\|unix sets the DLL compiler

sasmodels/core.py

-                      r47fb816
+                      rb0de252
 from glob import glob
 import re
-# Set "SAS_OPENCL=cuda" in the environment to use the CUDA rather than OpenCL
-USE_CUDA = os.environ.get("SAS_OPENCL", "") == "cuda"
 import numpy as np # type: ignore
 …
 from . import mixture
 from . import kernelpy
+if USE_CUDA:
+    from . import kernelcuda
+else:
+    from . import kernelcl
+from . import kernelcuda
+from . import kernelcl
 from . import kerneldll
 from . import custom
 …
         #print("building dll", numpy_dtype)
         return kerneldll.load_dll(source['dll'], model_info, numpy_dtype)
+    elif USE_CUDA:
+        #print("building cuda", numpy_dtype)
+    elif platform == "cuda":
         return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast)
     else:
 …
     # type: (ModelInfo, str, str) -> (np.dtype, bool, str)
     """
     Interpret dtype string, returning np.dtype and fast flag.
+    Interpret dtype string, returning np.dtype, fast flag and platform.
     Possible types include 'half', 'single', 'double' and 'quad'.  If the
 …
     default for the model and platform.
+    Platform preference can be specfied ("ocl" vs "dll"), with the default
+    being OpenCL if it is availabe.  If the dtype name ends with '!' then
+    platform is forced to be DLL rather than OpenCL.
+    Platform preference can be specfied ("ocl", "cuda", "dll"), with the
+    default being OpenCL or CUDA if available, otherwise DLL.  If the dtype
+    name ends with '!' then platform is forced to be DLL rather than GPU.
+    The default platform is set by the environment variable SAS_OPENCL,
+    SAS_OPENCL=driver:device for OpenCL, SAS_OPENCL=cuda:device for CUDA
+    or SAS_OPENCL=none for DLL.
     This routine ignores the preferences within the model definition.  This
 …
     if platform is None:
         platform = "ocl"
-    if not model_info.opencl:
-        platform = "dll"
-    elif USE_CUDA:
-        if not kernelcuda.use_cuda():
-            platform = "dll"
-    else:
-        if not kernelcl.use_opencl():
-            platform = "dll"
     # Check if type indicates dll regardless of which platform is given
 …
         platform = "dll"
         dtype = dtype[:-1]
+    # Make sure model allows opencl/gpu
+    if not model_info.opencl:
+        platform = "dll"
+    # Make sure opencl is available, or fallback to cuda then to dll
+    if platform == "ocl" and not kernelcl.use_opencl():
+        platform = "cuda" if kernelcuda.use_cuda() else "dll"
     # Convert special type names "half", "fast", and "quad"
 …
         dtype = "float16"
+    # Convert dtype string to numpy dtype.
+    # Convert dtype string to numpy dtype.  Use single precision for GPU
+    # if model allows it, otherwise use double precision.
     if dtype is None or dtype == "default":
         numpy_dtype = (generate.F32 if platform == "ocl" and model_info.single
+        numpy_dtype = (generate.F32 if model_info.single and platform in ("ocl", "cuda")
                        else generate.F64)
     else:
         numpy_dtype = np.dtype(dtype)
     # Make sure that the type is supported by opencl, otherwise use dll
+    # Make sure that the type is supported by GPU, otherwise use dll
     if platform == "ocl":
+        if USE_CUDA:
+            env = kernelcuda.environment()
+        else:
+            env = kernelcl.environment()
+        if not env.has_type(numpy_dtype):
+            platform = "dll"
+            if dtype is None:
+                numpy_dtype = generate.F64
+        env = kernelcl.environment()
+    elif platform == "cuda":
+        env = kernelcuda.environment()
+    else:
+        env = None
+    if env is not None and not env.has_type(numpy_dtype):
+        platform = "dll"
+        if dtype is None:
+            numpy_dtype = generate.F64
     return numpy_dtype, fast, platform

sasmodels/kernel_header.c

-                      r0db7dbd
+                      rb0de252
 #elif defined(_OPENMP)
 # define USE_OPENMP
+#endif
+#elif defined(__CUDACC__)
+# define USE_CUDA
+#endif
+// Use SAS_DOUBLE to force the use of double even for float kernels
+#define SAS_DOUBLE dou ## ble
 // If opencl is not available, then we are compiling a C function
 …
 #endif // !USE_OPENCL
-// Use SAS_DOUBLE to force the use of double even for float kernels
-#define SAS_DOUBLE dou ## ble
 #if defined(NEED_EXPM1)
    // TODO: precision is a half digit lower than numpy on mac in [1e-7, 0.5]

sasmodels/kernel_iq.c

r47fb816	rb0de252
80	80	// du * (m_sigma_y + 1j*m_sigma_z);
81	81	// weights for spin crosssections: dd du real, ud real, uu, du imag, ud imag
	82	__device__
82	83	static void set_spin_weights(double in_spin, double out_spin, double weight[6])
83	84	{

sasmodels/kernelcl.py

-                      rd86f0fc
+                      rb0de252
 """
 GPU driver for C kernels
+TODO: docs are out of date
 There should be a single GPU environment running on the system.  This
 …
 # Attempt to setup opencl. This may fail if the opencl package is not
+# Attempt to setup opencl. This may fail if the pyopencl package is not
 # installed or if it is installed but there are no devices available.
 try:
 …
 def use_opencl():
+    return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none"
+    env = os.environ.get("SAS_OPENCL", "").lower()
+    return HAVE_OPENCL and env != "none" and not env.startswith("cuda")
 ENV = None
 …
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
         queue.device)
-def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
-    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
-    """
-    Stretch an input vector to the correct boundary.
-    Performance on the kernels can drop by a factor of two or more if the
-    number of values to compute does not fall on a nice power of two
-    boundary.   The trailing additional vector elements are given a
-    value of *extra*, and so f(*extra*) will be computed for each of
-    them.  The returned array will thus be a subset of the computed array.
-    *boundary* should be a power of 2 which is at least 32 for good
-    performance on current platforms (as of Jan 2015).  It should
-    probably be the max of get_warp(kernel,queue) and
-    device.min_data_type_align_size//4.
-    """
-    remainder = vector.size % boundary
-    if remainder != 0:
-        size = vector.size + (boundary - remainder)
-        vector = np.hstack((vector, [extra] * (size - vector.size)))
-    return np.ascontiguousarray(vector, dtype=dtype)
 def compile_model(context, source, dtype, fast=False):

sasmodels/kernelcuda.py

-                      r0db7dbd
+                      rb0de252
 """
+GPU driver for C kernels
+GPU driver for C kernels (with CUDA)
+To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular
+device number.  If no device number is specified, then look for CUDA_DEVICE=n
+or a file ~/.cuda-device containing n for the device number.  Otherwise, try
+all available device numbers.
+TODO: docs are out of date
 There should be a single GPU environment running on the system.  This
 …
 # Attempt to setup opencl. This may fail if the opencl package is not
+# Attempt to setup cuda. This may fail if the pycuda package is not
 # installed or if it is installed but there are no devices available.
 try:
-    import pycuda.autoinit
     import pycuda.driver as cuda  # type: ignore
     from pycuda.compiler import SourceModule
+    from pycuda.tools import make_default_context, clear_context_caches
+    # Ask CUDA for the default context (so that we know that one exists)
+    # then immediately throw it away in case the user doesn't want it.
+    # Note: cribbed from pycuda.autoinit
+    cuda.init()
+    context = make_default_context()
+    context.pop()
+    clear_context_caches()
+    del context
     HAVE_CUDA = True
     CUDA_ERROR = ""
 …
 MAX_LOOPS = 2048
-# Pragmas for enable OpenCL features.  Be sure to protect them so that they
-# still compile even if OpenCL is not present.
-_F16_PRAGMA = """\
-#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
-#  pragma OPENCL EXTENSION cl_khr_fp16: enable
-#endif
-"""
-_F64_PRAGMA = """\
-#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
-#  pragma OPENCL EXTENSION cl_khr_fp64: enable
-#endif
-"""
 def use_cuda():
+    return HAVE_CUDA
+    env = os.environ.get("SAS_OPENCL", "").lower()
+    return HAVE_CUDA and (env == "" or env.startswith("cuda"))
 ENV = None
 …
     """
     global ENV
+    # Free any previous allocated context.
+    if ENV is not None and ENV.context is not None:
+        ENV.release()
     ENV = GpuEnvironment() if use_cuda() else None
 …
     if ENV is None:
         if not HAVE_CUDA:
             raise RuntimeError("OpenCL startup failed with ***"
+            raise RuntimeError("CUDA startup failed with ***"
                             + CUDA_ERROR + "***; using C compiler instead")
         reset_environment()
 …
     return ENV
+def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
+    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
+    """
+    Stretch an input vector to the correct boundary.
+    Performance on the kernels can drop by a factor of two or more if the
+    number of values to compute does not fall on a nice power of two
+    boundary.   The trailing additional vector elements are given a
+    value of *extra*, and so f(*extra*) will be computed for each of
+    them.  The returned array will thus be a subset of the computed array.
+    *boundary* should be a power of 2 which is at least 32 for good
+    performance on current platforms (as of Jan 2015).  It should
+    probably be the max of get_warp(kernel,queue) and
+    device.min_data_type_align_size//4.
+    """
+    remainder = vector.size % boundary
+    if remainder != 0:
+        size = vector.size + (boundary - remainder)
+        vector = np.hstack((vector, [extra] * (size - vector.size)))
+    return np.ascontiguousarray(vector, dtype=dtype)
+def has_type(dtype):
+    # type: (np.dtype) -> bool
+    """
+    Return true if device supports the requested precision.
+    """
+    # Assume the nvidia card supports 32-bit and 64-bit floats.
+    # TODO: check if pycuda support F16
+    return dtype in (generate.F32, generate.F64)
 def compile_model(source, dtype, fast=False):
     # type: (str, np.dtype, bool) -> cl.Program
+    # type: (str, np.dtype, bool) -> SourceModule
     """
     Build a model to run on the gpu.
 …
     devices in the context do not support the cl_khr_fp64 extension.
     """
+    dtype = np.dtype(dtype)
+    if not has_type(dtype):
+        raise RuntimeError("%s not supported for devices"%dtype)
     source_list = [generate.convert_type(source, dtype)]
-    if dtype == generate.F16:
-        source_list.insert(0, _F16_PRAGMA)
-    elif dtype == generate.F64:
-        source_list.insert(0, _F64_PRAGMA)
     source_list.insert(0, "#define USE_SINCOS\n")
     source = "\n".join(source_list)
+    program = SourceModule(source) # no_extern_c=True, include_dirs=[...]
+    options = '-use_fast_math' if fast else None
+    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]
+    #print("done with "+program)
     return program
 …
     GPU context, with possibly many devices, and one queue per device.
     """
+    def __init__(self, devnum=0):
+        # type: () -> None
+    context = None # type: cuda.Context
+    def __init__(self, devnum=None):
+        # type: (int) -> None
         # Byte boundary for data alignment
         #self.data_boundary = max(d.min_data_type_align_size
         #                         for d in self.context.devices)
         self.compiled = {}
+        #self.device = cuda.Device(devnum)
+        #self.context = self.device.make_context()
+        env = os.environ.get("SAS_OPENCL", "").lower()
+        if devnum is None and env.startswith("cuda:"):
+            devnum = int(env[5:])
+        # Set the global context to the particular device number if one is
+        # given, otherwise use the default context.  Perhaps this will be set
+        # by an environment variable within autoinit.
+        if devnum is not None:
+            self.context = cuda.Device(devnum).make_context()
+        else:
+            self.context = make_default_context()
+    def release(self):
+        if self.context is not None:
+            self.context.pop()
+            self.context = None
+    def __del__(self):
+        self.release()
     def has_type(self, dtype):
 …
         Return True if all devices support a given type.
         """
         return True
+        return has_type(dtype)
     def compile_program(self, name, source, dtype, fast, timestamp):
 …
     that the compiler is allowed to take shortcuts.
     """
+    info = None # type: ModelInfo
+    source = "" # type: str
+    dtype = None # type: np.dtype
+    fast = False # type: bool
+    program = None # type: SourceModule
+    _kernels = None # type: List[cuda.Function]
     def __init__(self, source, model_info, dtype=generate.F32, fast=False):
         # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
 …
                     last_nap = current_time
         sync()
+        cuda.memcpy_dtoh(self.result, self.result_b)
+        #print("result", self.result)
         details_b.free()
         values_b.free()
-        cuda.memcpy_dtoh(self.result, self.result_b)
-        #print("result", self.result)
         pd_norm = self.result[self.q_input.nq]
 …
     #line added to not hog resources
+    while not done.query(): time.sleep(0.01)
+    while not done.query():
+        time.sleep(0.01)
     # Block until the GPU executes the kernel.
 …
     efficiency.
     '''
     max_gx,max_gy = 65535,65535
+    max_gx, max_gy = 65535, 65535
     blocksize = 32
     #max_gx,max_gy = 5,65536
+    #max_gx, max_gy = 5, 65536
     #blocksize = 3
     block = (blocksize,1,1)
+    block = (blocksize, 1, 1)
     num_blocks = int((n+blocksize-1)/blocksize)
     if num_blocks < max_gx:
         grid = (num_blocks,1)
+        grid = (num_blocks, 1)
     else:
         gx = max_gx
         gy = (num_blocks + max_gx - 1) / max_gx
         if gy >= max_gy: raise ValueError("vector is too large")
         grid = (gx,gy)
     #print "block",block,"grid",grid
     #print "waste",block[0]*block[1]*block[2]*grid[0]*grid[1] - n
     return dict(block=block,grid=grid)
+        if gy >= max_gy:
+            raise ValueError("vector is too large")
+        grid = (gx, gy)
+    #print("block", block, "grid", grid)
+    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n)
+    return dict(block=block, grid=grid)

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: