Diff [508475acdd4b81a0a845a0320a972480faef7a39:2a12d8d8fd9623118d498493994f27e450e2da04] for / – SasView

doc/guide/gpu_setup.rst

-                      r8b31efa
+                      r63602b1
 Device Selection
 ================
-**OpenCL drivers**
 If you have multiple GPU devices you can tell the program which device to use.
 By default, the program looks for one GPU and one CPU device from available
 …
 was used to run the model.
+If you want to use a specific driver and devices, you can run the following
+**If you don't want to use OpenCL, you can set** *SAS_OPENCL=None*
+**in your environment settings, and it will only use normal programs.**
+If you want to use one of the other devices, you can run the following
 from the python console::
 …
 This will provide a menu of different OpenCL drivers available.
 When one is selected, it will say "set PYOPENCL_CTX=..."
+Use that value as the value of *SAS_OPENCL=driver:device*.
+To use the default OpenCL device (rather than CUDA or None),
+set *SAS_OPENCL=opencl*.
+In batch queues, you may need to set *XDG_CACHE_HOME=~/.cache*
+(Linux only) to a different directory, depending on how the filesystem
+is configured.  You should also set *SAS_DLL_PATH* for CPU-only modules.
+    -DSAS_MODELPATH=path sets directory containing custom models
+    -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device
+    -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only)
+    -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler
+    -DSAS_OPENMP=1 turns on OpenMP for the DLLs
+    -DSAS_DLL_PATH=path sets the path to the compiled modules
+**CUDA drivers**
+If OpenCL drivers are not available on your system, but NVidia CUDA
+drivers are available, then set *SAS_OPENCL=cuda* or
+*SAS_OPENCL=cuda:n* for a particular device number *n*.  If no device
+number is specified, then the CUDA drivers looks for look for
+*CUDA_DEVICE=n* or a file ~/.cuda-device containing n for the device number.
+In batch queues, the SLURM command *sbatch --gres=gpu:1 ...* will set
+*CUDA_VISIBLE_DEVICES=n*, which ought to set the correct device
+number for *SAS_OPENCL=cuda*.  If not, then set
+*CUDA_DEVICE=$CUDA_VISIBLE_DEVICES* within the batch script.  You may
+need to set the CUDA cache directory to a folder accessible across the
+cluster with *PYCUDA_CACHE_DIR* (or *PYCUDA_DISABLE_CACHE* to disable
+caching), and you may need to set environment specific compiler flags
+with *PYCUDA_DEFAULT_NVCC_FLAGS*.  You should also set *SAS_DLL_PATH*
+for CPU-only modules.
+**No GPU support**
+If you don't want to use OpenCL or CUDA, you can set *SAS_OPENCL=None*
+in your environment settings, and it will only use normal programs.
+In batch queues, you may need to set *SAS_DLL_PATH* to a directory
+accessible on the compute node.
+Use that value as the value of *SAS_OPENCL*.
 Device Testing
 …
 *Document History*
 | 2018-10-15 Paul Kienzle
+| 2017-09-27 Paul Kienzle

sasmodels/compare.py

-                      r4de14584
+                      r610ef23
 from . import kerneldll
 from . import kernelcl
-from . import kernelcuda
 from .data import plot_theory, empty_data1D, empty_data2D, load_data
 from .direct_model import DirectModel, get_mesh
 …
     === environment variables ===
     -DSAS_MODELPATH=path sets directory containing custom models
     -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device
+    -DSAS_OPENCL=vendor:device|none sets the target OpenCL device
     -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only)
     -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler
 …
         set_integration_size(model_info, ngauss)
+    if (dtype != "default" and not dtype.endswith('!')
+            and not (kernelcl.use_opencl() or kernelcuda.use_cuda())):
+    if dtype != "default" and not dtype.endswith('!') and not kernelcl.use_opencl():
         raise RuntimeError("OpenCL not available " + kernelcl.OPENCL_ERROR)

sasmodels/core.py

-                      rb0de252
+                      r2dcd6e7
 from . import mixture
 from . import kernelpy
-from . import kernelcuda
 from . import kernelcl
 from . import kerneldll
 …
         #print("building dll", numpy_dtype)
         return kerneldll.load_dll(source['dll'], model_info, numpy_dtype)
-    elif platform == "cuda":
-        return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast)
     else:
         #print("building ocl", numpy_dtype)
 …
     # type: (ModelInfo, str, str) -> (np.dtype, bool, str)
     """
     Interpret dtype string, returning np.dtype, fast flag and platform.
+    Interpret dtype string, returning np.dtype and fast flag.
     Possible types include 'half', 'single', 'double' and 'quad'.  If the
 …
     default for the model and platform.
+    Platform preference can be specfied ("ocl", "cuda", "dll"), with the
+    default being OpenCL or CUDA if available, otherwise DLL.  If the dtype
+    name ends with '!' then platform is forced to be DLL rather than GPU.
+    The default platform is set by the environment variable SAS_OPENCL,
+    SAS_OPENCL=driver:device for OpenCL, SAS_OPENCL=cuda:device for CUDA
+    or SAS_OPENCL=none for DLL.
+    Platform preference can be specfied ("ocl" vs "dll"), with the default
+    being OpenCL if it is availabe.  If the dtype name ends with '!' then
+    platform is forced to be DLL rather than OpenCL.
     This routine ignores the preferences within the model definition.  This
 …
     if platform is None:
         platform = "ocl"
+    if not kernelcl.use_opencl() or not model_info.opencl:
+        platform = "dll"
     # Check if type indicates dll regardless of which platform is given
 …
         platform = "dll"
         dtype = dtype[:-1]
-    # Make sure model allows opencl/gpu
-    if not model_info.opencl:
-        platform = "dll"
-    # Make sure opencl is available, or fallback to cuda then to dll
-    if platform == "ocl" and not kernelcl.use_opencl():
-        platform = "cuda" if kernelcuda.use_cuda() else "dll"
     # Convert special type names "half", "fast", and "quad"
 …
         dtype = "float16"
+    # Convert dtype string to numpy dtype.  Use single precision for GPU
+    # if model allows it, otherwise use double precision.
+    # Convert dtype string to numpy dtype.
     if dtype is None or dtype == "default":
         numpy_dtype = (generate.F32 if model_info.single and platform in ("ocl", "cuda")
+        numpy_dtype = (generate.F32 if platform == "ocl" and model_info.single
                        else generate.F64)
     else:
         numpy_dtype = np.dtype(dtype)
     # Make sure that the type is supported by GPU, otherwise use dll
+    # Make sure that the type is supported by opencl, otherwise use dll
     if platform == "ocl":
         env = kernelcl.environment()
+    elif platform == "cuda":
+        env = kernelcuda.environment()
+    else:
+        env = None
+    if env is not None and not env.has_type(numpy_dtype):
+        platform = "dll"
+        if dtype is None:
+            numpy_dtype = generate.F64
+        if not env.has_type(numpy_dtype):
+            platform = "dll"
+            if dtype is None:
+                numpy_dtype = generate.F64
     return numpy_dtype, fast, platform

sasmodels/kernel_header.c

-                      r74e9b5f
+                      r108e70e
 #ifdef __OPENCL_VERSION__
 # define USE_OPENCL
-#elif defined(__CUDACC__)
-# define USE_CUDA
 #elif defined(_OPENMP)
 # define USE_OPENMP
 #endif
-// Use SAS_DOUBLE to force the use of double even for float kernels
-#define SAS_DOUBLE dou ## ble
 // If opencl is not available, then we are compiling a C function
 // Note: if using a C++ compiler, then define kernel as extern "C"
 #ifdef USE_OPENCL
-   #define USE_GPU
-   #define pglobal global
-   #define pconstant constant
    typedef int int32_t;
+   #if defined(USE_SINCOS)
+   #  define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar)
+   #else
+   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif
+#  if defined(USE_SINCOS)
+#    define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar)
+#  else
+#    define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+#  endif
    // Intel CPU on Mac gives strange values for erf(); on the verified
    // platforms (intel, nvidia, amd), the cephes erf() is significantly
 …
    #  define erfcf erfc
    #endif
+#elif defined(USE_CUDA)
+   #define USE_GPU
+   #define local __shared__
+   #define pglobal
+   #define constant __constant__
+   #define pconstant const
+   #define kernel extern "C" __global__
+   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+   // OpenCL pown(a,b) = C99 pow(a,b), b integer
+   #define powr(a,b) pow(a,b)
+   #define pown(a,b) pow(a,b)
+   //typedef int int32_t;
+   #if defined(USE_SINCOS)
+   #  define SINCOS(angle,svar,cvar) sincos(angle,&svar,&cvar)
+   #else
+   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif
+#else // !USE_OPENCL && !USE_CUDA
+   #define local
+   #define pglobal
+   #define constant const
+   #define pconstant const
+   #ifdef __cplusplus
+#else // !USE_OPENCL
+// Use SAS_DOUBLE to force the use of double even for float kernels
+#  define SAS_DOUBLE dou ## ble
+#  ifdef __cplusplus
       #include <cstdio>
       #include <cmath>
 …
      #endif
      inline void SINCOS(double angle, double &svar, double &cvar) { svar=sin(angle); cvar=cos(angle); }
    #else // !__cplusplus
+#  else // !__cplusplus
      #include <inttypes.h>  // C99 guarantees that int32_t types is here
      #include <stdio.h>
 …
      #define kernel
      #define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif  // !__cplusplus
+   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+   // OpenCL pown(a,b) = C99 pow(a,b), b integer
+   #define powr(a,b) pow(a,b)
+   #define pown(a,b) pow(a,b)
+#  endif  // !__cplusplus
+#  define global
+#  define local
+#  define constant const
+// OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+// OpenCL pown(a,b) = C99 pow(a,b), b integer
+#  define powr(a,b) pow(a,b)
+#  define pown(a,b) pow(a,b)
 #endif // !USE_OPENCL

sasmodels/kernel_iq.c

-                      r74e9b5f
+                      r70530778
     const int32_t pd_start,     // where we are in the dispersity loop
     const int32_t pd_stop,      // where we are stopping in the dispersity loop
     pglobal const ProblemDetails *details,
     pglobal const double *values,
     pglobal const double *q, // nq q values, with padding to boundary
     pglobal double *result,  // nq+1 return values, again with padding
+    global const ProblemDetails *details,
+    global const double *values,
+    global const double *q, // nq q values, with padding to boundary
+    global double *result,  // nq+1 return values, again with padding
     const double cutoff     // cutoff in the dispersity weight product
+    )
+{
 #if defined(USE_GPU)
+#ifdef USE_OPENCL
   // who we are and what element we are working with
-  #if defined(USE_OPENCL)
   const int q_index = get_global_id(0);
-  #else // USE_CUDA
-  const int q_index = threadIdx.x + blockIdx.x * blockDim.x;
-  #endif
   if (q_index >= nq) return;
 #else
 …
   // seeing one q value (stored in the variable "this_result") while the dll
   // version must loop over all q.
   #if defined(USE_GPU)
+  #ifdef USE_OPENCL
     double pd_norm = (pd_start == 0 ? 0.0 : result[nq]);
     double this_result = (pd_start == 0 ? 0.0 : result[q_index]);
   #else // !USE_GPU
+  #else // !USE_OPENCL
     double pd_norm = (pd_start == 0 ? 0.0 : result[nq]);
     if (pd_start == 0) {
 …
+    }
     //if (q_index==0) printf("start %d %g %g\n", pd_start, pd_norm, result[0]);
 #endif // !USE_GPU
+#endif // !USE_OPENCL
 …
   const int n4 = pd_length[4];
   const int p4 = pd_par[4];
   pglobal const double *v4 = pd_value + pd_offset[4];
   pglobal const double *w4 = pd_weight + pd_offset[4];
+  global const double *v4 = pd_value + pd_offset[4];
+  global const double *w4 = pd_weight + pd_offset[4];
   int i4 = (pd_start/pd_stride[4])%n4;  // position in level 4 at pd_start
 …
   const int n##_LOOP = details->pd_length[_LOOP]; \
   const int p##_LOOP = details->pd_par[_LOOP]; \
   pglobal const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \
   pglobal const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \
+  global const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \
+  global const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \
   int i##_LOOP = (pd_start/details->pd_stride[_LOOP])%n##_LOOP;
 …
 // Pointers to the start of the dispersity and weight vectors, if needed.
 #if MAX_PD>0
   pglobal const double *pd_value = values + NUM_VALUES;
   pglobal const double *pd_weight = pd_value + details->num_weights;
+  global const double *pd_value = values + NUM_VALUES;
+  global const double *pd_weight = pd_value + details->num_weights;
 #endif
 …
       BUILD_ROTATION();
 #if !defined(USE_GPU)
+#ifndef USE_OPENCL
       // DLL needs to explicitly loop over the q values.
       #ifdef USE_OPENMP
 …
       #endif
       for (q_index=0; q_index<nq; q_index++)
 #endif // !USE_GPU
+#endif // !USE_OPENCL
+      {
 …
 //printf("q_index:%d %g %g %g %g\n", q_index, scattering, weight0);
         #if defined(USE_GPU)
+        #ifdef USE_OPENCL
           this_result += weight * scattering;
         #else // !USE_GPU
+        #else // !USE_OPENCL
           result[q_index] += weight * scattering;
         #endif // !USE_GPU
+        #endif // !USE_OPENCL
+      }
+    }
 …
 // Remember the current result and the updated norm.
 #if defined(USE_GPU)
+#ifdef USE_OPENCL
   result[q_index] = this_result;
   if (q_index == 0) result[nq] = pd_norm;
 //if (q_index == 0) printf("res: %g/%g\n", result[0], pd_norm);
 #else // !USE_GPU
+#else // !USE_OPENCL
   result[nq] = pd_norm;
 //printf("res: %g/%g\n", result[0], pd_norm);
 #endif // !USE_GPU
+#endif // !USE_OPENCL
 // ** clear the macros in preparation for the next kernel **

sasmodels/kernelcl.py

                       r95f62aa
 """
 GPU driver for C kernels
-TODO: docs are out of date
 There should be a single GPU environment running on the system.  This
 …
 # Attempt to setup opencl. This may fail if the pyopencl package is not
+# Attempt to setup opencl. This may fail if the opencl package is not
 # installed or if it is installed but there are no devices available.
 try:
 …
 def use_opencl():
+    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
+    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
+    return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none"
 ENV = None
 …
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
         queue.device)
+def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
+    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
+    """
+    Stretch an input vector to the correct boundary.
+    Performance on the kernels can drop by a factor of two or more if the
+    number of values to compute does not fall on a nice power of two
+    boundary.   The trailing additional vector elements are given a
+    value of *extra*, and so f(*extra*) will be computed for each of
+    them.  The returned array will thus be a subset of the computed array.
+    *boundary* should be a power of 2 which is at least 32 for good
+    performance on current platforms (as of Jan 2015).  It should
+    probably be the max of get_warp(kernel,queue) and
+    device.min_data_type_align_size//4.
+    """
+    remainder = vector.size % boundary
+    if remainder != 0:
+        size = vector.size + (boundary - remainder)
+        vector = np.hstack((vector, [extra] * (size - vector.size)))
+    return np.ascontiguousarray(vector, dtype=dtype)
 def compile_model(context, source, dtype, fast=False):
 …
     Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
     otherwise scans for the most appropriate device using
+    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
+    indicates that an OpenCL device should be used without specifying
+    which one (and not a CUDA device, or no GPU).
+    """
+    # Assume we do not get here if SAS_OPENCL is None or CUDA
+    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
+    if sas_opencl.lower() != 'opencl':
+        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+        os.environ["PYOPENCL_CTX"] = sas_opencl
+    :func:`_get_default_context`
+    """
+    if 'SAS_OPENCL' in os.environ:
+        #Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+        os.environ["PYOPENCL_CTX"] = os.environ["SAS_OPENCL"]
     if 'PYOPENCL_CTX' in os.environ:
 …
                 current_time = time.clock()
                 if current_time - last_nap > 0.5:
                     time.sleep(0.001)
+                    time.sleep(0.05)
                     last_nap = current_time
         cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for)

sasmodels/model_test.py

-                      r74e9b5f
+                      r012cd34
 Usage::
     python -m sasmodels.model_test [opencl|cuda|dll] model1 model2 ...
+    python -m sasmodels.model_test [opencl|dll|opencl_and_dll] model1 model2 ...
     if model1 is 'all', then all except the remaining models will be tested
 …
 from .modelinfo import expand_pars
 from .kernelcl import use_opencl
-from .kernelcuda import use_cuda
 # pylint: disable=unused-import
 …
     Construct the pyunit test suite.
+    *loaders* is the list of kernel drivers to use (dll, opencl or cuda).
+    For python model the python driver is always used.
+    *loaders* is the list of kernel drivers to use, which is one of
+    *["dll", "opencl"]*, *["dll"]* or *["opencl"]*.  For python models,
+    the python driver is always used.
     *models* is the list of models to test, or *["all"]* to test all models.
 …
             # test using dll if desired
             if 'dll' in loaders:
+            if 'dll' in loaders or not use_opencl():
                 test_name = "%s-dll"%model_name
                 test_method_name = "test_%s_dll" % model_info.id
 …
                                      test_method_name,
                                      platform="ocl", dtype=None,
-                                     stash=stash)
-                #print("defining", test_name)
-                suite.addTest(test)
-            # test using cuda if desired and available
-            if 'cuda' in loaders and use_cuda():
-                test_name = "%s-cuda"%model_name
-                test_method_name = "test_%s_cuda" % model_info.id
-                # Using dtype=None so that the models that are only
-                # correct for double precision are not tested using
-                # single precision.  The choice is determined by the
-                # presence of *single=False* in the model file.
-                test = ModelTestCase(test_name, model_info,
-                                     test_method_name,
-                                     platform="cuda", dtype=None,
                                      stash=stash)
                 #print("defining", test_name)
 …
                 # Check for missing tests.  Only do so for the "dll" tests
                 # to reduce noise from both opencl and cuda, and because
+                # to reduce noise from both opencl and dll, and because
                 # python kernels use platform="dll".
                 if self.platform == "dll":
 …
     # Build a test suite containing just the model
     loader = 'opencl' if use_opencl() else 'cuda' if use_cuda() else 'dll'
+    loaders = ['opencl'] if use_opencl() else ['dll']
     models = [model]
     try:
         suite = make_suite([loader], models)
+        suite = make_suite(loaders, models)
     except Exception:
         import traceback
 …
         loaders = ['opencl']
         models = models[1:]
-    elif models and models[0] == 'cuda':
-        if not use_cuda():
-            print("cuda is not available")
-            return 1
-        loaders = ['cuda']
-        models = models[1:]
     elif models and models[0] == 'dll':
         # TODO: test if compiler is available?
         loaders = ['dll']
         models = models[1:]
+    elif models and models[0] == 'opencl_and_dll':
+        loaders = ['opencl', 'dll'] if use_opencl() else ['dll']
+        models = models[1:]
     else:
+        loaders = ['dll']
+        if use_opencl():
+            loaders.append('opencl')
+        if use_cuda():
+            loaders.append('cuda')
+        loaders = ['opencl', 'dll'] if use_opencl() else ['dll']
     if not models:
         print("""\
 usage:
   python -m sasmodels.model_test [-v] [opencl|cuda|dll] model1 model2 ...
+  python -m sasmodels.model_test [-v] [opencl|dll] model1 model2 ...
 If -v is included on the command line, then use verbose output.
 If no platform is specified, then models will be tested with dll, and
 if available, OpenCL and CUDA; the compute target is ignored for pure python models.
+If neither opencl nor dll is specified, then models will be tested with
+both OpenCL and dll; the compute target is ignored for pure python models.
 If model1 is 'all', then all except the remaining models will be tested.
 …
     Run "nosetests sasmodels" on the command line to invoke it.
     """
+    loaders = ['dll']
+    if use_opencl():
+        loaders.append('opencl')
+    if use_cuda():
+        loaders.append('cuda')
+    loaders = ['opencl', 'dll'] if use_opencl() else ['dll']
     tests = make_suite(loaders, ['all'])
     def build_test(test):

sasmodels/models/lib/gauss76.c

-                      r74e9b5f
+                      r99b84ec
 // Gaussians
 constant double Gauss76Wt[76] = {
+constant double Gauss76Wt[76]={
         .00126779163408536,             //0
         .00294910295364247,
 …
 };
 constant double Gauss76Z[76] = {
+constant double Gauss76Z[76]={
         -.999505948362153,              //0
         -.997397786355355,

sasmodels/models/lib/polevl.c

-                      r74e9b5f
+                      r447e9aa
 */
+static
 double polevl( double x, pconstant double *coef, int N )
+double polevl( double x, constant double *coef, int N );
+double polevl( double x, constant double *coef, int N )
+{
 …
  */
+static
 double p1evl( double x, pconstant double *coef, int N )
+double p1evl( double x, constant double *coef, int N );
+double p1evl( double x, constant double *coef, int N )
+{
     int i=0;

sasmodels/models/lib/sas_J1.c

-                      r74e9b5f
+                      r5181ccc
 #if FLOAT_SIZE>4
 //Cephes double pression function
+double cephes_j1(double x);
 constant double RPJ1[8] = {
 …
 .0 };
-static
 double cephes_j1(double x)
+{
 …
 #else
 //Single precission version of cephes
+float cephes_j1f(float x);
 constant float JPJ1[8] = {
     -4.878788132172128E-009,
 …
     };
-static
 float cephes_j1f(float xx)
+{
 …
 //Finally J1c function that equals 2*J1(x)/x
+static
+double sas_2J1x_x(double x);
 double sas_2J1x_x(double x)
+{

SasView

Changes in / [508475a:2a12d8d8] in sasmodels

Legend:

doc/guide/gpu_setup.rst

sasmodels/compare.py

sasmodels/core.py

sasmodels/kernel_header.c

sasmodels/kernel_iq.c

sasmodels/kernelcl.py

sasmodels/model_test.py

sasmodels/models/lib/gauss76.c

sasmodels/models/lib/polevl.c

sasmodels/models/lib/sas_J1.c

Download in other formats: