Diff [31fc4ad0933a6d3f10f38cf74b060ca21352e976:07646b670995e9eaae706ef2c5ae36c4661d88a4] for / – SasView

doc/guide/gpu_setup.rst

-                      r63602b1
+                      r8b31efa
 Device Selection
 ================
+**OpenCL drivers**
 If you have multiple GPU devices you can tell the program which device to use.
 By default, the program looks for one GPU and one CPU device from available
 …
 was used to run the model.
+**If you don't want to use OpenCL, you can set** *SAS_OPENCL=None*
+**in your environment settings, and it will only use normal programs.**
+If you want to use one of the other devices, you can run the following
+If you want to use a specific driver and devices, you can run the following
 from the python console::
 …
 This will provide a menu of different OpenCL drivers available.
 When one is selected, it will say "set PYOPENCL_CTX=..."
+Use that value as the value of *SAS_OPENCL*.
+Use that value as the value of *SAS_OPENCL=driver:device*.
+To use the default OpenCL device (rather than CUDA or None),
+set *SAS_OPENCL=opencl*.
+In batch queues, you may need to set *XDG_CACHE_HOME=~/.cache*
+(Linux only) to a different directory, depending on how the filesystem
+is configured.  You should also set *SAS_DLL_PATH* for CPU-only modules.
+    -DSAS_MODELPATH=path sets directory containing custom models
+    -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device
+    -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only)
+    -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler
+    -DSAS_OPENMP=1 turns on OpenMP for the DLLs
+    -DSAS_DLL_PATH=path sets the path to the compiled modules
+**CUDA drivers**
+If OpenCL drivers are not available on your system, but NVidia CUDA
+drivers are available, then set *SAS_OPENCL=cuda* or
+*SAS_OPENCL=cuda:n* for a particular device number *n*.  If no device
+number is specified, then the CUDA drivers looks for look for
+*CUDA_DEVICE=n* or a file ~/.cuda-device containing n for the device number.
+In batch queues, the SLURM command *sbatch --gres=gpu:1 ...* will set
+*CUDA_VISIBLE_DEVICES=n*, which ought to set the correct device
+number for *SAS_OPENCL=cuda*.  If not, then set
+*CUDA_DEVICE=$CUDA_VISIBLE_DEVICES* within the batch script.  You may
+need to set the CUDA cache directory to a folder accessible across the
+cluster with *PYCUDA_CACHE_DIR* (or *PYCUDA_DISABLE_CACHE* to disable
+caching), and you may need to set environment specific compiler flags
+with *PYCUDA_DEFAULT_NVCC_FLAGS*.  You should also set *SAS_DLL_PATH*
+for CPU-only modules.
+**No GPU support**
+If you don't want to use OpenCL or CUDA, you can set *SAS_OPENCL=None*
+in your environment settings, and it will only use normal programs.
+In batch queues, you may need to set *SAS_DLL_PATH* to a directory
+accessible on the compute node.
 Device Testing
 …
 *Document History*
 | 2017-09-27 Paul Kienzle
+| 2018-10-15 Paul Kienzle

doc/guide/magnetism/magnetism.rst

-                      rbefe905
+                      rdf87acf
 ===========   ================================================================
  M0:sld       $D_M M_0$
  mtheta:sld   $\theta_M$
  mphi:sld     $\phi_M$
  up:angle     $\theta_\mathrm{up}$
  up:frac_i    $u_i$ = (spin up)/(spin up + spin down) *before* the sample
  up:frac_f    $u_f$ = (spin up)/(spin up + spin down) *after* the sample
+ sld_M0       $D_M M_0$
+ sld_mtheta   $\theta_M$
+ sld_mphi     $\phi_M$
+ up_frac_i    $u_i$ = (spin up)/(spin up + spin down) *before* the sample
+ up_frac_f    $u_f$ = (spin up)/(spin up + spin down) *after* the sample
+ up_angle     $\theta_\mathrm{up}$
 ===========   ================================================================
 .. note::
     The values of the 'up:frac_i' and 'up:frac_f' must be in the range 0 to 1.
+    The values of the 'up_frac_i' and 'up_frac_f' must be in the range 0 to 1.
 *Document History*

sasmodels/compare.py

-                      r610ef23
+                      r4de14584
 from . import kerneldll
 from . import kernelcl
+from . import kernelcuda
 from .data import plot_theory, empty_data1D, empty_data2D, load_data
 from .direct_model import DirectModel, get_mesh
 …
     === environment variables ===
     -DSAS_MODELPATH=path sets directory containing custom models
     -DSAS_OPENCL=vendor:device|none sets the target OpenCL device
+    -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device
     -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only)
     -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler
 …
         set_integration_size(model_info, ngauss)
+    if dtype != "default" and not dtype.endswith('!') and not kernelcl.use_opencl():
+    if (dtype != "default" and not dtype.endswith('!')
+            and not (kernelcl.use_opencl() or kernelcuda.use_cuda())):
         raise RuntimeError("OpenCL not available " + kernelcl.OPENCL_ERROR)

sasmodels/core.py

-                      ree60aa7
+                      rb0de252
 from . import mixture
 from . import kernelpy
+from . import kernelcuda
 from . import kernelcl
 from . import kerneldll
 …
         #print("building dll", numpy_dtype)
         return kerneldll.load_dll(source['dll'], model_info, numpy_dtype)
+    elif platform == "cuda":
+        return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast)
     else:
         #print("building ocl", numpy_dtype)
 …
     # type: (ModelInfo, str, str) -> (np.dtype, bool, str)
     """
     Interpret dtype string, returning np.dtype and fast flag.
+    Interpret dtype string, returning np.dtype, fast flag and platform.
     Possible types include 'half', 'single', 'double' and 'quad'.  If the
 …
     default for the model and platform.
+    Platform preference can be specfied ("ocl" vs "dll"), with the default
+    being OpenCL if it is availabe.  If the dtype name ends with '!' then
+    platform is forced to be DLL rather than OpenCL.
+    Platform preference can be specfied ("ocl", "cuda", "dll"), with the
+    default being OpenCL or CUDA if available, otherwise DLL.  If the dtype
+    name ends with '!' then platform is forced to be DLL rather than GPU.
+    The default platform is set by the environment variable SAS_OPENCL,
+    SAS_OPENCL=driver:device for OpenCL, SAS_OPENCL=cuda:device for CUDA
+    or SAS_OPENCL=none for DLL.
     This routine ignores the preferences within the model definition.  This
 …
     if platform is None:
         platform = "ocl"
-    if not kernelcl.use_opencl() or not model_info.opencl:
-        platform = "dll"
     # Check if type indicates dll regardless of which platform is given
 …
         platform = "dll"
         dtype = dtype[:-1]
+    # Make sure model allows opencl/gpu
+    if not model_info.opencl:
+        platform = "dll"
+    # Make sure opencl is available, or fallback to cuda then to dll
+    if platform == "ocl" and not kernelcl.use_opencl():
+        platform = "cuda" if kernelcuda.use_cuda() else "dll"
     # Convert special type names "half", "fast", and "quad"
 …
         dtype = "float16"
+    # Convert dtype string to numpy dtype.
+    # Convert dtype string to numpy dtype.  Use single precision for GPU
+    # if model allows it, otherwise use double precision.
     if dtype is None or dtype == "default":
         numpy_dtype = (generate.F32 if platform == "ocl" and model_info.single
+        numpy_dtype = (generate.F32 if model_info.single and platform in ("ocl", "cuda")
                        else generate.F64)
     else:
         numpy_dtype = np.dtype(dtype)
     # Make sure that the type is supported by opencl, otherwise use dll
+    # Make sure that the type is supported by GPU, otherwise use dll
     if platform == "ocl":
         env = kernelcl.environment()
+        if not env.has_type(numpy_dtype):
+            platform = "dll"
+            if dtype is None:
+                numpy_dtype = generate.F64
+    elif platform == "cuda":
+        env = kernelcuda.environment()
+    else:
+        env = None
+    if env is not None and not env.has_type(numpy_dtype):
+        platform = "dll"
+        if dtype is None:
+            numpy_dtype = generate.F64
     return numpy_dtype, fast, platform

sasmodels/kernel_header.c

-                      r296c52b
+                      r74e9b5f
 #ifdef __OPENCL_VERSION__
 # define USE_OPENCL
+#elif defined(__CUDACC__)
+# define USE_CUDA
 #elif defined(_OPENMP)
 # define USE_OPENMP
 #endif
+// Use SAS_DOUBLE to force the use of double even for float kernels
+#define SAS_DOUBLE dou ## ble
 // If opencl is not available, then we are compiling a C function
 // Note: if using a C++ compiler, then define kernel as extern "C"
 #ifdef USE_OPENCL
+   #define USE_GPU
+   #define pglobal global
+   #define pconstant constant
    typedef int int32_t;
+#  if defined(USE_SINCOS)
+#    define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar)
+#  else
+#    define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+#  endif
+   #if defined(USE_SINCOS)
+   #  define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar)
+   #else
+   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif
    // Intel CPU on Mac gives strange values for erf(); on the verified
    // platforms (intel, nvidia, amd), the cephes erf() is significantly
 …
    #  define erfcf erfc
    #endif
+#else // !USE_OPENCL
+// Use SAS_DOUBLE to force the use of double even for float kernels
+#  define SAS_DOUBLE dou ## ble
+#  ifdef __cplusplus
+#elif defined(USE_CUDA)
+   #define USE_GPU
+   #define local __shared__
+   #define pglobal
+   #define constant __constant__
+   #define pconstant const
+   #define kernel extern "C" __global__
+   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+   // OpenCL pown(a,b) = C99 pow(a,b), b integer
+   #define powr(a,b) pow(a,b)
+   #define pown(a,b) pow(a,b)
+   //typedef int int32_t;
+   #if defined(USE_SINCOS)
+   #  define SINCOS(angle,svar,cvar) sincos(angle,&svar,&cvar)
+   #else
+   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif
+#else // !USE_OPENCL && !USE_CUDA
+   #define local
+   #define pglobal
+   #define constant const
+   #define pconstant const
+   #ifdef __cplusplus
       #include <cstdio>
       #include <cmath>
 …
      #endif
      inline void SINCOS(double angle, double &svar, double &cvar) { svar=sin(angle); cvar=cos(angle); }
 #  else // !__cplusplus
+   #else // !__cplusplus
      #include <inttypes.h>  // C99 guarantees that int32_t types is here
      #include <stdio.h>
 …
      #define kernel
      #define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+#  endif  // !__cplusplus
+#  define global
+#  define local
+#  define constant const
+// OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+// OpenCL pown(a,b) = C99 pow(a,b), b integer
+#  define powr(a,b) pow(a,b)
+#  define pown(a,b) pow(a,b)
+   #endif  // !__cplusplus
+   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+   // OpenCL pown(a,b) = C99 pow(a,b), b integer
+   #define powr(a,b) pow(a,b)
+   #define pown(a,b) pow(a,b)
 #endif // !USE_OPENCL

sasmodels/kernel_iq.c

                       re44432d
 kernel
 void KERNEL_NAME(
     int32_t nq,                 // number of q values
     const int32_t pd_start,     // where we are in the dispersity loop
     const int32_t pd_stop,      // where we are stopping in the dispersity loop
     global const ProblemDetails *details,
     global const double *values,
     global const double *q, // nq q values, with padding to boundary
     global double *result,  // nq+1 return values, again with padding
     const double cutoff,     // cutoff in the dispersity weight product
+    int32_t nq,                   // number of q values
+    const int32_t pd_start,       // where we are in the dispersity loop
+    const int32_t pd_stop,        // where we are stopping in the dispersity loop
+    pglobal const ProblemDetails *details,
+    pglobal const double *values, // parameter values and distributions
+    pglobal const double *q,      // nq q values, with padding to boundary
+    pglobal double *result,       // nq+1 return values, again with padding
+    const double cutoff,          // cutoff in the dispersity weight product
     int32_t effective_radius_type // which effective radius to compute
+    )
+{
 #ifdef USE_OPENCL
+#if defined(USE_GPU)
   // who we are and what element we are working with
+  #if defined(USE_OPENCL)
   const int q_index = get_global_id(0);
+  #else // USE_CUDA
+  const int q_index = threadIdx.x + blockIdx.x * blockDim.x;
+  #endif
   if (q_index >= nq) return;
 #else
 …
   //
   // The code differs slightly between opencl and dll since opencl is only
   // seeing one q value (stored in the variable "this_result") while the dll
+  // seeing one q value (stored in the variable "this_F2") while the dll
   // version must loop over all q.
+  #ifdef USE_OPENCL
+  #if defined(CALL_FQ)
+    double weight_norm = (pd_start == 0 ? 0.0 : result[2*nq]);
+    double weighted_form = (pd_start == 0 ? 0.0 : result[2*nq+1]);
+    double weighted_shell = (pd_start == 0 ? 0.0 : result[2*nq+2]);
+    double weighted_radius = (pd_start == 0 ? 0.0 : result[2*nq+3]);
+  #else
+    double weight_norm = (pd_start == 0 ? 0.0 : result[nq]);
+    double weighted_form = (pd_start == 0 ? 0.0 : result[nq+1]);
+    double weighted_shell = (pd_start == 0 ? 0.0 : result[nq+2]);
+    double weighted_radius = (pd_start == 0 ? 0.0 : result[nq+3]);
+  #endif
+  #if defined(USE_GPU)
     #if defined(CALL_FQ)
-      double weight_norm = (pd_start == 0 ? 0.0 : result[2*nq]);
-      double weighted_form = (pd_start == 0 ? 0.0 : result[2*nq+1]);
-      double weighted_shell = (pd_start == 0 ? 0.0 : result[2*nq+2]);
-      double weighted_radius = (pd_start == 0 ? 0.0 : result[2*nq+3]);
       double this_F2 = (pd_start == 0 ? 0.0 : result[2*q_index+0]);
       double this_F1 = (pd_start == 0 ? 0.0 : result[2*q_index+1]);
     #else
+      double weight_norm = (pd_start == 0 ? 0.0 : result[nq]);
+      double weighted_form = (pd_start == 0 ? 0.0 : result[nq+1]);
+      double weighted_shell = (pd_start == 0 ? 0.0 : result[nq+2]);
+      double weighted_radius = (pd_start == 0 ? 0.0 : result[nq+3]);
+      double this_result = (pd_start == 0 ? 0.0 : result[q_index]);
+      double this_F2 = (pd_start == 0 ? 0.0 : result[q_index]);
     #endif
+  #else // !USE_OPENCL
+    #if defined(CALL_FQ)
+      double weight_norm = (pd_start == 0 ? 0.0 : result[2*nq]);
+      double weighted_form = (pd_start == 0 ? 0.0 : result[2*nq+1]);
+      double weighted_shell = (pd_start == 0 ? 0.0 : result[2*nq+2]);
+      double weighted_radius = (pd_start == 0 ? 0.0 : result[2*nq+3]);
+    #else
+      double weight_norm = (pd_start == 0 ? 0.0 : result[nq]);
+      double weighted_form = (pd_start == 0 ? 0.0 : result[nq+1]);
+      double weighted_shell = (pd_start == 0 ? 0.0 : result[nq+2]);
+      double weighted_radius = (pd_start == 0 ? 0.0 : result[nq+3]);
+    #endif
+  #else // !USE_GPU
     if (pd_start == 0) {
       #ifdef USE_OPENMP
 …
       #endif
+    }
     //if (q_index==0) printf("start %d %g %g\n", pd_start, weighted_shell, result[0]);
 #endif // !USE_OPENCL
+    //if (q_index==0) printf("start %d %g %g\n", pd_start, pd_norm, result[0]);
+#endif // !USE_GPU
 …
   const int n4 = pd_length[4];
   const int p4 = pd_par[4];
   global const double *v4 = pd_value + pd_offset[4];
   global const double *w4 = pd_weight + pd_offset[4];
+  pglobal const double *v4 = pd_value + pd_offset[4];
+  pglobal const double *w4 = pd_weight + pd_offset[4];
   int i4 = (pd_start/pd_stride[4])%n4;  // position in level 4 at pd_start
 …
           FETCH_Q         // set qx,qy from the q input vector
           APPLY_ROTATION  // convert qx,qy to qa,qb,qc
           CALL_KERNEL     // scattering = Iqxy(qa, qb, qc, p1, p2, ...)
+          CALL_KERNEL     // F2 = Iqxy(qa, qb, qc, p1, p2, ...)
       ++step;  // increment counter representing position in dispersity mesh
 …
   const int n##_LOOP = details->pd_length[_LOOP]; \
   const int p##_LOOP = details->pd_par[_LOOP]; \
   global const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \
   global const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \
+  pglobal const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \
+  pglobal const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \
   int i##_LOOP = (pd_start/details->pd_stride[_LOOP])%n##_LOOP;
 …
 // Pointers to the start of the dispersity and weight vectors, if needed.
 #if MAX_PD>0
   global const double *pd_value = values + NUM_VALUES;
   global const double *pd_weight = pd_value + details->num_weights;
+  pglobal const double *pd_value = values + NUM_VALUES;
+  pglobal const double *pd_weight = pd_value + details->num_weights;
 #endif
 …
       BUILD_ROTATION();
 #ifndef USE_OPENCL
+#if !defined(USE_GPU)
       // DLL needs to explicitly loop over the q values.
       #ifdef USE_OPENMP
 …
       #endif
       for (q_index=0; q_index<nq; q_index++)
 #endif // !USE_OPENCL
+#endif // !USE_GPU
+      {
 …
         #if defined(MAGNETIC) && NUM_MAGNETIC > 0
           // Compute the scattering from the magnetic cross sections.
           double scattering = 0.0;
+          double F2 = 0.0;
           const double qsq = qx*qx + qy*qy;
           if (qsq > 1.e-16) {
 …
 //  q_index, qx, qy, xs, sk, local_values.vector[sld_index], px, py, mx, my, mz);
+                }
                 scattering += xs_weight * CALL_KERNEL();
+                F2 += xs_weight * CALL_KERNEL();
+              }
+            }
 …
             CALL_KERNEL(); // sets F1 and F2 by reference
           #else
             const double scattering = CALL_KERNEL();
+            const double F2 = CALL_KERNEL();
           #endif
         #endif // !MAGNETIC
 //printf("q_index:%d %g %g %g %g\n", q_index, scattering, weight0);
         #ifdef USE_OPENCL
+//printf("q_index:%d %g %g %g %g\n", q_index, F2, weight0);
+        #if defined(USE_GPU)
           #if defined(CALL_FQ)
             this_F2 += weight * F2;
             this_F1 += weight * F1;
           #else
             this_result += weight * scattering;
+            this_F2 += weight * F2;
           #endif
         #else // !USE_OPENCL
 …
             result[2*q_index+1] += weight * F1;
           #else
             result[q_index] += weight * scattering;
+            result[q_index] += weight * F2;
           #endif
         #endif // !USE_OPENCL
 …
 #endif
 // Remember the current result and the updated norm.
 #ifdef USE_OPENCL
+// Remember the results and the updated norm.
+#if defined(USE_GPU)
   #if defined(CALL_FQ)
+    result[2*q_index+0] = this_F2;
+    result[2*q_index+1] = this_F1;
+    if (q_index == 0) {
+      result[2*nq+0] = weight_norm;
+      result[2*nq+1] = weighted_form;
+      result[2*nq+3] = weighted_shell;
+      result[2*nq+3] = weighted_radius;
+    }
+  result[2*q_index+0] = this_F2;
+  result[2*q_index+1] = this_F1;
   #else
+    result[q_index] = this_result;
+    if (q_index == 0) {
+      result[nq+0] = weight_norm;
+      result[nq+1] = weighted_form;
+      result[nq+2] = weighted_shell;
+      result[nq+3] = weighted_radius;
+    }
+  result[q_index] = this_F2;
   #endif
+//if (q_index == 0) printf("res: %g/%g\n", result[0], weighted_shell);
+#else // !USE_OPENCL
   #if defined(CALL_FQ)
+  if (q_index == 0)
+#endif
+  {
+#if defined(CALL_FQ)
     result[2*nq] = weight_norm;
     result[2*nq+1] = weighted_form;
     result[2*nq+2] = weighted_shell;
     result[2*nq+3] = weighted_radius;
   #else
+#else
     result[nq] = weight_norm;
     result[nq+1] = weighted_form;
     result[nq+2] = weighted_shell;
     result[nq+3] = weighted_radius;
+  #endif
+//printf("res: %g/%g\n", result[0], weighted_shell);
+#endif // !USE_OPENCL
+#endif
+  }
 // ** clear the macros in preparation for the next kernel **

sasmodels/kernelcl.py

-                      re44432d
+                      r95f62aa
 """
 GPU driver for C kernels
+TODO: docs are out of date
 There should be a single GPU environment running on the system.  This
 …
 # Attempt to setup opencl. This may fail if the opencl package is not
+# Attempt to setup opencl. This may fail if the pyopencl package is not
 # installed or if it is installed but there are no devices available.
 try:
 …
 from . import generate
+from .generate import F32, F64
 from .kernel import KernelModel, Kernel
 …
 def use_opencl():
+    return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none"
+    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
+    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
 ENV = None
 …
     Return true if device supports the requested precision.
     """
     if dtype == generate.F32:
+    if dtype == F32:
         return True
     elif dtype == generate.F64:
 …
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
         queue.device)
-def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
-    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
-    """
-    Stretch an input vector to the correct boundary.
-    Performance on the kernels can drop by a factor of two or more if the
-    number of values to compute does not fall on a nice power of two
-    boundary.   The trailing additional vector elements are given a
-    value of *extra*, and so f(*extra*) will be computed for each of
-    them.  The returned array will thus be a subset of the computed array.
-    *boundary* should be a power of 2 which is at least 32 for good
-    performance on current platforms (as of Jan 2015).  It should
-    probably be the max of get_warp(kernel,queue) and
-    device.min_data_type_align_size//4.
-    """
-    remainder = vector.size % boundary
-    if remainder != 0:
-        size = vector.size + (boundary - remainder)
-        vector = np.hstack((vector, [extra] * (size - vector.size)))
-    return np.ascontiguousarray(vector, dtype=dtype)
 def compile_model(context, source, dtype, fast=False):
 …
     """
     GPU context, with possibly many devices, and one queue per device.
+    Because the environment can be reset during a live program (e.g., if the
+    user changes the active GPU device in the GUI), everything associated
+    with the device context must be cached in the environment and recreated
+    if the environment changes.  The *cache* attribute is a simple dictionary
+    which holds keys and references to objects, such as compiled kernels and
+    allocated buffers.  The running program should check in the cache for
+    long lived objects and create them if they are not there.  The program
+    should not hold onto cached objects, but instead only keep them active
+    for the duration of a function call.  When the environment is destroyed
+    then the *release* method for each active cache item is called before
+    the environment is freed.  This means that each cl buffer should be
+    in its own cache entry.
     """
     def __init__(self):
         # type: () -> None
         # find gpu context
+        #self.context = cl.create_some_context()
+        self.context = None
+        if 'SAS_OPENCL' in os.environ:
+            #Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+            os.environ["PYOPENCL_CTX"] = os.environ["SAS_OPENCL"]
+        if 'PYOPENCL_CTX' in os.environ:
+            self._create_some_context()
+        if not self.context:
+            self.context = _get_default_context()
+        context_list = _create_some_context()
+        # Find a context for F32 and for F64 (maybe the same one).
+        # F16 isn't good enough.
+        self.context = {}
+        for dtype in (F32, F64):
+            for context in context_list:
+                if has_type(context.devices[0], dtype):
+                    self.context[dtype] = context
+                    break
+            else:
+                self.context[dtype] = None
+        # Build a queue for each context
+        self.queue = {}
+        context = self.context[F32]
+        self.queue[F32] = cl.CommandQueue(context, context.devices[0])
+        if self.context[F64] == self.context[F32]:
+            self.queue[F64] = self.queue[F32]
+        else:
+            context = self.context[F64]
+            self.queue[F64] = cl.CommandQueue(context, context.devices[0])
         # Byte boundary for data alignment
         #self.data_boundary = max(d.min_data_type_align_size
         #                         for d in self.context.devices)
+        self.queues = [cl.CommandQueue(context, context.devices[0])
                        for context in self.context]
+        #self.data_boundary = max(context.devices[0].min_data_type_align_size
+        #                         for context in self.context.values())
+        # Cache for compiled programs, and for items in context
         self.compiled = {}
+        self.cache = {}
     def has_type(self, dtype):
 …
         Return True if all devices support a given type.
         """
+        return any(has_type(d, dtype)
+                   for context in self.context
+                   for d in context.devices)
+    def get_queue(self, dtype):
+        # type: (np.dtype) -> cl.CommandQueue
+        """
+        Return a command queue for the kernels of type dtype.
+        """
+        for context, queue in zip(self.context, self.queues):
+            if all(has_type(d, dtype) for d in context.devices):
+                return queue
+    def get_context(self, dtype):
+        # type: (np.dtype) -> cl.Context
+        """
+        Return a OpenCL context for the kernels of type dtype.
+        """
+        for context in self.context:
+            if all(has_type(d, dtype) for d in context.devices):
+                return context
+    def _create_some_context(self):
+        # type: () -> cl.Context
+        """
+        Protected call to cl.create_some_context without interactivity.  Use
+        this if SAS_OPENCL is set in the environment.  Sets the *context*
+        attribute.
+        """
+        try:
+            self.context = [cl.create_some_context(interactive=False)]
+        except Exception as exc:
+            warnings.warn(str(exc))
+            warnings.warn("pyopencl.create_some_context() failed")
+            warnings.warn("the environment variable 'SAS_OPENCL' might not be set correctly")
+        return self.context.get(dtype, None) is not None
     def compile_program(self, name, source, dtype, fast, timestamp):
 …
             del self.compiled[key]
         if key not in self.compiled:
             context = self.get_context(dtype)
+            context = self.context[dtype]
             logging.info("building %s for OpenCL %s", key,
                          context.devices[0].name.strip())
             program = compile_model(self.get_context(dtype),
+            program = compile_model(self.context[dtype],
                                     str(source), dtype, fast)
             self.compiled[key] = (program, timestamp)
         return program
+    def free_buffer(self, key):
+        if key in self.cache:
+            self.cache[key].release()
+            del self.cache[key]
+    def __del__(self):
+        for v in self.cache.values():
+            release = getattr(v, 'release', lambda: None)
+            release()
+        self.cache = {}
+_CURRENT_ID = 0
+def unique_id():
+    global _CURRENT_ID
+    _CURRENT_ID += 1
+    return _CURRENT_ID
+def _create_some_context():
+    # type: () -> cl.Context
+    """
+    Protected call to cl.create_some_context without interactivity.
+    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
+    otherwise scans for the most appropriate device using
+    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
+    indicates that an OpenCL device should be used without specifying
+    which one (and not a CUDA device, or no GPU).
+    """
+    # Assume we do not get here if SAS_OPENCL is None or CUDA
+    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
+    if sas_opencl.lower() != 'opencl':
+        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+        os.environ["PYOPENCL_CTX"] = sas_opencl
+    if 'PYOPENCL_CTX' in os.environ:
+        try:
+            return [cl.create_some_context(interactive=False)]
+        except Exception as exc:
+            warnings.warn(str(exc))
+            warnings.warn("pyopencl.create_some_context() failed")
+            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly")
+    return _get_default_context()
 def _get_default_context():
 …
         self.dtype = dtype
         self.fast = fast
         self.program = None # delay program creation
         self._kernels = None
+        self.timestamp = generate.ocl_timestamp(self.info)
+        self._cache_key = unique_id()
     def __getstate__(self):
 …
         # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
         self.info, self.source, self.dtype, self.fast = state
-        self.program = None
     def make_kernel(self, q_vectors):
         # type: (List[np.ndarray]) -> "GpuKernel"
+        if self.program is None:
+            compile_program = environment().compile_program
+            timestamp = generate.ocl_timestamp(self.info)
+            self.program = compile_program(
+        return GpuKernel(self, q_vectors)
+    @property
+    def Iq(self):
+        return self._fetch_kernel('Iq')
+    def fetch_kernel(self, name):
+        # type: (str) -> cl.Kernel
+        """
+        Fetch the kernel from the environment by name, compiling it if it
+        does not already exist.
+        """
+        gpu = environment()
+        key = self._cache_key
+        if key not in gpu.cache:
+            program = gpu.compile_program(
                 self.info.name,
                 self.source['opencl'],
                 self.dtype,
                 self.fast,
                 timestamp)
+                self.timestamp)
             variants = ['Iq', 'Iqxy', 'Imagnetic']
             names = [generate.kernel_name(self.info, k) for k in variants]
             kernels = [getattr(self.program, k) for k in names]
             self._kernels = dict((k, v) for k, v in zip(variants, kernels))
         is_2d = len(q_vectors) == 2
         if is_2d:
             kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
+            kernels = [getattr(program, k) for k in names]
+            data = dict((k, v) for k, v in zip(variants, kernels))
+            # keep a handle to program so GC doesn't collect
+            data['program'] = program
+            gpu.cache[key] = data
         else:
+            kernel = [self._kernels['Iq']]*2
+        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
+    def release(self):
+        # type: () -> None
+        """
+        Free the resources associated with the model.
+        """
+        if self.program is not None:
+            self.program = None
+    def __del__(self):
+        # type: () -> None
+        self.release()
+            data = gpu.cache[key]
+        return data[name]
 # TODO: check that we don't need a destructor for buffers which go out of scope
 …
         # type: (List[np.ndarray], np.dtype) -> None
         # TODO: do we ever need double precision q?
-        env = environment()
         self.nq = q_vectors[0].size
         self.dtype = np.dtype(dtype)
 …
         # at this point, so instead using 32, which is good on the set of
         # architectures tested so far.
-        extra_q = 3  # total weight, weighted volume and weighted radius
         if self.is_2d:
             width = ((self.nq+15+extra_q)//16)*16
+            width = ((self.nq+15)//16)*16
             self.q = np.empty((width, 2), dtype=dtype)
             self.q[:self.nq, 0] = q_vectors[0]
             self.q[:self.nq, 1] = q_vectors[1]
         else:
             width = ((self.nq+31+extra_q)//32)*32
+            width = ((self.nq+31)//32)*32
             self.q = np.empty(width, dtype=dtype)
             self.q[:self.nq] = q_vectors[0]
         self.global_size = [self.q.shape[0]]
+        context = env.get_context(self.dtype)
+        #print("creating inputs of size", self.global_size)
+        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
+                             hostbuf=self.q)
+        self._cache_key = unique_id()
+    @property
+    def q_b(self):
+        """Lazy creation of q buffer so it can survive context reset"""
+        env = environment()
+        key = self._cache_key
+        if key not in env.cache:
+            context = env.context[self.dtype]
+            #print("creating inputs of size", self.global_size)
+            buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
+                               hostbuf=self.q)
+            env.cache[key] = buffer
+        return env.cache[key]
     def release(self):
         # type: () -> None
         """
+        Free the memory.
+        """
+        if self.q_b is not None:
+            self.q_b.release()
+            self.q_b = None
+        Free the buffer associated with the q value
+        """
+        environment().free_buffer(id(self))
     def __del__(self):
 …
     Callable SAS kernel.
     *kernel* is the GpuKernel object to call
     *model_info* is the module information
     *q_vectors* is the q vectors at which the kernel should be evaluated
+    *model* is the GpuModel object to call
+    The following attributes are defined:
+    *info* is the module information
     *dtype* is the kernel precision
+    *dim* is '1d' or '2d'
+    *result* is a vector to contain the results of the call
     The resulting call method takes the *pars*, a list of values for
 …
     Call :meth:`release` when done with the kernel instance.
     """
     def __init__(self, kernel, dtype, model_info, q_vectors):
+    def __init__(self, model, q_vectors):
         # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
+        q_input = GpuInput(q_vectors, dtype)
+        self.kernel = kernel
+        self.info = model_info
+        self.dtype = dtype
+        self.dim = '2d' if q_input.is_2d else '1d'
+        # leave room for f1/f2 results in case we need to compute beta for 1d models
+        dtype = model.dtype
+        self.q_input = GpuInput(q_vectors, dtype)
+        self._model = model
+        self._as_dtype = (np.float32 if dtype == generate.F32
+                          else np.float64 if dtype == generate.F64
+                          else np.float16 if dtype == generate.F16
+                          else np.float32)  # will never get here, so use np.float32
+        self._cache_key = unique_id()
+        # attributes accessed from the outside
+        self.dim = '2d' if self.q_input.is_2d else '1d'
+        self.info = model.info
+        self.dtype = model.dtype
+        # holding place for the returned value
         nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
+        # +4 for total weight, shell volume, effective radius, form volume
+        self.result = np.empty(q_input.nq*nout + 4, self.dtype)
+        # Inputs and outputs for each kernel call
+        # Note: res may be shorter than res_b if global_size != nq
+        extra_q = 4  # total weight, form volume, shell volume and R_eff
+        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
+    @property
+    def _result_b(self):
+        """Lazy creation of result buffer so it can survive context reset"""
         env = environment()
+        self.queue = env.get_queue(dtype)
+        self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
+                                  q_input.global_size[0] * nout * dtype.itemsize)
+        self.q_input = q_input # allocated by GpuInput above
+        self._need_release = [self.result_b, self.q_input]
+        self.real = (np.float32 if dtype == generate.F32
+                     else np.float64 if dtype == generate.F64
+                     else np.float16 if dtype == generate.F16
+                     else np.float32)  # will never get here, so use np.float32
+        key = self._cache_key
+        if key not in env.cache:
+            context = env.context[self.dtype]
+            width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
+            buffer = cl.Buffer(context, mf.READ_WRITE, width)
+            env.cache[key] = buffer
+        return env.cache[key]
     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
+        context = self.queue.context
+        # Arrange data transfer to card
+        env = environment()
+        queue = env.queue[self._model.dtype]
+        context = queue.context
+        # Arrange data transfer to/from card
+        q_b = self.q_input.q_b
+        result_b = self._result_b
         details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=call_details.buffer)
 …
                              hostbuf=values)
+        kernel = self.kernel[1 if magnetic else 0]
+        args = [
+        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
+        kernel = self._model.fetch_kernel(name)
+        kernel_args = [
             np.uint32(self.q_input.nq), None, None,
             details_b, values_b, self.q_input.q_b, self.result_b,
             self.real(cutoff),
+            details_b, values_b, q_b, result_b,
+            self._as_dtype(cutoff),
             np.uint32(effective_radius_type),
+        ]
 …
             stop = min(start + step, call_details.num_eval)
             #print("queuing",start,stop)
             args[1:3] = [np.int32(start), np.int32(stop)]
             wait_for = [kernel(self.queue, self.q_input.global_size, None,
                                *args, wait_for=wait_for)]
+            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
+            wait_for = [kernel(queue, self.q_input.global_size, None,
+                               *kernel_args, wait_for=wait_for)]
             if stop < call_details.num_eval:
                 # Allow other processes to run
 …
                 current_time = time.clock()
                 if current_time - last_nap > 0.5:
                     time.sleep(0.05)
+                    time.sleep(0.001)
                     last_nap = current_time
         cl.enqueue_copy(self.queue, self.result, self.result_b)
+        cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for)
         #print("result", self.result)
 …
         Release resources associated with the kernel.
         """
+        for v in self._need_release:
+            v.release()
+        self._need_release = []
+        environment().free_buffer(id(self))
+        self.q_input.release()
     def __del__(self):

sasmodels/model_test.py

-                      r012cd34
+                      r74e9b5f
 Usage::
     python -m sasmodels.model_test [opencl|dll|opencl_and_dll] model1 model2 ...
+    python -m sasmodels.model_test [opencl|cuda|dll] model1 model2 ...
     if model1 is 'all', then all except the remaining models will be tested
 …
 from .modelinfo import expand_pars
 from .kernelcl import use_opencl
+from .kernelcuda import use_cuda
 # pylint: disable=unused-import
 …
     Construct the pyunit test suite.
+    *loaders* is the list of kernel drivers to use, which is one of
+    *["dll", "opencl"]*, *["dll"]* or *["opencl"]*.  For python models,
+    the python driver is always used.
+    *loaders* is the list of kernel drivers to use (dll, opencl or cuda).
+    For python model the python driver is always used.
     *models* is the list of models to test, or *["all"]* to test all models.
 …
             # test using dll if desired
             if 'dll' in loaders or not use_opencl():
+            if 'dll' in loaders:
                 test_name = "%s-dll"%model_name
                 test_method_name = "test_%s_dll" % model_info.id
 …
                                      test_method_name,
                                      platform="ocl", dtype=None,
+                                     stash=stash)
+                #print("defining", test_name)
+                suite.addTest(test)
+            # test using cuda if desired and available
+            if 'cuda' in loaders and use_cuda():
+                test_name = "%s-cuda"%model_name
+                test_method_name = "test_%s_cuda" % model_info.id
+                # Using dtype=None so that the models that are only
+                # correct for double precision are not tested using
+                # single precision.  The choice is determined by the
+                # presence of *single=False* in the model file.
+                test = ModelTestCase(test_name, model_info,
+                                     test_method_name,
+                                     platform="cuda", dtype=None,
                                      stash=stash)
                 #print("defining", test_name)
 …
                 # Check for missing tests.  Only do so for the "dll" tests
                 # to reduce noise from both opencl and dll, and because
+                # to reduce noise from both opencl and cuda, and because
                 # python kernels use platform="dll".
                 if self.platform == "dll":
 …
     # Build a test suite containing just the model
     loaders = ['opencl'] if use_opencl() else ['dll']
+    loader = 'opencl' if use_opencl() else 'cuda' if use_cuda() else 'dll'
     models = [model]
     try:
         suite = make_suite(loaders, models)
+        suite = make_suite([loader], models)
     except Exception:
         import traceback
 …
         loaders = ['opencl']
         models = models[1:]
+    elif models and models[0] == 'cuda':
+        if not use_cuda():
+            print("cuda is not available")
+            return 1
+        loaders = ['cuda']
+        models = models[1:]
     elif models and models[0] == 'dll':
         # TODO: test if compiler is available?
         loaders = ['dll']
         models = models[1:]
-    elif models and models[0] == 'opencl_and_dll':
-        loaders = ['opencl', 'dll'] if use_opencl() else ['dll']
-        models = models[1:]
     else:
+        loaders = ['opencl', 'dll'] if use_opencl() else ['dll']
+        loaders = ['dll']
+        if use_opencl():
+            loaders.append('opencl')
+        if use_cuda():
+            loaders.append('cuda')
     if not models:
         print("""\
 usage:
   python -m sasmodels.model_test [-v] [opencl|dll] model1 model2 ...
+  python -m sasmodels.model_test [-v] [opencl|cuda|dll] model1 model2 ...
 If -v is included on the command line, then use verbose output.
 If neither opencl nor dll is specified, then models will be tested with
 both OpenCL and dll; the compute target is ignored for pure python models.
+If no platform is specified, then models will be tested with dll, and
+if available, OpenCL and CUDA; the compute target is ignored for pure python models.
 If model1 is 'all', then all except the remaining models will be tested.
 …
     Run "nosetests sasmodels" on the command line to invoke it.
     """
+    loaders = ['opencl', 'dll'] if use_opencl() else ['dll']
+    loaders = ['dll']
+    if use_opencl():
+        loaders.append('opencl')
+    if use_cuda():
+        loaders.append('cuda')
     tests = make_suite(loaders, ['all'])
     def build_test(test):

sasmodels/models/lib/gauss76.c

-                      r99b84ec
+                      r74e9b5f
 // Gaussians
 constant double Gauss76Wt[76]={
+constant double Gauss76Wt[76] = {
         .00126779163408536,             //0
         .00294910295364247,
 …
 };
 constant double Gauss76Z[76]={
+constant double Gauss76Z[76] = {
         -.999505948362153,              //0
         -.997397786355355,

sasmodels/models/lib/polevl.c

-                      r447e9aa
+                      r74e9b5f
 */
+double polevl( double x, constant double *coef, int N );
 double polevl( double x, constant double *coef, int N )
+static
+double polevl( double x, pconstant double *coef, int N )
+{
 …
  */
+double p1evl( double x, constant double *coef, int N );
 double p1evl( double x, constant double *coef, int N )
+static
+double p1evl( double x, pconstant double *coef, int N )
+{
     int i=0;

sasmodels/models/lib/sas_J1.c

-                      r5181ccc
+                      r74e9b5f
 #if FLOAT_SIZE>4
 //Cephes double pression function
-double cephes_j1(double x);
 constant double RPJ1[8] = {
 …
 .0 };
+static
 double cephes_j1(double x)
+{
 …
 #else
 //Single precission version of cephes
-float cephes_j1f(float x);
 constant float JPJ1[8] = {
     -4.878788132172128E-009,
 …
     };
+static
 float cephes_j1f(float xx)
+{
 …
 //Finally J1c function that equals 2*J1(x)/x
+double sas_2J1x_x(double x);
+static
 double sas_2J1x_x(double x)
+{

sasmodels/models/spinodal.py

-                      r475ff58
+                      r93fe8a1
 where $x=q/q_0$, $q_0$ is the peak position, $I_{max}$ is the intensity
 at $q_0$ (parameterised as the $scale$ parameter), and $B$ is a flat
+background. The spinodal wavelength is given by $2\pi/q_0$.
+background. The spinodal wavelength, $\Lambda$, is given by $2\pi/q_0$.
+The definition of $I_{max}$ in the literature varies. Hashimoto *et al* (1991)
+define it as
+.. math::
+    I_{max} = \Lambda^3\Delta\rho^2
+whereas Meier & Strobl (1987) give
+.. math::
+    I_{max} = V_z\Delta\rho^2
+where $V_z$ is the volume per monomer unit.
 The exponent $\gamma$ is equal to $d+1$ for off-critical concentration
 …
 H. Furukawa. Dynamics-scaling theory for phase-separating unmixing mixtures:
+Growth rates of droplets and scaling properties of autocorrelation functions.
+Physica A 123,497 (1984).
+Growth rates of droplets and scaling properties of autocorrelation functions.
+Physica A 123, 497 (1984).
+H. Meier & G. Strobl. Small-Angle X-ray Scattering Study of Spinodal
+Decomposition in Polystyrene/Poly(styrene-co-bromostyrene) Blends.
+Macromolecules 20, 649-654 (1987).
+T. Hashimoto, M. Takenaka & H. Jinnai. Scattering Studies of Self-Assembling
+Processes of Polymer Blends in Spinodal Decomposition.
+J. Appl. Cryst. 24, 457-466 (1991).
 Revision History
 …
 * **Author:**  Dirk Honecker **Date:** Oct 7, 2016
 * **Revised:** Steve King    **Date:** Sep 7, 2018
+* **Revised:** Steve King    **Date:** Oct 25, 2018
 """

setup.py

-                      r1f991d6
+                      r783e76f
                 return version[1:-1]
     raise RuntimeError("Could not read version from %s/__init__.py"%package)
+install_requires = ['numpy', 'scipy']
+if sys.platform=='win32' or sys.platform=='cygwin':
+    install_requires.append('tinycc')
 setup(
 …
         'sasmodels': ['*.c', '*.cl'],
     },
+    install_requires=[
+    ],
+    install_requires=install_requires,
     extras_require={
+        'full': ['docutils', 'bumps', 'matplotlib'],
+        'server': ['bumps'],
         'OpenCL': ["pyopencl"],
-        'Bumps': ["bumps"],
-        'TinyCC': ["tinycc"],
     },
     build_requires=['setuptools'],

SasView

Changes in / [31fc4ad:07646b6] in sasmodels

Legend:

doc/guide/gpu_setup.rst

doc/guide/magnetism/magnetism.rst

sasmodels/compare.py

sasmodels/core.py

sasmodels/kernel_header.c

sasmodels/kernel_iq.c

sasmodels/kernelcl.py

sasmodels/model_test.py

sasmodels/models/lib/gauss76.c

sasmodels/models/lib/polevl.c

sasmodels/models/lib/sas_J1.c

sasmodels/models/spinodal.py

setup.py

Download in other formats: