← Previous Changeset
Next Changeset →

Changeset 0db7dbd in sasmodels

Timestamp:

Feb 16, 2018 7:10:04 PM (7 years ago)

Author:

pkienzle

Branches:

master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

Parents:

Message:

cuda support: allow cylinder model to run under CUDA as well as OpenCL

Location:

Files:

: 1 added
: 7 edited

core.py (modified) (5 diffs)
kernel_header.c (modified) (6 diffs)
kernel_iq.c (modified) (17 diffs)
kernelcuda.py (added)
models/cylinder.c (modified) (5 diffs)
models/lib/gauss76.c (modified) (2 diffs)
models/lib/polevl.c (modified) (2 diffs)
models/lib/sas_J1.c (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

sasmodels/core.py

-                      r3221de0
+                      r0db7dbd
 from glob import glob
 import re
+# Set "SAS_OPENCL=cuda" in the environment to use the CUDA rather than OpenCL
+USE_CUDA = os.environ.get("SAS_OPENCL", "") == "cuda"
 import numpy as np # type: ignore
 …
 from . import mixture
 from . import kernelpy
+from . import kernelcl
+if USE_CUDA:
+    from . import kernelcuda
+else:
+    from . import kernelcl
 from . import kerneldll
 from . import custom
 …
         #print("building dll", numpy_dtype)
         return kerneldll.load_dll(source['dll'], model_info, numpy_dtype)
+    elif USE_CUDA:
+        #print("building cuda", numpy_dtype)
+        return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast)
     else:
         #print("building ocl", numpy_dtype)
 …
     if platform is None:
         platform = "ocl"
     if not kernelcl.use_opencl() or not model_info.opencl:
+    if not model_info.opencl:
         platform = "dll"
+    elif USE_CUDA:
+        if not kernelcuda.use_cuda():
+            platform = "dll"
+    else:
+        if not kernelcl.use_opencl():
+            platform = "dll"
     # Check if type indicates dll regardless of which platform is given
 …
     # Make sure that the type is supported by opencl, otherwise use dll
     if platform == "ocl":
+        env = kernelcl.environment()
+        if USE_CUDA:
+            env = kernelcuda.environment()
+        else:
+            env = kernelcl.environment()
         if not env.has_type(numpy_dtype):
             platform = "dll"

sasmodels/kernel_header.c

-                      r108e70e
+                      r0db7dbd
 #ifdef __OPENCL_VERSION__
 # define USE_OPENCL
+#elif defined(__CUDACC__)
+# define USE_CUDA
 #elif defined(_OPENMP)
 # define USE_OPENMP
 …
 // Note: if using a C++ compiler, then define kernel as extern "C"
 #ifdef USE_OPENCL
+   #define USE_GPU
    typedef int int32_t;
+#  if defined(USE_SINCOS)
+#    define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar)
+#  else
+#    define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+#  endif
+   #define global_par global
+   #define local_par local
+   #define constant_par constant
+   #define global_var global
+   #define local_var local
+   #define constant_var constant
+   #define __device__
+   #if defined(USE_SINCOS)
+   #  define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar)
+   #else
+   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif
    // Intel CPU on Mac gives strange values for erf(); on the verified
    // platforms (intel, nvidia, amd), the cephes erf() is significantly
 …
    #  define erfcf erfc
    #endif
+#else // !USE_OPENCL
+// Use SAS_DOUBLE to force the use of double even for float kernels
+#  define SAS_DOUBLE dou ## ble
+#  ifdef __cplusplus
+#elif defined(USE_CUDA)
+   #define USE_GPU
+   #define global_par
+   #define local_par
+   #define constant_par const
+   #define global_var
+   #define local_var __shared__
+   #define constant_var __constant__
+   #define kernel extern "C" __global__
+   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+   // OpenCL pown(a,b) = C99 pow(a,b), b integer
+   #define powr(a,b) pow(a,b)
+   #define pown(a,b) pow(a,b)
+   //typedef int int32_t;
+   #if defined(USE_SINCOS)
+   #  define SINCOS(angle,svar,cvar) sincos(angle,&svar,&cvar)
+   #else
+   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+   #endif
+#else // !USE_OPENCL && !USE_CUDA
+   #define global_par
+   #define local_par
+   #define constant_par const
+   #define global_var
+   #define local_var
+   #define constant_var const
+   #define __device__
+   #ifdef __cplusplus
       #include <cstdio>
       #include <cmath>
 …
      #endif
      inline void SINCOS(double angle, double &svar, double &cvar) { svar=sin(angle); cvar=cos(angle); }
 #  else // !__cplusplus
+   #else // !__cplusplus
      #include <inttypes.h>  // C99 guarantees that int32_t types is here
      #include <stdio.h>
 …
      #define kernel
      #define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0)
+#  endif  // !__cplusplus
+#  define global
+#  define local
+#  define constant const
+// OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+// OpenCL pown(a,b) = C99 pow(a,b), b integer
+#  define powr(a,b) pow(a,b)
+#  define pown(a,b) pow(a,b)
+   #endif  // !__cplusplus
+   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0
+   // OpenCL pown(a,b) = C99 pow(a,b), b integer
+   #define powr(a,b) pow(a,b)
+   #define pown(a,b) pow(a,b)
 #endif // !USE_OPENCL
+// Use SAS_DOUBLE to force the use of double even for float kernels
+#define SAS_DOUBLE dou ## ble
 #if defined(NEED_EXPM1)
 …
 #  define M_4PI_3 4.18879020478639
 #endif
+__device__
 inline double square(double x) { return x*x; }
+__device__
 inline double cube(double x) { return x*x*x; }
+__device__
 inline double sas_sinx_x(double x) { return x==0 ? 1.0 : sin(x)/x; }

sasmodels/kernel_iq.c

-                      raadec17
+                      r0db7dbd
 // Return value restricted between low and high
+__device__
 static double clip(double value, double low, double high)
+{
 …
 //     du * (m_sigma_y + 1j*m_sigma_z);
 // weights for spin crosssections: dd du real, ud real, uu, du imag, ud imag
+__device__
 static void set_spin_weights(double in_spin, double out_spin, double spins[4])
+{
 …
 // Compute the magnetic sld
+__device__
 static double mag_sld(
   const unsigned int xs, // 0=dd, 1=du real, 2=ud real, 3=uu, 4=du imag, 5=up imag
 …
 // jitter angles (dtheta, dphi).  This matrix can be applied to all of the
 // (qx, qy) points in the image to produce R*[qx,qy]' = [qa,qc]'
+__device__
 static void
 qac_rotation(
 …
 // Apply the rotation matrix returned from qac_rotation to the point (qx,qy),
 // returning R*[qx,qy]' = [qa,qc]'
+static double
+__device__
+static void
 qac_apply(
     QACRotation *rotation,
 …
 // jitter angles (dtheta, dphi, dpsi).  This matrix can be applied to all of the
 // (qx, qy) points in the image to produce R*[qx,qy]' = [qa,qb,qc]'
+__device__
 static void
 qabc_rotation(
 …
 // Apply the rotation matrix returned from qabc_rotation to the point (qx,qy),
 // returning R*[qx,qy]' = [qa,qb,qc]'
+static double
+__device__
+static void
 qabc_apply(
     QABCRotation *rotation,
 …
     const int32_t pd_start,     // where we are in the dispersity loop
     const int32_t pd_stop,      // where we are stopping in the dispersity loop
     global const ProblemDetails *details,
     global const double *values,
     global const double *q, // nq q values, with padding to boundary
     global double *result,  // nq+1 return values, again with padding
+    global_par const ProblemDetails *details,
+    global_par const double *values,
+    global_par const double *q, // nq q values, with padding to boundary
+    global_par double *result,  // nq+1 return values, again with padding
     const double cutoff     // cutoff in the dispersity weight product
+    )
+{
 #ifdef USE_OPENCL
+#if defined(USE_GPU)
   // who we are and what element we are working with
+  #if defined(USE_OPENCL)
   const int q_index = get_global_id(0);
+  #else // USE_CUDA
+  const int q_index = threadIdx.x + blockIdx.x * blockDim.x;
+  #endif
   if (q_index >= nq) return;
 #else
 …
   // seeing one q value (stored in the variable "this_result") while the dll
   // version must loop over all q.
   #ifdef USE_OPENCL
+  #if defined(USE_GPU)
     double pd_norm = (pd_start == 0 ? 0.0 : result[nq]);
     double this_result = (pd_start == 0 ? 0.0 : result[q_index]);
   #else // !USE_OPENCL
+  #else // !USE_GPU
     double pd_norm = (pd_start == 0 ? 0.0 : result[nq]);
     if (pd_start == 0) {
 …
+    }
     //if (q_index==0) printf("start %d %g %g\n", pd_start, pd_norm, result[0]);
 #endif // !USE_OPENCL
+#endif // !USE_GPU
 …
   const int n4 = pd_length[4];
   const int p4 = pd_par[4];
   global const double *v4 = pd_value + pd_offset[4];
   global const double *w4 = pd_weight + pd_offset[4];
+  global_var const double *v4 = pd_value + pd_offset[4];
+  global_var const double *w4 = pd_weight + pd_offset[4];
   int i4 = (pd_start/pd_stride[4])%n4;  // position in level 4 at pd_start
 …
   const int n##_LOOP = details->pd_length[_LOOP]; \
   const int p##_LOOP = details->pd_par[_LOOP]; \
   global const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \
   global const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \
+  global_var const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \
+  global_var const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \
   int i##_LOOP = (pd_start/details->pd_stride[_LOOP])%n##_LOOP;
 …
 // Pointers to the start of the dispersity and weight vectors, if needed.
 #if MAX_PD>0
   global const double *pd_value = values + NUM_VALUES;
   global const double *pd_weight = pd_value + details->num_weights;
+  global_var const double *pd_value = values + NUM_VALUES;
+  global_var const double *pd_weight = pd_value + details->num_weights;
 #endif
 …
       BUILD_ROTATION();
 #ifndef USE_OPENCL
+#if !defined(USE_GPU)
       // DLL needs to explicitly loop over the q values.
       #ifdef USE_OPENMP
 …
       #endif
       for (q_index=0; q_index<nq; q_index++)
 #endif // !USE_OPENCL
+#endif // !USE_GPU
+      {
 …
 //printf("q_index:%d %g %g %g %g\n", q_index, scattering, weight0);
         #ifdef USE_OPENCL
+        #if defined(USE_GPU)
           this_result += weight * scattering;
         #else // !USE_OPENCL
+        #else // !USE_GPU
           result[q_index] += weight * scattering;
         #endif // !USE_OPENCL
+        #endif // !USE_GPU
+      }
+    }
 …
 // Remember the current result and the updated norm.
 #ifdef USE_OPENCL
+#if defined(USE_GPU)
   result[q_index] = this_result;
   if (q_index == 0) result[nq] = pd_norm;
 //if (q_index == 0) printf("res: %g/%g\n", result[0], pd_norm);
 #else // !USE_OPENCL
+#else // !USE_GPU
   result[nq] = pd_norm;
 //printf("res: %g/%g\n", result[0], pd_norm);
 #endif // !USE_OPENCL
+#endif // !USE_GPU
 // ** clear the macros in preparation for the next kernel **

sasmodels/models/cylinder.c

-                      r108e70e
+                      r0db7dbd
 #define INVALID(v) (v.radius<0 || v.length<0)
+__device__
 static double
 form_volume(double radius, double length)
 …
+}
+__device__
 static double
 fq(double qab, double qc, double radius, double length)
 …
+}
+__device__
 static double
 orient_avg_1D(double q, double radius, double length)
 …
+}
+__device__
 static double
 Iq(double q,
 …
+}
+__device__
 static double
 Iqac(double qab, double qc,

sasmodels/models/lib/gauss76.c

-                      r99b84ec
+                      r0db7dbd
 // Gaussians
 constant double Gauss76Wt[76]={
+constant_var double Gauss76Wt[76] = {
         .00126779163408536,             //0
         .00294910295364247,
 …
 };
 constant double Gauss76Z[76]={
+constant_var double Gauss76Z[76] = {
         -.999505948362153,              //0
         -.997397786355355,

sasmodels/models/lib/polevl.c

-                      r447e9aa
+                      r0db7dbd
 */
+double polevl( double x, constant double *coef, int N );
 double polevl( double x, constant double *coef, int N )
+__device__ static
+double polevl( double x, constant_par double *coef, int N )
+{
 …
  */
+double p1evl( double x, constant double *coef, int N );
 double p1evl( double x, constant double *coef, int N )
+__device__ static
+double p1evl( double x, constant_par double *coef, int N )
+{
     int i=0;

sasmodels/models/lib/sas_J1.c

-                      r5181ccc
+                      r0db7dbd
 #if FLOAT_SIZE>4
 //Cephes double pression function
+double cephes_j1(double x);
+constant double RPJ1[8] = {
+constant_var double RPJ1[8] = {
     -8.99971225705559398224E8,
 .52228297998194034323E11,
 …
 .0 };
 constant double RQJ1[8] = {
+constant_var double RQJ1[8] = {
 .20836478118054335476E2,
 .56987256757748830383E5,
 …
     };
 constant double PPJ1[8] = {
+constant_var double PPJ1[8] = {
 .62125616208173112003E-4,
 .31397056940917570436E-2,
 …
 constant double PQJ1[8] = {
+constant_var double PQJ1[8] = {
 .71323128072548699714E-4,
 .88455908754495404082E-2,
 …
 .0 };
 constant double QPJ1[8] = {
+constant_var double QPJ1[8] = {
 .10862594750176621635E-2,
 .98213872951233449420E0,
 …
 .52070205858023719784E1 };
 constant double QQJ1[8] = {
+constant_var double QQJ1[8] = {
 .42373277035675149943E1,
 .05644886038262816351E3,
 …
 .0 };
+__device__ static
 double cephes_j1(double x)
+{
 …
 #else
 //Single precission version of cephes
+float cephes_j1f(float x);
+constant float JPJ1[8] = {
+constant_var float JPJ1[8] = {
     -4.878788132172128E-009,
 .009061827883699E-007,
 …
     };
 constant float MO1J1[8] = {
+constant_var float MO1J1[8] = {
 .913942741265801E-002,
     -2.284801500053359E-001,
 …
     };
 constant float PH1J1[8] = {
+constant_var float PH1J1[8] = {
     -4.497014141919556E+001,
 .073465654089319E+001,
 …
     };
+__device__ static
 float cephes_j1f(float xx)
+{
 …
 //Finally J1c function that equals 2*J1(x)/x
+double sas_2J1x_x(double x);
+__device__ static
 double sas_2J1x_x(double x)
+{

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: