Changeset 0db7dbd in sasmodels


Ignore:
Timestamp:
Feb 16, 2018 5:10:04 PM (6 years ago)
Author:
pkienzle
Branches:
master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
47fb816
Parents:
aa90015
Message:

cuda support: allow cylinder model to run under CUDA as well as OpenCL

Location:
sasmodels
Files:
1 added
7 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/core.py

    r3221de0 r0db7dbd  
    1313from glob import glob 
    1414import re 
     15 
     16# Set "SAS_OPENCL=cuda" in the environment to use the CUDA rather than OpenCL 
     17USE_CUDA = os.environ.get("SAS_OPENCL", "") == "cuda" 
    1518 
    1619import numpy as np # type: ignore 
     
    2124from . import mixture 
    2225from . import kernelpy 
    23 from . import kernelcl 
     26if USE_CUDA: 
     27    from . import kernelcuda 
     28else: 
     29    from . import kernelcl 
    2430from . import kerneldll 
    2531from . import custom 
     
    210216        #print("building dll", numpy_dtype) 
    211217        return kerneldll.load_dll(source['dll'], model_info, numpy_dtype) 
     218    elif USE_CUDA: 
     219        #print("building cuda", numpy_dtype) 
     220        return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast) 
    212221    else: 
    213222        #print("building ocl", numpy_dtype) 
     
    268277    if platform is None: 
    269278        platform = "ocl" 
    270     if not kernelcl.use_opencl() or not model_info.opencl: 
     279    if not model_info.opencl: 
    271280        platform = "dll" 
     281    elif USE_CUDA: 
     282        if not kernelcuda.use_cuda(): 
     283            platform = "dll" 
     284    else: 
     285        if not kernelcl.use_opencl(): 
     286            platform = "dll" 
    272287 
    273288    # Check if type indicates dll regardless of which platform is given 
     
    294309    # Make sure that the type is supported by opencl, otherwise use dll 
    295310    if platform == "ocl": 
    296         env = kernelcl.environment() 
     311        if USE_CUDA: 
     312            env = kernelcuda.environment() 
     313        else: 
     314            env = kernelcl.environment() 
    297315        if not env.has_type(numpy_dtype): 
    298316            platform = "dll" 
  • sasmodels/kernel_header.c

    r108e70e r0db7dbd  
    11#ifdef __OPENCL_VERSION__ 
    22# define USE_OPENCL 
     3#elif defined(__CUDACC__) 
     4# define USE_CUDA 
    35#elif defined(_OPENMP) 
    46# define USE_OPENMP 
     
    810// Note: if using a C++ compiler, then define kernel as extern "C" 
    911#ifdef USE_OPENCL 
     12 
     13   #define USE_GPU 
    1014   typedef int int32_t; 
    11 #  if defined(USE_SINCOS) 
    12 #    define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar) 
    13 #  else 
    14 #    define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
    15 #  endif 
     15   #define global_par global 
     16   #define local_par local 
     17   #define constant_par constant 
     18   #define global_var global 
     19   #define local_var local 
     20   #define constant_var constant 
     21   #define __device__ 
     22 
     23   #if defined(USE_SINCOS) 
     24   #  define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar) 
     25   #else 
     26   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
     27   #endif 
    1628   // Intel CPU on Mac gives strange values for erf(); on the verified 
    1729   // platforms (intel, nvidia, amd), the cephes erf() is significantly 
     
    2436   #  define erfcf erfc 
    2537   #endif 
    26 #else // !USE_OPENCL 
    27 // Use SAS_DOUBLE to force the use of double even for float kernels 
    28 #  define SAS_DOUBLE dou ## ble 
    29 #  ifdef __cplusplus 
     38 
     39#elif defined(USE_CUDA) 
     40 
     41   #define USE_GPU 
     42   #define global_par 
     43   #define local_par 
     44   #define constant_par const 
     45   #define global_var 
     46   #define local_var __shared__ 
     47   #define constant_var __constant__ 
     48 
     49   #define kernel extern "C" __global__ 
     50 
     51   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0 
     52   // OpenCL pown(a,b) = C99 pow(a,b), b integer 
     53   #define powr(a,b) pow(a,b) 
     54   #define pown(a,b) pow(a,b) 
     55   //typedef int int32_t; 
     56   #if defined(USE_SINCOS) 
     57   #  define SINCOS(angle,svar,cvar) sincos(angle,&svar,&cvar) 
     58   #else 
     59   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
     60   #endif 
     61 
     62#else // !USE_OPENCL && !USE_CUDA 
     63 
     64   #define global_par 
     65   #define local_par 
     66   #define constant_par const 
     67   #define global_var 
     68   #define local_var 
     69   #define constant_var const 
     70   #define __device__ 
     71 
     72   #ifdef __cplusplus 
    3073      #include <cstdio> 
    3174      #include <cmath> 
     
    5194     #endif 
    5295     inline void SINCOS(double angle, double &svar, double &cvar) { svar=sin(angle); cvar=cos(angle); } 
    53 else // !__cplusplus 
     96   #else // !__cplusplus 
    5497     #include <inttypes.h>  // C99 guarantees that int32_t types is here 
    5598     #include <stdio.h> 
     
    76119     #define kernel 
    77120     #define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
    78 #  endif  // !__cplusplus 
    79 #  define global 
    80 #  define local 
    81 #  define constant const 
    82 // OpenCL powr(a,b) = C99 pow(a,b), b >= 0 
    83 // OpenCL pown(a,b) = C99 pow(a,b), b integer 
    84 #  define powr(a,b) pow(a,b) 
    85 #  define pown(a,b) pow(a,b) 
     121   #endif  // !__cplusplus 
     122   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0 
     123   // OpenCL pown(a,b) = C99 pow(a,b), b integer 
     124   #define powr(a,b) pow(a,b) 
     125   #define pown(a,b) pow(a,b) 
     126 
    86127#endif // !USE_OPENCL 
     128 
     129// Use SAS_DOUBLE to force the use of double even for float kernels 
     130#define SAS_DOUBLE dou ## ble 
    87131 
    88132#if defined(NEED_EXPM1) 
     
    147191#  define M_4PI_3 4.18879020478639 
    148192#endif 
     193__device__ 
    149194inline double square(double x) { return x*x; } 
     195__device__ 
    150196inline double cube(double x) { return x*x*x; } 
     197__device__ 
    151198inline double sas_sinx_x(double x) { return x==0 ? 1.0 : sin(x)/x; } 
    152199 
  • sasmodels/kernel_iq.c

    raadec17 r0db7dbd  
    6767 
    6868// Return value restricted between low and high 
     69__device__ 
    6970static double clip(double value, double low, double high) 
    7071{ 
     
    7980//     du * (m_sigma_y + 1j*m_sigma_z); 
    8081// weights for spin crosssections: dd du real, ud real, uu, du imag, ud imag 
     82__device__ 
    8183static void set_spin_weights(double in_spin, double out_spin, double spins[4]) 
    8284{ 
     
    9294 
    9395// Compute the magnetic sld 
     96__device__ 
    9497static double mag_sld( 
    9598  const unsigned int xs, // 0=dd, 1=du real, 2=ud real, 3=uu, 4=du imag, 5=up imag 
     
    140143// jitter angles (dtheta, dphi).  This matrix can be applied to all of the 
    141144// (qx, qy) points in the image to produce R*[qx,qy]' = [qa,qc]' 
     145__device__ 
    142146static void 
    143147qac_rotation( 
     
    173177// Apply the rotation matrix returned from qac_rotation to the point (qx,qy), 
    174178// returning R*[qx,qy]' = [qa,qc]' 
    175 static double 
     179__device__ 
     180static void 
    176181qac_apply( 
    177182    QACRotation *rotation, 
     
    200205// jitter angles (dtheta, dphi, dpsi).  This matrix can be applied to all of the 
    201206// (qx, qy) points in the image to produce R*[qx,qy]' = [qa,qb,qc]' 
     207__device__ 
    202208static void 
    203209qabc_rotation( 
     
    246252// Apply the rotation matrix returned from qabc_rotation to the point (qx,qy), 
    247253// returning R*[qx,qy]' = [qa,qb,qc]' 
    248 static double 
     254__device__ 
     255static void 
    249256qabc_apply( 
    250257    QABCRotation *rotation, 
     
    267274    const int32_t pd_start,     // where we are in the dispersity loop 
    268275    const int32_t pd_stop,      // where we are stopping in the dispersity loop 
    269     global const ProblemDetails *details, 
    270     global const double *values, 
    271     global const double *q, // nq q values, with padding to boundary 
    272     global double *result,  // nq+1 return values, again with padding 
     276    global_par const ProblemDetails *details, 
     277    global_par const double *values, 
     278    global_par const double *q, // nq q values, with padding to boundary 
     279    global_par double *result,  // nq+1 return values, again with padding 
    273280    const double cutoff     // cutoff in the dispersity weight product 
    274281    ) 
    275282{ 
    276 #ifdef USE_OPENCL 
     283#if defined(USE_GPU) 
    277284  // who we are and what element we are working with 
     285  #if defined(USE_OPENCL) 
    278286  const int q_index = get_global_id(0); 
     287  #else // USE_CUDA 
     288  const int q_index = threadIdx.x + blockIdx.x * blockDim.x; 
     289  #endif 
    279290  if (q_index >= nq) return; 
    280291#else 
     
    329340  // seeing one q value (stored in the variable "this_result") while the dll 
    330341  // version must loop over all q. 
    331   #ifdef USE_OPENCL 
     342  #if defined(USE_GPU) 
    332343    double pd_norm = (pd_start == 0 ? 0.0 : result[nq]); 
    333344    double this_result = (pd_start == 0 ? 0.0 : result[q_index]); 
    334   #else // !USE_OPENCL 
     345  #else // !USE_GPU 
    335346    double pd_norm = (pd_start == 0 ? 0.0 : result[nq]); 
    336347    if (pd_start == 0) { 
     
    341352    } 
    342353    //if (q_index==0) printf("start %d %g %g\n", pd_start, pd_norm, result[0]); 
    343 #endif // !USE_OPENCL 
     354#endif // !USE_GPU 
    344355 
    345356 
     
    364375  const int n4 = pd_length[4]; 
    365376  const int p4 = pd_par[4]; 
    366   global const double *v4 = pd_value + pd_offset[4]; 
    367   global const double *w4 = pd_weight + pd_offset[4]; 
     377  global_var const double *v4 = pd_value + pd_offset[4]; 
     378  global_var const double *w4 = pd_weight + pd_offset[4]; 
    368379  int i4 = (pd_start/pd_stride[4])%n4;  // position in level 4 at pd_start 
    369380 
     
    551562  const int n##_LOOP = details->pd_length[_LOOP]; \ 
    552563  const int p##_LOOP = details->pd_par[_LOOP]; \ 
    553   global const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \ 
    554   global const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \ 
     564  global_var const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \ 
     565  global_var const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \ 
    555566  int i##_LOOP = (pd_start/details->pd_stride[_LOOP])%n##_LOOP; 
    556567 
     
    576587// Pointers to the start of the dispersity and weight vectors, if needed. 
    577588#if MAX_PD>0 
    578   global const double *pd_value = values + NUM_VALUES; 
    579   global const double *pd_weight = pd_value + details->num_weights; 
     589  global_var const double *pd_value = values + NUM_VALUES; 
     590  global_var const double *pd_weight = pd_value + details->num_weights; 
    580591#endif 
    581592 
     
    637648      BUILD_ROTATION(); 
    638649 
    639 #ifndef USE_OPENCL 
     650#if !defined(USE_GPU) 
    640651      // DLL needs to explicitly loop over the q values. 
    641652      #ifdef USE_OPENMP 
     
    643654      #endif 
    644655      for (q_index=0; q_index<nq; q_index++) 
    645 #endif // !USE_OPENCL 
     656#endif // !USE_GPU 
    646657      { 
    647658 
     
    684695//printf("q_index:%d %g %g %g %g\n", q_index, scattering, weight0); 
    685696 
    686         #ifdef USE_OPENCL 
     697        #if defined(USE_GPU) 
    687698          this_result += weight * scattering; 
    688         #else // !USE_OPENCL 
     699        #else // !USE_GPU 
    689700          result[q_index] += weight * scattering; 
    690         #endif // !USE_OPENCL 
     701        #endif // !USE_GPU 
    691702      } 
    692703    } 
     
    712723 
    713724// Remember the current result and the updated norm. 
    714 #ifdef USE_OPENCL 
     725#if defined(USE_GPU) 
    715726  result[q_index] = this_result; 
    716727  if (q_index == 0) result[nq] = pd_norm; 
    717728//if (q_index == 0) printf("res: %g/%g\n", result[0], pd_norm); 
    718 #else // !USE_OPENCL 
     729#else // !USE_GPU 
    719730  result[nq] = pd_norm; 
    720731//printf("res: %g/%g\n", result[0], pd_norm); 
    721 #endif // !USE_OPENCL 
     732#endif // !USE_GPU 
    722733 
    723734// ** clear the macros in preparation for the next kernel ** 
  • sasmodels/models/cylinder.c

    r108e70e r0db7dbd  
    11#define INVALID(v) (v.radius<0 || v.length<0) 
    22 
     3__device__ 
    34static double 
    45form_volume(double radius, double length) 
     
    78} 
    89 
     10__device__ 
    911static double 
    1012fq(double qab, double qc, double radius, double length) 
     
    1315} 
    1416 
     17__device__ 
    1518static double 
    1619orient_avg_1D(double q, double radius, double length) 
     
    3336} 
    3437 
     38__device__ 
    3539static double 
    3640Iq(double q, 
     
    4448} 
    4549 
     50__device__ 
    4651static double 
    4752Iqac(double qab, double qc, 
  • sasmodels/models/lib/gauss76.c

    r99b84ec r0db7dbd  
    1111 
    1212// Gaussians 
    13 constant double Gauss76Wt[76]={ 
     13constant_var double Gauss76Wt[76] = { 
    1414        .00126779163408536,             //0 
    1515        .00294910295364247, 
     
    9090}; 
    9191 
    92 constant double Gauss76Z[76]={ 
     92constant_var double Gauss76Z[76] = { 
    9393        -.999505948362153,              //0 
    9494        -.997397786355355, 
  • sasmodels/models/lib/polevl.c

    r447e9aa r0db7dbd  
    5151*/ 
    5252 
    53 double polevl( double x, constant double *coef, int N ); 
    54 double polevl( double x, constant double *coef, int N ) 
     53__device__ static 
     54double polevl( double x, constant_par double *coef, int N ) 
    5555{ 
    5656 
     
    7272 */ 
    7373 
    74 double p1evl( double x, constant double *coef, int N ); 
    75 double p1evl( double x, constant double *coef, int N ) 
     74__device__ static 
     75double p1evl( double x, constant_par double *coef, int N ) 
    7676{ 
    7777    int i=0; 
  • sasmodels/models/lib/sas_J1.c

    r5181ccc r0db7dbd  
    4242#if FLOAT_SIZE>4 
    4343//Cephes double pression function 
    44 double cephes_j1(double x); 
    45  
    46 constant double RPJ1[8] = { 
     44 
     45constant_var double RPJ1[8] = { 
    4746    -8.99971225705559398224E8, 
    4847    4.52228297998194034323E11, 
     
    5453    0.0 }; 
    5554 
    56 constant double RQJ1[8] = { 
     55constant_var double RQJ1[8] = { 
    5756    6.20836478118054335476E2, 
    5857    2.56987256757748830383E5, 
     
    6564    }; 
    6665 
    67 constant double PPJ1[8] = { 
     66constant_var double PPJ1[8] = { 
    6867    7.62125616208173112003E-4, 
    6968    7.31397056940917570436E-2, 
     
    7675 
    7776 
    78 constant double PQJ1[8] = { 
     77constant_var double PQJ1[8] = { 
    7978    5.71323128072548699714E-4, 
    8079    6.88455908754495404082E-2, 
     
    8685    0.0 }; 
    8786 
    88 constant double QPJ1[8] = { 
     87constant_var double QPJ1[8] = { 
    8988    5.10862594750176621635E-2, 
    9089    4.98213872951233449420E0, 
     
    9695    2.52070205858023719784E1 }; 
    9796 
    98 constant double QQJ1[8] = { 
     97constant_var double QQJ1[8] = { 
    9998    7.42373277035675149943E1, 
    10099    1.05644886038262816351E3, 
     
    106105    0.0 }; 
    107106 
     107__device__ static 
    108108double cephes_j1(double x) 
    109109{ 
     
    155155#else 
    156156//Single precission version of cephes 
    157 float cephes_j1f(float x); 
    158  
    159 constant float JPJ1[8] = { 
     157constant_var float JPJ1[8] = { 
    160158    -4.878788132172128E-009, 
    161159    6.009061827883699E-007, 
     
    168166    }; 
    169167 
    170 constant float MO1J1[8] = { 
     168constant_var float MO1J1[8] = { 
    171169    6.913942741265801E-002, 
    172170    -2.284801500053359E-001, 
     
    179177    }; 
    180178 
    181 constant float PH1J1[8] = { 
     179constant_var float PH1J1[8] = { 
    182180    -4.497014141919556E+001, 
    183181    5.073465654089319E+001, 
     
    190188    }; 
    191189 
     190__device__ static 
    192191float cephes_j1f(float xx) 
    193192{ 
     
    240239 
    241240//Finally J1c function that equals 2*J1(x)/x 
    242 double sas_2J1x_x(double x); 
     241__device__ static 
    243242double sas_2J1x_x(double x) 
    244243{ 
Note: See TracChangeset for help on using the changeset viewer.