Changeset 74e9b5f in sasmodels


Ignore:
Timestamp:
Oct 12, 2018 10:52:48 PM (5 weeks ago)
Author:
pkienzle
Branches:
beta_approx, cuda-test, py3, ticket-1015-gpu-mem-error, ticket-1157, ticket-608-user-defined-weights, ticket_1156
Children:
4de14584
Parents:
b0de252
Message:

autotag functions as device functions for cuda. Refs #1076.

Location:
sasmodels
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernel_header.c

    rb0de252 r74e9b5f  
    55#elif defined(_OPENMP) 
    66# define USE_OPENMP 
    7 #elif defined(__CUDACC__) 
    8 # define USE_CUDA 
    97#endif 
    108 
     
    1715 
    1816   #define USE_GPU 
     17   #define pglobal global 
     18   #define pconstant constant 
     19 
    1920   typedef int int32_t; 
    20    #define global_par global 
    21    #define local_par local 
    22    #define constant_par constant 
    23    #define global_var global 
    24    #define local_var local 
    25    #define constant_var constant 
    26    #define __device__ 
    2721 
    2822   #if defined(USE_SINCOS) 
     
    4539 
    4640   #define USE_GPU 
    47    #define global_par 
    48    #define local_par 
    49    #define constant_par const 
    50    #define global_var 
    51    #define local_var __shared__ 
    52    #define constant_var __constant__ 
    53  
     41   #define local __shared__ 
     42   #define pglobal 
     43   #define constant __constant__ 
     44   #define pconstant const 
    5445   #define kernel extern "C" __global__ 
    5546 
     
    6758#else // !USE_OPENCL && !USE_CUDA 
    6859 
    69    #define global_par 
    70    #define local_par 
    71    #define constant_par const 
    72    #define global_var 
    73    #define local_var 
    74    #define constant_var const 
    75    #define __device__ 
     60   #define local 
     61   #define pglobal 
     62   #define constant const 
     63   #define pconstant const 
    7664 
    7765   #ifdef __cplusplus 
     
    193181#  define M_4PI_3 4.18879020478639 
    194182#endif 
    195 __device__ 
    196183inline double square(double x) { return x*x; } 
    197 __device__ 
    198184inline double cube(double x) { return x*x*x; } 
    199 __device__ 
    200185inline double sas_sinx_x(double x) { return x==0 ? 1.0 : sin(x)/x; } 
    201186 
  • sasmodels/kernel_iq.c

    rb0de252 r74e9b5f  
    6767 
    6868// Return value restricted between low and high 
    69 __device__ 
    7069static double clip(double value, double low, double high) 
    7170{ 
     
    8079//     du * (m_sigma_y + 1j*m_sigma_z); 
    8180// weights for spin crosssections: dd du real, ud real, uu, du imag, ud imag 
    82 __device__ 
    8381static void set_spin_weights(double in_spin, double out_spin, double weight[6]) 
    8482{ 
     
    105103 
    106104// Compute the magnetic sld 
    107 __device__ 
    108105static double mag_sld( 
    109106  const unsigned int xs, // 0=dd, 1=du.real, 2=ud.real, 3=uu, 4=du.imag, 5=ud.imag 
     
    154151// jitter angles (dtheta, dphi).  This matrix can be applied to all of the 
    155152// (qx, qy) points in the image to produce R*[qx,qy]' = [qa,qc]' 
    156 __device__ 
    157153static void 
    158154qac_rotation( 
     
    188184// Apply the rotation matrix returned from qac_rotation to the point (qx,qy), 
    189185// returning R*[qx,qy]' = [qa,qc]' 
    190 __device__ 
    191186static void 
    192187qac_apply( 
     
    216211// jitter angles (dtheta, dphi, dpsi).  This matrix can be applied to all of the 
    217212// (qx, qy) points in the image to produce R*[qx,qy]' = [qa,qb,qc]' 
    218 __device__ 
    219213static void 
    220214qabc_rotation( 
     
    263257// Apply the rotation matrix returned from qabc_rotation to the point (qx,qy), 
    264258// returning R*[qx,qy]' = [qa,qb,qc]' 
    265 __device__ 
    266259static void 
    267260qabc_apply( 
     
    285278    const int32_t pd_start,     // where we are in the dispersity loop 
    286279    const int32_t pd_stop,      // where we are stopping in the dispersity loop 
    287     global_par const ProblemDetails *details, 
    288     global_par const double *values, 
    289     global_par const double *q, // nq q values, with padding to boundary 
    290     global_par double *result,  // nq+1 return values, again with padding 
     280    pglobal const ProblemDetails *details, 
     281    pglobal const double *values, 
     282    pglobal const double *q, // nq q values, with padding to boundary 
     283    pglobal double *result,  // nq+1 return values, again with padding 
    291284    const double cutoff     // cutoff in the dispersity weight product 
    292285    ) 
     
    386379  const int n4 = pd_length[4]; 
    387380  const int p4 = pd_par[4]; 
    388   global_var const double *v4 = pd_value + pd_offset[4]; 
    389   global_var const double *w4 = pd_weight + pd_offset[4]; 
     381  pglobal const double *v4 = pd_value + pd_offset[4]; 
     382  pglobal const double *w4 = pd_weight + pd_offset[4]; 
    390383  int i4 = (pd_start/pd_stride[4])%n4;  // position in level 4 at pd_start 
    391384 
     
    573566  const int n##_LOOP = details->pd_length[_LOOP]; \ 
    574567  const int p##_LOOP = details->pd_par[_LOOP]; \ 
    575   global_var const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \ 
    576   global_var const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \ 
     568  pglobal const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \ 
     569  pglobal const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \ 
    577570  int i##_LOOP = (pd_start/details->pd_stride[_LOOP])%n##_LOOP; 
    578571 
     
    598591// Pointers to the start of the dispersity and weight vectors, if needed. 
    599592#if MAX_PD>0 
    600   global_var const double *pd_value = values + NUM_VALUES; 
    601   global_var const double *pd_weight = pd_value + details->num_weights; 
     593  pglobal const double *pd_value = values + NUM_VALUES; 
     594  pglobal const double *pd_weight = pd_value + details->num_weights; 
    602595#endif 
    603596 
  • sasmodels/kernelcuda.py

    rb0de252 r74e9b5f  
    6262import logging 
    6363import time 
     64import re 
    6465 
    6566import numpy as np  # type: ignore 
     
    146147    return dtype in (generate.F32, generate.F64) 
    147148 
     149 
     150FUNCTION_PATTERN = re.compile(r"""^ 
     151  (?P<space>\s*)                   # initial space 
     152  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 
     153  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens 
     154  """, re.VERBOSE|re.MULTILINE) 
     155 
     156MARKED_PATTERN = re.compile(r""" 
     157  \b(return|else|kernel|device|__device__)\b 
     158  """, re.VERBOSE|re.MULTILINE) 
     159 
     160def _add_device_tag(match): 
     161    # type: (None) -> str 
     162    # Note: should be re.Match, but that isn't a simple type 
     163    """ 
     164    replace qualifiers with __device__ qualifiers if needed 
     165    """ 
     166    qualifiers = match.group("qualifiers") 
     167    if MARKED_PATTERN.search(qualifiers): 
     168        start, end = match.span() 
     169        return match.string[start:end] 
     170    else: 
     171        function = match.group("function") 
     172        space = match.group("space") 
     173        return "".join((space, "__device__ ", qualifiers, function)) 
     174 
     175def mark_device_functions(source): 
     176    # type: (str) -> str 
     177    """ 
     178    Mark all function declarations as __device__ functions (except kernel). 
     179    """ 
     180    return FUNCTION_PATTERN.sub(_add_device_tag, source) 
     181 
     182def show_device_functions(source): 
     183    # type: (str) -> str 
     184    """ 
     185    Show all discovered function declarations, but don't change any. 
     186    """ 
     187    for match in FUNCTION_PATTERN.finditer(source): 
     188        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 
     189    return source 
     190 
    148191def compile_model(source, dtype, fast=False): 
    149192    # type: (str, np.dtype, bool) -> SourceModule 
     
    163206    source_list.insert(0, "#define USE_SINCOS\n") 
    164207    source = "\n".join(source_list) 
    165     options = '-use_fast_math' if fast else None 
     208    #source = show_device_functions(source) 
     209    source = mark_device_functions(source) 
     210    #with open('/tmp/kernel.cu', 'w') as fd: fd.write(source) 
     211    #print(source) 
     212    #options = ['--verbose', '-E'] 
     213    options = ['--use_fast_math'] if fast else None 
    166214    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
     215 
    167216    #print("done with "+program) 
    168217    return program 
  • sasmodels/model_test.py

    r012cd34 r74e9b5f  
    55Usage:: 
    66 
    7     python -m sasmodels.model_test [opencl|dll|opencl_and_dll] model1 model2 ... 
     7    python -m sasmodels.model_test [opencl|cuda|dll] model1 model2 ... 
    88 
    99    if model1 is 'all', then all except the remaining models will be tested 
     
    6363from .modelinfo import expand_pars 
    6464from .kernelcl import use_opencl 
     65from .kernelcuda import use_cuda 
    6566 
    6667# pylint: disable=unused-import 
     
    8081    Construct the pyunit test suite. 
    8182 
    82     *loaders* is the list of kernel drivers to use, which is one of 
    83     *["dll", "opencl"]*, *["dll"]* or *["opencl"]*.  For python models, 
    84     the python driver is always used. 
     83    *loaders* is the list of kernel drivers to use (dll, opencl or cuda). 
     84    For python model the python driver is always used. 
    8585 
    8686    *models* is the list of models to test, or *["all"]* to test all models. 
     
    135135 
    136136            # test using dll if desired 
    137             if 'dll' in loaders or not use_opencl(): 
     137            if 'dll' in loaders: 
    138138                test_name = "%s-dll"%model_name 
    139139                test_method_name = "test_%s_dll" % model_info.id 
     
    156156                                     test_method_name, 
    157157                                     platform="ocl", dtype=None, 
     158                                     stash=stash) 
     159                #print("defining", test_name) 
     160                suite.addTest(test) 
     161 
     162            # test using cuda if desired and available 
     163            if 'cuda' in loaders and use_cuda(): 
     164                test_name = "%s-cuda"%model_name 
     165                test_method_name = "test_%s_cuda" % model_info.id 
     166                # Using dtype=None so that the models that are only 
     167                # correct for double precision are not tested using 
     168                # single precision.  The choice is determined by the 
     169                # presence of *single=False* in the model file. 
     170                test = ModelTestCase(test_name, model_info, 
     171                                     test_method_name, 
     172                                     platform="cuda", dtype=None, 
    158173                                     stash=stash) 
    159174                #print("defining", test_name) 
     
    220235 
    221236                # Check for missing tests.  Only do so for the "dll" tests 
    222                 # to reduce noise from both opencl and dll, and because 
     237                # to reduce noise from both opencl and cuda, and because 
    223238                # python kernels use platform="dll". 
    224239                if self.platform == "dll": 
     
    368383 
    369384    # Build a test suite containing just the model 
    370     loaders = ['opencl'] if use_opencl() else ['dll'] 
     385    loader = 'opencl' if use_opencl() else 'cuda' if use_cuda() else 'dll' 
    371386    models = [model] 
    372387    try: 
    373         suite = make_suite(loaders, models) 
     388        suite = make_suite([loader], models) 
    374389    except Exception: 
    375390        import traceback 
     
    434449        loaders = ['opencl'] 
    435450        models = models[1:] 
     451    elif models and models[0] == 'cuda': 
     452        if not use_cuda(): 
     453            print("cuda is not available") 
     454            return 1 
     455        loaders = ['cuda'] 
     456        models = models[1:] 
    436457    elif models and models[0] == 'dll': 
    437458        # TODO: test if compiler is available? 
    438459        loaders = ['dll'] 
    439460        models = models[1:] 
    440     elif models and models[0] == 'opencl_and_dll': 
    441         loaders = ['opencl', 'dll'] if use_opencl() else ['dll'] 
    442         models = models[1:] 
    443461    else: 
    444         loaders = ['opencl', 'dll'] if use_opencl() else ['dll'] 
     462        loaders = ['dll'] 
     463        if use_opencl(): 
     464            loaders.append('opencl') 
     465        if use_cuda(): 
     466            loaders.append('cuda') 
    445467    if not models: 
    446468        print("""\ 
    447469usage: 
    448   python -m sasmodels.model_test [-v] [opencl|dll] model1 model2 ... 
     470  python -m sasmodels.model_test [-v] [opencl|cuda|dll] model1 model2 ... 
    449471 
    450472If -v is included on the command line, then use verbose output. 
    451473 
    452 If neither opencl nor dll is specified, then models will be tested with 
    453 both OpenCL and dll; the compute target is ignored for pure python models. 
     474If no platform is specified, then models will be tested with dll, and 
     475if available, OpenCL and CUDA; the compute target is ignored for pure python models. 
    454476 
    455477If model1 is 'all', then all except the remaining models will be tested. 
     
    471493    Run "nosetests sasmodels" on the command line to invoke it. 
    472494    """ 
    473     loaders = ['opencl', 'dll'] if use_opencl() else ['dll'] 
     495    loaders = ['dll'] 
     496    if use_opencl(): 
     497        loaders.append('opencl') 
     498    if use_cuda(): 
     499        loaders.append('cuda') 
    474500    tests = make_suite(loaders, ['all']) 
    475501    def build_test(test): 
  • sasmodels/models/cylinder.c

    r0db7dbd r74e9b5f  
    11#define INVALID(v) (v.radius<0 || v.length<0) 
    22 
    3 __device__ 
    43static double 
    54form_volume(double radius, double length) 
     
    87} 
    98 
    10 __device__ 
    119static double 
    1210fq(double qab, double qc, double radius, double length) 
     
    1513} 
    1614 
    17 __device__ 
    1815static double 
    1916orient_avg_1D(double q, double radius, double length) 
     
    3633} 
    3734 
    38 __device__ 
    3935static double 
    4036Iq(double q, 
     
    4844} 
    4945 
    50 __device__ 
    5146static double 
    5247Iqac(double qab, double qc, 
  • sasmodels/models/lib/gauss76.c

    r0db7dbd r74e9b5f  
    1111 
    1212// Gaussians 
    13 constant_var double Gauss76Wt[76] = { 
     13constant double Gauss76Wt[76] = { 
    1414        .00126779163408536,             //0 
    1515        .00294910295364247, 
     
    9090}; 
    9191 
    92 constant_var double Gauss76Z[76] = { 
     92constant double Gauss76Z[76] = { 
    9393        -.999505948362153,              //0 
    9494        -.997397786355355, 
  • sasmodels/models/lib/polevl.c

    r0db7dbd r74e9b5f  
    5151*/ 
    5252 
    53 __device__ static 
    54 double polevl( double x, constant_par double *coef, int N ) 
     53static 
     54double polevl( double x, pconstant double *coef, int N ) 
    5555{ 
    5656 
     
    7272 */ 
    7373 
    74 __device__ static 
    75 double p1evl( double x, constant_par double *coef, int N ) 
     74static 
     75double p1evl( double x, pconstant double *coef, int N ) 
    7676{ 
    7777    int i=0; 
  • sasmodels/models/lib/sas_J1.c

    r0db7dbd r74e9b5f  
    4343//Cephes double pression function 
    4444 
    45 constant_var double RPJ1[8] = { 
     45constant double RPJ1[8] = { 
    4646    -8.99971225705559398224E8, 
    4747    4.52228297998194034323E11, 
     
    5353    0.0 }; 
    5454 
    55 constant_var double RQJ1[8] = { 
     55constant double RQJ1[8] = { 
    5656    6.20836478118054335476E2, 
    5757    2.56987256757748830383E5, 
     
    6464    }; 
    6565 
    66 constant_var double PPJ1[8] = { 
     66constant double PPJ1[8] = { 
    6767    7.62125616208173112003E-4, 
    6868    7.31397056940917570436E-2, 
     
    7575 
    7676 
    77 constant_var double PQJ1[8] = { 
     77constant double PQJ1[8] = { 
    7878    5.71323128072548699714E-4, 
    7979    6.88455908754495404082E-2, 
     
    8585    0.0 }; 
    8686 
    87 constant_var double QPJ1[8] = { 
     87constant double QPJ1[8] = { 
    8888    5.10862594750176621635E-2, 
    8989    4.98213872951233449420E0, 
     
    9595    2.52070205858023719784E1 }; 
    9696 
    97 constant_var double QQJ1[8] = { 
     97constant double QQJ1[8] = { 
    9898    7.42373277035675149943E1, 
    9999    1.05644886038262816351E3, 
     
    105105    0.0 }; 
    106106 
    107 __device__ static 
     107static 
    108108double cephes_j1(double x) 
    109109{ 
     
    155155#else 
    156156//Single precission version of cephes 
    157 constant_var float JPJ1[8] = { 
     157constant float JPJ1[8] = { 
    158158    -4.878788132172128E-009, 
    159159    6.009061827883699E-007, 
     
    166166    }; 
    167167 
    168 constant_var float MO1J1[8] = { 
     168constant float MO1J1[8] = { 
    169169    6.913942741265801E-002, 
    170170    -2.284801500053359E-001, 
     
    177177    }; 
    178178 
    179 constant_var float PH1J1[8] = { 
     179constant float PH1J1[8] = { 
    180180    -4.497014141919556E+001, 
    181181    5.073465654089319E+001, 
     
    188188    }; 
    189189 
    190 __device__ static 
     190static 
    191191float cephes_j1f(float xx) 
    192192{ 
     
    239239 
    240240//Finally J1c function that equals 2*J1(x)/x 
    241 __device__ static 
     241static 
    242242double sas_2J1x_x(double x) 
    243243{ 
Note: See TracChangeset for help on using the changeset viewer.