Changeset 508475a in sasmodels


Ignore:
Timestamp:
Oct 25, 2018 2:50:35 PM (4 weeks ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
beta_approx, cuda-test, py3, ticket-1015-gpu-mem-error, ticket-1157, ticket-608-user-defined-weights, ticket_1156
Children:
ac97d66, cc8b183
Parents:
8b31efa (diff), 2a12d8d8 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
git-author:
Paul Kienzle <pkienzle@…> (10/25/18 14:20:45)
git-committer:
Paul Kienzle <pkienzle@…> (10/25/18 14:50:35)
Message:

Merge branch 'ticket-1015-gpu-mem-error' into cuda-test

Files:
1 added
13 edited

Legend:

Unmodified
Added
Removed
  • doc/guide/magnetism/magnetism.rst

    rbefe905 rdf87acf  
    8989 
    9090===========   ================================================================ 
    91  M0:sld       $D_M M_0$ 
    92  mtheta:sld   $\theta_M$ 
    93  mphi:sld     $\phi_M$ 
    94  up:angle     $\theta_\mathrm{up}$ 
    95  up:frac_i    $u_i$ = (spin up)/(spin up + spin down) *before* the sample 
    96  up:frac_f    $u_f$ = (spin up)/(spin up + spin down) *after* the sample 
     91 sld_M0       $D_M M_0$ 
     92 sld_mtheta   $\theta_M$ 
     93 sld_mphi     $\phi_M$ 
     94 up_frac_i    $u_i$ = (spin up)/(spin up + spin down) *before* the sample 
     95 up_frac_f    $u_f$ = (spin up)/(spin up + spin down) *after* the sample 
     96 up_angle     $\theta_\mathrm{up}$ 
    9797===========   ================================================================ 
    9898 
    9999.. note:: 
    100     The values of the 'up:frac_i' and 'up:frac_f' must be in the range 0 to 1. 
     100    The values of the 'up_frac_i' and 'up_frac_f' must be in the range 0 to 1. 
    101101 
    102102*Document History* 
  • sasmodels/kernelcl.py

    r8b31efa r508475a  
    7676 
    7777from . import generate 
     78from .generate import F32, F64 
    7879from .kernel import KernelModel, Kernel 
    7980 
     
    133134 
    134135def use_opencl(): 
    135     env = os.environ.get("SAS_OPENCL", "").lower() 
    136     return HAVE_OPENCL and env != "none" and not env.startswith("cuda") 
     136    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 
     137    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 
    137138 
    138139ENV = None 
     
    165166    Return true if device supports the requested precision. 
    166167    """ 
    167     if dtype == generate.F32: 
     168    if dtype == F32: 
    168169        return True 
    169170    elif dtype == generate.F64: 
     
    219220    """ 
    220221    GPU context, with possibly many devices, and one queue per device. 
     222 
     223    Because the environment can be reset during a live program (e.g., if the 
     224    user changes the active GPU device in the GUI), everything associated 
     225    with the device context must be cached in the environment and recreated 
     226    if the environment changes.  The *cache* attribute is a simple dictionary 
     227    which holds keys and references to objects, such as compiled kernels and 
     228    allocated buffers.  The running program should check in the cache for 
     229    long lived objects and create them if they are not there.  The program 
     230    should not hold onto cached objects, but instead only keep them active 
     231    for the duration of a function call.  When the environment is destroyed 
     232    then the *release* method for each active cache item is called before 
     233    the environment is freed.  This means that each cl buffer should be 
     234    in its own cache entry. 
    221235    """ 
    222236    def __init__(self): 
    223237        # type: () -> None 
    224238        # find gpu context 
    225         #self.context = cl.create_some_context() 
    226  
    227         self.context = None 
    228         if 'SAS_OPENCL' in os.environ: 
    229             # Set the PyOpenCL environment variable PYOPENCL_CTX  
    230             # from SAS_OPENCL=driver:device.  Ignore the generic 
    231             # SAS_OPENCL=opencl, which is used to select the default  
    232             # OpenCL device.  Don't need to check for "none" or 
    233             # "cuda" since use_opencl() would return False if they 
    234             # were defined, and we wouldn't get here. 
    235             dev_str = os.environ["SAS_OPENCL"] 
    236             if dev_str and dev_str.lower() != "opencl": 
    237                 os.environ["PYOPENCL_CTX"] = dev_str 
    238  
    239         if 'PYOPENCL_CTX' in os.environ: 
    240             self._create_some_context() 
    241  
    242         if not self.context: 
    243             self.context = _get_default_context() 
     239        context_list = _create_some_context() 
     240 
     241        # Find a context for F32 and for F64 (maybe the same one). 
     242        # F16 isn't good enough. 
     243        self.context = {} 
     244        for dtype in (F32, F64): 
     245            for context in context_list: 
     246                if has_type(context.devices[0], dtype): 
     247                    self.context[dtype] = context 
     248                    break 
     249            else: 
     250                self.context[dtype] = None 
     251 
     252        # Build a queue for each context 
     253        self.queue = {} 
     254        context = self.context[F32] 
     255        self.queue[F32] = cl.CommandQueue(context, context.devices[0]) 
     256        if self.context[F64] == self.context[F32]: 
     257            self.queue[F64] = self.queue[F32] 
     258        else: 
     259            context = self.context[F64] 
     260            self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 
    244261 
    245262        # Byte boundary for data alignment 
    246         #self.data_boundary = max(d.min_data_type_align_size 
    247         #                         for d in self.context.devices) 
    248         self.queues = [cl.CommandQueue(context, context.devices[0]) 
    249                        for context in self.context] 
     263        #self.data_boundary = max(context.devices[0].min_data_type_align_size 
     264        #                         for context in self.context.values()) 
     265 
     266        # Cache for compiled programs, and for items in context 
    250267        self.compiled = {} 
     268        self.cache = {} 
    251269 
    252270    def has_type(self, dtype): 
     
    255273        Return True if all devices support a given type. 
    256274        """ 
    257         return any(has_type(d, dtype) 
    258                    for context in self.context 
    259                    for d in context.devices) 
    260  
    261     def get_queue(self, dtype): 
    262         # type: (np.dtype) -> cl.CommandQueue 
    263         """ 
    264         Return a command queue for the kernels of type dtype. 
    265         """ 
    266         for context, queue in zip(self.context, self.queues): 
    267             if all(has_type(d, dtype) for d in context.devices): 
    268                 return queue 
    269  
    270     def get_context(self, dtype): 
    271         # type: (np.dtype) -> cl.Context 
    272         """ 
    273         Return a OpenCL context for the kernels of type dtype. 
    274         """ 
    275         for context in self.context: 
    276             if all(has_type(d, dtype) for d in context.devices): 
    277                 return context 
    278  
    279     def _create_some_context(self): 
    280         # type: () -> cl.Context 
    281         """ 
    282         Protected call to cl.create_some_context without interactivity.  Use 
    283         this if SAS_OPENCL is set in the environment.  Sets the *context* 
    284         attribute. 
    285         """ 
    286         try: 
    287             self.context = [cl.create_some_context(interactive=False)] 
    288         except Exception as exc: 
    289             warnings.warn(str(exc)) 
    290             warnings.warn("pyopencl.create_some_context() failed") 
    291             warnings.warn("the environment variable 'SAS_OPENCL' might not be set correctly") 
     275        return self.context.get(dtype, None) is not None 
    292276 
    293277    def compile_program(self, name, source, dtype, fast, timestamp): 
     
    306290            del self.compiled[key] 
    307291        if key not in self.compiled: 
    308             context = self.get_context(dtype) 
     292            context = self.context[dtype] 
    309293            logging.info("building %s for OpenCL %s", key, 
    310294                         context.devices[0].name.strip()) 
    311             program = compile_model(self.get_context(dtype), 
     295            program = compile_model(self.context[dtype], 
    312296                                    str(source), dtype, fast) 
    313297            self.compiled[key] = (program, timestamp) 
    314298        return program 
     299 
     300    def free_buffer(self, key): 
     301        if key in self.cache: 
     302            self.cache[key].release() 
     303            del self.cache[key] 
     304 
     305    def __del__(self): 
     306        for v in self.cache.values(): 
     307            release = getattr(v, 'release', lambda: None) 
     308            release() 
     309        self.cache = {} 
     310 
     311_CURRENT_ID = 0 
     312def unique_id(): 
     313    global _CURRENT_ID 
     314    _CURRENT_ID += 1 
     315    return _CURRENT_ID 
     316 
     317def _create_some_context(): 
     318    # type: () -> cl.Context 
     319    """ 
     320    Protected call to cl.create_some_context without interactivity. 
     321 
     322    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment, 
     323    otherwise scans for the most appropriate device using 
     324    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which 
     325    indicates that an OpenCL device should be used without specifying 
     326    which one (and not a CUDA device, or no GPU). 
     327    """ 
     328    # Assume we do not get here if SAS_OPENCL is None or CUDA 
     329    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 
     330    if sas_opencl.lower() != 'opencl': 
     331        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
     332        os.environ["PYOPENCL_CTX"] = sas_opencl 
     333 
     334    if 'PYOPENCL_CTX' in os.environ: 
     335        try: 
     336            return [cl.create_some_context(interactive=False)] 
     337        except Exception as exc: 
     338            warnings.warn(str(exc)) 
     339            warnings.warn("pyopencl.create_some_context() failed") 
     340            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 
     341 
     342    return _get_default_context() 
    315343 
    316344def _get_default_context(): 
     
    392420        self.dtype = dtype 
    393421        self.fast = fast 
    394         self.program = None # delay program creation 
    395         self._kernels = None 
     422        self.timestamp = generate.ocl_timestamp(self.info) 
     423        self._cache_key = unique_id() 
    396424 
    397425    def __getstate__(self): 
     
    402430        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    403431        self.info, self.source, self.dtype, self.fast = state 
    404         self.program = None 
    405432 
    406433    def make_kernel(self, q_vectors): 
    407434        # type: (List[np.ndarray]) -> "GpuKernel" 
    408         if self.program is None: 
    409             compile_program = environment().compile_program 
    410             timestamp = generate.ocl_timestamp(self.info) 
    411             self.program = compile_program( 
     435        return GpuKernel(self, q_vectors) 
     436 
     437    @property 
     438    def Iq(self): 
     439        return self._fetch_kernel('Iq') 
     440 
     441    def fetch_kernel(self, name): 
     442        # type: (str) -> cl.Kernel 
     443        """ 
     444        Fetch the kernel from the environment by name, compiling it if it 
     445        does not already exist. 
     446        """ 
     447        gpu = environment() 
     448        key = self._cache_key 
     449        if key not in gpu.cache: 
     450            program = gpu.compile_program( 
    412451                self.info.name, 
    413452                self.source['opencl'], 
    414453                self.dtype, 
    415454                self.fast, 
    416                 timestamp) 
     455                self.timestamp) 
    417456            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    418457            names = [generate.kernel_name(self.info, k) for k in variants] 
    419             kernels = [getattr(self.program, k) for k in names] 
    420             self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 
    421         is_2d = len(q_vectors) == 2 
    422         if is_2d: 
    423             kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 
     458            kernels = [getattr(program, k) for k in names] 
     459            data = dict((k, v) for k, v in zip(variants, kernels)) 
     460            # keep a handle to program so GC doesn't collect 
     461            data['program'] = program 
     462            gpu.cache[key] = data 
    424463        else: 
    425             kernel = [self._kernels['Iq']]*2 
    426         return GpuKernel(kernel, self.dtype, self.info, q_vectors) 
    427  
    428     def release(self): 
    429         # type: () -> None 
    430         """ 
    431         Free the resources associated with the model. 
    432         """ 
    433         if self.program is not None: 
    434             self.program = None 
    435  
    436     def __del__(self): 
    437         # type: () -> None 
    438         self.release() 
     464            data = gpu.cache[key] 
     465        return data[name] 
    439466 
    440467# TODO: check that we don't need a destructor for buffers which go out of scope 
     
    461488        # type: (List[np.ndarray], np.dtype) -> None 
    462489        # TODO: do we ever need double precision q? 
    463         env = environment() 
    464490        self.nq = q_vectors[0].size 
    465491        self.dtype = np.dtype(dtype) 
     
    481507            self.q[:self.nq] = q_vectors[0] 
    482508        self.global_size = [self.q.shape[0]] 
    483         context = env.get_context(self.dtype) 
    484         #print("creating inputs of size", self.global_size) 
    485         self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    486                              hostbuf=self.q) 
     509        self._cache_key = unique_id() 
     510 
     511    @property 
     512    def q_b(self): 
     513        """Lazy creation of q buffer so it can survive context reset""" 
     514        env = environment() 
     515        key = self._cache_key 
     516        if key not in env.cache: 
     517            context = env.context[self.dtype] 
     518            #print("creating inputs of size", self.global_size) 
     519            buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
     520                               hostbuf=self.q) 
     521            env.cache[key] = buffer 
     522        return env.cache[key] 
    487523 
    488524    def release(self): 
    489525        # type: () -> None 
    490526        """ 
    491         Free the memory. 
    492         """ 
    493         if self.q_b is not None: 
    494             self.q_b.release() 
    495             self.q_b = None 
     527        Free the buffer associated with the q value 
     528        """ 
     529        environment().free_buffer(id(self)) 
    496530 
    497531    def __del__(self): 
     
    503537    Callable SAS kernel. 
    504538 
    505     *kernel* is the GpuKernel object to call 
    506  
    507     *model_info* is the module information 
    508  
    509     *q_vectors* is the q vectors at which the kernel should be evaluated 
     539    *model* is the GpuModel object to call 
     540 
     541    The following attributes are defined: 
     542 
     543    *info* is the module information 
    510544 
    511545    *dtype* is the kernel precision 
     546 
     547    *dim* is '1d' or '2d' 
     548 
     549    *result* is a vector to contain the results of the call 
    512550 
    513551    The resulting call method takes the *pars*, a list of values for 
     
    519557    Call :meth:`release` when done with the kernel instance. 
    520558    """ 
    521     def __init__(self, kernel, dtype, model_info, q_vectors): 
     559    def __init__(self, model, q_vectors): 
    522560        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    523         q_input = GpuInput(q_vectors, dtype) 
    524         self.kernel = kernel 
    525         self.info = model_info 
    526         self.dtype = dtype 
    527         self.dim = '2d' if q_input.is_2d else '1d' 
    528         # plus three for the normalization values 
    529         self.result = np.empty(q_input.nq+1, dtype) 
    530  
    531         # Inputs and outputs for each kernel call 
    532         # Note: res may be shorter than res_b if global_size != nq 
     561        dtype = model.dtype 
     562        self.q_input = GpuInput(q_vectors, dtype) 
     563        self._model = model 
     564        self._as_dtype = (np.float32 if dtype == generate.F32 
     565                          else np.float64 if dtype == generate.F64 
     566                          else np.float16 if dtype == generate.F16 
     567                          else np.float32)  # will never get here, so use np.float32 
     568        self._cache_key = unique_id() 
     569 
     570        # attributes accessed from the outside 
     571        self.dim = '2d' if self.q_input.is_2d else '1d' 
     572        self.info = model.info 
     573        self.dtype = model.dtype 
     574 
     575        # holding place for the returned value 
     576        # plus one for the normalization values 
     577        self.result = np.empty(self.q_input.nq+1, dtype) 
     578 
     579    @property 
     580    def _result_b(self): 
     581        """Lazy creation of result buffer so it can survive context reset""" 
    533582        env = environment() 
    534         self.queue = env.get_queue(dtype) 
    535  
    536         self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 
    537                                   q_input.global_size[0] * dtype.itemsize) 
    538         self.q_input = q_input # allocated by GpuInput above 
    539  
    540         self._need_release = [self.result_b, self.q_input] 
    541         self.real = (np.float32 if dtype == generate.F32 
    542                      else np.float64 if dtype == generate.F64 
    543                      else np.float16 if dtype == generate.F16 
    544                      else np.float32)  # will never get here, so use np.float32 
     583        key = self._cache_key 
     584        if key not in env.cache: 
     585            context = env.context[self.dtype] 
     586            #print("creating inputs of size", self.global_size) 
     587            buffer = cl.Buffer(context, mf.READ_WRITE, 
     588                               self.q_input.global_size[0] * self.dtype.itemsize) 
     589            env.cache[key] = buffer 
     590        return env.cache[key] 
    545591 
    546592    def __call__(self, call_details, values, cutoff, magnetic): 
    547593        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    548         context = self.queue.context 
    549         # Arrange data transfer to card 
     594        env = environment() 
     595        queue = env.queue[self._model.dtype] 
     596        context = queue.context 
     597 
     598        # Arrange data transfer to/from card 
     599        q_b = self.q_input.q_b 
     600        result_b = self._result_b 
    550601        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    551602                              hostbuf=call_details.buffer) 
     
    553604                             hostbuf=values) 
    554605 
    555         kernel = self.kernel[1 if magnetic else 0] 
    556         args = [ 
     606        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
     607        kernel = self._model.fetch_kernel(name) 
     608        kernel_args = [ 
    557609            np.uint32(self.q_input.nq), None, None, 
    558             details_b, values_b, self.q_input.q_b, self.result_b, 
    559             self.real(cutoff), 
     610            details_b, values_b, q_b, result_b, 
     611            self._as_dtype(cutoff), 
    560612        ] 
    561613        #print("Calling OpenCL") 
     
    568620            stop = min(start + step, call_details.num_eval) 
    569621            #print("queuing",start,stop) 
    570             args[1:3] = [np.int32(start), np.int32(stop)] 
    571             wait_for = [kernel(self.queue, self.q_input.global_size, None, 
    572                                *args, wait_for=wait_for)] 
     622            kernel_args[1:3] = [np.int32(start), np.int32(stop)] 
     623            wait_for = [kernel(queue, self.q_input.global_size, None, 
     624                               *kernel_args, wait_for=wait_for)] 
    573625            if stop < call_details.num_eval: 
    574626                # Allow other processes to run 
     
    578630                    time.sleep(0.001) 
    579631                    last_nap = current_time 
    580         cl.enqueue_copy(self.queue, self.result, self.result_b) 
     632        cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for) 
    581633        #print("result", self.result) 
    582634 
     
    598650        Release resources associated with the kernel. 
    599651        """ 
    600         for v in self._need_release: 
    601             v.release() 
    602         self._need_release = [] 
     652        environment().free_buffer(id(self)) 
     653        self.q_input.release() 
    603654 
    604655    def __del__(self): 
  • sasmodels/models/spinodal.py

    r475ff58 r93fe8a1  
    1212where $x=q/q_0$, $q_0$ is the peak position, $I_{max}$ is the intensity  
    1313at $q_0$ (parameterised as the $scale$ parameter), and $B$ is a flat  
    14 background. The spinodal wavelength is given by $2\pi/q_0$.  
     14background. The spinodal wavelength, $\Lambda$, is given by $2\pi/q_0$.  
     15 
     16The definition of $I_{max}$ in the literature varies. Hashimoto *et al* (1991)  
     17define it as  
     18 
     19.. math:: 
     20    I_{max} = \Lambda^3\Delta\rho^2 
     21     
     22whereas Meier & Strobl (1987) give  
     23 
     24.. math:: 
     25    I_{max} = V_z\Delta\rho^2 
     26     
     27where $V_z$ is the volume per monomer unit. 
    1528 
    1629The exponent $\gamma$ is equal to $d+1$ for off-critical concentration  
     
    2841 
    2942H. Furukawa. Dynamics-scaling theory for phase-separating unmixing mixtures: 
    30 Growth rates of droplets and scaling properties of autocorrelation functions. 
    31 Physica A 123,497 (1984). 
     43Growth rates of droplets and scaling properties of autocorrelation functions.  
     44Physica A 123, 497 (1984). 
     45 
     46H. Meier & G. Strobl. Small-Angle X-ray Scattering Study of Spinodal  
     47Decomposition in Polystyrene/Poly(styrene-co-bromostyrene) Blends.  
     48Macromolecules 20, 649-654 (1987). 
     49 
     50T. Hashimoto, M. Takenaka & H. Jinnai. Scattering Studies of Self-Assembling  
     51Processes of Polymer Blends in Spinodal Decomposition.  
     52J. Appl. Cryst. 24, 457-466 (1991). 
    3253 
    3354Revision History 
     
    3556 
    3657* **Author:**  Dirk Honecker **Date:** Oct 7, 2016 
    37 * **Revised:** Steve King    **Date:** Sep 7, 2018 
     58* **Revised:** Steve King    **Date:** Oct 25, 2018 
    3859""" 
    3960 
  • setup.py

    r1f991d6 r783e76f  
    2929                return version[1:-1] 
    3030    raise RuntimeError("Could not read version from %s/__init__.py"%package) 
     31 
     32install_requires = ['numpy', 'scipy'] 
     33 
     34if sys.platform=='win32' or sys.platform=='cygwin': 
     35    install_requires.append('tinycc') 
    3136 
    3237setup( 
     
    6166        'sasmodels': ['*.c', '*.cl'], 
    6267    }, 
    63     install_requires=[ 
    64     ], 
     68    install_requires=install_requires, 
    6569    extras_require={ 
     70        'full': ['docutils', 'bumps', 'matplotlib'], 
     71        'server': ['bumps'], 
    6672        'OpenCL': ["pyopencl"], 
    67         'Bumps': ["bumps"], 
    68         'TinyCC': ["tinycc"], 
    6973    }, 
    7074    build_requires=['setuptools'], 
  • doc/guide/gpu_setup.rst

    r63602b1 r8b31efa  
    9494Device Selection 
    9595================ 
     96**OpenCL drivers** 
     97 
    9698If you have multiple GPU devices you can tell the program which device to use. 
    9799By default, the program looks for one GPU and one CPU device from available 
     
    104106was used to run the model. 
    105107 
    106 **If you don't want to use OpenCL, you can set** *SAS_OPENCL=None* 
    107 **in your environment settings, and it will only use normal programs.** 
    108  
    109 If you want to use one of the other devices, you can run the following 
     108If you want to use a specific driver and devices, you can run the following 
    110109from the python console:: 
    111110 
     
    115114This will provide a menu of different OpenCL drivers available. 
    116115When one is selected, it will say "set PYOPENCL_CTX=..." 
    117 Use that value as the value of *SAS_OPENCL*. 
     116Use that value as the value of *SAS_OPENCL=driver:device*. 
     117 
     118To use the default OpenCL device (rather than CUDA or None), 
     119set *SAS_OPENCL=opencl*. 
     120 
     121In batch queues, you may need to set *XDG_CACHE_HOME=~/.cache*  
     122(Linux only) to a different directory, depending on how the filesystem  
     123is configured.  You should also set *SAS_DLL_PATH* for CPU-only modules. 
     124 
     125    -DSAS_MODELPATH=path sets directory containing custom models 
     126    -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device 
     127    -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only) 
     128    -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler 
     129    -DSAS_OPENMP=1 turns on OpenMP for the DLLs 
     130    -DSAS_DLL_PATH=path sets the path to the compiled modules 
     131 
     132 
     133**CUDA drivers** 
     134 
     135If OpenCL drivers are not available on your system, but NVidia CUDA 
     136drivers are available, then set *SAS_OPENCL=cuda* or 
     137*SAS_OPENCL=cuda:n* for a particular device number *n*.  If no device 
     138number is specified, then the CUDA drivers looks for look for 
     139*CUDA_DEVICE=n* or a file ~/.cuda-device containing n for the device number. 
     140 
     141In batch queues, the SLURM command *sbatch --gres=gpu:1 ...* will set 
     142*CUDA_VISIBLE_DEVICES=n*, which ought to set the correct device 
     143number for *SAS_OPENCL=cuda*.  If not, then set 
     144*CUDA_DEVICE=$CUDA_VISIBLE_DEVICES* within the batch script.  You may 
     145need to set the CUDA cache directory to a folder accessible across the 
     146cluster with *PYCUDA_CACHE_DIR* (or *PYCUDA_DISABLE_CACHE* to disable 
     147caching), and you may need to set environment specific compiler flags 
     148with *PYCUDA_DEFAULT_NVCC_FLAGS*.  You should also set *SAS_DLL_PATH*  
     149for CPU-only modules. 
     150 
     151**No GPU support** 
     152 
     153If you don't want to use OpenCL or CUDA, you can set *SAS_OPENCL=None* 
     154in your environment settings, and it will only use normal programs. 
     155 
     156In batch queues, you may need to set *SAS_DLL_PATH* to a directory 
     157accessible on the compute node. 
     158 
    118159 
    119160Device Testing 
     
    154195*Document History* 
    155196 
    156 | 2017-09-27 Paul Kienzle 
     197| 2018-10-15 Paul Kienzle 
  • sasmodels/compare.py

    r610ef23 r4de14584  
    4141from . import kerneldll 
    4242from . import kernelcl 
     43from . import kernelcuda 
    4344from .data import plot_theory, empty_data1D, empty_data2D, load_data 
    4445from .direct_model import DirectModel, get_mesh 
     
    115116    === environment variables === 
    116117    -DSAS_MODELPATH=path sets directory containing custom models 
    117     -DSAS_OPENCL=vendor:device|none sets the target OpenCL device 
     118    -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device 
    118119    -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only) 
    119120    -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler 
     
    725726        set_integration_size(model_info, ngauss) 
    726727 
    727     if dtype != "default" and not dtype.endswith('!') and not kernelcl.use_opencl(): 
     728    if (dtype != "default" and not dtype.endswith('!')  
     729            and not (kernelcl.use_opencl() or kernelcuda.use_cuda())): 
    728730        raise RuntimeError("OpenCL not available " + kernelcl.OPENCL_ERROR) 
    729731 
  • sasmodels/core.py

    r2dcd6e7 rb0de252  
    2121from . import mixture 
    2222from . import kernelpy 
     23from . import kernelcuda 
    2324from . import kernelcl 
    2425from . import kerneldll 
     
    210211        #print("building dll", numpy_dtype) 
    211212        return kerneldll.load_dll(source['dll'], model_info, numpy_dtype) 
     213    elif platform == "cuda": 
     214        return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast) 
    212215    else: 
    213216        #print("building ocl", numpy_dtype) 
     
    245248    # type: (ModelInfo, str, str) -> (np.dtype, bool, str) 
    246249    """ 
    247     Interpret dtype string, returning np.dtype and fast flag. 
     250    Interpret dtype string, returning np.dtype, fast flag and platform. 
    248251 
    249252    Possible types include 'half', 'single', 'double' and 'quad'.  If the 
     
    253256    default for the model and platform. 
    254257 
    255     Platform preference can be specfied ("ocl" vs "dll"), with the default 
    256     being OpenCL if it is availabe.  If the dtype name ends with '!' then 
    257     platform is forced to be DLL rather than OpenCL. 
     258    Platform preference can be specfied ("ocl", "cuda", "dll"), with the 
     259    default being OpenCL or CUDA if available, otherwise DLL.  If the dtype 
     260    name ends with '!' then platform is forced to be DLL rather than GPU. 
     261    The default platform is set by the environment variable SAS_OPENCL, 
     262    SAS_OPENCL=driver:device for OpenCL, SAS_OPENCL=cuda:device for CUDA 
     263    or SAS_OPENCL=none for DLL. 
    258264 
    259265    This routine ignores the preferences within the model definition.  This 
     
    268274    if platform is None: 
    269275        platform = "ocl" 
    270     if not kernelcl.use_opencl() or not model_info.opencl: 
    271         platform = "dll" 
    272276 
    273277    # Check if type indicates dll regardless of which platform is given 
     
    275279        platform = "dll" 
    276280        dtype = dtype[:-1] 
     281 
     282    # Make sure model allows opencl/gpu 
     283    if not model_info.opencl: 
     284        platform = "dll" 
     285 
     286    # Make sure opencl is available, or fallback to cuda then to dll 
     287    if platform == "ocl" and not kernelcl.use_opencl(): 
     288        platform = "cuda" if kernelcuda.use_cuda() else "dll" 
    277289 
    278290    # Convert special type names "half", "fast", and "quad" 
     
    285297        dtype = "float16" 
    286298 
    287     # Convert dtype string to numpy dtype. 
     299    # Convert dtype string to numpy dtype.  Use single precision for GPU 
     300    # if model allows it, otherwise use double precision. 
    288301    if dtype is None or dtype == "default": 
    289         numpy_dtype = (generate.F32 if platform == "ocl" and model_info.single 
     302        numpy_dtype = (generate.F32 if model_info.single and platform in ("ocl", "cuda") 
    290303                       else generate.F64) 
    291304    else: 
    292305        numpy_dtype = np.dtype(dtype) 
    293306 
    294     # Make sure that the type is supported by opencl, otherwise use dll 
     307    # Make sure that the type is supported by GPU, otherwise use dll 
    295308    if platform == "ocl": 
    296309        env = kernelcl.environment() 
    297         if not env.has_type(numpy_dtype): 
    298             platform = "dll" 
    299             if dtype is None: 
    300                 numpy_dtype = generate.F64 
     310    elif platform == "cuda": 
     311        env = kernelcuda.environment() 
     312    else: 
     313        env = None 
     314    if env is not None and not env.has_type(numpy_dtype): 
     315        platform = "dll" 
     316        if dtype is None: 
     317            numpy_dtype = generate.F64 
    301318 
    302319    return numpy_dtype, fast, platform 
  • sasmodels/kernel_header.c

    r108e70e r74e9b5f  
    11#ifdef __OPENCL_VERSION__ 
    22# define USE_OPENCL 
     3#elif defined(__CUDACC__) 
     4# define USE_CUDA 
    35#elif defined(_OPENMP) 
    46# define USE_OPENMP 
    57#endif 
     8 
     9// Use SAS_DOUBLE to force the use of double even for float kernels 
     10#define SAS_DOUBLE dou ## ble 
    611 
    712// If opencl is not available, then we are compiling a C function 
    813// Note: if using a C++ compiler, then define kernel as extern "C" 
    914#ifdef USE_OPENCL 
     15 
     16   #define USE_GPU 
     17   #define pglobal global 
     18   #define pconstant constant 
     19 
    1020   typedef int int32_t; 
    11 #  if defined(USE_SINCOS) 
    12 #    define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar) 
    13 #  else 
    14 #    define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
    15 #  endif 
     21 
     22   #if defined(USE_SINCOS) 
     23   #  define SINCOS(angle,svar,cvar) svar=sincos(angle,&cvar) 
     24   #else 
     25   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
     26   #endif 
    1627   // Intel CPU on Mac gives strange values for erf(); on the verified 
    1728   // platforms (intel, nvidia, amd), the cephes erf() is significantly 
     
    2435   #  define erfcf erfc 
    2536   #endif 
    26 #else // !USE_OPENCL 
    27 // Use SAS_DOUBLE to force the use of double even for float kernels 
    28 #  define SAS_DOUBLE dou ## ble 
    29 #  ifdef __cplusplus 
     37 
     38#elif defined(USE_CUDA) 
     39 
     40   #define USE_GPU 
     41   #define local __shared__ 
     42   #define pglobal 
     43   #define constant __constant__ 
     44   #define pconstant const 
     45   #define kernel extern "C" __global__ 
     46 
     47   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0 
     48   // OpenCL pown(a,b) = C99 pow(a,b), b integer 
     49   #define powr(a,b) pow(a,b) 
     50   #define pown(a,b) pow(a,b) 
     51   //typedef int int32_t; 
     52   #if defined(USE_SINCOS) 
     53   #  define SINCOS(angle,svar,cvar) sincos(angle,&svar,&cvar) 
     54   #else 
     55   #  define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
     56   #endif 
     57 
     58#else // !USE_OPENCL && !USE_CUDA 
     59 
     60   #define local 
     61   #define pglobal 
     62   #define constant const 
     63   #define pconstant const 
     64 
     65   #ifdef __cplusplus 
    3066      #include <cstdio> 
    3167      #include <cmath> 
     
    5187     #endif 
    5288     inline void SINCOS(double angle, double &svar, double &cvar) { svar=sin(angle); cvar=cos(angle); } 
    53 else // !__cplusplus 
     89   #else // !__cplusplus 
    5490     #include <inttypes.h>  // C99 guarantees that int32_t types is here 
    5591     #include <stdio.h> 
     
    76112     #define kernel 
    77113     #define SINCOS(angle,svar,cvar) do {const double _t_=angle; svar=sin(_t_);cvar=cos(_t_);} while (0) 
    78 #  endif  // !__cplusplus 
    79 #  define global 
    80 #  define local 
    81 #  define constant const 
    82 // OpenCL powr(a,b) = C99 pow(a,b), b >= 0 
    83 // OpenCL pown(a,b) = C99 pow(a,b), b integer 
    84 #  define powr(a,b) pow(a,b) 
    85 #  define pown(a,b) pow(a,b) 
     114   #endif  // !__cplusplus 
     115   // OpenCL powr(a,b) = C99 pow(a,b), b >= 0 
     116   // OpenCL pown(a,b) = C99 pow(a,b), b integer 
     117   #define powr(a,b) pow(a,b) 
     118   #define pown(a,b) pow(a,b) 
     119 
    86120#endif // !USE_OPENCL 
    87121 
  • sasmodels/kernel_iq.c

    r70530778 r74e9b5f  
    278278    const int32_t pd_start,     // where we are in the dispersity loop 
    279279    const int32_t pd_stop,      // where we are stopping in the dispersity loop 
    280     global const ProblemDetails *details, 
    281     global const double *values, 
    282     global const double *q, // nq q values, with padding to boundary 
    283     global double *result,  // nq+1 return values, again with padding 
     280    pglobal const ProblemDetails *details, 
     281    pglobal const double *values, 
     282    pglobal const double *q, // nq q values, with padding to boundary 
     283    pglobal double *result,  // nq+1 return values, again with padding 
    284284    const double cutoff     // cutoff in the dispersity weight product 
    285285    ) 
    286286{ 
    287 #ifdef USE_OPENCL 
     287#if defined(USE_GPU) 
    288288  // who we are and what element we are working with 
     289  #if defined(USE_OPENCL) 
    289290  const int q_index = get_global_id(0); 
     291  #else // USE_CUDA 
     292  const int q_index = threadIdx.x + blockIdx.x * blockDim.x; 
     293  #endif 
    290294  if (q_index >= nq) return; 
    291295#else 
     
    340344  // seeing one q value (stored in the variable "this_result") while the dll 
    341345  // version must loop over all q. 
    342   #ifdef USE_OPENCL 
     346  #if defined(USE_GPU) 
    343347    double pd_norm = (pd_start == 0 ? 0.0 : result[nq]); 
    344348    double this_result = (pd_start == 0 ? 0.0 : result[q_index]); 
    345   #else // !USE_OPENCL 
     349  #else // !USE_GPU 
    346350    double pd_norm = (pd_start == 0 ? 0.0 : result[nq]); 
    347351    if (pd_start == 0) { 
     
    352356    } 
    353357    //if (q_index==0) printf("start %d %g %g\n", pd_start, pd_norm, result[0]); 
    354 #endif // !USE_OPENCL 
     358#endif // !USE_GPU 
    355359 
    356360 
     
    375379  const int n4 = pd_length[4]; 
    376380  const int p4 = pd_par[4]; 
    377   global const double *v4 = pd_value + pd_offset[4]; 
    378   global const double *w4 = pd_weight + pd_offset[4]; 
     381  pglobal const double *v4 = pd_value + pd_offset[4]; 
     382  pglobal const double *w4 = pd_weight + pd_offset[4]; 
    379383  int i4 = (pd_start/pd_stride[4])%n4;  // position in level 4 at pd_start 
    380384 
     
    562566  const int n##_LOOP = details->pd_length[_LOOP]; \ 
    563567  const int p##_LOOP = details->pd_par[_LOOP]; \ 
    564   global const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \ 
    565   global const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \ 
     568  pglobal const double *v##_LOOP = pd_value + details->pd_offset[_LOOP]; \ 
     569  pglobal const double *w##_LOOP = pd_weight + details->pd_offset[_LOOP]; \ 
    566570  int i##_LOOP = (pd_start/details->pd_stride[_LOOP])%n##_LOOP; 
    567571 
     
    587591// Pointers to the start of the dispersity and weight vectors, if needed. 
    588592#if MAX_PD>0 
    589   global const double *pd_value = values + NUM_VALUES; 
    590   global const double *pd_weight = pd_value + details->num_weights; 
     593  pglobal const double *pd_value = values + NUM_VALUES; 
     594  pglobal const double *pd_weight = pd_value + details->num_weights; 
    591595#endif 
    592596 
     
    648652      BUILD_ROTATION(); 
    649653 
    650 #ifndef USE_OPENCL 
     654#if !defined(USE_GPU) 
    651655      // DLL needs to explicitly loop over the q values. 
    652656      #ifdef USE_OPENMP 
     
    654658      #endif 
    655659      for (q_index=0; q_index<nq; q_index++) 
    656 #endif // !USE_OPENCL 
     660#endif // !USE_GPU 
    657661      { 
    658662 
     
    697701//printf("q_index:%d %g %g %g %g\n", q_index, scattering, weight0); 
    698702 
    699         #ifdef USE_OPENCL 
     703        #if defined(USE_GPU) 
    700704          this_result += weight * scattering; 
    701         #else // !USE_OPENCL 
     705        #else // !USE_GPU 
    702706          result[q_index] += weight * scattering; 
    703         #endif // !USE_OPENCL 
     707        #endif // !USE_GPU 
    704708      } 
    705709    } 
     
    725729 
    726730// Remember the current result and the updated norm. 
    727 #ifdef USE_OPENCL 
     731#if defined(USE_GPU) 
    728732  result[q_index] = this_result; 
    729733  if (q_index == 0) result[nq] = pd_norm; 
    730734//if (q_index == 0) printf("res: %g/%g\n", result[0], pd_norm); 
    731 #else // !USE_OPENCL 
     735#else // !USE_GPU 
    732736  result[nq] = pd_norm; 
    733737//printf("res: %g/%g\n", result[0], pd_norm); 
    734 #endif // !USE_OPENCL 
     738#endif // !USE_GPU 
    735739 
    736740// ** clear the macros in preparation for the next kernel ** 
  • sasmodels/model_test.py

    r012cd34 r74e9b5f  
    55Usage:: 
    66 
    7     python -m sasmodels.model_test [opencl|dll|opencl_and_dll] model1 model2 ... 
     7    python -m sasmodels.model_test [opencl|cuda|dll] model1 model2 ... 
    88 
    99    if model1 is 'all', then all except the remaining models will be tested 
     
    6363from .modelinfo import expand_pars 
    6464from .kernelcl import use_opencl 
     65from .kernelcuda import use_cuda 
    6566 
    6667# pylint: disable=unused-import 
     
    8081    Construct the pyunit test suite. 
    8182 
    82     *loaders* is the list of kernel drivers to use, which is one of 
    83     *["dll", "opencl"]*, *["dll"]* or *["opencl"]*.  For python models, 
    84     the python driver is always used. 
     83    *loaders* is the list of kernel drivers to use (dll, opencl or cuda). 
     84    For python model the python driver is always used. 
    8585 
    8686    *models* is the list of models to test, or *["all"]* to test all models. 
     
    135135 
    136136            # test using dll if desired 
    137             if 'dll' in loaders or not use_opencl(): 
     137            if 'dll' in loaders: 
    138138                test_name = "%s-dll"%model_name 
    139139                test_method_name = "test_%s_dll" % model_info.id 
     
    156156                                     test_method_name, 
    157157                                     platform="ocl", dtype=None, 
     158                                     stash=stash) 
     159                #print("defining", test_name) 
     160                suite.addTest(test) 
     161 
     162            # test using cuda if desired and available 
     163            if 'cuda' in loaders and use_cuda(): 
     164                test_name = "%s-cuda"%model_name 
     165                test_method_name = "test_%s_cuda" % model_info.id 
     166                # Using dtype=None so that the models that are only 
     167                # correct for double precision are not tested using 
     168                # single precision.  The choice is determined by the 
     169                # presence of *single=False* in the model file. 
     170                test = ModelTestCase(test_name, model_info, 
     171                                     test_method_name, 
     172                                     platform="cuda", dtype=None, 
    158173                                     stash=stash) 
    159174                #print("defining", test_name) 
     
    220235 
    221236                # Check for missing tests.  Only do so for the "dll" tests 
    222                 # to reduce noise from both opencl and dll, and because 
     237                # to reduce noise from both opencl and cuda, and because 
    223238                # python kernels use platform="dll". 
    224239                if self.platform == "dll": 
     
    368383 
    369384    # Build a test suite containing just the model 
    370     loaders = ['opencl'] if use_opencl() else ['dll'] 
     385    loader = 'opencl' if use_opencl() else 'cuda' if use_cuda() else 'dll' 
    371386    models = [model] 
    372387    try: 
    373         suite = make_suite(loaders, models) 
     388        suite = make_suite([loader], models) 
    374389    except Exception: 
    375390        import traceback 
     
    434449        loaders = ['opencl'] 
    435450        models = models[1:] 
     451    elif models and models[0] == 'cuda': 
     452        if not use_cuda(): 
     453            print("cuda is not available") 
     454            return 1 
     455        loaders = ['cuda'] 
     456        models = models[1:] 
    436457    elif models and models[0] == 'dll': 
    437458        # TODO: test if compiler is available? 
    438459        loaders = ['dll'] 
    439460        models = models[1:] 
    440     elif models and models[0] == 'opencl_and_dll': 
    441         loaders = ['opencl', 'dll'] if use_opencl() else ['dll'] 
    442         models = models[1:] 
    443461    else: 
    444         loaders = ['opencl', 'dll'] if use_opencl() else ['dll'] 
     462        loaders = ['dll'] 
     463        if use_opencl(): 
     464            loaders.append('opencl') 
     465        if use_cuda(): 
     466            loaders.append('cuda') 
    445467    if not models: 
    446468        print("""\ 
    447469usage: 
    448   python -m sasmodels.model_test [-v] [opencl|dll] model1 model2 ... 
     470  python -m sasmodels.model_test [-v] [opencl|cuda|dll] model1 model2 ... 
    449471 
    450472If -v is included on the command line, then use verbose output. 
    451473 
    452 If neither opencl nor dll is specified, then models will be tested with 
    453 both OpenCL and dll; the compute target is ignored for pure python models. 
     474If no platform is specified, then models will be tested with dll, and 
     475if available, OpenCL and CUDA; the compute target is ignored for pure python models. 
    454476 
    455477If model1 is 'all', then all except the remaining models will be tested. 
     
    471493    Run "nosetests sasmodels" on the command line to invoke it. 
    472494    """ 
    473     loaders = ['opencl', 'dll'] if use_opencl() else ['dll'] 
     495    loaders = ['dll'] 
     496    if use_opencl(): 
     497        loaders.append('opencl') 
     498    if use_cuda(): 
     499        loaders.append('cuda') 
    474500    tests = make_suite(loaders, ['all']) 
    475501    def build_test(test): 
  • sasmodels/models/lib/gauss76.c

    r99b84ec r74e9b5f  
    1111 
    1212// Gaussians 
    13 constant double Gauss76Wt[76]={ 
     13constant double Gauss76Wt[76] = { 
    1414        .00126779163408536,             //0 
    1515        .00294910295364247, 
     
    9090}; 
    9191 
    92 constant double Gauss76Z[76]={ 
     92constant double Gauss76Z[76] = { 
    9393        -.999505948362153,              //0 
    9494        -.997397786355355, 
  • sasmodels/models/lib/polevl.c

    r447e9aa r74e9b5f  
    5151*/ 
    5252 
    53 double polevl( double x, constant double *coef, int N ); 
    54 double polevl( double x, constant double *coef, int N ) 
     53static 
     54double polevl( double x, pconstant double *coef, int N ) 
    5555{ 
    5656 
     
    7272 */ 
    7373 
    74 double p1evl( double x, constant double *coef, int N ); 
    75 double p1evl( double x, constant double *coef, int N ) 
     74static 
     75double p1evl( double x, pconstant double *coef, int N ) 
    7676{ 
    7777    int i=0; 
  • sasmodels/models/lib/sas_J1.c

    r5181ccc r74e9b5f  
    4242#if FLOAT_SIZE>4 
    4343//Cephes double pression function 
    44 double cephes_j1(double x); 
    4544 
    4645constant double RPJ1[8] = { 
     
    106105    0.0 }; 
    107106 
     107static 
    108108double cephes_j1(double x) 
    109109{ 
     
    155155#else 
    156156//Single precission version of cephes 
    157 float cephes_j1f(float x); 
    158  
    159157constant float JPJ1[8] = { 
    160158    -4.878788132172128E-009, 
     
    190188    }; 
    191189 
     190static 
    192191float cephes_j1f(float xx) 
    193192{ 
     
    240239 
    241240//Finally J1c function that equals 2*J1(x)/x 
    242 double sas_2J1x_x(double x); 
     241static 
    243242double sas_2J1x_x(double x) 
    244243{ 
Note: See TracChangeset for help on using the changeset viewer.