Changes in / [36a2418:e5bbe64] in sasmodels


Ignore:
Location:
sasmodels
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernel.py

    r3199b17 rcd28947  
    2323# pylint: enable=unused-import 
    2424 
    25  
    2625class KernelModel(object): 
    2726    info = None  # type: ModelInfo 
     
    3433        # type: () -> None 
    3534        pass 
    36  
    3735 
    3836class Kernel(object): 
  • sasmodels/kernelcl.py

    r3199b17 rf872fd1  
    6161 
    6262 
    63 # Attempt to setup OpenCL. This may fail if the pyopencl package is not 
     63# Attempt to setup opencl. This may fail if the pyopencl package is not 
    6464# installed or if it is installed but there are no devices available. 
    6565try: 
     
    6767    from pyopencl import mem_flags as mf 
    6868    from pyopencl.characterize import get_fast_inaccurate_build_options 
    69     # Ask OpenCL for the default context so that we know that one exists. 
     69    # Ask OpenCL for the default context so that we know that one exists 
    7070    cl.create_some_context(interactive=False) 
    7171    HAVE_OPENCL = True 
     
    8888# pylint: enable=unused-import 
    8989 
    90  
    91 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 
     90# CRUFT: pyopencl < 2017.1  (as of June 2016 needs quotes around include path) 
    9291def quote_path(v): 
    9392    """ 
     
    10099    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 
    101100 
    102  
    103101def fix_pyopencl_include(): 
    104102    """ 
     
    107105    import pyopencl as cl 
    108106    if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 
    109         cl._DEFAULT_INCLUDE_OPTIONS = [ 
    110             quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 
    111             ] 
    112  
     107        cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 
    113108 
    114109if HAVE_OPENCL: 
     
    123118MAX_LOOPS = 2048 
    124119 
     120 
    125121# Pragmas for enable OpenCL features.  Be sure to protect them so that they 
    126122# still compile even if OpenCL is not present. 
     
    137133""" 
    138134 
    139  
    140135def use_opencl(): 
    141136    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 
    142137    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 
    143138 
    144  
    145139ENV = None 
    146140def reset_environment(): 
     
    150144    global ENV 
    151145    ENV = GpuEnvironment() if use_opencl() else None 
    152  
    153146 
    154147def environment(): 
     
    168161    return ENV 
    169162 
    170  
    171163def has_type(device, dtype): 
    172164    # type: (cl.Device, np.dtype) -> bool 
     
    179171        return "cl_khr_fp64" in device.extensions 
    180172    else: 
    181         # Not supporting F16 type since it isn't accurate enough. 
     173        # Not supporting F16 type since it isn't accurate enough 
    182174        return False 
    183  
    184175 
    185176def get_warp(kernel, queue): 
     
    191182        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
    192183        queue.device) 
    193  
    194184 
    195185def compile_model(context, source, dtype, fast=False): 
     
    213203        source_list.insert(0, _F64_PRAGMA) 
    214204 
    215     # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 
     205    # Note: USE_SINCOS makes the intel cpu slower under opencl 
    216206    if context.devices[0].type == cl.device_type.GPU: 
    217207        source_list.insert(0, "#define USE_SINCOS\n") 
     
    220210    source = "\n".join(source_list) 
    221211    program = cl.Program(context, source).build(options=options) 
    222  
    223212    #print("done with "+program) 
    224213    return program 
    225214 
    226215 
    227 # For now, this returns one device in the context. 
    228 # TODO: Create a context that contains all devices on all platforms. 
     216# for now, this returns one device in the context 
     217# TODO: create a context that contains all devices on all platforms 
    229218class GpuEnvironment(object): 
    230219    """ 
    231     GPU context for OpenCL, with possibly many devices and one queue per device. 
     220    GPU context, with possibly many devices, and one queue per device. 
     221 
     222    Because the environment can be reset during a live program (e.g., if the 
     223    user changes the active GPU device in the GUI), everything associated 
     224    with the device context must be cached in the environment and recreated 
     225    if the environment changes.  The *cache* attribute is a simple dictionary 
     226    which holds keys and references to objects, such as compiled kernels and 
     227    allocated buffers.  The running program should check in the cache for 
     228    long lived objects and create them if they are not there.  The program 
     229    should not hold onto cached objects, but instead only keep them active 
     230    for the duration of a function call.  When the environment is destroyed 
     231    then the *release* method for each active cache item is called before 
     232    the environment is freed.  This means that each cl buffer should be 
     233    in its own cache entry. 
    232234    """ 
    233235    def __init__(self): 
    234236        # type: () -> None 
    235         # Find gpu context. 
     237        # find gpu context 
    236238        context_list = _create_some_context() 
    237239 
     
    247249                self.context[dtype] = None 
    248250 
    249         # Build a queue for each context. 
     251        # Build a queue for each context 
    250252        self.queue = {} 
    251253        context = self.context[F32] 
     
    257259            self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 
    258260 
    259         ## Byte boundary for data alignment. 
     261        # Byte boundary for data alignment 
    260262        #self.data_boundary = max(context.devices[0].min_data_type_align_size 
    261263        #                         for context in self.context.values()) 
    262264 
    263         # Cache for compiled programs, and for items in context. 
     265        # Cache for compiled programs, and for items in context 
    264266        self.compiled = {} 
     267        self.cache = {} 
    265268 
    266269    def has_type(self, dtype): 
     
    277280        """ 
    278281        # Note: PyOpenCL caches based on md5 hash of source, options and device 
    279         # but I'll do so as well just to save some data munging time. 
     282        # so we don't really need to cache things for ourselves.  I'll do so 
     283        # anyway just to save some data munging time. 
    280284        tag = generate.tag_source(source) 
    281285        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    282         # Check timestamp on program. 
     286        # Check timestamp on program 
    283287        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    284288        if program_timestamp < timestamp: 
     
    293297        return program 
    294298 
     299    def free_buffer(self, key): 
     300        if key in self.cache: 
     301            self.cache[key].release() 
     302            del self.cache[key] 
     303 
     304    def __del__(self): 
     305        for v in self.cache.values(): 
     306            release = getattr(v, 'release', lambda: None) 
     307            release() 
     308        self.cache = {} 
     309 
     310_CURRENT_ID = 0 
     311def unique_id(): 
     312    global _CURRENT_ID 
     313    _CURRENT_ID += 1 
     314    return _CURRENT_ID 
    295315 
    296316def _create_some_context(): 
     
    305325    which one (and not a CUDA device, or no GPU). 
    306326    """ 
    307     # Assume we do not get here if SAS_OPENCL is None or CUDA. 
     327    # Assume we do not get here if SAS_OPENCL is None or CUDA 
    308328    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 
    309329    if sas_opencl.lower() != 'opencl': 
    310         # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 
     330        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
    311331        os.environ["PYOPENCL_CTX"] = sas_opencl 
    312332 
     
    316336        except Exception as exc: 
    317337            warnings.warn(str(exc)) 
    318             warnings.warn("pyopencl.create_some_context() failed.  The " 
    319                 "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 
    320                 "not be set correctly") 
     338            warnings.warn("pyopencl.create_some_context() failed") 
     339            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 
    321340 
    322341    return _get_default_context() 
    323  
    324342 
    325343def _get_default_context(): 
     
    334352    # is running may increase throughput. 
    335353    # 
    336     # MacBook Pro, base install: 
     354    # Macbook pro, base install: 
    337355    #     {'Apple': [Intel CPU, NVIDIA GPU]} 
    338     # MacBook Pro, base install: 
     356    # Macbook pro, base install: 
    339357    #     {'Apple': [Intel CPU, Intel GPU]} 
    340     # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 
     358    # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed 
    341359    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 
    342360    gpu, cpu = None, None 
     
    361379            else: 
    362380                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 
    363                 # Intel Phi for example registers as an accelerator. 
     381                # Intel Phi for example registers as an accelerator 
    364382                # Since the user installed a custom device on their system 
    365383                # and went through the pain of sorting out OpenCL drivers for 
     
    368386                gpu = device 
    369387 
    370     # Order the devices by gpu then by cpu; when searching for an available 
     388    # order the devices by gpu then by cpu; when searching for an available 
    371389    # device by data type they will be checked in this order, which means 
    372390    # that if the gpu supports double then the cpu will never be used (though 
     
    395413    that the compiler is allowed to take shortcuts. 
    396414    """ 
    397     info = None  # type: ModelInfo 
    398     source = ""  # type: str 
    399     dtype = None  # type: np.dtype 
    400     fast = False  # type: bool 
    401     _program = None  # type: cl.Program 
    402     _kernels = None  # type: Dict[str, cl.Kernel] 
    403  
    404415    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
    405416        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None 
     
    408419        self.dtype = dtype 
    409420        self.fast = fast 
     421        self.timestamp = generate.ocl_timestamp(self.info) 
     422        self._cache_key = unique_id() 
    410423 
    411424    def __getstate__(self): 
     
    416429        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    417430        self.info, self.source, self.dtype, self.fast = state 
    418         self._program = self._kernels = None 
    419431 
    420432    def make_kernel(self, q_vectors): 
     
    422434        return GpuKernel(self, q_vectors) 
    423435 
    424     def get_function(self, name): 
     436    @property 
     437    def Iq(self): 
     438        return self._fetch_kernel('Iq') 
     439 
     440    def fetch_kernel(self, name): 
    425441        # type: (str) -> cl.Kernel 
    426442        """ 
     
    428444        does not already exist. 
    429445        """ 
    430         if self._program is None: 
    431             self._prepare_program() 
    432         return self._kernels[name] 
    433  
    434     def _prepare_program(self): 
    435         # type: (str) -> None 
    436         env = environment() 
    437         timestamp = generate.ocl_timestamp(self.info) 
    438         program = env.compile_program( 
    439             self.info.name, 
    440             self.source['opencl'], 
    441             self.dtype, 
    442             self.fast, 
    443             timestamp) 
    444         variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    445         names = [generate.kernel_name(self.info, k) for k in variants] 
    446         functions = [getattr(program, k) for k in names] 
    447         self._kernels = {k: v for k, v in zip(variants, functions)} 
    448         # Keep a handle to program so GC doesn't collect. 
    449         self._program = program 
    450  
    451  
    452 # TODO: Check that we don't need a destructor for buffers which go out of scope. 
     446        gpu = environment() 
     447        key = self._cache_key 
     448        if key not in gpu.cache: 
     449            program = gpu.compile_program( 
     450                self.info.name, 
     451                self.source['opencl'], 
     452                self.dtype, 
     453                self.fast, 
     454                self.timestamp) 
     455            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     456            names = [generate.kernel_name(self.info, k) for k in variants] 
     457            kernels = [getattr(program, k) for k in names] 
     458            data = dict((k, v) for k, v in zip(variants, kernels)) 
     459            # keep a handle to program so GC doesn't collect 
     460            data['program'] = program 
     461            gpu.cache[key] = data 
     462        else: 
     463            data = gpu.cache[key] 
     464        return data[name] 
     465 
     466# TODO: check that we don't need a destructor for buffers which go out of scope 
    453467class GpuInput(object): 
    454468    """ 
     
    472486    def __init__(self, q_vectors, dtype=generate.F32): 
    473487        # type: (List[np.ndarray], np.dtype) -> None 
    474         # TODO: Do we ever need double precision q? 
     488        # TODO: do we ever need double precision q? 
    475489        self.nq = q_vectors[0].size 
    476490        self.dtype = np.dtype(dtype) 
    477491        self.is_2d = (len(q_vectors) == 2) 
    478         # TODO: Stretch input based on get_warp(). 
    479         # Not doing it now since warp depends on kernel, which is not known 
     492        # TODO: stretch input based on get_warp() 
     493        # not doing it now since warp depends on kernel, which is not known 
    480494        # at this point, so instead using 32, which is good on the set of 
    481495        # architectures tested so far. 
     
    490504            self.q[:self.nq] = q_vectors[0] 
    491505        self.global_size = [self.q.shape[0]] 
    492         #print("creating inputs of size", self.global_size) 
    493  
    494         # Transfer input value to GPU. 
     506        self._cache_key = unique_id() 
     507 
     508    @property 
     509    def q_b(self): 
     510        """Lazy creation of q buffer so it can survive context reset""" 
    495511        env = environment() 
    496         context = env.context[self.dtype] 
    497         self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    498                              hostbuf=self.q) 
     512        key = self._cache_key 
     513        if key not in env.cache: 
     514            context = env.context[self.dtype] 
     515            #print("creating inputs of size", self.global_size) 
     516            buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
     517                               hostbuf=self.q) 
     518            env.cache[key] = buffer 
     519        return env.cache[key] 
    499520 
    500521    def release(self): 
    501522        # type: () -> None 
    502523        """ 
    503         Free the buffer associated with the q value. 
    504         """ 
    505         if self.q_b is not None: 
    506             self.q_b.release() 
    507             self.q_b = None 
     524        Free the buffer associated with the q value 
     525        """ 
     526        environment().free_buffer(id(self)) 
    508527 
    509528    def __del__(self): 
     
    511530        self.release() 
    512531 
    513  
    514532class GpuKernel(Kernel): 
    515533    """ 
     
    518536    *model* is the GpuModel object to call 
    519537 
    520     The kernel is derived from :class:`Kernel`, providing the 
    521     :meth:`call_kernel` method to evaluate the kernel for a given set of 
    522     parameters.  Because of the need to move the q values to the GPU before 
    523     evaluation, the kernel is instantiated for a particular set of q vectors, 
    524     and can be called many times without transfering q each time. 
     538    The following attributes are defined: 
     539 
     540    *info* is the module information 
     541 
     542    *dtype* is the kernel precision 
     543 
     544    *dim* is '1d' or '2d' 
     545 
     546    *result* is a vector to contain the results of the call 
     547 
     548    The resulting call method takes the *pars*, a list of values for 
     549    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
     550    vectors for the polydisperse parameters.  *cutoff* determines the 
     551    integration limits: any points with combined weight less than *cutoff* 
     552    will not be calculated. 
    525553 
    526554    Call :meth:`release` when done with the kernel instance. 
    527555    """ 
    528     #: SAS model information structure. 
    529     info = None  # type: ModelInfo 
    530     #: Kernel precision. 
    531     dtype = None  # type: np.dtype 
    532     #: Kernel dimensions (1d or 2d). 
    533     dim = ""  # type: str 
    534     #: Calculation results, updated after each call to :meth:`_call_kernel`. 
    535     result = None  # type: np.ndarray 
    536  
    537556    def __init__(self, model, q_vectors): 
    538         # type: (GpuModel, List[np.ndarray]) -> None 
     557        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    539558        dtype = model.dtype 
    540559        self.q_input = GpuInput(q_vectors, dtype) 
    541560        self._model = model 
    542  
    543         # Attributes accessed from the outside. 
     561        # F16 isn't sufficient, so don't support it 
     562        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     563        self._cache_key = unique_id() 
     564 
     565        # attributes accessed from the outside 
    544566        self.dim = '2d' if self.q_input.is_2d else '1d' 
    545567        self.info = model.info 
    546         self.dtype = dtype 
    547  
    548         # Converter to translate input to target type. 
    549         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    550  
    551         # Holding place for the returned value. 
     568        self.dtype = model.dtype 
     569 
     570        # holding place for the returned value 
    552571        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    553         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    554         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    555  
    556         # Allocate result value on GPU. 
     572        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     573        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
     574 
     575    @property 
     576    def _result_b(self): 
     577        """Lazy creation of result buffer so it can survive context reset""" 
    557578        env = environment() 
    558         context = env.context[self.dtype] 
    559         width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    560         self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 
    561  
    562     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    563                      effective_radius_type): 
    564         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
     579        key = self._cache_key 
     580        if key not in env.cache: 
     581            context = env.context[self.dtype] 
     582            width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
     583            buffer = cl.Buffer(context, mf.READ_WRITE, width) 
     584            env.cache[key] = buffer 
     585        return env.cache[key] 
     586 
     587    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     588        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    565589        env = environment() 
    566590        queue = env.queue[self._model.dtype] 
    567591        context = queue.context 
    568592 
    569         # Arrange data transfer to card. 
     593        # Arrange data transfer to/from card 
     594        q_b = self.q_input.q_b 
     595        result_b = self._result_b 
    570596        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    571597                              hostbuf=call_details.buffer) 
     
    573599                             hostbuf=values) 
    574600 
    575         # Setup kernel function and arguments. 
    576601        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    577         kernel = self._model.get_function(name) 
     602        kernel = self._model.fetch_kernel(name) 
    578603        kernel_args = [ 
    579             np.uint32(self.q_input.nq),  # Number of inputs. 
    580             None,  # Placeholder for pd_start. 
    581             None,  # Placeholder for pd_stop. 
    582             details_b,  # Problem definition. 
    583             values_b,  # Parameter values. 
    584             self.q_input.q_b,  # Q values. 
    585             self._result_b,   # Result storage. 
    586             self._as_dtype(cutoff),  # Probability cutoff. 
    587             np.uint32(effective_radius_type),  # R_eff mode. 
     604            np.uint32(self.q_input.nq), None, None, 
     605            details_b, values_b, q_b, result_b, 
     606            self._as_dtype(cutoff), 
     607            np.uint32(effective_radius_type), 
    588608        ] 
    589  
    590         # Call kernel and retrieve results. 
    591609        #print("Calling OpenCL") 
    592610        #call_details.show(values) 
     611        #Call kernel and retrieve results 
    593612        wait_for = None 
    594613        last_nap = time.clock() 
     
    601620                               *kernel_args, wait_for=wait_for)] 
    602621            if stop < call_details.num_eval: 
    603                 # Allow other processes to run. 
     622                # Allow other processes to run 
    604623                wait_for[0].wait() 
    605624                current_time = time.clock() 
     
    607626                    time.sleep(0.001) 
    608627                    last_nap = current_time 
    609         cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for) 
     628        cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for) 
    610629        #print("result", self.result) 
    611630 
    612         # Free buffers. 
    613         details_b.release() 
    614         values_b.release() 
     631        # Free buffers 
     632        for v in (details_b, values_b): 
     633            if v is not None: 
     634                v.release() 
    615635 
    616636    def release(self): 
     
    619639        Release resources associated with the kernel. 
    620640        """ 
     641        environment().free_buffer(id(self)) 
    621642        self.q_input.release() 
    622         if self._result_b is not None: 
    623             self._result_b.release() 
    624             self._result_b = None 
    625643 
    626644    def __del__(self): 
  • sasmodels/kernelcuda.py

    rfa26e78 rf872fd1  
    6363import time 
    6464import re 
    65 import atexit 
    6665 
    6766import numpy as np  # type: ignore 
    6867 
    6968 
    70 # Attempt to setup CUDA. This may fail if the pycuda package is not 
     69# Attempt to setup cuda. This may fail if the pycuda package is not 
    7170# installed or if it is installed but there are no devices available. 
    7271try: 
     
    108107MAX_LOOPS = 2048 
    109108 
    110  
    111109def use_cuda(): 
    112     sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 
    113     return HAVE_CUDA and sas_opencl.startswith("cuda") 
    114  
     110    env = os.environ.get("SAS_OPENCL", "").lower() 
     111    return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
    115112 
    116113ENV = None 
     
    124121        ENV.release() 
    125122    ENV = GpuEnvironment() if use_cuda() else None 
    126  
    127123 
    128124def environment(): 
     
    142138    return ENV 
    143139 
    144  
    145 # PyTest is not freeing ENV, so make sure it gets freed. 
    146 atexit.register(lambda: ENV.release() if ENV is not None else None) 
    147  
    148  
    149140def has_type(dtype): 
    150141    # type: (np.dtype) -> bool 
     
    152143    Return true if device supports the requested precision. 
    153144    """ 
    154     # Assume the NVIDIA card supports 32-bit and 64-bit floats. 
    155     # TODO: Check if pycuda support F16. 
     145    # Assume the nvidia card supports 32-bit and 64-bit floats. 
     146    # TODO: check if pycuda support F16 
    156147    return dtype in (generate.F32, generate.F64) 
    157148 
    158149 
    159150FUNCTION_PATTERN = re.compile(r"""^ 
    160   (?P<space>\s*)                       # Initial space. 
    161   (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function. 
    162   (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens. 
     151  (?P<space>\s*)                   # initial space 
     152  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 
     153  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens 
    163154  """, re.VERBOSE|re.MULTILINE) 
    164155 
     
    167158  """, re.VERBOSE|re.MULTILINE) 
    168159 
    169  
    170160def _add_device_tag(match): 
    171161    # type: (None) -> str 
    172     # Note: Should be re.Match, but that isn't a simple type. 
     162    # Note: should be re.Match, but that isn't a simple type 
    173163    """ 
    174164    replace qualifiers with __device__ qualifiers if needed 
     
    183173        return "".join((space, "__device__ ", qualifiers, function)) 
    184174 
    185  
    186175def mark_device_functions(source): 
    187176    # type: (str) -> str 
     
    190179    """ 
    191180    return FUNCTION_PATTERN.sub(_add_device_tag, source) 
    192  
    193181 
    194182def show_device_functions(source): 
     
    200188        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 
    201189    return source 
    202  
    203190 
    204191def compile_model(source, dtype, fast=False): 
     
    225212    #options = ['--verbose', '-E'] 
    226213    options = ['--use_fast_math'] if fast else None 
    227     program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 
     214    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
    228215 
    229216    #print("done with "+program) 
     
    231218 
    232219 
    233 # For now, this returns one device in the context. 
    234 # TODO: Create a context that contains all devices on all platforms. 
     220# for now, this returns one device in the context 
     221# TODO: create a context that contains all devices on all platforms 
    235222class GpuEnvironment(object): 
    236223    """ 
    237     GPU context for CUDA. 
     224    GPU context, with possibly many devices, and one queue per device. 
    238225    """ 
    239226    context = None # type: cuda.Context 
    240227    def __init__(self, devnum=None): 
    241228        # type: (int) -> None 
     229        # Byte boundary for data alignment 
     230        #self.data_boundary = max(d.min_data_type_align_size 
     231        #                         for d in self.context.devices) 
     232        self.compiled = {} 
    242233        env = os.environ.get("SAS_OPENCL", "").lower() 
    243234        if devnum is None and env.startswith("cuda:"): 
    244235            devnum = int(env[5:]) 
    245  
    246236        # Set the global context to the particular device number if one is 
    247237        # given, otherwise use the default context.  Perhaps this will be set 
     
    252242            self.context = make_default_context() 
    253243 
    254         ## Byte boundary for data alignment. 
    255         #self.data_boundary = max(d.min_data_type_align_size 
    256         #                         for d in self.context.devices) 
    257  
    258         # Cache for compiled programs, and for items in context. 
    259         self.compiled = {} 
    260  
    261244    def release(self): 
    262245        if self.context is not None: 
     
    279262        Compile the program for the device in the given context. 
    280263        """ 
    281         # Note: PyCuda (probably) caches but I'll do so as well just to 
    282         # save some data munging time. 
     264        # Note: PyOpenCL caches based on md5 hash of source, options and device 
     265        # so we don't really need to cache things for ourselves.  I'll do so 
     266        # anyway just to save some data munging time. 
    283267        tag = generate.tag_source(source) 
    284268        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    285         # Check timestamp on program. 
     269        # Check timestamp on program 
    286270        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    287271        if program_timestamp < timestamp: 
     
    293277        return program 
    294278 
    295  
    296279class GpuModel(KernelModel): 
    297280    """ 
     
    309292    that the compiler is allowed to take shortcuts. 
    310293    """ 
    311     info = None  # type: ModelInfo 
    312     source = ""  # type: str 
    313     dtype = None  # type: np.dtype 
    314     fast = False  # type: bool 
    315     _program = None # type: SourceModule 
    316     _kernels = None  # type: Dict[str, cuda.Function] 
     294    info = None # type: ModelInfo 
     295    source = "" # type: str 
     296    dtype = None # type: np.dtype 
     297    fast = False # type: bool 
     298    program = None # type: SourceModule 
     299    _kernels = None # type: List[cuda.Function] 
    317300 
    318301    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
     
    322305        self.dtype = dtype 
    323306        self.fast = fast 
     307        self.program = None # delay program creation 
     308        self._kernels = None 
    324309 
    325310    def __getstate__(self): 
     
    330315        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    331316        self.info, self.source, self.dtype, self.fast = state 
    332         self._program = self._kernels = None 
     317        self.program = None 
    333318 
    334319    def make_kernel(self, q_vectors): 
    335320        # type: (List[np.ndarray]) -> "GpuKernel" 
    336         return GpuKernel(self, q_vectors) 
    337  
    338     def get_function(self, name): 
    339         # type: (str) -> cuda.Function 
    340         """ 
    341         Fetch the kernel from the environment by name, compiling it if it 
    342         does not already exist. 
    343         """ 
    344         if self._program is None: 
    345             self._prepare_program() 
    346         return self._kernels[name] 
    347  
    348     def _prepare_program(self): 
    349         # type: (str) -> None 
    350         env = environment() 
    351         timestamp = generate.ocl_timestamp(self.info) 
    352         program = env.compile_program( 
    353             self.info.name, 
    354             self.source['opencl'], 
    355             self.dtype, 
    356             self.fast, 
    357             timestamp) 
    358         variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    359         names = [generate.kernel_name(self.info, k) for k in variants] 
    360         functions = [program.get_function(k) for k in names] 
    361         self._kernels = {k: v for k, v in zip(variants, functions)} 
    362         # Keep a handle to program so GC doesn't collect. 
    363         self._program = program 
    364  
    365  
    366 # TODO: Check that we don't need a destructor for buffers which go out of scope. 
     321        if self.program is None: 
     322            compile_program = environment().compile_program 
     323            timestamp = generate.ocl_timestamp(self.info) 
     324            self.program = compile_program( 
     325                self.info.name, 
     326                self.source['opencl'], 
     327                self.dtype, 
     328                self.fast, 
     329                timestamp) 
     330            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     331            names = [generate.kernel_name(self.info, k) for k in variants] 
     332            kernels = [self.program.get_function(k) for k in names] 
     333            self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 
     334        is_2d = len(q_vectors) == 2 
     335        if is_2d: 
     336            kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 
     337        else: 
     338            kernel = [self._kernels['Iq']]*2 
     339        return GpuKernel(kernel, self.dtype, self.info, q_vectors) 
     340 
     341    def release(self): 
     342        # type: () -> None 
     343        """ 
     344        Free the resources associated with the model. 
     345        """ 
     346        if self.program is not None: 
     347            self.program = None 
     348 
     349    def __del__(self): 
     350        # type: () -> None 
     351        self.release() 
     352 
     353# TODO: check that we don't need a destructor for buffers which go out of scope 
    367354class GpuInput(object): 
    368355    """ 
     
    386373    def __init__(self, q_vectors, dtype=generate.F32): 
    387374        # type: (List[np.ndarray], np.dtype) -> None 
    388         # TODO: Do we ever need double precision q? 
     375        # TODO: do we ever need double precision q? 
    389376        self.nq = q_vectors[0].size 
    390377        self.dtype = np.dtype(dtype) 
    391378        self.is_2d = (len(q_vectors) == 2) 
    392         # TODO: Stretch input based on get_warp(). 
    393         # Not doing it now since warp depends on kernel, which is not known 
     379        # TODO: stretch input based on get_warp() 
     380        # not doing it now since warp depends on kernel, which is not known 
    394381        # at this point, so instead using 32, which is good on the set of 
    395382        # architectures tested so far. 
    396383        if self.is_2d: 
    397             width = ((self.nq+15)//16)*16 
     384            # Note: 16 rather than 15 because result is 1 longer than input. 
     385            width = ((self.nq+16)//16)*16 
    398386            self.q = np.empty((width, 2), dtype=dtype) 
    399387            self.q[:self.nq, 0] = q_vectors[0] 
    400388            self.q[:self.nq, 1] = q_vectors[1] 
    401389        else: 
    402             width = ((self.nq+31)//32)*32 
     390            # Note: 32 rather than 31 because result is 1 longer than input. 
     391            width = ((self.nq+32)//32)*32 
    403392            self.q = np.empty(width, dtype=dtype) 
    404393            self.q[:self.nq] = q_vectors[0] 
    405394        self.global_size = [self.q.shape[0]] 
    406395        #print("creating inputs of size", self.global_size) 
    407  
    408         # Transfer input value to GPU. 
    409396        self.q_b = cuda.to_device(self.q) 
    410397 
     
    412399        # type: () -> None 
    413400        """ 
    414         Free the buffer associated with the q value. 
     401        Free the memory. 
    415402        """ 
    416403        if self.q_b is not None: 
     
    422409        self.release() 
    423410 
    424  
    425411class GpuKernel(Kernel): 
    426412    """ 
    427413    Callable SAS kernel. 
    428414 
    429     *model* is the GpuModel object to call 
    430  
    431     The kernel is derived from :class:`Kernel`, providing the 
    432     :meth:`call_kernel` method to evaluate the kernel for a given set of 
    433     parameters.  Because of the need to move the q values to the GPU before 
    434     evaluation, the kernel is instantiated for a particular set of q vectors, 
    435     and can be called many times without transfering q each time. 
     415    *kernel* is the GpuKernel object to call 
     416 
     417    *model_info* is the module information 
     418 
     419    *q_vectors* is the q vectors at which the kernel should be evaluated 
     420 
     421    *dtype* is the kernel precision 
     422 
     423    The resulting call method takes the *pars*, a list of values for 
     424    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
     425    vectors for the polydisperse parameters.  *cutoff* determines the 
     426    integration limits: any points with combined weight less than *cutoff* 
     427    will not be calculated. 
    436428 
    437429    Call :meth:`release` when done with the kernel instance. 
    438430    """ 
    439     #: SAS model information structure. 
    440     info = None  # type: ModelInfo 
    441     #: Kernel precision. 
    442     dtype = None  # type: np.dtype 
    443     #: Kernel dimensions (1d or 2d). 
    444     dim = ""  # type: str 
    445     #: Calculation results, updated after each call to :meth:`_call_kernel`. 
    446     result = None  # type: np.ndarray 
    447  
    448     def __init__(self, model, q_vectors): 
    449         # type: (GpuModel, List[np.ndarray]) -> None 
    450         dtype = model.dtype 
     431    def __init__(self, kernel, dtype, model_info, q_vectors): 
     432        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    451433        self.q_input = GpuInput(q_vectors, dtype) 
    452         self._model = model 
    453  
    454         # Attributes accessed from the outside. 
     434        self.kernel = kernel 
     435        # F16 isn't sufficient, so don't support it 
     436        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     437 
     438        # attributes accessed from the outside 
    455439        self.dim = '2d' if self.q_input.is_2d else '1d' 
    456         self.info = model.info 
     440        self.info = model_info 
    457441        self.dtype = dtype 
    458442 
    459         # Converter to translate input to target type. 
    460         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    461  
    462         # Holding place for the returned value. 
     443        # holding place for the returned value 
    463444        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    464         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    465         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    466  
    467         # Allocate result value on GPU. 
     445        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     446        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
     447 
     448        # Inputs and outputs for each kernel call 
     449        # Note: res may be shorter than res_b if global_size != nq 
    468450        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    469         self._result_b = cuda.mem_alloc(width) 
    470  
    471     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    472                      effective_radius_type): 
    473         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    474  
    475         # Arrange data transfer to card. 
     451        self.result_b = cuda.mem_alloc(width) 
     452        self._need_release = [self.result_b] 
     453 
     454    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     455        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     456        # Arrange data transfer to card 
    476457        details_b = cuda.to_device(call_details.buffer) 
    477458        values_b = cuda.to_device(values) 
    478459 
    479         # Setup kernel function and arguments. 
    480         name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    481         kernel = self._model.get_function(name) 
    482         kernel_args = [ 
    483             np.uint32(self.q_input.nq),  # Number of inputs. 
    484             None,  # Placeholder for pd_start. 
    485             None,  # Placeholder for pd_stop. 
    486             details_b,  # Problem definition. 
    487             values_b,  # Parameter values. 
    488             self.q_input.q_b,  # Q values. 
    489             self._result_b,   # Result storage. 
    490             self._as_dtype(cutoff),  # Probability cutoff. 
    491             np.uint32(effective_radius_type),  # R_eff mode. 
     460        kernel = self.kernel[1 if magnetic else 0] 
     461        args = [ 
     462            np.uint32(self.q_input.nq), None, None, 
     463            details_b, values_b, self.q_input.q_b, self.result_b, 
     464            self._as_dtype(cutoff), 
     465            np.uint32(effective_radius_type), 
    492466        ] 
    493467        grid = partition(self.q_input.nq) 
    494  
    495         # Call kernel and retrieve results. 
    496         #print("Calling CUDA") 
     468        #print("Calling OpenCL") 
    497469        #call_details.show(values) 
     470        # Call kernel and retrieve results 
    498471        last_nap = time.clock() 
    499472        step = 100000000//self.q_input.nq + 1 
     
    502475            stop = min(start + step, call_details.num_eval) 
    503476            #print("queuing",start,stop) 
    504             kernel_args[1:3] = [np.int32(start), np.int32(stop)] 
    505             kernel(*kernel_args, **grid) 
     477            args[1:3] = [np.int32(start), np.int32(stop)] 
     478            kernel(*args, **grid) 
    506479            if stop < call_details.num_eval: 
    507480                sync() 
    508                 # Allow other processes to run. 
     481                # Allow other processes to run 
    509482                current_time = time.clock() 
    510483                if current_time - last_nap > 0.5: 
     
    512485                    last_nap = current_time 
    513486        sync() 
    514         cuda.memcpy_dtoh(self.result, self._result_b) 
     487        cuda.memcpy_dtoh(self.result, self.result_b) 
    515488        #print("result", self.result) 
    516489 
     
    523496        Release resources associated with the kernel. 
    524497        """ 
    525         self.q_input.release() 
    526         if self._result_b is not None: 
    527             self._result_b.free() 
    528             self._result_b = None 
     498        for p in self._need_release: 
     499            p.free() 
     500        self._need_release = [] 
    529501 
    530502    def __del__(self): 
     
    540512    Note: Maybe context.synchronize() is sufficient. 
    541513    """ 
    542     # Create an event with which to synchronize. 
     514    #return # The following works in C++; don't know what pycuda is doing 
     515    # Create an event with which to synchronize 
    543516    done = cuda.Event() 
    544517 
     
    546519    done.record() 
    547520 
    548     # Make sure we don't hog resource while waiting to sync. 
     521    #line added to not hog resources 
    549522    while not done.query(): 
    550523        time.sleep(0.01) 
     
    552525    # Block until the GPU executes the kernel. 
    553526    done.synchronize() 
    554  
    555527    # Clean up the event; I don't think they can be reused. 
    556528    del done 
  • sasmodels/kerneldll.py

    r3199b17 re44432d  
    100100# pylint: enable=unused-import 
    101101 
    102 # Compiler output is a byte stream that needs to be decode in python 3. 
     102# Compiler output is a byte stream that needs to be decode in python 3 
    103103decode = (lambda s: s) if sys.version_info[0] < 3 else (lambda s: s.decode('utf8')) 
    104104 
     
    115115        COMPILER = "tinycc" 
    116116    elif "VCINSTALLDIR" in os.environ: 
    117         # If vcvarsall.bat has been called, then VCINSTALLDIR is in the 
    118         # environment and we can use the MSVC compiler.  Otherwise, if 
    119         # tinycc is available then use it.  Otherwise, hope that mingw 
    120         # is available. 
     117        # If vcvarsall.bat has been called, then VCINSTALLDIR is in the environment 
     118        # and we can use the MSVC compiler.  Otherwise, if tinycc is available 
     119        # the use it.  Otherwise, hope that mingw is available. 
    121120        COMPILER = "msvc" 
    122121    else: 
     
    125124    COMPILER = "unix" 
    126125 
    127 ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86. 
     126ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86 
    128127if COMPILER == "unix": 
    129     # Generic unix compile. 
    130     # On Mac users will need the X code command line tools installed. 
     128    # Generic unix compile 
     129    # On mac users will need the X code command line tools installed 
    131130    #COMPILE = "gcc-mp-4.7 -shared -fPIC -std=c99 -fopenmp -O2 -Wall %s -o %s -lm -lgomp" 
    132131    CC = "cc -shared -fPIC -std=c99 -O2 -Wall".split() 
    133     # Add OpenMP support if not running on a Mac. 
     132    # add openmp support if not running on a mac 
    134133    if sys.platform != "darwin": 
    135         # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9). 
     134        # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9) 
    136135        # Shut it off for all unix until we can investigate. 
    137136        #CC.append("-fopenmp") 
     
    145144    # vcomp90.dll on the path.  One may be found here: 
    146145    #       C:/Windows/winsxs/x86_microsoft.vc90.openmp*/vcomp90.dll 
    147     # Copy this to the python directory and uncomment the OpenMP COMPILE. 
    148     # TODO: Remove intermediate OBJ file created in the directory. 
    149     # TODO: Maybe don't use randomized name for the c file. 
    150     # TODO: Maybe ask distutils to find MSVC. 
     146    # Copy this to the python directory and uncomment the OpenMP COMPILE 
     147    # TODO: remove intermediate OBJ file created in the directory 
     148    # TODO: maybe don't use randomized name for the c file 
     149    # TODO: maybe ask distutils to find MSVC 
    151150    CC = "cl /nologo /Ox /MD /W3 /GS- /DNDEBUG".split() 
    152151    if "SAS_OPENMP" in os.environ: 
     
    173172ALLOW_SINGLE_PRECISION_DLLS = True 
    174173 
    175  
    176174def compile(source, output): 
    177175    # type: (str, str) -> None 
     
    185183    logging.info(command_str) 
    186184    try: 
    187         # Need shell=True on windows to keep console box from popping up. 
     185        # need shell=True on windows to keep console box from popping up 
    188186        shell = (os.name == 'nt') 
    189187        subprocess.check_output(command, shell=shell, stderr=subprocess.STDOUT) 
     
    194192        raise RuntimeError("compile failed.  File is in %r"%source) 
    195193 
    196  
    197194def dll_name(model_info, dtype): 
    198195    # type: (ModelInfo, np.dtype) ->  str 
     
    205202    basename += ARCH + ".so" 
    206203 
    207     # Hack to find precompiled dlls. 
     204    # Hack to find precompiled dlls 
    208205    path = joinpath(generate.DATA_PATH, '..', 'compiled_models', basename) 
    209206    if os.path.exists(path): 
     
    245242        raise ValueError("16 bit floats not supported") 
    246243    if dtype == F32 and not ALLOW_SINGLE_PRECISION_DLLS: 
    247         dtype = F64  # Force 64-bit dll. 
    248     # Note: dtype may be F128 for long double precision. 
     244        dtype = F64  # Force 64-bit dll 
     245    # Note: dtype may be F128 for long double precision 
    249246 
    250247    dll = dll_path(model_info, dtype) 
     
    257254        need_recompile = dll_time < newest_source 
    258255    if need_recompile: 
    259         # Make sure the DLL path exists. 
     256        # Make sure the DLL path exists 
    260257        if not os.path.exists(SAS_DLL_PATH): 
    261258            os.makedirs(SAS_DLL_PATH) 
     
    266263            file_handle.write(source) 
    267264        compile(source=filename, output=dll) 
    268         # Comment the following to keep the generated C file. 
    269         # Note: If there is a syntax error then compile raises an error 
     265        # comment the following to keep the generated c file 
     266        # Note: if there is a syntax error then compile raises an error 
    270267        # and the source file will not be deleted. 
    271268        os.unlink(filename) 
     
    306303        self.dllpath = dllpath 
    307304        self._dll = None  # type: ct.CDLL 
    308         self._kernels = None  # type: List[Callable, Callable] 
     305        self._kernels = None # type: List[Callable, Callable] 
    309306        self.dtype = np.dtype(dtype) 
    310307 
     
    341338        # type: (List[np.ndarray]) -> DllKernel 
    342339        q_input = PyInput(q_vectors, self.dtype) 
    343         # Note: DLL is lazy loaded. 
     340        # Note: pickle not supported for DllKernel 
    344341        if self._dll is None: 
    345342            self._load_dll() 
     
    361358        self._dll = None 
    362359 
    363  
    364360class DllKernel(Kernel): 
    365361    """ 
     
    383379    def __init__(self, kernel, model_info, q_input): 
    384380        # type: (Callable[[], np.ndarray], ModelInfo, PyInput) -> None 
    385         dtype = q_input.dtype 
     381        #,model_info,q_input) 
     382        self.kernel = kernel 
     383        self.info = model_info 
    386384        self.q_input = q_input 
    387         self.kernel = kernel 
    388  
    389         # Attributes accessed from the outside. 
     385        self.dtype = q_input.dtype 
    390386        self.dim = '2d' if q_input.is_2d else '1d' 
    391         self.info = model_info 
    392         self.dtype = dtype 
    393  
    394         # Converter to translate input to target type. 
    395         self._as_dtype = (np.float32 if dtype == generate.F32 
    396                           else np.float64 if dtype == generate.F64 
    397                           else np.float128) 
    398  
    399         # Holding place for the returned value. 
     387        # leave room for f1/f2 results in case we need to compute beta for 1d models 
    400388        nout = 2 if self.info.have_Fq else 1 
    401         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    402         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    403  
    404     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    405                      effective_radius_type): 
    406         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    407  
    408         # Setup kernel function and arguments. 
     389        # +4 for total weight, shell volume, effective radius, form volume 
     390        self.result = np.empty(q_input.nq*nout + 4, self.dtype) 
     391        self.real = (np.float32 if self.q_input.dtype == generate.F32 
     392                     else np.float64 if self.q_input.dtype == generate.F64 
     393                     else np.float128) 
     394 
     395    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     396        # type: (CallDetails, np.ndarray, np.ndarray, float, bool, int) -> np.ndarray 
    409397        kernel = self.kernel[1 if magnetic else 0] 
    410         kernel_args = [ 
    411             self.q_input.nq,  # Number of inputs. 
    412             None,  # Placeholder for pd_start. 
    413             None,  # Placeholder for pd_stop. 
    414             call_details.buffer.ctypes.data,  # Problem definition. 
    415             values.ctypes.data,  # Parameter values. 
    416             self.q_input.q.ctypes.data,  # Q values. 
    417             self.result.ctypes.data,   # Result storage. 
    418             self._as_dtype(cutoff),  # Probability cutoff. 
    419             effective_radius_type,  # R_eff mode. 
     398        args = [ 
     399            self.q_input.nq, # nq 
     400            None, # pd_start 
     401            None, # pd_stop pd_stride[MAX_PD] 
     402            call_details.buffer.ctypes.data, # problem 
     403            values.ctypes.data,  # pars 
     404            self.q_input.q.ctypes.data, # q 
     405            self.result.ctypes.data,   # results 
     406            self.real(cutoff), # cutoff 
     407            effective_radius_type, # cutoff 
    420408        ] 
    421  
    422         # Call kernel and retrieve results. 
    423409        #print("Calling DLL") 
    424410        #call_details.show(values) 
    425411        step = 100 
    426         # TODO: Do we need the explicit sleep like the OpenCL and CUDA loops? 
    427412        for start in range(0, call_details.num_eval, step): 
    428413            stop = min(start + step, call_details.num_eval) 
    429             kernel_args[1:3] = [start, stop] 
    430             kernel(*kernel_args) # type: ignore 
     414            args[1:3] = [start, stop] 
     415            kernel(*args) # type: ignore 
    431416 
    432417    def release(self): 
    433418        # type: () -> None 
    434419        """ 
    435         Release resources associated with the kernel. 
     420        Release any resources associated with the kernel. 
    436421        """ 
    437         # TODO: OpenCL/CUDA allocate q_input in __init__ and free it in release. 
    438         # Should we be doing the same for DLL? 
    439         #self.q_input.release() 
    440         pass 
    441  
    442     def __del__(self): 
    443         # type: () -> None 
    444         self.release() 
     422        self.q_input.release() 
  • sasmodels/kernelpy.py

    r3199b17 re44432d  
    3333logger = logging.getLogger(__name__) 
    3434 
    35  
    3635class PyModel(KernelModel): 
    3736    """ 
     
    3938    """ 
    4039    def __init__(self, model_info): 
    41         # Make sure Iq is available and vectorized. 
     40        # Make sure Iq is available and vectorized 
    4241        _create_default_functions(model_info) 
    4342        self.info = model_info 
     
    5453        """ 
    5554        pass 
    56  
    5755 
    5856class PyInput(object): 
     
    9391        self.q = None 
    9492 
    95  
    9693class PyKernel(Kernel): 
    9794    """ 
     
    134131        parameter_vector = np.empty(len(partable.call_parameters)-2, 'd') 
    135132 
    136         # Create views into the array to hold the arguments. 
     133        # Create views into the array to hold the arguments 
    137134        offset = 0 
    138135        kernel_args, volume_args = [], [] 
     
    177174                        else (lambda mode: 1.0)) 
    178175 
     176 
     177 
    179178    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    180179        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     
    196195        self.q_input.release() 
    197196        self.q_input = None 
    198  
    199197 
    200198def _loops(parameters,    # type: np.ndarray 
     
    256254        total = np.zeros(nq, 'd') 
    257255        for loop_index in range(call_details.num_eval): 
    258             # Update polydispersity parameter values. 
     256            # update polydispersity parameter values 
    259257            if p0_index == p0_length: 
    260258                pd_index = (loop_index//pd_stride)%pd_length 
     
    267265            p0_index += 1 
    268266            if weight > cutoff: 
    269                 # Call the scattering function. 
     267                # Call the scattering function 
    270268                # Assume that NaNs are only generated if the parameters are bad; 
    271269                # exclude all q for that NaN.  Even better would be to have an 
     
    275273                    continue 
    276274 
    277                 # Update value and norm. 
     275                # update value and norm 
    278276                total += weight * Iq 
    279277                weight_norm += weight 
     
    295293    any functions that are not already marked as vectorized. 
    296294    """ 
    297     # Note: Must call create_vector_Iq before create_vector_Iqxy. 
     295    # Note: must call create_vector_Iq before create_vector_Iqxy 
    298296    _create_vector_Iq(model_info) 
    299297    _create_vector_Iqxy(model_info) 
  • sasmodels/model_test.py

    r00afc15 r5024a56  
    167167        # test using cuda if desired and available 
    168168        if 'cuda' in loaders and use_cuda(): 
    169             test_name = "%s-cuda" % model_info.id 
     169            test_name = "%s-cuda"%model_name 
    170170            test_method_name = "test_%s_cuda" % model_info.id 
    171171            # Using dtype=None so that the models that are only 
  • sasmodels/models/rpa.c

    r19dc29e7 r71b751d  
    2525  double S0ba,Pbb,S0bb,Pbc,S0bc,Pbd,S0bd; 
    2626  double S0ca,S0cb,Pcc,S0cc,Pcd,S0cd; 
    27   //double S0da,S0db,S0dc; 
     27  double S0da,S0db,S0dc; 
    2828  double Pdd,S0dd; 
    2929  double Kaa,Kbb,Kcc; 
    3030  double Kba,Kca,Kcb; 
    31   //double Kda,Kdb,Kdc,Kdd; 
     31  double Kda,Kdb,Kdc,Kdd; 
    3232  double Zaa,Zab,Zac,Zba,Zbb,Zbc,Zca,Zcb,Zcc; 
    3333  double DenT,T11,T12,T13,T21,T22,T23,T31,T32,T33; 
     
    3636  double N11,N12,N13,N21,N22,N23,N31,N32,N33; 
    3737  double M11,M12,M13,M21,M22,M23,M31,M32,M33; 
    38   double S11,S12,S22,S23,S13,S33; 
    39   //double S21,S31,S32,S44;  
    40   //double S14,S24,S34,S41,S42,S43; 
     38  double S11,S12,S13,S14,S21,S22,S23,S24; 
     39  double S31,S32,S33,S34,S41,S42,S43,S44; 
    4140  double Lad,Lbd,Lcd,Nav,Intg; 
    4241 
     
    116115  S0cd=(Phicd*vcd*Ncd)*Pcd; 
    117116 
    118   //S0da=S0ad; 
    119   //S0db=S0bd; 
    120   //S0dc=S0cd; 
     117  S0da=S0ad; 
     118  S0db=S0bd; 
     119  S0dc=S0cd; 
    121120  Pdd=2.0*(exp(-Xd)-1.0+Xd)/(Xd*Xd); // free D chain 
    122121  S0dd=N[3]*Phi[3]*v[3]*Pdd; 
     
    199198  S0ca=S0ac; 
    200199  S0cb=S0bc; 
    201   //S0da=S0ad; 
    202   //S0db=S0bd; 
    203   //S0dc=S0cd; 
     200  S0da=S0ad; 
     201  S0db=S0bd; 
     202  S0dc=S0cd; 
    204203 
    205204  // self chi parameter is 0 ... of course 
     
    207206  Kbb=0.0; 
    208207  Kcc=0.0; 
    209   //Kdd=0.0; 
     208  Kdd=0.0; 
    210209 
    211210  Kba=Kab; 
    212211  Kca=Kac; 
    213212  Kcb=Kbc; 
    214   //Kda=Kad; 
    215   //Kdb=Kbd; 
    216   //Kdc=Kcd; 
     213  Kda=Kad; 
     214  Kdb=Kbd; 
     215  Kdc=Kcd; 
    217216 
    218217  Zaa=Kaa-Kad-Kad; 
     
    295294  S12= Q12*S0aa + Q22*S0ab + Q32*S0ac; 
    296295  S13= Q13*S0aa + Q23*S0ab + Q33*S0ac; 
     296  S14=-S11-S12-S13; 
     297  S21= Q11*S0ba + Q21*S0bb + Q31*S0bc; 
    297298  S22= Q12*S0ba + Q22*S0bb + Q32*S0bc; 
    298299  S23= Q13*S0ba + Q23*S0bb + Q33*S0bc; 
     300  S24=-S21-S22-S23; 
     301  S31= Q11*S0ca + Q21*S0cb + Q31*S0cc; 
     302  S32= Q12*S0ca + Q22*S0cb + Q32*S0cc; 
    299303  S33= Q13*S0ca + Q23*S0cb + Q33*S0cc; 
    300   //S21= Q11*S0ba + Q21*S0bb + Q31*S0bc; 
    301   //S31= Q11*S0ca + Q21*S0cb + Q31*S0cc; 
    302   //S32= Q12*S0ca + Q22*S0cb + Q32*S0cc; 
    303   //S44=S11+S22+S33+2.0*S12+2.0*S13+2.0*S23; 
    304   //S14=-S11-S12-S13; 
    305   //S24=-S21-S22-S23; 
    306   //S34=-S31-S32-S33; 
    307   //S41=S14; 
    308   //S42=S24; 
    309   //S43=S34; 
     304  S34=-S31-S32-S33; 
     305  S41=S14; 
     306  S42=S24; 
     307  S43=S34; 
     308  S44=S11+S22+S33+2.0*S12+2.0*S13+2.0*S23; 
    310309 
    311310  //calculate contrast where L[i] is the scattering length of i and D is the matrix 
Note: See TracChangeset for help on using the changeset viewer.