Changes in / [0d26e91:e589e9a] in sasmodels


Ignore:
Location:
sasmodels
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernel.py

    r3199b17 rcd28947  
    2323# pylint: enable=unused-import 
    2424 
    25  
    2625class KernelModel(object): 
    2726    info = None  # type: ModelInfo 
     
    3433        # type: () -> None 
    3534        pass 
    36  
    3735 
    3836class Kernel(object): 
  • sasmodels/kernelcl.py

    r3199b17 rf872fd1  
    6969import numpy as np  # type: ignore 
    7070 
    71 # Attempt to setup OpenCL. This may fail if the pyopencl package is not 
     71# Attempt to setup opencl. This may fail if the pyopencl package is not 
    7272# installed or if it is installed but there are no devices available. 
    7373try: 
     
    7575    from pyopencl import mem_flags as mf 
    7676    from pyopencl.characterize import get_fast_inaccurate_build_options 
    77     # Ask OpenCL for the default context so that we know that one exists. 
     77    # Ask OpenCL for the default context so that we know that one exists 
    7878    cl.create_some_context(interactive=False) 
    7979    HAVE_OPENCL = True 
     
    9696# pylint: enable=unused-import 
    9797 
    98  
    99 # CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 
     98# CRUFT: pyopencl < 2017.1  (as of June 2016 needs quotes around include path) 
    10099def quote_path(v): 
    101100    """ 
     
    108107    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 
    109108 
    110  
    111109def fix_pyopencl_include(): 
    112110    """ 
     
    115113    import pyopencl as cl 
    116114    if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 
    117         cl._DEFAULT_INCLUDE_OPTIONS = [ 
    118             quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 
    119             ] 
    120  
     115        cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 
    121116 
    122117if HAVE_OPENCL: 
     
    131126MAX_LOOPS = 2048 
    132127 
     128 
    133129# Pragmas for enable OpenCL features.  Be sure to protect them so that they 
    134130# still compile even if OpenCL is not present. 
     
    145141""" 
    146142 
    147  
    148143def use_opencl(): 
    149144    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 
    150145    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 
    151146 
    152  
    153147ENV = None 
    154148def reset_environment(): 
     
    158152    global ENV 
    159153    ENV = GpuEnvironment() if use_opencl() else None 
    160  
    161154 
    162155def environment(): 
     
    176169    return ENV 
    177170 
    178  
    179171def has_type(device, dtype): 
    180172    # type: (cl.Device, np.dtype) -> bool 
     
    187179        return "cl_khr_fp64" in device.extensions 
    188180    else: 
    189         # Not supporting F16 type since it isn't accurate enough. 
     181        # Not supporting F16 type since it isn't accurate enough 
    190182        return False 
    191  
    192183 
    193184def get_warp(kernel, queue): 
     
    199190        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
    200191        queue.device) 
    201  
    202192 
    203193def compile_model(context, source, dtype, fast=False): 
     
    221211        source_list.insert(0, _F64_PRAGMA) 
    222212 
    223     # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 
     213    # Note: USE_SINCOS makes the intel cpu slower under opencl 
    224214    if context.devices[0].type == cl.device_type.GPU: 
    225215        source_list.insert(0, "#define USE_SINCOS\n") 
     
    228218    source = "\n".join(source_list) 
    229219    program = cl.Program(context, source).build(options=options) 
    230  
    231220    #print("done with "+program) 
    232221    return program 
    233222 
    234223 
    235 # For now, this returns one device in the context. 
    236 # TODO: Create a context that contains all devices on all platforms. 
     224# for now, this returns one device in the context 
     225# TODO: create a context that contains all devices on all platforms 
    237226class GpuEnvironment(object): 
    238227    """ 
    239     GPU context for OpenCL, with possibly many devices and one queue per device. 
     228    GPU context, with possibly many devices, and one queue per device. 
     229 
     230    Because the environment can be reset during a live program (e.g., if the 
     231    user changes the active GPU device in the GUI), everything associated 
     232    with the device context must be cached in the environment and recreated 
     233    if the environment changes.  The *cache* attribute is a simple dictionary 
     234    which holds keys and references to objects, such as compiled kernels and 
     235    allocated buffers.  The running program should check in the cache for 
     236    long lived objects and create them if they are not there.  The program 
     237    should not hold onto cached objects, but instead only keep them active 
     238    for the duration of a function call.  When the environment is destroyed 
     239    then the *release* method for each active cache item is called before 
     240    the environment is freed.  This means that each cl buffer should be 
     241    in its own cache entry. 
    240242    """ 
    241243    def __init__(self): 
    242244        # type: () -> None 
    243         # Find gpu context. 
     245        # find gpu context 
    244246        context_list = _create_some_context() 
    245247 
     
    255257                self.context[dtype] = None 
    256258 
    257         # Build a queue for each context. 
     259        # Build a queue for each context 
    258260        self.queue = {} 
    259261        context = self.context[F32] 
     
    265267            self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 
    266268 
    267         ## Byte boundary for data alignment. 
     269        # Byte boundary for data alignment 
    268270        #self.data_boundary = max(context.devices[0].min_data_type_align_size 
    269271        #                         for context in self.context.values()) 
    270272 
    271         # Cache for compiled programs, and for items in context. 
     273        # Cache for compiled programs, and for items in context 
    272274        self.compiled = {} 
     275        self.cache = {} 
    273276 
    274277    def has_type(self, dtype): 
     
    285288        """ 
    286289        # Note: PyOpenCL caches based on md5 hash of source, options and device 
    287         # but I'll do so as well just to save some data munging time. 
     290        # so we don't really need to cache things for ourselves.  I'll do so 
     291        # anyway just to save some data munging time. 
    288292        tag = generate.tag_source(source) 
    289293        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    290         # Check timestamp on program. 
     294        # Check timestamp on program 
    291295        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    292296        if program_timestamp < timestamp: 
     
    301305        return program 
    302306 
     307    def free_buffer(self, key): 
     308        if key in self.cache: 
     309            self.cache[key].release() 
     310            del self.cache[key] 
     311 
     312    def __del__(self): 
     313        for v in self.cache.values(): 
     314            release = getattr(v, 'release', lambda: None) 
     315            release() 
     316        self.cache = {} 
     317 
     318_CURRENT_ID = 0 
     319def unique_id(): 
     320    global _CURRENT_ID 
     321    _CURRENT_ID += 1 
     322    return _CURRENT_ID 
    303323 
    304324def _create_some_context(): 
     
    313333    which one (and not a CUDA device, or no GPU). 
    314334    """ 
    315     # Assume we do not get here if SAS_OPENCL is None or CUDA. 
     335    # Assume we do not get here if SAS_OPENCL is None or CUDA 
    316336    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 
    317337    if sas_opencl.lower() != 'opencl': 
    318         # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 
     338        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
    319339        os.environ["PYOPENCL_CTX"] = sas_opencl 
    320340 
     
    324344        except Exception as exc: 
    325345            warnings.warn(str(exc)) 
    326             warnings.warn("pyopencl.create_some_context() failed.  The " 
    327                 "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 
    328                 "not be set correctly") 
     346            warnings.warn("pyopencl.create_some_context() failed") 
     347            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 
    329348 
    330349    return _get_default_context() 
    331  
    332350 
    333351def _get_default_context(): 
     
    342360    # is running may increase throughput. 
    343361    # 
    344     # MacBook Pro, base install: 
     362    # Macbook pro, base install: 
    345363    #     {'Apple': [Intel CPU, NVIDIA GPU]} 
    346     # MacBook Pro, base install: 
     364    # Macbook pro, base install: 
    347365    #     {'Apple': [Intel CPU, Intel GPU]} 
    348     # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 
     366    # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed 
    349367    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 
    350368    gpu, cpu = None, None 
     
    369387            else: 
    370388                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 
    371                 # Intel Phi for example registers as an accelerator. 
     389                # Intel Phi for example registers as an accelerator 
    372390                # Since the user installed a custom device on their system 
    373391                # and went through the pain of sorting out OpenCL drivers for 
     
    376394                gpu = device 
    377395 
    378     # Order the devices by gpu then by cpu; when searching for an available 
     396    # order the devices by gpu then by cpu; when searching for an available 
    379397    # device by data type they will be checked in this order, which means 
    380398    # that if the gpu supports double then the cpu will never be used (though 
     
    403421    that the compiler is allowed to take shortcuts. 
    404422    """ 
    405     info = None  # type: ModelInfo 
    406     source = ""  # type: str 
    407     dtype = None  # type: np.dtype 
    408     fast = False  # type: bool 
    409     _program = None  # type: cl.Program 
    410     _kernels = None  # type: Dict[str, cl.Kernel] 
    411  
    412423    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
    413424        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None 
     
    416427        self.dtype = dtype 
    417428        self.fast = fast 
     429        self.timestamp = generate.ocl_timestamp(self.info) 
     430        self._cache_key = unique_id() 
    418431 
    419432    def __getstate__(self): 
     
    424437        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    425438        self.info, self.source, self.dtype, self.fast = state 
    426         self._program = self._kernels = None 
    427439 
    428440    def make_kernel(self, q_vectors): 
     
    430442        return GpuKernel(self, q_vectors) 
    431443 
    432     def get_function(self, name): 
     444    @property 
     445    def Iq(self): 
     446        return self._fetch_kernel('Iq') 
     447 
     448    def fetch_kernel(self, name): 
    433449        # type: (str) -> cl.Kernel 
    434450        """ 
     
    436452        does not already exist. 
    437453        """ 
    438         if self._program is None: 
    439             self._prepare_program() 
    440         return self._kernels[name] 
    441  
    442     def _prepare_program(self): 
    443         # type: (str) -> None 
    444         env = environment() 
    445         timestamp = generate.ocl_timestamp(self.info) 
    446         program = env.compile_program( 
    447             self.info.name, 
    448             self.source['opencl'], 
    449             self.dtype, 
    450             self.fast, 
    451             timestamp) 
    452         variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    453         names = [generate.kernel_name(self.info, k) for k in variants] 
    454         functions = [getattr(program, k) for k in names] 
    455         self._kernels = {k: v for k, v in zip(variants, functions)} 
    456         # Keep a handle to program so GC doesn't collect. 
    457         self._program = program 
    458  
    459  
    460 # TODO: Check that we don't need a destructor for buffers which go out of scope. 
     454        gpu = environment() 
     455        key = self._cache_key 
     456        if key not in gpu.cache: 
     457            program = gpu.compile_program( 
     458                self.info.name, 
     459                self.source['opencl'], 
     460                self.dtype, 
     461                self.fast, 
     462                self.timestamp) 
     463            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     464            names = [generate.kernel_name(self.info, k) for k in variants] 
     465            kernels = [getattr(program, k) for k in names] 
     466            data = dict((k, v) for k, v in zip(variants, kernels)) 
     467            # keep a handle to program so GC doesn't collect 
     468            data['program'] = program 
     469            gpu.cache[key] = data 
     470        else: 
     471            data = gpu.cache[key] 
     472        return data[name] 
     473 
     474# TODO: check that we don't need a destructor for buffers which go out of scope 
    461475class GpuInput(object): 
    462476    """ 
     
    480494    def __init__(self, q_vectors, dtype=generate.F32): 
    481495        # type: (List[np.ndarray], np.dtype) -> None 
    482         # TODO: Do we ever need double precision q? 
     496        # TODO: do we ever need double precision q? 
    483497        self.nq = q_vectors[0].size 
    484498        self.dtype = np.dtype(dtype) 
    485499        self.is_2d = (len(q_vectors) == 2) 
    486         # TODO: Stretch input based on get_warp(). 
    487         # Not doing it now since warp depends on kernel, which is not known 
     500        # TODO: stretch input based on get_warp() 
     501        # not doing it now since warp depends on kernel, which is not known 
    488502        # at this point, so instead using 32, which is good on the set of 
    489503        # architectures tested so far. 
     
    498512            self.q[:self.nq] = q_vectors[0] 
    499513        self.global_size = [self.q.shape[0]] 
    500         #print("creating inputs of size", self.global_size) 
    501  
    502         # Transfer input value to GPU. 
     514        self._cache_key = unique_id() 
     515 
     516    @property 
     517    def q_b(self): 
     518        """Lazy creation of q buffer so it can survive context reset""" 
    503519        env = environment() 
    504         context = env.context[self.dtype] 
    505         self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    506                              hostbuf=self.q) 
     520        key = self._cache_key 
     521        if key not in env.cache: 
     522            context = env.context[self.dtype] 
     523            #print("creating inputs of size", self.global_size) 
     524            buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
     525                               hostbuf=self.q) 
     526            env.cache[key] = buffer 
     527        return env.cache[key] 
    507528 
    508529    def release(self): 
    509530        # type: () -> None 
    510531        """ 
    511         Free the buffer associated with the q value. 
    512         """ 
    513         if self.q_b is not None: 
    514             self.q_b.release() 
    515             self.q_b = None 
     532        Free the buffer associated with the q value 
     533        """ 
     534        environment().free_buffer(id(self)) 
    516535 
    517536    def __del__(self): 
     
    519538        self.release() 
    520539 
    521  
    522540class GpuKernel(Kernel): 
    523541    """ 
     
    526544    *model* is the GpuModel object to call 
    527545 
    528     The kernel is derived from :class:`Kernel`, providing the 
    529     :meth:`call_kernel` method to evaluate the kernel for a given set of 
    530     parameters.  Because of the need to move the q values to the GPU before 
    531     evaluation, the kernel is instantiated for a particular set of q vectors, 
    532     and can be called many times without transfering q each time. 
     546    The following attributes are defined: 
     547 
     548    *info* is the module information 
     549 
     550    *dtype* is the kernel precision 
     551 
     552    *dim* is '1d' or '2d' 
     553 
     554    *result* is a vector to contain the results of the call 
     555 
     556    The resulting call method takes the *pars*, a list of values for 
     557    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
     558    vectors for the polydisperse parameters.  *cutoff* determines the 
     559    integration limits: any points with combined weight less than *cutoff* 
     560    will not be calculated. 
    533561 
    534562    Call :meth:`release` when done with the kernel instance. 
    535563    """ 
    536     #: SAS model information structure. 
    537     info = None  # type: ModelInfo 
    538     #: Kernel precision. 
    539     dtype = None  # type: np.dtype 
    540     #: Kernel dimensions (1d or 2d). 
    541     dim = ""  # type: str 
    542     #: Calculation results, updated after each call to :meth:`_call_kernel`. 
    543     result = None  # type: np.ndarray 
    544  
    545564    def __init__(self, model, q_vectors): 
    546         # type: (GpuModel, List[np.ndarray]) -> None 
     565        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    547566        dtype = model.dtype 
    548567        self.q_input = GpuInput(q_vectors, dtype) 
    549568        self._model = model 
    550  
    551         # Attributes accessed from the outside. 
     569        # F16 isn't sufficient, so don't support it 
     570        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     571        self._cache_key = unique_id() 
     572 
     573        # attributes accessed from the outside 
    552574        self.dim = '2d' if self.q_input.is_2d else '1d' 
    553575        self.info = model.info 
    554         self.dtype = dtype 
    555  
    556         # Converter to translate input to target type. 
    557         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    558  
    559         # Holding place for the returned value. 
     576        self.dtype = model.dtype 
     577 
     578        # holding place for the returned value 
    560579        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    561         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    562         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    563  
    564         # Allocate result value on GPU. 
     580        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     581        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
     582 
     583    @property 
     584    def _result_b(self): 
     585        """Lazy creation of result buffer so it can survive context reset""" 
    565586        env = environment() 
    566         context = env.context[self.dtype] 
    567         width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    568         self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 
    569  
    570     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    571                      effective_radius_type): 
    572         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
     587        key = self._cache_key 
     588        if key not in env.cache: 
     589            context = env.context[self.dtype] 
     590            width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
     591            buffer = cl.Buffer(context, mf.READ_WRITE, width) 
     592            env.cache[key] = buffer 
     593        return env.cache[key] 
     594 
     595    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     596        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    573597        env = environment() 
    574598        queue = env.queue[self._model.dtype] 
    575599        context = queue.context 
    576600 
    577         # Arrange data transfer to card. 
     601        # Arrange data transfer to/from card 
     602        q_b = self.q_input.q_b 
     603        result_b = self._result_b 
    578604        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    579605                              hostbuf=call_details.buffer) 
     
    581607                             hostbuf=values) 
    582608 
    583         # Setup kernel function and arguments. 
    584609        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    585         kernel = self._model.get_function(name) 
     610        kernel = self._model.fetch_kernel(name) 
    586611        kernel_args = [ 
    587             np.uint32(self.q_input.nq),  # Number of inputs. 
    588             None,  # Placeholder for pd_start. 
    589             None,  # Placeholder for pd_stop. 
    590             details_b,  # Problem definition. 
    591             values_b,  # Parameter values. 
    592             self.q_input.q_b,  # Q values. 
    593             self._result_b,   # Result storage. 
    594             self._as_dtype(cutoff),  # Probability cutoff. 
    595             np.uint32(effective_radius_type),  # R_eff mode. 
     612            np.uint32(self.q_input.nq), None, None, 
     613            details_b, values_b, q_b, result_b, 
     614            self._as_dtype(cutoff), 
     615            np.uint32(effective_radius_type), 
    596616        ] 
    597  
    598         # Call kernel and retrieve results. 
    599617        #print("Calling OpenCL") 
    600618        #call_details.show(values) 
     619        #Call kernel and retrieve results 
    601620        wait_for = None 
    602621        last_nap = clock() 
     
    609628                               *kernel_args, wait_for=wait_for)] 
    610629            if stop < call_details.num_eval: 
    611                 # Allow other processes to run. 
     630                # Allow other processes to run 
    612631                wait_for[0].wait() 
    613632                current_time = clock() 
     
    615634                    time.sleep(0.001) 
    616635                    last_nap = current_time 
    617         cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for) 
     636        cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for) 
    618637        #print("result", self.result) 
    619638 
    620         # Free buffers. 
    621         details_b.release() 
    622         values_b.release() 
     639        # Free buffers 
     640        for v in (details_b, values_b): 
     641            if v is not None: 
     642                v.release() 
    623643 
    624644    def release(self): 
     
    627647        Release resources associated with the kernel. 
    628648        """ 
     649        environment().free_buffer(id(self)) 
    629650        self.q_input.release() 
    630         if self._result_b is not None: 
    631             self._result_b.release() 
    632             self._result_b = None 
    633651 
    634652    def __del__(self): 
  • sasmodels/kernelcuda.py

    rfa26e78 rf872fd1  
    6363import time 
    6464import re 
    65 import atexit 
    6665 
    6766import numpy as np  # type: ignore 
    6867 
    6968 
    70 # Attempt to setup CUDA. This may fail if the pycuda package is not 
     69# Attempt to setup cuda. This may fail if the pycuda package is not 
    7170# installed or if it is installed but there are no devices available. 
    7271try: 
     
    108107MAX_LOOPS = 2048 
    109108 
    110  
    111109def use_cuda(): 
    112     sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 
    113     return HAVE_CUDA and sas_opencl.startswith("cuda") 
    114  
     110    env = os.environ.get("SAS_OPENCL", "").lower() 
     111    return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
    115112 
    116113ENV = None 
     
    124121        ENV.release() 
    125122    ENV = GpuEnvironment() if use_cuda() else None 
    126  
    127123 
    128124def environment(): 
     
    142138    return ENV 
    143139 
    144  
    145 # PyTest is not freeing ENV, so make sure it gets freed. 
    146 atexit.register(lambda: ENV.release() if ENV is not None else None) 
    147  
    148  
    149140def has_type(dtype): 
    150141    # type: (np.dtype) -> bool 
     
    152143    Return true if device supports the requested precision. 
    153144    """ 
    154     # Assume the NVIDIA card supports 32-bit and 64-bit floats. 
    155     # TODO: Check if pycuda support F16. 
     145    # Assume the nvidia card supports 32-bit and 64-bit floats. 
     146    # TODO: check if pycuda support F16 
    156147    return dtype in (generate.F32, generate.F64) 
    157148 
    158149 
    159150FUNCTION_PATTERN = re.compile(r"""^ 
    160   (?P<space>\s*)                       # Initial space. 
    161   (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function. 
    162   (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens. 
     151  (?P<space>\s*)                   # initial space 
     152  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 
     153  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens 
    163154  """, re.VERBOSE|re.MULTILINE) 
    164155 
     
    167158  """, re.VERBOSE|re.MULTILINE) 
    168159 
    169  
    170160def _add_device_tag(match): 
    171161    # type: (None) -> str 
    172     # Note: Should be re.Match, but that isn't a simple type. 
     162    # Note: should be re.Match, but that isn't a simple type 
    173163    """ 
    174164    replace qualifiers with __device__ qualifiers if needed 
     
    183173        return "".join((space, "__device__ ", qualifiers, function)) 
    184174 
    185  
    186175def mark_device_functions(source): 
    187176    # type: (str) -> str 
     
    190179    """ 
    191180    return FUNCTION_PATTERN.sub(_add_device_tag, source) 
    192  
    193181 
    194182def show_device_functions(source): 
     
    200188        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 
    201189    return source 
    202  
    203190 
    204191def compile_model(source, dtype, fast=False): 
     
    225212    #options = ['--verbose', '-E'] 
    226213    options = ['--use_fast_math'] if fast else None 
    227     program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 
     214    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
    228215 
    229216    #print("done with "+program) 
     
    231218 
    232219 
    233 # For now, this returns one device in the context. 
    234 # TODO: Create a context that contains all devices on all platforms. 
     220# for now, this returns one device in the context 
     221# TODO: create a context that contains all devices on all platforms 
    235222class GpuEnvironment(object): 
    236223    """ 
    237     GPU context for CUDA. 
     224    GPU context, with possibly many devices, and one queue per device. 
    238225    """ 
    239226    context = None # type: cuda.Context 
    240227    def __init__(self, devnum=None): 
    241228        # type: (int) -> None 
     229        # Byte boundary for data alignment 
     230        #self.data_boundary = max(d.min_data_type_align_size 
     231        #                         for d in self.context.devices) 
     232        self.compiled = {} 
    242233        env = os.environ.get("SAS_OPENCL", "").lower() 
    243234        if devnum is None and env.startswith("cuda:"): 
    244235            devnum = int(env[5:]) 
    245  
    246236        # Set the global context to the particular device number if one is 
    247237        # given, otherwise use the default context.  Perhaps this will be set 
     
    252242            self.context = make_default_context() 
    253243 
    254         ## Byte boundary for data alignment. 
    255         #self.data_boundary = max(d.min_data_type_align_size 
    256         #                         for d in self.context.devices) 
    257  
    258         # Cache for compiled programs, and for items in context. 
    259         self.compiled = {} 
    260  
    261244    def release(self): 
    262245        if self.context is not None: 
     
    279262        Compile the program for the device in the given context. 
    280263        """ 
    281         # Note: PyCuda (probably) caches but I'll do so as well just to 
    282         # save some data munging time. 
     264        # Note: PyOpenCL caches based on md5 hash of source, options and device 
     265        # so we don't really need to cache things for ourselves.  I'll do so 
     266        # anyway just to save some data munging time. 
    283267        tag = generate.tag_source(source) 
    284268        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    285         # Check timestamp on program. 
     269        # Check timestamp on program 
    286270        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    287271        if program_timestamp < timestamp: 
     
    293277        return program 
    294278 
    295  
    296279class GpuModel(KernelModel): 
    297280    """ 
     
    309292    that the compiler is allowed to take shortcuts. 
    310293    """ 
    311     info = None  # type: ModelInfo 
    312     source = ""  # type: str 
    313     dtype = None  # type: np.dtype 
    314     fast = False  # type: bool 
    315     _program = None # type: SourceModule 
    316     _kernels = None  # type: Dict[str, cuda.Function] 
     294    info = None # type: ModelInfo 
     295    source = "" # type: str 
     296    dtype = None # type: np.dtype 
     297    fast = False # type: bool 
     298    program = None # type: SourceModule 
     299    _kernels = None # type: List[cuda.Function] 
    317300 
    318301    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
     
    322305        self.dtype = dtype 
    323306        self.fast = fast 
     307        self.program = None # delay program creation 
     308        self._kernels = None 
    324309 
    325310    def __getstate__(self): 
     
    330315        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    331316        self.info, self.source, self.dtype, self.fast = state 
    332         self._program = self._kernels = None 
     317        self.program = None 
    333318 
    334319    def make_kernel(self, q_vectors): 
    335320        # type: (List[np.ndarray]) -> "GpuKernel" 
    336         return GpuKernel(self, q_vectors) 
    337  
    338     def get_function(self, name): 
    339         # type: (str) -> cuda.Function 
    340         """ 
    341         Fetch the kernel from the environment by name, compiling it if it 
    342         does not already exist. 
    343         """ 
    344         if self._program is None: 
    345             self._prepare_program() 
    346         return self._kernels[name] 
    347  
    348     def _prepare_program(self): 
    349         # type: (str) -> None 
    350         env = environment() 
    351         timestamp = generate.ocl_timestamp(self.info) 
    352         program = env.compile_program( 
    353             self.info.name, 
    354             self.source['opencl'], 
    355             self.dtype, 
    356             self.fast, 
    357             timestamp) 
    358         variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    359         names = [generate.kernel_name(self.info, k) for k in variants] 
    360         functions = [program.get_function(k) for k in names] 
    361         self._kernels = {k: v for k, v in zip(variants, functions)} 
    362         # Keep a handle to program so GC doesn't collect. 
    363         self._program = program 
    364  
    365  
    366 # TODO: Check that we don't need a destructor for buffers which go out of scope. 
     321        if self.program is None: 
     322            compile_program = environment().compile_program 
     323            timestamp = generate.ocl_timestamp(self.info) 
     324            self.program = compile_program( 
     325                self.info.name, 
     326                self.source['opencl'], 
     327                self.dtype, 
     328                self.fast, 
     329                timestamp) 
     330            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     331            names = [generate.kernel_name(self.info, k) for k in variants] 
     332            kernels = [self.program.get_function(k) for k in names] 
     333            self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 
     334        is_2d = len(q_vectors) == 2 
     335        if is_2d: 
     336            kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 
     337        else: 
     338            kernel = [self._kernels['Iq']]*2 
     339        return GpuKernel(kernel, self.dtype, self.info, q_vectors) 
     340 
     341    def release(self): 
     342        # type: () -> None 
     343        """ 
     344        Free the resources associated with the model. 
     345        """ 
     346        if self.program is not None: 
     347            self.program = None 
     348 
     349    def __del__(self): 
     350        # type: () -> None 
     351        self.release() 
     352 
     353# TODO: check that we don't need a destructor for buffers which go out of scope 
    367354class GpuInput(object): 
    368355    """ 
     
    386373    def __init__(self, q_vectors, dtype=generate.F32): 
    387374        # type: (List[np.ndarray], np.dtype) -> None 
    388         # TODO: Do we ever need double precision q? 
     375        # TODO: do we ever need double precision q? 
    389376        self.nq = q_vectors[0].size 
    390377        self.dtype = np.dtype(dtype) 
    391378        self.is_2d = (len(q_vectors) == 2) 
    392         # TODO: Stretch input based on get_warp(). 
    393         # Not doing it now since warp depends on kernel, which is not known 
     379        # TODO: stretch input based on get_warp() 
     380        # not doing it now since warp depends on kernel, which is not known 
    394381        # at this point, so instead using 32, which is good on the set of 
    395382        # architectures tested so far. 
    396383        if self.is_2d: 
    397             width = ((self.nq+15)//16)*16 
     384            # Note: 16 rather than 15 because result is 1 longer than input. 
     385            width = ((self.nq+16)//16)*16 
    398386            self.q = np.empty((width, 2), dtype=dtype) 
    399387            self.q[:self.nq, 0] = q_vectors[0] 
    400388            self.q[:self.nq, 1] = q_vectors[1] 
    401389        else: 
    402             width = ((self.nq+31)//32)*32 
     390            # Note: 32 rather than 31 because result is 1 longer than input. 
     391            width = ((self.nq+32)//32)*32 
    403392            self.q = np.empty(width, dtype=dtype) 
    404393            self.q[:self.nq] = q_vectors[0] 
    405394        self.global_size = [self.q.shape[0]] 
    406395        #print("creating inputs of size", self.global_size) 
    407  
    408         # Transfer input value to GPU. 
    409396        self.q_b = cuda.to_device(self.q) 
    410397 
     
    412399        # type: () -> None 
    413400        """ 
    414         Free the buffer associated with the q value. 
     401        Free the memory. 
    415402        """ 
    416403        if self.q_b is not None: 
     
    422409        self.release() 
    423410 
    424  
    425411class GpuKernel(Kernel): 
    426412    """ 
    427413    Callable SAS kernel. 
    428414 
    429     *model* is the GpuModel object to call 
    430  
    431     The kernel is derived from :class:`Kernel`, providing the 
    432     :meth:`call_kernel` method to evaluate the kernel for a given set of 
    433     parameters.  Because of the need to move the q values to the GPU before 
    434     evaluation, the kernel is instantiated for a particular set of q vectors, 
    435     and can be called many times without transfering q each time. 
     415    *kernel* is the GpuKernel object to call 
     416 
     417    *model_info* is the module information 
     418 
     419    *q_vectors* is the q vectors at which the kernel should be evaluated 
     420 
     421    *dtype* is the kernel precision 
     422 
     423    The resulting call method takes the *pars*, a list of values for 
     424    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
     425    vectors for the polydisperse parameters.  *cutoff* determines the 
     426    integration limits: any points with combined weight less than *cutoff* 
     427    will not be calculated. 
    436428 
    437429    Call :meth:`release` when done with the kernel instance. 
    438430    """ 
    439     #: SAS model information structure. 
    440     info = None  # type: ModelInfo 
    441     #: Kernel precision. 
    442     dtype = None  # type: np.dtype 
    443     #: Kernel dimensions (1d or 2d). 
    444     dim = ""  # type: str 
    445     #: Calculation results, updated after each call to :meth:`_call_kernel`. 
    446     result = None  # type: np.ndarray 
    447  
    448     def __init__(self, model, q_vectors): 
    449         # type: (GpuModel, List[np.ndarray]) -> None 
    450         dtype = model.dtype 
     431    def __init__(self, kernel, dtype, model_info, q_vectors): 
     432        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    451433        self.q_input = GpuInput(q_vectors, dtype) 
    452         self._model = model 
    453  
    454         # Attributes accessed from the outside. 
     434        self.kernel = kernel 
     435        # F16 isn't sufficient, so don't support it 
     436        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     437 
     438        # attributes accessed from the outside 
    455439        self.dim = '2d' if self.q_input.is_2d else '1d' 
    456         self.info = model.info 
     440        self.info = model_info 
    457441        self.dtype = dtype 
    458442 
    459         # Converter to translate input to target type. 
    460         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    461  
    462         # Holding place for the returned value. 
     443        # holding place for the returned value 
    463444        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    464         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    465         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    466  
    467         # Allocate result value on GPU. 
     445        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     446        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
     447 
     448        # Inputs and outputs for each kernel call 
     449        # Note: res may be shorter than res_b if global_size != nq 
    468450        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    469         self._result_b = cuda.mem_alloc(width) 
    470  
    471     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    472                      effective_radius_type): 
    473         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    474  
    475         # Arrange data transfer to card. 
     451        self.result_b = cuda.mem_alloc(width) 
     452        self._need_release = [self.result_b] 
     453 
     454    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     455        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     456        # Arrange data transfer to card 
    476457        details_b = cuda.to_device(call_details.buffer) 
    477458        values_b = cuda.to_device(values) 
    478459 
    479         # Setup kernel function and arguments. 
    480         name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    481         kernel = self._model.get_function(name) 
    482         kernel_args = [ 
    483             np.uint32(self.q_input.nq),  # Number of inputs. 
    484             None,  # Placeholder for pd_start. 
    485             None,  # Placeholder for pd_stop. 
    486             details_b,  # Problem definition. 
    487             values_b,  # Parameter values. 
    488             self.q_input.q_b,  # Q values. 
    489             self._result_b,   # Result storage. 
    490             self._as_dtype(cutoff),  # Probability cutoff. 
    491             np.uint32(effective_radius_type),  # R_eff mode. 
     460        kernel = self.kernel[1 if magnetic else 0] 
     461        args = [ 
     462            np.uint32(self.q_input.nq), None, None, 
     463            details_b, values_b, self.q_input.q_b, self.result_b, 
     464            self._as_dtype(cutoff), 
     465            np.uint32(effective_radius_type), 
    492466        ] 
    493467        grid = partition(self.q_input.nq) 
    494  
    495         # Call kernel and retrieve results. 
    496         #print("Calling CUDA") 
     468        #print("Calling OpenCL") 
    497469        #call_details.show(values) 
     470        # Call kernel and retrieve results 
    498471        last_nap = time.clock() 
    499472        step = 100000000//self.q_input.nq + 1 
     
    502475            stop = min(start + step, call_details.num_eval) 
    503476            #print("queuing",start,stop) 
    504             kernel_args[1:3] = [np.int32(start), np.int32(stop)] 
    505             kernel(*kernel_args, **grid) 
     477            args[1:3] = [np.int32(start), np.int32(stop)] 
     478            kernel(*args, **grid) 
    506479            if stop < call_details.num_eval: 
    507480                sync() 
    508                 # Allow other processes to run. 
     481                # Allow other processes to run 
    509482                current_time = time.clock() 
    510483                if current_time - last_nap > 0.5: 
     
    512485                    last_nap = current_time 
    513486        sync() 
    514         cuda.memcpy_dtoh(self.result, self._result_b) 
     487        cuda.memcpy_dtoh(self.result, self.result_b) 
    515488        #print("result", self.result) 
    516489 
     
    523496        Release resources associated with the kernel. 
    524497        """ 
    525         self.q_input.release() 
    526         if self._result_b is not None: 
    527             self._result_b.free() 
    528             self._result_b = None 
     498        for p in self._need_release: 
     499            p.free() 
     500        self._need_release = [] 
    529501 
    530502    def __del__(self): 
     
    540512    Note: Maybe context.synchronize() is sufficient. 
    541513    """ 
    542     # Create an event with which to synchronize. 
     514    #return # The following works in C++; don't know what pycuda is doing 
     515    # Create an event with which to synchronize 
    543516    done = cuda.Event() 
    544517 
     
    546519    done.record() 
    547520 
    548     # Make sure we don't hog resource while waiting to sync. 
     521    #line added to not hog resources 
    549522    while not done.query(): 
    550523        time.sleep(0.01) 
     
    552525    # Block until the GPU executes the kernel. 
    553526    done.synchronize() 
    554  
    555527    # Clean up the event; I don't think they can be reused. 
    556528    del done 
  • sasmodels/kerneldll.py

    r3199b17 re44432d  
    100100# pylint: enable=unused-import 
    101101 
    102 # Compiler output is a byte stream that needs to be decode in python 3. 
     102# Compiler output is a byte stream that needs to be decode in python 3 
    103103decode = (lambda s: s) if sys.version_info[0] < 3 else (lambda s: s.decode('utf8')) 
    104104 
     
    115115        COMPILER = "tinycc" 
    116116    elif "VCINSTALLDIR" in os.environ: 
    117         # If vcvarsall.bat has been called, then VCINSTALLDIR is in the 
    118         # environment and we can use the MSVC compiler.  Otherwise, if 
    119         # tinycc is available then use it.  Otherwise, hope that mingw 
    120         # is available. 
     117        # If vcvarsall.bat has been called, then VCINSTALLDIR is in the environment 
     118        # and we can use the MSVC compiler.  Otherwise, if tinycc is available 
     119        # the use it.  Otherwise, hope that mingw is available. 
    121120        COMPILER = "msvc" 
    122121    else: 
     
    125124    COMPILER = "unix" 
    126125 
    127 ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86. 
     126ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86 
    128127if COMPILER == "unix": 
    129     # Generic unix compile. 
    130     # On Mac users will need the X code command line tools installed. 
     128    # Generic unix compile 
     129    # On mac users will need the X code command line tools installed 
    131130    #COMPILE = "gcc-mp-4.7 -shared -fPIC -std=c99 -fopenmp -O2 -Wall %s -o %s -lm -lgomp" 
    132131    CC = "cc -shared -fPIC -std=c99 -O2 -Wall".split() 
    133     # Add OpenMP support if not running on a Mac. 
     132    # add openmp support if not running on a mac 
    134133    if sys.platform != "darwin": 
    135         # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9). 
     134        # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9) 
    136135        # Shut it off for all unix until we can investigate. 
    137136        #CC.append("-fopenmp") 
     
    145144    # vcomp90.dll on the path.  One may be found here: 
    146145    #       C:/Windows/winsxs/x86_microsoft.vc90.openmp*/vcomp90.dll 
    147     # Copy this to the python directory and uncomment the OpenMP COMPILE. 
    148     # TODO: Remove intermediate OBJ file created in the directory. 
    149     # TODO: Maybe don't use randomized name for the c file. 
    150     # TODO: Maybe ask distutils to find MSVC. 
     146    # Copy this to the python directory and uncomment the OpenMP COMPILE 
     147    # TODO: remove intermediate OBJ file created in the directory 
     148    # TODO: maybe don't use randomized name for the c file 
     149    # TODO: maybe ask distutils to find MSVC 
    151150    CC = "cl /nologo /Ox /MD /W3 /GS- /DNDEBUG".split() 
    152151    if "SAS_OPENMP" in os.environ: 
     
    173172ALLOW_SINGLE_PRECISION_DLLS = True 
    174173 
    175  
    176174def compile(source, output): 
    177175    # type: (str, str) -> None 
     
    185183    logging.info(command_str) 
    186184    try: 
    187         # Need shell=True on windows to keep console box from popping up. 
     185        # need shell=True on windows to keep console box from popping up 
    188186        shell = (os.name == 'nt') 
    189187        subprocess.check_output(command, shell=shell, stderr=subprocess.STDOUT) 
     
    194192        raise RuntimeError("compile failed.  File is in %r"%source) 
    195193 
    196  
    197194def dll_name(model_info, dtype): 
    198195    # type: (ModelInfo, np.dtype) ->  str 
     
    205202    basename += ARCH + ".so" 
    206203 
    207     # Hack to find precompiled dlls. 
     204    # Hack to find precompiled dlls 
    208205    path = joinpath(generate.DATA_PATH, '..', 'compiled_models', basename) 
    209206    if os.path.exists(path): 
     
    245242        raise ValueError("16 bit floats not supported") 
    246243    if dtype == F32 and not ALLOW_SINGLE_PRECISION_DLLS: 
    247         dtype = F64  # Force 64-bit dll. 
    248     # Note: dtype may be F128 for long double precision. 
     244        dtype = F64  # Force 64-bit dll 
     245    # Note: dtype may be F128 for long double precision 
    249246 
    250247    dll = dll_path(model_info, dtype) 
     
    257254        need_recompile = dll_time < newest_source 
    258255    if need_recompile: 
    259         # Make sure the DLL path exists. 
     256        # Make sure the DLL path exists 
    260257        if not os.path.exists(SAS_DLL_PATH): 
    261258            os.makedirs(SAS_DLL_PATH) 
     
    266263            file_handle.write(source) 
    267264        compile(source=filename, output=dll) 
    268         # Comment the following to keep the generated C file. 
    269         # Note: If there is a syntax error then compile raises an error 
     265        # comment the following to keep the generated c file 
     266        # Note: if there is a syntax error then compile raises an error 
    270267        # and the source file will not be deleted. 
    271268        os.unlink(filename) 
     
    306303        self.dllpath = dllpath 
    307304        self._dll = None  # type: ct.CDLL 
    308         self._kernels = None  # type: List[Callable, Callable] 
     305        self._kernels = None # type: List[Callable, Callable] 
    309306        self.dtype = np.dtype(dtype) 
    310307 
     
    341338        # type: (List[np.ndarray]) -> DllKernel 
    342339        q_input = PyInput(q_vectors, self.dtype) 
    343         # Note: DLL is lazy loaded. 
     340        # Note: pickle not supported for DllKernel 
    344341        if self._dll is None: 
    345342            self._load_dll() 
     
    361358        self._dll = None 
    362359 
    363  
    364360class DllKernel(Kernel): 
    365361    """ 
     
    383379    def __init__(self, kernel, model_info, q_input): 
    384380        # type: (Callable[[], np.ndarray], ModelInfo, PyInput) -> None 
    385         dtype = q_input.dtype 
     381        #,model_info,q_input) 
     382        self.kernel = kernel 
     383        self.info = model_info 
    386384        self.q_input = q_input 
    387         self.kernel = kernel 
    388  
    389         # Attributes accessed from the outside. 
     385        self.dtype = q_input.dtype 
    390386        self.dim = '2d' if q_input.is_2d else '1d' 
    391         self.info = model_info 
    392         self.dtype = dtype 
    393  
    394         # Converter to translate input to target type. 
    395         self._as_dtype = (np.float32 if dtype == generate.F32 
    396                           else np.float64 if dtype == generate.F64 
    397                           else np.float128) 
    398  
    399         # Holding place for the returned value. 
     387        # leave room for f1/f2 results in case we need to compute beta for 1d models 
    400388        nout = 2 if self.info.have_Fq else 1 
    401         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    402         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    403  
    404     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    405                      effective_radius_type): 
    406         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    407  
    408         # Setup kernel function and arguments. 
     389        # +4 for total weight, shell volume, effective radius, form volume 
     390        self.result = np.empty(q_input.nq*nout + 4, self.dtype) 
     391        self.real = (np.float32 if self.q_input.dtype == generate.F32 
     392                     else np.float64 if self.q_input.dtype == generate.F64 
     393                     else np.float128) 
     394 
     395    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     396        # type: (CallDetails, np.ndarray, np.ndarray, float, bool, int) -> np.ndarray 
    409397        kernel = self.kernel[1 if magnetic else 0] 
    410         kernel_args = [ 
    411             self.q_input.nq,  # Number of inputs. 
    412             None,  # Placeholder for pd_start. 
    413             None,  # Placeholder for pd_stop. 
    414             call_details.buffer.ctypes.data,  # Problem definition. 
    415             values.ctypes.data,  # Parameter values. 
    416             self.q_input.q.ctypes.data,  # Q values. 
    417             self.result.ctypes.data,   # Result storage. 
    418             self._as_dtype(cutoff),  # Probability cutoff. 
    419             effective_radius_type,  # R_eff mode. 
     398        args = [ 
     399            self.q_input.nq, # nq 
     400            None, # pd_start 
     401            None, # pd_stop pd_stride[MAX_PD] 
     402            call_details.buffer.ctypes.data, # problem 
     403            values.ctypes.data,  # pars 
     404            self.q_input.q.ctypes.data, # q 
     405            self.result.ctypes.data,   # results 
     406            self.real(cutoff), # cutoff 
     407            effective_radius_type, # cutoff 
    420408        ] 
    421  
    422         # Call kernel and retrieve results. 
    423409        #print("Calling DLL") 
    424410        #call_details.show(values) 
    425411        step = 100 
    426         # TODO: Do we need the explicit sleep like the OpenCL and CUDA loops? 
    427412        for start in range(0, call_details.num_eval, step): 
    428413            stop = min(start + step, call_details.num_eval) 
    429             kernel_args[1:3] = [start, stop] 
    430             kernel(*kernel_args) # type: ignore 
     414            args[1:3] = [start, stop] 
     415            kernel(*args) # type: ignore 
    431416 
    432417    def release(self): 
    433418        # type: () -> None 
    434419        """ 
    435         Release resources associated with the kernel. 
     420        Release any resources associated with the kernel. 
    436421        """ 
    437         # TODO: OpenCL/CUDA allocate q_input in __init__ and free it in release. 
    438         # Should we be doing the same for DLL? 
    439         #self.q_input.release() 
    440         pass 
    441  
    442     def __del__(self): 
    443         # type: () -> None 
    444         self.release() 
     422        self.q_input.release() 
  • sasmodels/kernelpy.py

    r3199b17 re44432d  
    3333logger = logging.getLogger(__name__) 
    3434 
    35  
    3635class PyModel(KernelModel): 
    3736    """ 
     
    3938    """ 
    4039    def __init__(self, model_info): 
    41         # Make sure Iq is available and vectorized. 
     40        # Make sure Iq is available and vectorized 
    4241        _create_default_functions(model_info) 
    4342        self.info = model_info 
     
    5453        """ 
    5554        pass 
    56  
    5755 
    5856class PyInput(object): 
     
    9391        self.q = None 
    9492 
    95  
    9693class PyKernel(Kernel): 
    9794    """ 
     
    134131        parameter_vector = np.empty(len(partable.call_parameters)-2, 'd') 
    135132 
    136         # Create views into the array to hold the arguments. 
     133        # Create views into the array to hold the arguments 
    137134        offset = 0 
    138135        kernel_args, volume_args = [], [] 
     
    177174                        else (lambda mode: 1.0)) 
    178175 
     176 
     177 
    179178    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    180179        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     
    196195        self.q_input.release() 
    197196        self.q_input = None 
    198  
    199197 
    200198def _loops(parameters,    # type: np.ndarray 
     
    256254        total = np.zeros(nq, 'd') 
    257255        for loop_index in range(call_details.num_eval): 
    258             # Update polydispersity parameter values. 
     256            # update polydispersity parameter values 
    259257            if p0_index == p0_length: 
    260258                pd_index = (loop_index//pd_stride)%pd_length 
     
    267265            p0_index += 1 
    268266            if weight > cutoff: 
    269                 # Call the scattering function. 
     267                # Call the scattering function 
    270268                # Assume that NaNs are only generated if the parameters are bad; 
    271269                # exclude all q for that NaN.  Even better would be to have an 
     
    275273                    continue 
    276274 
    277                 # Update value and norm. 
     275                # update value and norm 
    278276                total += weight * Iq 
    279277                weight_norm += weight 
     
    295293    any functions that are not already marked as vectorized. 
    296294    """ 
    297     # Note: Must call create_vector_Iq before create_vector_Iqxy. 
     295    # Note: must call create_vector_Iq before create_vector_Iqxy 
    298296    _create_vector_Iq(model_info) 
    299297    _create_vector_Iqxy(model_info) 
  • sasmodels/model_test.py

    r00afc15 r5024a56  
    167167        # test using cuda if desired and available 
    168168        if 'cuda' in loaders and use_cuda(): 
    169             test_name = "%s-cuda" % model_info.id 
     169            test_name = "%s-cuda"%model_name 
    170170            test_method_name = "test_%s_cuda" % model_info.id 
    171171            # Using dtype=None so that the models that are only 
  • sasmodels/models/rpa.c

    r19dc29e7 r71b751d  
    2525  double S0ba,Pbb,S0bb,Pbc,S0bc,Pbd,S0bd; 
    2626  double S0ca,S0cb,Pcc,S0cc,Pcd,S0cd; 
    27   //double S0da,S0db,S0dc; 
     27  double S0da,S0db,S0dc; 
    2828  double Pdd,S0dd; 
    2929  double Kaa,Kbb,Kcc; 
    3030  double Kba,Kca,Kcb; 
    31   //double Kda,Kdb,Kdc,Kdd; 
     31  double Kda,Kdb,Kdc,Kdd; 
    3232  double Zaa,Zab,Zac,Zba,Zbb,Zbc,Zca,Zcb,Zcc; 
    3333  double DenT,T11,T12,T13,T21,T22,T23,T31,T32,T33; 
     
    3636  double N11,N12,N13,N21,N22,N23,N31,N32,N33; 
    3737  double M11,M12,M13,M21,M22,M23,M31,M32,M33; 
    38   double S11,S12,S22,S23,S13,S33; 
    39   //double S21,S31,S32,S44;  
    40   //double S14,S24,S34,S41,S42,S43; 
     38  double S11,S12,S13,S14,S21,S22,S23,S24; 
     39  double S31,S32,S33,S34,S41,S42,S43,S44; 
    4140  double Lad,Lbd,Lcd,Nav,Intg; 
    4241 
     
    116115  S0cd=(Phicd*vcd*Ncd)*Pcd; 
    117116 
    118   //S0da=S0ad; 
    119   //S0db=S0bd; 
    120   //S0dc=S0cd; 
     117  S0da=S0ad; 
     118  S0db=S0bd; 
     119  S0dc=S0cd; 
    121120  Pdd=2.0*(exp(-Xd)-1.0+Xd)/(Xd*Xd); // free D chain 
    122121  S0dd=N[3]*Phi[3]*v[3]*Pdd; 
     
    199198  S0ca=S0ac; 
    200199  S0cb=S0bc; 
    201   //S0da=S0ad; 
    202   //S0db=S0bd; 
    203   //S0dc=S0cd; 
     200  S0da=S0ad; 
     201  S0db=S0bd; 
     202  S0dc=S0cd; 
    204203 
    205204  // self chi parameter is 0 ... of course 
     
    207206  Kbb=0.0; 
    208207  Kcc=0.0; 
    209   //Kdd=0.0; 
     208  Kdd=0.0; 
    210209 
    211210  Kba=Kab; 
    212211  Kca=Kac; 
    213212  Kcb=Kbc; 
    214   //Kda=Kad; 
    215   //Kdb=Kbd; 
    216   //Kdc=Kcd; 
     213  Kda=Kad; 
     214  Kdb=Kbd; 
     215  Kdc=Kcd; 
    217216 
    218217  Zaa=Kaa-Kad-Kad; 
     
    295294  S12= Q12*S0aa + Q22*S0ab + Q32*S0ac; 
    296295  S13= Q13*S0aa + Q23*S0ab + Q33*S0ac; 
     296  S14=-S11-S12-S13; 
     297  S21= Q11*S0ba + Q21*S0bb + Q31*S0bc; 
    297298  S22= Q12*S0ba + Q22*S0bb + Q32*S0bc; 
    298299  S23= Q13*S0ba + Q23*S0bb + Q33*S0bc; 
     300  S24=-S21-S22-S23; 
     301  S31= Q11*S0ca + Q21*S0cb + Q31*S0cc; 
     302  S32= Q12*S0ca + Q22*S0cb + Q32*S0cc; 
    299303  S33= Q13*S0ca + Q23*S0cb + Q33*S0cc; 
    300   //S21= Q11*S0ba + Q21*S0bb + Q31*S0bc; 
    301   //S31= Q11*S0ca + Q21*S0cb + Q31*S0cc; 
    302   //S32= Q12*S0ca + Q22*S0cb + Q32*S0cc; 
    303   //S44=S11+S22+S33+2.0*S12+2.0*S13+2.0*S23; 
    304   //S14=-S11-S12-S13; 
    305   //S24=-S21-S22-S23; 
    306   //S34=-S31-S32-S33; 
    307   //S41=S14; 
    308   //S42=S24; 
    309   //S43=S34; 
     304  S34=-S31-S32-S33; 
     305  S41=S14; 
     306  S42=S24; 
     307  S43=S34; 
     308  S44=S11+S22+S33+2.0*S12+2.0*S13+2.0*S23; 
    310309 
    311310  //calculate contrast where L[i] is the scattering length of i and D is the matrix 
  • sasmodels/resolution.py

    rda3638f re2592f0  
    498498        if q_min < 0: 
    499499            q_min = q[0]*MINIMUM_ABSOLUTE_Q 
    500         n_low = int(np.ceil(log_delta_q * (log(q[0])-log(q_min)))) 
    501         q_low = np.logspace(log10(q_min), log10(q[0]), n_low+1)[:-1] 
     500        n_low = log_delta_q * (log(q[0])-log(q_min)) 
     501        q_low = np.logspace(log10(q_min), log10(q[0]), int(np.ceil(n_low))+1)[:-1] 
    502502    else: 
    503503        q_low = [] 
    504504    if q_max > q[-1]: 
    505         n_high = int(np.ceil(log_delta_q * (log(q_max)-log(q[-1])))) 
    506         q_high = np.logspace(log10(q[-1]), log10(q_max), n_high+1)[1:] 
     505        n_high = log_delta_q * (log(q_max)-log(q[-1])) 
     506        q_high = np.logspace(log10(q[-1]), log10(q_max), int(np.ceil(n_high))+1)[1:] 
    507507    else: 
    508508        q_high = [] 
Note: See TracChangeset for help on using the changeset viewer.