Changeset 3199b17 in sasmodels


Ignore:
Timestamp:
Mar 6, 2019 2:24:03 PM (6 years ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
4453136
Parents:
00afc15
Message:

PEP 8 changes and improved consistency between OpenCL/CUDA/DLL code

Location:
sasmodels
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernel.py

    re44432d r3199b17  
    2323# pylint: enable=unused-import 
    2424 
     25 
    2526class KernelModel(object): 
    2627    info = None  # type: ModelInfo 
     
    3334        # type: () -> None 
    3435        pass 
     36 
    3537 
    3638class Kernel(object): 
  • sasmodels/kernelcl.py

    r00afc15 r3199b17  
    6161 
    6262 
    63 # Attempt to setup opencl. This may fail if the pyopencl package is not 
     63# Attempt to setup OpenCL. This may fail if the pyopencl package is not 
    6464# installed or if it is installed but there are no devices available. 
    6565try: 
     
    6767    from pyopencl import mem_flags as mf 
    6868    from pyopencl.characterize import get_fast_inaccurate_build_options 
    69     # Ask OpenCL for the default context so that we know that one exists 
     69    # Ask OpenCL for the default context so that we know that one exists. 
    7070    cl.create_some_context(interactive=False) 
    7171    HAVE_OPENCL = True 
     
    8888# pylint: enable=unused-import 
    8989 
    90 # CRUFT: pyopencl < 2017.1  (as of June 2016 needs quotes around include path) 
     90 
     91# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 
    9192def quote_path(v): 
    9293    """ 
     
    99100    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 
    100101 
     102 
    101103def fix_pyopencl_include(): 
    102104    """ 
     
    105107    import pyopencl as cl 
    106108    if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 
    107         cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 
     109        cl._DEFAULT_INCLUDE_OPTIONS = [ 
     110            quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 
     111            ] 
     112 
    108113 
    109114if HAVE_OPENCL: 
     
    118123MAX_LOOPS = 2048 
    119124 
    120  
    121125# Pragmas for enable OpenCL features.  Be sure to protect them so that they 
    122126# still compile even if OpenCL is not present. 
     
    133137""" 
    134138 
     139 
    135140def use_opencl(): 
    136141    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 
    137142    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 
    138143 
     144 
    139145ENV = None 
    140146def reset_environment(): 
     
    144150    global ENV 
    145151    ENV = GpuEnvironment() if use_opencl() else None 
     152 
    146153 
    147154def environment(): 
     
    161168    return ENV 
    162169 
     170 
    163171def has_type(device, dtype): 
    164172    # type: (cl.Device, np.dtype) -> bool 
     
    171179        return "cl_khr_fp64" in device.extensions 
    172180    else: 
    173         # Not supporting F16 type since it isn't accurate enough 
     181        # Not supporting F16 type since it isn't accurate enough. 
    174182        return False 
     183 
    175184 
    176185def get_warp(kernel, queue): 
     
    182191        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
    183192        queue.device) 
     193 
    184194 
    185195def compile_model(context, source, dtype, fast=False): 
     
    203213        source_list.insert(0, _F64_PRAGMA) 
    204214 
    205     # Note: USE_SINCOS makes the intel cpu slower under opencl 
     215    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 
    206216    if context.devices[0].type == cl.device_type.GPU: 
    207217        source_list.insert(0, "#define USE_SINCOS\n") 
     
    210220    source = "\n".join(source_list) 
    211221    program = cl.Program(context, source).build(options=options) 
     222 
    212223    #print("done with "+program) 
    213224    return program 
    214225 
    215226 
    216 # for now, this returns one device in the context 
    217 # TODO: create a context that contains all devices on all platforms 
     227# For now, this returns one device in the context. 
     228# TODO: Create a context that contains all devices on all platforms. 
    218229class GpuEnvironment(object): 
    219230    """ 
    220     GPU context, with possibly many devices, and one queue per device. 
    221  
    222     Because the environment can be reset during a live program (e.g., if the 
    223     user changes the active GPU device in the GUI), everything associated 
    224     with the device context must be cached in the environment and recreated 
    225     if the environment changes.  The *cache* attribute is a simple dictionary 
    226     which holds keys and references to objects, such as compiled kernels and 
    227     allocated buffers.  The running program should check in the cache for 
    228     long lived objects and create them if they are not there.  The program 
    229     should not hold onto cached objects, but instead only keep them active 
    230     for the duration of a function call.  When the environment is destroyed 
    231     then the *release* method for each active cache item is called before 
    232     the environment is freed.  This means that each cl buffer should be 
    233     in its own cache entry. 
     231    GPU context for OpenCL, with possibly many devices and one queue per device. 
    234232    """ 
    235233    def __init__(self): 
    236234        # type: () -> None 
    237         # find gpu context 
     235        # Find gpu context. 
    238236        context_list = _create_some_context() 
    239237 
     
    249247                self.context[dtype] = None 
    250248 
    251         # Build a queue for each context 
     249        # Build a queue for each context. 
    252250        self.queue = {} 
    253251        context = self.context[F32] 
     
    259257            self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 
    260258 
    261         # Byte boundary for data alignment 
     259        ## Byte boundary for data alignment. 
    262260        #self.data_boundary = max(context.devices[0].min_data_type_align_size 
    263261        #                         for context in self.context.values()) 
    264262 
    265         # Cache for compiled programs, and for items in context 
     263        # Cache for compiled programs, and for items in context. 
    266264        self.compiled = {} 
    267265 
     
    279277        """ 
    280278        # Note: PyOpenCL caches based on md5 hash of source, options and device 
    281         # so we don't really need to cache things for ourselves.  I'll do so 
    282         # anyway just to save some data munging time. 
     279        # but I'll do so as well just to save some data munging time. 
    283280        tag = generate.tag_source(source) 
    284281        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    285         # Check timestamp on program 
     282        # Check timestamp on program. 
    286283        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    287284        if program_timestamp < timestamp: 
     
    296293        return program 
    297294 
     295 
    298296def _create_some_context(): 
    299297    # type: () -> cl.Context 
     
    307305    which one (and not a CUDA device, or no GPU). 
    308306    """ 
    309     # Assume we do not get here if SAS_OPENCL is None or CUDA 
     307    # Assume we do not get here if SAS_OPENCL is None or CUDA. 
    310308    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 
    311309    if sas_opencl.lower() != 'opencl': 
    312         # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
     310        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 
    313311        os.environ["PYOPENCL_CTX"] = sas_opencl 
    314312 
     
    318316        except Exception as exc: 
    319317            warnings.warn(str(exc)) 
    320             warnings.warn("pyopencl.create_some_context() failed") 
    321             warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 
     318            warnings.warn("pyopencl.create_some_context() failed.  The " 
     319                "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 
     320                "not be set correctly") 
    322321 
    323322    return _get_default_context() 
     323 
    324324 
    325325def _get_default_context(): 
     
    334334    # is running may increase throughput. 
    335335    # 
    336     # Macbook pro, base install: 
     336    # MacBook Pro, base install: 
    337337    #     {'Apple': [Intel CPU, NVIDIA GPU]} 
    338     # Macbook pro, base install: 
     338    # MacBook Pro, base install: 
    339339    #     {'Apple': [Intel CPU, Intel GPU]} 
    340     # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed 
     340    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 
    341341    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 
    342342    gpu, cpu = None, None 
     
    361361            else: 
    362362                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 
    363                 # Intel Phi for example registers as an accelerator 
     363                # Intel Phi for example registers as an accelerator. 
    364364                # Since the user installed a custom device on their system 
    365365                # and went through the pain of sorting out OpenCL drivers for 
     
    368368                gpu = device 
    369369 
    370     # order the devices by gpu then by cpu; when searching for an available 
     370    # Order the devices by gpu then by cpu; when searching for an available 
    371371    # device by data type they will be checked in this order, which means 
    372372    # that if the gpu supports double then the cpu will never be used (though 
     
    395395    that the compiler is allowed to take shortcuts. 
    396396    """ 
    397     info = None # type: ModelInfo 
    398     source = "" # type: str 
    399     dtype = None # type: np.dtype 
    400     fast = False # type: bool 
    401     _program = None # type: cl.Program 
    402     _kernels = None # type: Dict[str, cl.Kernel] 
     397    info = None  # type: ModelInfo 
     398    source = ""  # type: str 
     399    dtype = None  # type: np.dtype 
     400    fast = False  # type: bool 
     401    _program = None  # type: cl.Program 
     402    _kernels = None  # type: Dict[str, cl.Kernel] 
    403403 
    404404    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
     
    446446        functions = [getattr(program, k) for k in names] 
    447447        self._kernels = {k: v for k, v in zip(variants, functions)} 
    448         # keep a handle to program so GC doesn't collect 
     448        # Keep a handle to program so GC doesn't collect. 
    449449        self._program = program 
    450450 
    451 # TODO: check that we don't need a destructor for buffers which go out of scope 
     451 
     452# TODO: Check that we don't need a destructor for buffers which go out of scope. 
    452453class GpuInput(object): 
    453454    """ 
     
    471472    def __init__(self, q_vectors, dtype=generate.F32): 
    472473        # type: (List[np.ndarray], np.dtype) -> None 
    473         # TODO: do we ever need double precision q? 
     474        # TODO: Do we ever need double precision q? 
    474475        self.nq = q_vectors[0].size 
    475476        self.dtype = np.dtype(dtype) 
    476477        self.is_2d = (len(q_vectors) == 2) 
    477         # TODO: stretch input based on get_warp() 
    478         # not doing it now since warp depends on kernel, which is not known 
     478        # TODO: Stretch input based on get_warp(). 
     479        # Not doing it now since warp depends on kernel, which is not known 
    479480        # at this point, so instead using 32, which is good on the set of 
    480481        # architectures tested so far. 
     
    491492        #print("creating inputs of size", self.global_size) 
    492493 
    493         # transfer input value to gpu 
     494        # Transfer input value to GPU. 
    494495        env = environment() 
    495496        context = env.context[self.dtype] 
     
    500501        # type: () -> None 
    501502        """ 
    502         Free the buffer associated with the q value 
     503        Free the buffer associated with the q value. 
    503504        """ 
    504505        if self.q_b is not None: 
     
    509510        # type: () -> None 
    510511        self.release() 
     512 
    511513 
    512514class GpuKernel(Kernel): 
     
    524526    Call :meth:`release` when done with the kernel instance. 
    525527    """ 
    526     #: SAS model information structure 
    527     info = None # type: ModelInfo 
    528     #: kernel precision 
    529     dtype = None # type: np.dtype 
    530     #: kernel dimensions (1d or 2d) 
    531     dim = "" # type: str 
    532     #: calculation results, updated after each call to :meth:`_call_kernel` 
    533     result = None # type: np.ndarray 
     528    #: SAS model information structure. 
     529    info = None  # type: ModelInfo 
     530    #: Kernel precision. 
     531    dtype = None  # type: np.dtype 
     532    #: Kernel dimensions (1d or 2d). 
     533    dim = ""  # type: str 
     534    #: Calculation results, updated after each call to :meth:`_call_kernel`. 
     535    result = None  # type: np.ndarray 
    534536 
    535537    def __init__(self, model, q_vectors): 
     
    538540        self.q_input = GpuInput(q_vectors, dtype) 
    539541        self._model = model 
    540         # F16 isn't sufficient, so don't support it 
    541         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    542  
    543         # attributes accessed from the outside 
     542 
     543        # Attributes accessed from the outside. 
    544544        self.dim = '2d' if self.q_input.is_2d else '1d' 
    545545        self.info = model.info 
    546         self.dtype = model.dtype 
    547  
    548         # holding place for the returned value 
     546        self.dtype = dtype 
     547 
     548        # Converter to translate input to target type. 
     549        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     550 
     551        # Holding place for the returned value. 
    549552        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    550         extra_q = 4  # total weight, form volume, shell volume and R_eff 
    551         self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
    552  
    553         # allocate result value on gpu 
     553        extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
     554        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
     555 
     556        # Allocate result value on GPU. 
    554557        env = environment() 
    555558        context = env.context[self.dtype] 
     
    557560        self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 
    558561 
    559     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    560         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     562    def _call_kernel(self, call_details, values, cutoff, magnetic, 
     563                     effective_radius_type): 
     564        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    561565        env = environment() 
    562566        queue = env.queue[self._model.dtype] 
    563567        context = queue.context 
    564568 
    565         # Arrange data transfer to/from card 
     569        # Arrange data transfer to card. 
    566570        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    567571                              hostbuf=call_details.buffer) 
     
    569573                             hostbuf=values) 
    570574 
     575        # Setup kernel function and arguments. 
    571576        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    572577        kernel = self._model.get_function(name) 
    573578        kernel_args = [ 
    574             np.uint32(self.q_input.nq), None, None, 
    575             details_b, values_b, self.q_input.q_b, self._result_b, 
    576             self._as_dtype(cutoff), 
    577             np.uint32(effective_radius_type), 
     579            np.uint32(self.q_input.nq),  # Number of inputs. 
     580            None,  # Placeholder for pd_start. 
     581            None,  # Placeholder for pd_stop. 
     582            details_b,  # Problem definition. 
     583            values_b,  # Parameter values. 
     584            self.q_input.q_b,  # Q values. 
     585            self._result_b,   # Result storage. 
     586            self._as_dtype(cutoff),  # Probability cutoff. 
     587            np.uint32(effective_radius_type),  # R_eff mode. 
    578588        ] 
     589 
     590        # Call kernel and retrieve results. 
    579591        #print("Calling OpenCL") 
    580592        #call_details.show(values) 
    581         #Call kernel and retrieve results 
    582593        wait_for = None 
    583594        last_nap = time.clock() 
     
    590601                               *kernel_args, wait_for=wait_for)] 
    591602            if stop < call_details.num_eval: 
    592                 # Allow other processes to run 
     603                # Allow other processes to run. 
    593604                wait_for[0].wait() 
    594605                current_time = time.clock() 
     
    599610        #print("result", self.result) 
    600611 
    601         # Free buffers 
     612        # Free buffers. 
    602613        details_b.release() 
    603614        values_b.release() 
  • sasmodels/kernelcuda.py

    r00afc15 r3199b17  
    6868 
    6969 
    70 # Attempt to setup cuda. This may fail if the pycuda package is not 
     70# Attempt to setup CUDA. This may fail if the pycuda package is not 
    7171# installed or if it is installed but there are no devices available. 
    7272try: 
     
    108108MAX_LOOPS = 2048 
    109109 
     110 
    110111def use_cuda(): 
    111     env = os.environ.get("SAS_OPENCL", "").lower() 
    112     return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
     112    sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 
     113    return HAVE_CUDA and sas_opencl.startswith("cuda") 
     114 
    113115 
    114116ENV = None 
     
    122124        ENV.release() 
    123125    ENV = GpuEnvironment() if use_cuda() else None 
     126 
    124127 
    125128def environment(): 
     
    139142    return ENV 
    140143 
    141 def free_context(): 
    142     global ENV 
    143     if ENV is not None: 
    144         ENV.release() 
    145         ENV = None 
    146  
    147 atexit.register(free_context) 
    148144 
    149145def has_type(dtype): 
     
    152148    Return true if device supports the requested precision. 
    153149    """ 
    154     # Assume the nvidia card supports 32-bit and 64-bit floats. 
    155     # TODO: check if pycuda support F16 
     150    # Assume the NVIDIA card supports 32-bit and 64-bit floats. 
     151    # TODO: Check if pycuda support F16. 
    156152    return dtype in (generate.F32, generate.F64) 
    157153 
    158154 
    159155FUNCTION_PATTERN = re.compile(r"""^ 
    160   (?P<space>\s*)                   # initial space 
    161   (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 
    162   (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens 
     156  (?P<space>\s*)                       # Initial space. 
     157  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function. 
     158  (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens. 
    163159  """, re.VERBOSE|re.MULTILINE) 
    164160 
     
    167163  """, re.VERBOSE|re.MULTILINE) 
    168164 
     165 
    169166def _add_device_tag(match): 
    170167    # type: (None) -> str 
    171     # Note: should be re.Match, but that isn't a simple type 
     168    # Note: Should be re.Match, but that isn't a simple type. 
    172169    """ 
    173170    replace qualifiers with __device__ qualifiers if needed 
     
    182179        return "".join((space, "__device__ ", qualifiers, function)) 
    183180 
     181 
    184182def mark_device_functions(source): 
    185183    # type: (str) -> str 
     
    188186    """ 
    189187    return FUNCTION_PATTERN.sub(_add_device_tag, source) 
     188 
    190189 
    191190def show_device_functions(source): 
     
    197196        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 
    198197    return source 
     198 
    199199 
    200200def compile_model(source, dtype, fast=False): 
     
    221221    #options = ['--verbose', '-E'] 
    222222    options = ['--use_fast_math'] if fast else None 
    223     program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
     223    program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 
    224224 
    225225    #print("done with "+program) 
     
    227227 
    228228 
    229 # for now, this returns one device in the context 
    230 # TODO: create a context that contains all devices on all platforms 
     229# For now, this returns one device in the context. 
     230# TODO: Create a context that contains all devices on all platforms. 
    231231class GpuEnvironment(object): 
    232232    """ 
    233     GPU context, with possibly many devices, and one queue per device. 
     233    GPU context for CUDA. 
    234234    """ 
    235235    context = None # type: cuda.Context 
    236236    def __init__(self, devnum=None): 
    237237        # type: (int) -> None 
    238         # Byte boundary for data alignment 
    239         #self.data_boundary = max(d.min_data_type_align_size 
    240         #                         for d in self.context.devices) 
    241         self.compiled = {} 
    242238        env = os.environ.get("SAS_OPENCL", "").lower() 
    243239        if devnum is None and env.startswith("cuda:"): 
    244240            devnum = int(env[5:]) 
     241 
    245242        # Set the global context to the particular device number if one is 
    246243        # given, otherwise use the default context.  Perhaps this will be set 
     
    251248            self.context = make_default_context() 
    252249 
     250        ## Byte boundary for data alignment. 
     251        #self.data_boundary = max(d.min_data_type_align_size 
     252        #                         for d in self.context.devices) 
     253 
     254        # Cache for compiled programs, and for items in context. 
     255        self.compiled = {} 
     256 
    253257    def release(self): 
    254258        if self.context is not None: 
     
    271275        Compile the program for the device in the given context. 
    272276        """ 
    273         # Note: PyOpenCL caches based on md5 hash of source, options and device 
    274         # so we don't really need to cache things for ourselves.  I'll do so 
    275         # anyway just to save some data munging time. 
     277        # Note: PyCuda (probably) caches but I'll do so as well just to 
     278        # save some data munging time. 
    276279        tag = generate.tag_source(source) 
    277280        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    278         # Check timestamp on program 
     281        # Check timestamp on program. 
    279282        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    280283        if program_timestamp < timestamp: 
     
    286289        return program 
    287290 
     291 
    288292class GpuModel(KernelModel): 
    289293    """ 
     
    301305    that the compiler is allowed to take shortcuts. 
    302306    """ 
    303     info = None # type: ModelInfo 
    304     source = "" # type: str 
    305     dtype = None # type: np.dtype 
    306     fast = False # type: bool 
    307     _program = None # type: SourceModule 
    308     _kernels = None # type: Dict[str, cuda.Function] 
     307    info = None  # type: ModelInfo 
     308    source = ""  # type: str 
     309    dtype = None  # type: np.dtype 
     310    fast = False  # type: bool 
     311    _program = None  # type: SourceModule 
     312    _kernels = None  # type: Dict[str, cuda.Function] 
    309313 
    310314    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
     
    352356        functions = [program.get_function(k) for k in names] 
    353357        self._kernels = {k: v for k, v in zip(variants, functions)} 
    354         # keep a handle to program so GC doesn't collect 
     358        # Keep a handle to program so GC doesn't collect. 
    355359        self._program = program 
    356360 
    357 # TODO: check that we don't need a destructor for buffers which go out of scope 
     361 
     362# TODO: Check that we don't need a destructor for buffers which go out of scope. 
    358363class GpuInput(object): 
    359364    """ 
     
    377382    def __init__(self, q_vectors, dtype=generate.F32): 
    378383        # type: (List[np.ndarray], np.dtype) -> None 
    379         # TODO: do we ever need double precision q? 
     384        # TODO: Do we ever need double precision q? 
    380385        self.nq = q_vectors[0].size 
    381386        self.dtype = np.dtype(dtype) 
    382387        self.is_2d = (len(q_vectors) == 2) 
    383         # TODO: stretch input based on get_warp() 
    384         # not doing it now since warp depends on kernel, which is not known 
     388        # TODO: Stretch input based on get_warp(). 
     389        # Not doing it now since warp depends on kernel, which is not known 
    385390        # at this point, so instead using 32, which is good on the set of 
    386391        # architectures tested so far. 
    387392        if self.is_2d: 
    388             # Note: 16 rather than 15 because result is 1 longer than input. 
    389             width = ((self.nq+16)//16)*16 
     393            width = ((self.nq+15)//16)*16 
    390394            self.q = np.empty((width, 2), dtype=dtype) 
    391395            self.q[:self.nq, 0] = q_vectors[0] 
    392396            self.q[:self.nq, 1] = q_vectors[1] 
    393397        else: 
    394             # Note: 32 rather than 31 because result is 1 longer than input. 
    395             width = ((self.nq+32)//32)*32 
     398            width = ((self.nq+31)//32)*32 
    396399            self.q = np.empty(width, dtype=dtype) 
    397400            self.q[:self.nq] = q_vectors[0] 
     
    399402        #print("creating inputs of size", self.global_size) 
    400403 
    401         # transfer input value to gpu 
     404        # Transfer input value to GPU. 
    402405        self.q_b = cuda.to_device(self.q) 
    403406 
     
    405408        # type: () -> None 
    406409        """ 
    407         Free the memory. 
     410        Free the buffer associated with the q value. 
    408411        """ 
    409412        if self.q_b is not None: 
     
    414417        # type: () -> None 
    415418        self.release() 
     419 
    416420 
    417421class GpuKernel(Kernel): 
     
    429433    Call :meth:`release` when done with the kernel instance. 
    430434    """ 
    431     #: SAS model information structure 
    432     info = None # type: ModelInfo 
    433     #: kernel precision 
    434     dtype = None # type: np.dtype 
    435     #: kernel dimensions (1d or 2d) 
    436     dim = "" # type: str 
    437     #: calculation results, updated after each call to :meth:`_call_kernel` 
    438     result = None # type: np.ndarray 
     435    #: SAS model information structure. 
     436    info = None  # type: ModelInfo 
     437    #: Kernel precision. 
     438    dtype = None  # type: np.dtype 
     439    #: Kernel dimensions (1d or 2d). 
     440    dim = ""  # type: str 
     441    #: Calculation results, updated after each call to :meth:`_call_kernel`. 
     442    result = None  # type: np.ndarray 
    439443 
    440444    def __init__(self, model, q_vectors): 
     
    443447        self.q_input = GpuInput(q_vectors, dtype) 
    444448        self._model = model 
    445         # F16 isn't sufficient, so don't support it 
    446         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    447  
    448         # attributes accessed from the outside 
     449 
     450        # Attributes accessed from the outside. 
    449451        self.dim = '2d' if self.q_input.is_2d else '1d' 
    450452        self.info = model.info 
    451         self.dtype = model.dtype 
    452  
    453         # holding place for the returned value 
     453        self.dtype = dtype 
     454 
     455        # Converter to translate input to target type. 
     456        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     457 
     458        # Holding place for the returned value. 
    454459        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    455         extra_q = 4  # total weight, form volume, shell volume and R_eff 
    456         self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
    457  
    458         # allocate result value on gpu 
     460        extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
     461        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
     462 
     463        # Allocate result value on GPU. 
    459464        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    460465        self._result_b = cuda.mem_alloc(width) 
    461466 
    462     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    463         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    464         # Arrange data transfer to card 
     467    def _call_kernel(self, call_details, values, cutoff, magnetic, 
     468                     effective_radius_type): 
     469        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
     470 
     471        # Arrange data transfer to card. 
    465472        details_b = cuda.to_device(call_details.buffer) 
    466473        values_b = cuda.to_device(values) 
    467474 
     475        # Setup kernel function and arguments. 
    468476        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    469477        kernel = self._model.get_function(name) 
    470478        kernel_args = [ 
    471             np.uint32(self.q_input.nq), None, None, 
    472             details_b, values_b, self.q_input.q_b, self._result_b, 
    473             self._as_dtype(cutoff), 
    474             np.uint32(effective_radius_type), 
     479            np.uint32(self.q_input.nq),  # Number of inputs. 
     480            None,  # Placeholder for pd_start. 
     481            None,  # Placeholder for pd_stop. 
     482            details_b,  # Problem definition. 
     483            values_b,  # Parameter values. 
     484            self.q_input.q_b,  # Q values. 
     485            self._result_b,   # Result storage. 
     486            self._as_dtype(cutoff),  # Probability cutoff. 
     487            np.uint32(effective_radius_type),  # R_eff mode. 
    475488        ] 
    476489        grid = partition(self.q_input.nq) 
    477         #print("Calling OpenCL") 
     490 
     491        # Call kernel and retrieve results. 
     492        #print("Calling CUDA") 
    478493        #call_details.show(values) 
    479         # Call kernel and retrieve results 
    480494        last_nap = time.clock() 
    481495        step = 100000000//self.q_input.nq + 1 
     
    488502            if stop < call_details.num_eval: 
    489503                sync() 
    490                 # Allow other processes to run 
     504                # Allow other processes to run. 
    491505                current_time = time.clock() 
    492506                if current_time - last_nap > 0.5: 
     
    522536    Note: Maybe context.synchronize() is sufficient. 
    523537    """ 
    524     #return # The following works in C++; don't know what pycuda is doing 
    525     # Create an event with which to synchronize 
     538    # Create an event with which to synchronize. 
    526539    done = cuda.Event() 
    527540 
     
    529542    done.record() 
    530543 
    531     #line added to not hog resources 
     544    # Make sure we don't hog resource while waiting to sync. 
    532545    while not done.query(): 
    533546        time.sleep(0.01) 
     
    535548    # Block until the GPU executes the kernel. 
    536549    done.synchronize() 
     550 
    537551    # Clean up the event; I don't think they can be reused. 
    538552    del done 
  • sasmodels/kerneldll.py

    re44432d r3199b17  
    100100# pylint: enable=unused-import 
    101101 
    102 # Compiler output is a byte stream that needs to be decode in python 3 
     102# Compiler output is a byte stream that needs to be decode in python 3. 
    103103decode = (lambda s: s) if sys.version_info[0] < 3 else (lambda s: s.decode('utf8')) 
    104104 
     
    115115        COMPILER = "tinycc" 
    116116    elif "VCINSTALLDIR" in os.environ: 
    117         # If vcvarsall.bat has been called, then VCINSTALLDIR is in the environment 
    118         # and we can use the MSVC compiler.  Otherwise, if tinycc is available 
    119         # the use it.  Otherwise, hope that mingw is available. 
     117        # If vcvarsall.bat has been called, then VCINSTALLDIR is in the 
     118        # environment and we can use the MSVC compiler.  Otherwise, if 
     119        # tinycc is available then use it.  Otherwise, hope that mingw 
     120        # is available. 
    120121        COMPILER = "msvc" 
    121122    else: 
     
    124125    COMPILER = "unix" 
    125126 
    126 ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86 
     127ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86. 
    127128if COMPILER == "unix": 
    128     # Generic unix compile 
    129     # On mac users will need the X code command line tools installed 
     129    # Generic unix compile. 
     130    # On Mac users will need the X code command line tools installed. 
    130131    #COMPILE = "gcc-mp-4.7 -shared -fPIC -std=c99 -fopenmp -O2 -Wall %s -o %s -lm -lgomp" 
    131132    CC = "cc -shared -fPIC -std=c99 -O2 -Wall".split() 
    132     # add openmp support if not running on a mac 
     133    # Add OpenMP support if not running on a Mac. 
    133134    if sys.platform != "darwin": 
    134         # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9) 
     135        # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9). 
    135136        # Shut it off for all unix until we can investigate. 
    136137        #CC.append("-fopenmp") 
     
    144145    # vcomp90.dll on the path.  One may be found here: 
    145146    #       C:/Windows/winsxs/x86_microsoft.vc90.openmp*/vcomp90.dll 
    146     # Copy this to the python directory and uncomment the OpenMP COMPILE 
    147     # TODO: remove intermediate OBJ file created in the directory 
    148     # TODO: maybe don't use randomized name for the c file 
    149     # TODO: maybe ask distutils to find MSVC 
     147    # Copy this to the python directory and uncomment the OpenMP COMPILE. 
     148    # TODO: Remove intermediate OBJ file created in the directory. 
     149    # TODO: Maybe don't use randomized name for the c file. 
     150    # TODO: Maybe ask distutils to find MSVC. 
    150151    CC = "cl /nologo /Ox /MD /W3 /GS- /DNDEBUG".split() 
    151152    if "SAS_OPENMP" in os.environ: 
     
    172173ALLOW_SINGLE_PRECISION_DLLS = True 
    173174 
     175 
    174176def compile(source, output): 
    175177    # type: (str, str) -> None 
     
    183185    logging.info(command_str) 
    184186    try: 
    185         # need shell=True on windows to keep console box from popping up 
     187        # Need shell=True on windows to keep console box from popping up. 
    186188        shell = (os.name == 'nt') 
    187189        subprocess.check_output(command, shell=shell, stderr=subprocess.STDOUT) 
     
    192194        raise RuntimeError("compile failed.  File is in %r"%source) 
    193195 
     196 
    194197def dll_name(model_info, dtype): 
    195198    # type: (ModelInfo, np.dtype) ->  str 
     
    202205    basename += ARCH + ".so" 
    203206 
    204     # Hack to find precompiled dlls 
     207    # Hack to find precompiled dlls. 
    205208    path = joinpath(generate.DATA_PATH, '..', 'compiled_models', basename) 
    206209    if os.path.exists(path): 
     
    242245        raise ValueError("16 bit floats not supported") 
    243246    if dtype == F32 and not ALLOW_SINGLE_PRECISION_DLLS: 
    244         dtype = F64  # Force 64-bit dll 
    245     # Note: dtype may be F128 for long double precision 
     247        dtype = F64  # Force 64-bit dll. 
     248    # Note: dtype may be F128 for long double precision. 
    246249 
    247250    dll = dll_path(model_info, dtype) 
     
    254257        need_recompile = dll_time < newest_source 
    255258    if need_recompile: 
    256         # Make sure the DLL path exists 
     259        # Make sure the DLL path exists. 
    257260        if not os.path.exists(SAS_DLL_PATH): 
    258261            os.makedirs(SAS_DLL_PATH) 
     
    263266            file_handle.write(source) 
    264267        compile(source=filename, output=dll) 
    265         # comment the following to keep the generated c file 
    266         # Note: if there is a syntax error then compile raises an error 
     268        # Comment the following to keep the generated C file. 
     269        # Note: If there is a syntax error then compile raises an error 
    267270        # and the source file will not be deleted. 
    268271        os.unlink(filename) 
     
    303306        self.dllpath = dllpath 
    304307        self._dll = None  # type: ct.CDLL 
    305         self._kernels = None # type: List[Callable, Callable] 
     308        self._kernels = None  # type: List[Callable, Callable] 
    306309        self.dtype = np.dtype(dtype) 
    307310 
     
    338341        # type: (List[np.ndarray]) -> DllKernel 
    339342        q_input = PyInput(q_vectors, self.dtype) 
    340         # Note: pickle not supported for DllKernel 
     343        # Note: DLL is lazy loaded. 
    341344        if self._dll is None: 
    342345            self._load_dll() 
     
    358361        self._dll = None 
    359362 
     363 
    360364class DllKernel(Kernel): 
    361365    """ 
     
    379383    def __init__(self, kernel, model_info, q_input): 
    380384        # type: (Callable[[], np.ndarray], ModelInfo, PyInput) -> None 
    381         #,model_info,q_input) 
     385        dtype = q_input.dtype 
     386        self.q_input = q_input 
    382387        self.kernel = kernel 
     388 
     389        # Attributes accessed from the outside. 
     390        self.dim = '2d' if q_input.is_2d else '1d' 
    383391        self.info = model_info 
    384         self.q_input = q_input 
    385         self.dtype = q_input.dtype 
    386         self.dim = '2d' if q_input.is_2d else '1d' 
    387         # leave room for f1/f2 results in case we need to compute beta for 1d models 
     392        self.dtype = dtype 
     393 
     394        # Converter to translate input to target type. 
     395        self._as_dtype = (np.float32 if dtype == generate.F32 
     396                          else np.float64 if dtype == generate.F64 
     397                          else np.float128) 
     398 
     399        # Holding place for the returned value. 
    388400        nout = 2 if self.info.have_Fq else 1 
    389         # +4 for total weight, shell volume, effective radius, form volume 
    390         self.result = np.empty(q_input.nq*nout + 4, self.dtype) 
    391         self.real = (np.float32 if self.q_input.dtype == generate.F32 
    392                      else np.float64 if self.q_input.dtype == generate.F64 
    393                      else np.float128) 
    394  
    395     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    396         # type: (CallDetails, np.ndarray, np.ndarray, float, bool, int) -> np.ndarray 
     401        extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
     402        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
     403 
     404    def _call_kernel(self, call_details, values, cutoff, magnetic, 
     405                     effective_radius_type): 
     406        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
     407 
     408        # Setup kernel function and arguments. 
    397409        kernel = self.kernel[1 if magnetic else 0] 
    398         args = [ 
    399             self.q_input.nq, # nq 
    400             None, # pd_start 
    401             None, # pd_stop pd_stride[MAX_PD] 
    402             call_details.buffer.ctypes.data, # problem 
    403             values.ctypes.data,  # pars 
    404             self.q_input.q.ctypes.data, # q 
    405             self.result.ctypes.data,   # results 
    406             self.real(cutoff), # cutoff 
    407             effective_radius_type, # cutoff 
     410        kernel_args = [ 
     411            self.q_input.nq,  # Number of inputs. 
     412            None,  # Placeholder for pd_start. 
     413            None,  # Placeholder for pd_stop. 
     414            call_details.buffer.ctypes.data,  # Problem definition. 
     415            values.ctypes.data,  # Parameter values. 
     416            self.q_input.q.ctypes.data,  # Q values. 
     417            self.result.ctypes.data,   # Result storage. 
     418            self._as_dtype(cutoff),  # Probability cutoff. 
     419            effective_radius_type,  # R_eff mode. 
    408420        ] 
     421 
     422        # Call kernel and retrieve results. 
    409423        #print("Calling DLL") 
    410424        #call_details.show(values) 
    411425        step = 100 
     426        # TODO: Do we need the explicit sleep like the OpenCL and CUDA loops? 
    412427        for start in range(0, call_details.num_eval, step): 
    413428            stop = min(start + step, call_details.num_eval) 
    414             args[1:3] = [start, stop] 
    415             kernel(*args) # type: ignore 
     429            kernel_args[1:3] = [start, stop] 
     430            kernel(*kernel_args) # type: ignore 
    416431 
    417432    def release(self): 
    418433        # type: () -> None 
    419434        """ 
    420         Release any resources associated with the kernel. 
     435        Release resources associated with the kernel. 
    421436        """ 
    422         self.q_input.release() 
     437        # TODO: OpenCL/CUDA allocate q_input in __init__ and free it in release. 
     438        # Should we be doing the same for DLL? 
     439        #self.q_input.release() 
     440        pass 
     441 
     442    def __del__(self): 
     443        # type: () -> None 
     444        self.release() 
  • sasmodels/kernelpy.py

    raa8c6e0 r3199b17  
    3333logger = logging.getLogger(__name__) 
    3434 
     35 
    3536class PyModel(KernelModel): 
    3637    """ 
     
    3839    """ 
    3940    def __init__(self, model_info): 
    40         # Make sure Iq is available and vectorized 
     41        # Make sure Iq is available and vectorized. 
    4142        _create_default_functions(model_info) 
    4243        self.info = model_info 
     
    5354        """ 
    5455        pass 
     56 
    5557 
    5658class PyInput(object): 
     
    9193        self.q = None 
    9294 
     95 
    9396class PyKernel(Kernel): 
    9497    """ 
     
    131134        parameter_vector = np.empty(len(partable.call_parameters)-2, 'd') 
    132135 
    133         # Create views into the array to hold the arguments 
     136        # Create views into the array to hold the arguments. 
    134137        offset = 0 
    135138        kernel_args, volume_args = [], [] 
     
    174177                        else (lambda mode: 1.0)) 
    175178 
    176  
    177  
    178179    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    179180        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     
    195196        self.q_input.release() 
    196197        self.q_input = None 
     198 
    197199 
    198200def _loops(parameters,    # type: np.ndarray 
     
    254256        total = np.zeros(nq, 'd') 
    255257        for loop_index in range(call_details.num_eval): 
    256             # update polydispersity parameter values 
     258            # Update polydispersity parameter values. 
    257259            if p0_index == p0_length: 
    258260                pd_index = (loop_index//pd_stride)%pd_length 
     
    265267            p0_index += 1 
    266268            if weight > cutoff: 
    267                 # Call the scattering function 
     269                # Call the scattering function. 
    268270                # Assume that NaNs are only generated if the parameters are bad; 
    269271                # exclude all q for that NaN.  Even better would be to have an 
     
    273275                    continue 
    274276 
    275                 # update value and norm 
     277                # Update value and norm. 
    276278                total += weight * Iq 
    277279                weight_norm += weight 
     
    293295    any functions that are not already marked as vectorized. 
    294296    """ 
    295     # Note: must call create_vector_Iq before create_vector_Iqxy 
     297    # Note: Must call create_vector_Iq before create_vector_Iqxy. 
    296298    _create_vector_Iq(model_info) 
    297299    _create_vector_Iqxy(model_info) 
Note: See TracChangeset for help on using the changeset viewer.