Changes in sasmodels/kernelcl.py [f872fd1:3199b17] in sasmodels


Ignore:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcl.py

    rf872fd1 r3199b17  
    6161 
    6262 
    63 # Attempt to setup opencl. This may fail if the pyopencl package is not 
     63# Attempt to setup OpenCL. This may fail if the pyopencl package is not 
    6464# installed or if it is installed but there are no devices available. 
    6565try: 
     
    6767    from pyopencl import mem_flags as mf 
    6868    from pyopencl.characterize import get_fast_inaccurate_build_options 
    69     # Ask OpenCL for the default context so that we know that one exists 
     69    # Ask OpenCL for the default context so that we know that one exists. 
    7070    cl.create_some_context(interactive=False) 
    7171    HAVE_OPENCL = True 
     
    8888# pylint: enable=unused-import 
    8989 
    90 # CRUFT: pyopencl < 2017.1  (as of June 2016 needs quotes around include path) 
     90 
     91# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path). 
    9192def quote_path(v): 
    9293    """ 
     
    99100    return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v 
    100101 
     102 
    101103def fix_pyopencl_include(): 
    102104    """ 
     
    105107    import pyopencl as cl 
    106108    if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'): 
    107         cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS] 
     109        cl._DEFAULT_INCLUDE_OPTIONS = [ 
     110            quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS 
     111            ] 
     112 
    108113 
    109114if HAVE_OPENCL: 
     
    118123MAX_LOOPS = 2048 
    119124 
    120  
    121125# Pragmas for enable OpenCL features.  Be sure to protect them so that they 
    122126# still compile even if OpenCL is not present. 
     
    133137""" 
    134138 
     139 
    135140def use_opencl(): 
    136141    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 
    137142    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 
    138143 
     144 
    139145ENV = None 
    140146def reset_environment(): 
     
    144150    global ENV 
    145151    ENV = GpuEnvironment() if use_opencl() else None 
     152 
    146153 
    147154def environment(): 
     
    161168    return ENV 
    162169 
     170 
    163171def has_type(device, dtype): 
    164172    # type: (cl.Device, np.dtype) -> bool 
     
    171179        return "cl_khr_fp64" in device.extensions 
    172180    else: 
    173         # Not supporting F16 type since it isn't accurate enough 
     181        # Not supporting F16 type since it isn't accurate enough. 
    174182        return False 
     183 
    175184 
    176185def get_warp(kernel, queue): 
     
    182191        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
    183192        queue.device) 
     193 
    184194 
    185195def compile_model(context, source, dtype, fast=False): 
     
    203213        source_list.insert(0, _F64_PRAGMA) 
    204214 
    205     # Note: USE_SINCOS makes the intel cpu slower under opencl 
     215    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL. 
    206216    if context.devices[0].type == cl.device_type.GPU: 
    207217        source_list.insert(0, "#define USE_SINCOS\n") 
     
    210220    source = "\n".join(source_list) 
    211221    program = cl.Program(context, source).build(options=options) 
     222 
    212223    #print("done with "+program) 
    213224    return program 
    214225 
    215226 
    216 # for now, this returns one device in the context 
    217 # TODO: create a context that contains all devices on all platforms 
     227# For now, this returns one device in the context. 
     228# TODO: Create a context that contains all devices on all platforms. 
    218229class GpuEnvironment(object): 
    219230    """ 
    220     GPU context, with possibly many devices, and one queue per device. 
    221  
    222     Because the environment can be reset during a live program (e.g., if the 
    223     user changes the active GPU device in the GUI), everything associated 
    224     with the device context must be cached in the environment and recreated 
    225     if the environment changes.  The *cache* attribute is a simple dictionary 
    226     which holds keys and references to objects, such as compiled kernels and 
    227     allocated buffers.  The running program should check in the cache for 
    228     long lived objects and create them if they are not there.  The program 
    229     should not hold onto cached objects, but instead only keep them active 
    230     for the duration of a function call.  When the environment is destroyed 
    231     then the *release* method for each active cache item is called before 
    232     the environment is freed.  This means that each cl buffer should be 
    233     in its own cache entry. 
     231    GPU context for OpenCL, with possibly many devices and one queue per device. 
    234232    """ 
    235233    def __init__(self): 
    236234        # type: () -> None 
    237         # find gpu context 
     235        # Find gpu context. 
    238236        context_list = _create_some_context() 
    239237 
     
    249247                self.context[dtype] = None 
    250248 
    251         # Build a queue for each context 
     249        # Build a queue for each context. 
    252250        self.queue = {} 
    253251        context = self.context[F32] 
     
    259257            self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 
    260258 
    261         # Byte boundary for data alignment 
     259        ## Byte boundary for data alignment. 
    262260        #self.data_boundary = max(context.devices[0].min_data_type_align_size 
    263261        #                         for context in self.context.values()) 
    264262 
    265         # Cache for compiled programs, and for items in context 
     263        # Cache for compiled programs, and for items in context. 
    266264        self.compiled = {} 
    267         self.cache = {} 
    268265 
    269266    def has_type(self, dtype): 
     
    280277        """ 
    281278        # Note: PyOpenCL caches based on md5 hash of source, options and device 
    282         # so we don't really need to cache things for ourselves.  I'll do so 
    283         # anyway just to save some data munging time. 
     279        # but I'll do so as well just to save some data munging time. 
    284280        tag = generate.tag_source(source) 
    285281        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    286         # Check timestamp on program 
     282        # Check timestamp on program. 
    287283        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    288284        if program_timestamp < timestamp: 
     
    297293        return program 
    298294 
    299     def free_buffer(self, key): 
    300         if key in self.cache: 
    301             self.cache[key].release() 
    302             del self.cache[key] 
    303  
    304     def __del__(self): 
    305         for v in self.cache.values(): 
    306             release = getattr(v, 'release', lambda: None) 
    307             release() 
    308         self.cache = {} 
    309  
    310 _CURRENT_ID = 0 
    311 def unique_id(): 
    312     global _CURRENT_ID 
    313     _CURRENT_ID += 1 
    314     return _CURRENT_ID 
    315295 
    316296def _create_some_context(): 
     
    325305    which one (and not a CUDA device, or no GPU). 
    326306    """ 
    327     # Assume we do not get here if SAS_OPENCL is None or CUDA 
     307    # Assume we do not get here if SAS_OPENCL is None or CUDA. 
    328308    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 
    329309    if sas_opencl.lower() != 'opencl': 
    330         # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
     310        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context. 
    331311        os.environ["PYOPENCL_CTX"] = sas_opencl 
    332312 
     
    336316        except Exception as exc: 
    337317            warnings.warn(str(exc)) 
    338             warnings.warn("pyopencl.create_some_context() failed") 
    339             warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 
     318            warnings.warn("pyopencl.create_some_context() failed.  The " 
     319                "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might " 
     320                "not be set correctly") 
    340321 
    341322    return _get_default_context() 
     323 
    342324 
    343325def _get_default_context(): 
     
    352334    # is running may increase throughput. 
    353335    # 
    354     # Macbook pro, base install: 
     336    # MacBook Pro, base install: 
    355337    #     {'Apple': [Intel CPU, NVIDIA GPU]} 
    356     # Macbook pro, base install: 
     338    # MacBook Pro, base install: 
    357339    #     {'Apple': [Intel CPU, Intel GPU]} 
    358     # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed 
     340    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install: 
    359341    #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]} 
    360342    gpu, cpu = None, None 
     
    379361            else: 
    380362                # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM 
    381                 # Intel Phi for example registers as an accelerator 
     363                # Intel Phi for example registers as an accelerator. 
    382364                # Since the user installed a custom device on their system 
    383365                # and went through the pain of sorting out OpenCL drivers for 
     
    386368                gpu = device 
    387369 
    388     # order the devices by gpu then by cpu; when searching for an available 
     370    # Order the devices by gpu then by cpu; when searching for an available 
    389371    # device by data type they will be checked in this order, which means 
    390372    # that if the gpu supports double then the cpu will never be used (though 
     
    413395    that the compiler is allowed to take shortcuts. 
    414396    """ 
     397    info = None  # type: ModelInfo 
     398    source = ""  # type: str 
     399    dtype = None  # type: np.dtype 
     400    fast = False  # type: bool 
     401    _program = None  # type: cl.Program 
     402    _kernels = None  # type: Dict[str, cl.Kernel] 
     403 
    415404    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
    416405        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None 
     
    419408        self.dtype = dtype 
    420409        self.fast = fast 
    421         self.timestamp = generate.ocl_timestamp(self.info) 
    422         self._cache_key = unique_id() 
    423410 
    424411    def __getstate__(self): 
     
    429416        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    430417        self.info, self.source, self.dtype, self.fast = state 
     418        self._program = self._kernels = None 
    431419 
    432420    def make_kernel(self, q_vectors): 
     
    434422        return GpuKernel(self, q_vectors) 
    435423 
    436     @property 
    437     def Iq(self): 
    438         return self._fetch_kernel('Iq') 
    439  
    440     def fetch_kernel(self, name): 
     424    def get_function(self, name): 
    441425        # type: (str) -> cl.Kernel 
    442426        """ 
     
    444428        does not already exist. 
    445429        """ 
    446         gpu = environment() 
    447         key = self._cache_key 
    448         if key not in gpu.cache: 
    449             program = gpu.compile_program( 
    450                 self.info.name, 
    451                 self.source['opencl'], 
    452                 self.dtype, 
    453                 self.fast, 
    454                 self.timestamp) 
    455             variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    456             names = [generate.kernel_name(self.info, k) for k in variants] 
    457             kernels = [getattr(program, k) for k in names] 
    458             data = dict((k, v) for k, v in zip(variants, kernels)) 
    459             # keep a handle to program so GC doesn't collect 
    460             data['program'] = program 
    461             gpu.cache[key] = data 
    462         else: 
    463             data = gpu.cache[key] 
    464         return data[name] 
    465  
    466 # TODO: check that we don't need a destructor for buffers which go out of scope 
     430        if self._program is None: 
     431            self._prepare_program() 
     432        return self._kernels[name] 
     433 
     434    def _prepare_program(self): 
     435        # type: (str) -> None 
     436        env = environment() 
     437        timestamp = generate.ocl_timestamp(self.info) 
     438        program = env.compile_program( 
     439            self.info.name, 
     440            self.source['opencl'], 
     441            self.dtype, 
     442            self.fast, 
     443            timestamp) 
     444        variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     445        names = [generate.kernel_name(self.info, k) for k in variants] 
     446        functions = [getattr(program, k) for k in names] 
     447        self._kernels = {k: v for k, v in zip(variants, functions)} 
     448        # Keep a handle to program so GC doesn't collect. 
     449        self._program = program 
     450 
     451 
     452# TODO: Check that we don't need a destructor for buffers which go out of scope. 
    467453class GpuInput(object): 
    468454    """ 
     
    486472    def __init__(self, q_vectors, dtype=generate.F32): 
    487473        # type: (List[np.ndarray], np.dtype) -> None 
    488         # TODO: do we ever need double precision q? 
     474        # TODO: Do we ever need double precision q? 
    489475        self.nq = q_vectors[0].size 
    490476        self.dtype = np.dtype(dtype) 
    491477        self.is_2d = (len(q_vectors) == 2) 
    492         # TODO: stretch input based on get_warp() 
    493         # not doing it now since warp depends on kernel, which is not known 
     478        # TODO: Stretch input based on get_warp(). 
     479        # Not doing it now since warp depends on kernel, which is not known 
    494480        # at this point, so instead using 32, which is good on the set of 
    495481        # architectures tested so far. 
     
    504490            self.q[:self.nq] = q_vectors[0] 
    505491        self.global_size = [self.q.shape[0]] 
    506         self._cache_key = unique_id() 
    507  
    508     @property 
    509     def q_b(self): 
    510         """Lazy creation of q buffer so it can survive context reset""" 
     492        #print("creating inputs of size", self.global_size) 
     493 
     494        # Transfer input value to GPU. 
    511495        env = environment() 
    512         key = self._cache_key 
    513         if key not in env.cache: 
    514             context = env.context[self.dtype] 
    515             #print("creating inputs of size", self.global_size) 
    516             buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    517                                hostbuf=self.q) 
    518             env.cache[key] = buffer 
    519         return env.cache[key] 
     496        context = env.context[self.dtype] 
     497        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
     498                             hostbuf=self.q) 
    520499 
    521500    def release(self): 
    522501        # type: () -> None 
    523502        """ 
    524         Free the buffer associated with the q value 
    525         """ 
    526         environment().free_buffer(id(self)) 
     503        Free the buffer associated with the q value. 
     504        """ 
     505        if self.q_b is not None: 
     506            self.q_b.release() 
     507            self.q_b = None 
    527508 
    528509    def __del__(self): 
     
    530511        self.release() 
    531512 
     513 
    532514class GpuKernel(Kernel): 
    533515    """ 
     
    536518    *model* is the GpuModel object to call 
    537519 
    538     The following attributes are defined: 
    539  
    540     *info* is the module information 
    541  
    542     *dtype* is the kernel precision 
    543  
    544     *dim* is '1d' or '2d' 
    545  
    546     *result* is a vector to contain the results of the call 
    547  
    548     The resulting call method takes the *pars*, a list of values for 
    549     the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
    550     vectors for the polydisperse parameters.  *cutoff* determines the 
    551     integration limits: any points with combined weight less than *cutoff* 
    552     will not be calculated. 
     520    The kernel is derived from :class:`Kernel`, providing the 
     521    :meth:`call_kernel` method to evaluate the kernel for a given set of 
     522    parameters.  Because of the need to move the q values to the GPU before 
     523    evaluation, the kernel is instantiated for a particular set of q vectors, 
     524    and can be called many times without transfering q each time. 
    553525 
    554526    Call :meth:`release` when done with the kernel instance. 
    555527    """ 
     528    #: SAS model information structure. 
     529    info = None  # type: ModelInfo 
     530    #: Kernel precision. 
     531    dtype = None  # type: np.dtype 
     532    #: Kernel dimensions (1d or 2d). 
     533    dim = ""  # type: str 
     534    #: Calculation results, updated after each call to :meth:`_call_kernel`. 
     535    result = None  # type: np.ndarray 
     536 
    556537    def __init__(self, model, q_vectors): 
    557         # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
     538        # type: (GpuModel, List[np.ndarray]) -> None 
    558539        dtype = model.dtype 
    559540        self.q_input = GpuInput(q_vectors, dtype) 
    560541        self._model = model 
    561         # F16 isn't sufficient, so don't support it 
    562         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    563         self._cache_key = unique_id() 
    564  
    565         # attributes accessed from the outside 
     542 
     543        # Attributes accessed from the outside. 
    566544        self.dim = '2d' if self.q_input.is_2d else '1d' 
    567545        self.info = model.info 
    568         self.dtype = model.dtype 
    569  
    570         # holding place for the returned value 
     546        self.dtype = dtype 
     547 
     548        # Converter to translate input to target type. 
     549        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     550 
     551        # Holding place for the returned value. 
    571552        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    572         extra_q = 4  # total weight, form volume, shell volume and R_eff 
    573         self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
    574  
    575     @property 
    576     def _result_b(self): 
    577         """Lazy creation of result buffer so it can survive context reset""" 
     553        extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
     554        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
     555 
     556        # Allocate result value on GPU. 
    578557        env = environment() 
    579         key = self._cache_key 
    580         if key not in env.cache: 
    581             context = env.context[self.dtype] 
    582             width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    583             buffer = cl.Buffer(context, mf.READ_WRITE, width) 
    584             env.cache[key] = buffer 
    585         return env.cache[key] 
    586  
    587     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    588         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     558        context = env.context[self.dtype] 
     559        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
     560        self._result_b = cl.Buffer(context, mf.READ_WRITE, width) 
     561 
     562    def _call_kernel(self, call_details, values, cutoff, magnetic, 
     563                     effective_radius_type): 
     564        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    589565        env = environment() 
    590566        queue = env.queue[self._model.dtype] 
    591567        context = queue.context 
    592568 
    593         # Arrange data transfer to/from card 
    594         q_b = self.q_input.q_b 
    595         result_b = self._result_b 
     569        # Arrange data transfer to card. 
    596570        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    597571                              hostbuf=call_details.buffer) 
     
    599573                             hostbuf=values) 
    600574 
     575        # Setup kernel function and arguments. 
    601576        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    602         kernel = self._model.fetch_kernel(name) 
     577        kernel = self._model.get_function(name) 
    603578        kernel_args = [ 
    604             np.uint32(self.q_input.nq), None, None, 
    605             details_b, values_b, q_b, result_b, 
    606             self._as_dtype(cutoff), 
    607             np.uint32(effective_radius_type), 
     579            np.uint32(self.q_input.nq),  # Number of inputs. 
     580            None,  # Placeholder for pd_start. 
     581            None,  # Placeholder for pd_stop. 
     582            details_b,  # Problem definition. 
     583            values_b,  # Parameter values. 
     584            self.q_input.q_b,  # Q values. 
     585            self._result_b,   # Result storage. 
     586            self._as_dtype(cutoff),  # Probability cutoff. 
     587            np.uint32(effective_radius_type),  # R_eff mode. 
    608588        ] 
     589 
     590        # Call kernel and retrieve results. 
    609591        #print("Calling OpenCL") 
    610592        #call_details.show(values) 
    611         #Call kernel and retrieve results 
    612593        wait_for = None 
    613594        last_nap = time.clock() 
     
    620601                               *kernel_args, wait_for=wait_for)] 
    621602            if stop < call_details.num_eval: 
    622                 # Allow other processes to run 
     603                # Allow other processes to run. 
    623604                wait_for[0].wait() 
    624605                current_time = time.clock() 
     
    626607                    time.sleep(0.001) 
    627608                    last_nap = current_time 
    628         cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for) 
     609        cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for) 
    629610        #print("result", self.result) 
    630611 
    631         # Free buffers 
    632         for v in (details_b, values_b): 
    633             if v is not None: 
    634                 v.release() 
     612        # Free buffers. 
     613        details_b.release() 
     614        values_b.release() 
    635615 
    636616    def release(self): 
     
    639619        Release resources associated with the kernel. 
    640620        """ 
    641         environment().free_buffer(id(self)) 
    642621        self.q_input.release() 
     622        if self._result_b is not None: 
     623            self._result_b.release() 
     624            self._result_b = None 
    643625 
    644626    def __del__(self): 
Note: See TracChangeset for help on using the changeset viewer.