Changeset 8064d5e in sasmodels for sasmodels/kernelcl.py


Ignore:
Timestamp:
Nov 9, 2018 2:23:18 PM (5 years ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
c11d09f
Parents:
b093c1c (diff), cf3d0ce (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'beta_approx' into py3

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcl.py

    rf31815d r8064d5e  
    11""" 
    22GPU driver for C kernels 
     3 
     4TODO: docs are out of date 
    35 
    46There should be a single GPU environment running on the system.  This 
     
    6769import numpy as np  # type: ignore 
    6870 
    69 # Attempt to setup opencl. This may fail if the opencl package is not 
     71# Attempt to setup opencl. This may fail if the pyopencl package is not 
    7072# installed or if it is installed but there are no devices available. 
    7173try: 
     
    8284 
    8385from . import generate 
     86from .generate import F32, F64 
    8487from .kernel import KernelModel, Kernel 
    8588 
     
    139142 
    140143def use_opencl(): 
    141     return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none" 
     144    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower() 
     145    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda") 
    142146 
    143147ENV = None 
     
    170174    Return true if device supports the requested precision. 
    171175    """ 
    172     if dtype == generate.F32: 
     176    if dtype == F32: 
    173177        return True 
    174     elif dtype == generate.F64: 
     178    elif dtype == F64: 
    175179        return "cl_khr_fp64" in device.extensions 
    176     elif dtype == generate.F16: 
    177         return "cl_khr_fp16" in device.extensions 
    178180    else: 
     181        # Not supporting F16 type since it isn't accurate enough 
    179182        return False 
    180183 
     
    187190        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
    188191        queue.device) 
    189  
    190 def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 
    191     # type: (np.ndarray, np.dtype, float, int) -> np.ndarray 
    192     """ 
    193     Stretch an input vector to the correct boundary. 
    194  
    195     Performance on the kernels can drop by a factor of two or more if the 
    196     number of values to compute does not fall on a nice power of two 
    197     boundary.   The trailing additional vector elements are given a 
    198     value of *extra*, and so f(*extra*) will be computed for each of 
    199     them.  The returned array will thus be a subset of the computed array. 
    200  
    201     *boundary* should be a power of 2 which is at least 32 for good 
    202     performance on current platforms (as of Jan 2015).  It should 
    203     probably be the max of get_warp(kernel,queue) and 
    204     device.min_data_type_align_size//4. 
    205     """ 
    206     remainder = vector.size % boundary 
    207     if remainder != 0: 
    208         size = vector.size + (boundary - remainder) 
    209         vector = np.hstack((vector, [extra] * (size - vector.size))) 
    210     return np.ascontiguousarray(vector, dtype=dtype) 
    211  
    212192 
    213193def compile_model(context, source, dtype, fast=False): 
     
    247227    """ 
    248228    GPU context, with possibly many devices, and one queue per device. 
     229 
     230    Because the environment can be reset during a live program (e.g., if the 
     231    user changes the active GPU device in the GUI), everything associated 
     232    with the device context must be cached in the environment and recreated 
     233    if the environment changes.  The *cache* attribute is a simple dictionary 
     234    which holds keys and references to objects, such as compiled kernels and 
     235    allocated buffers.  The running program should check in the cache for 
     236    long lived objects and create them if they are not there.  The program 
     237    should not hold onto cached objects, but instead only keep them active 
     238    for the duration of a function call.  When the environment is destroyed 
     239    then the *release* method for each active cache item is called before 
     240    the environment is freed.  This means that each cl buffer should be 
     241    in its own cache entry. 
    249242    """ 
    250243    def __init__(self): 
    251244        # type: () -> None 
    252245        # find gpu context 
    253         #self.context = cl.create_some_context() 
    254  
    255         self.context = None 
    256         if 'SAS_OPENCL' in os.environ: 
    257             #Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
    258             os.environ["PYOPENCL_CTX"] = os.environ["SAS_OPENCL"] 
    259         if 'PYOPENCL_CTX' in os.environ: 
    260             self._create_some_context() 
    261  
    262         if not self.context: 
    263             self.context = _get_default_context() 
     246        context_list = _create_some_context() 
     247 
     248        # Find a context for F32 and for F64 (maybe the same one). 
     249        # F16 isn't good enough. 
     250        self.context = {} 
     251        for dtype in (F32, F64): 
     252            for context in context_list: 
     253                if has_type(context.devices[0], dtype): 
     254                    self.context[dtype] = context 
     255                    break 
     256            else: 
     257                self.context[dtype] = None 
     258 
     259        # Build a queue for each context 
     260        self.queue = {} 
     261        context = self.context[F32] 
     262        self.queue[F32] = cl.CommandQueue(context, context.devices[0]) 
     263        if self.context[F64] == self.context[F32]: 
     264            self.queue[F64] = self.queue[F32] 
     265        else: 
     266            context = self.context[F64] 
     267            self.queue[F64] = cl.CommandQueue(context, context.devices[0]) 
    264268 
    265269        # Byte boundary for data alignment 
    266         #self.data_boundary = max(d.min_data_type_align_size 
    267         #                         for d in self.context.devices) 
    268         self.queues = [cl.CommandQueue(context, context.devices[0]) 
    269                        for context in self.context] 
     270        #self.data_boundary = max(context.devices[0].min_data_type_align_size 
     271        #                         for context in self.context.values()) 
     272 
     273        # Cache for compiled programs, and for items in context 
    270274        self.compiled = {} 
     275        self.cache = {} 
    271276 
    272277    def has_type(self, dtype): 
     
    275280        Return True if all devices support a given type. 
    276281        """ 
    277         return any(has_type(d, dtype) 
    278                    for context in self.context 
    279                    for d in context.devices) 
    280  
    281     def get_queue(self, dtype): 
    282         # type: (np.dtype) -> cl.CommandQueue 
    283         """ 
    284         Return a command queue for the kernels of type dtype. 
    285         """ 
    286         for context, queue in zip(self.context, self.queues): 
    287             if all(has_type(d, dtype) for d in context.devices): 
    288                 return queue 
    289  
    290     def get_context(self, dtype): 
    291         # type: (np.dtype) -> cl.Context 
    292         """ 
    293         Return a OpenCL context for the kernels of type dtype. 
    294         """ 
    295         for context in self.context: 
    296             if all(has_type(d, dtype) for d in context.devices): 
    297                 return context 
    298  
    299     def _create_some_context(self): 
    300         # type: () -> cl.Context 
    301         """ 
    302         Protected call to cl.create_some_context without interactivity.  Use 
    303         this if SAS_OPENCL is set in the environment.  Sets the *context* 
    304         attribute. 
    305         """ 
    306         try: 
    307             self.context = [cl.create_some_context(interactive=False)] 
    308         except Exception as exc: 
    309             warnings.warn(str(exc)) 
    310             warnings.warn("pyopencl.create_some_context() failed") 
    311             warnings.warn("the environment variable 'SAS_OPENCL' might not be set correctly") 
     282        return self.context.get(dtype, None) is not None 
    312283 
    313284    def compile_program(self, name, source, dtype, fast, timestamp): 
     
    326297            del self.compiled[key] 
    327298        if key not in self.compiled: 
    328             context = self.get_context(dtype) 
     299            context = self.context[dtype] 
    329300            logging.info("building %s for OpenCL %s", key, 
    330301                         context.devices[0].name.strip()) 
    331             program = compile_model(self.get_context(dtype), 
     302            program = compile_model(self.context[dtype], 
    332303                                    str(source), dtype, fast) 
    333304            self.compiled[key] = (program, timestamp) 
    334305        return program 
     306 
     307    def free_buffer(self, key): 
     308        if key in self.cache: 
     309            self.cache[key].release() 
     310            del self.cache[key] 
     311 
     312    def __del__(self): 
     313        for v in self.cache.values(): 
     314            release = getattr(v, 'release', lambda: None) 
     315            release() 
     316        self.cache = {} 
     317 
     318_CURRENT_ID = 0 
     319def unique_id(): 
     320    global _CURRENT_ID 
     321    _CURRENT_ID += 1 
     322    return _CURRENT_ID 
     323 
     324def _create_some_context(): 
     325    # type: () -> cl.Context 
     326    """ 
     327    Protected call to cl.create_some_context without interactivity. 
     328 
     329    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment, 
     330    otherwise scans for the most appropriate device using 
     331    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which 
     332    indicates that an OpenCL device should be used without specifying 
     333    which one (and not a CUDA device, or no GPU). 
     334    """ 
     335    # Assume we do not get here if SAS_OPENCL is None or CUDA 
     336    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl') 
     337    if sas_opencl.lower() != 'opencl': 
     338        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context 
     339        os.environ["PYOPENCL_CTX"] = sas_opencl 
     340 
     341    if 'PYOPENCL_CTX' in os.environ: 
     342        try: 
     343            return [cl.create_some_context(interactive=False)] 
     344        except Exception as exc: 
     345            warnings.warn(str(exc)) 
     346            warnings.warn("pyopencl.create_some_context() failed") 
     347            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly") 
     348 
     349    return _get_default_context() 
    335350 
    336351def _get_default_context(): 
     
    412427        self.dtype = dtype 
    413428        self.fast = fast 
    414         self.program = None # delay program creation 
    415         self._kernels = None 
     429        self.timestamp = generate.ocl_timestamp(self.info) 
     430        self._cache_key = unique_id() 
    416431 
    417432    def __getstate__(self): 
     
    422437        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    423438        self.info, self.source, self.dtype, self.fast = state 
    424         self.program = None 
    425439 
    426440    def make_kernel(self, q_vectors): 
    427441        # type: (List[np.ndarray]) -> "GpuKernel" 
    428         if self.program is None: 
    429             compile_program = environment().compile_program 
    430             timestamp = generate.ocl_timestamp(self.info) 
    431             self.program = compile_program( 
     442        return GpuKernel(self, q_vectors) 
     443 
     444    @property 
     445    def Iq(self): 
     446        return self._fetch_kernel('Iq') 
     447 
     448    def fetch_kernel(self, name): 
     449        # type: (str) -> cl.Kernel 
     450        """ 
     451        Fetch the kernel from the environment by name, compiling it if it 
     452        does not already exist. 
     453        """ 
     454        gpu = environment() 
     455        key = self._cache_key 
     456        if key not in gpu.cache: 
     457            program = gpu.compile_program( 
    432458                self.info.name, 
    433459                self.source['opencl'], 
    434460                self.dtype, 
    435461                self.fast, 
    436                 timestamp) 
     462                self.timestamp) 
    437463            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    438464            names = [generate.kernel_name(self.info, k) for k in variants] 
    439             kernels = [getattr(self.program, k) for k in names] 
    440             self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 
    441         is_2d = len(q_vectors) == 2 
    442         if is_2d: 
    443             kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 
     465            kernels = [getattr(program, k) for k in names] 
     466            data = dict((k, v) for k, v in zip(variants, kernels)) 
     467            # keep a handle to program so GC doesn't collect 
     468            data['program'] = program 
     469            gpu.cache[key] = data 
    444470        else: 
    445             kernel = [self._kernels['Iq']]*2 
    446         return GpuKernel(kernel, self.dtype, self.info, q_vectors) 
    447  
    448     def release(self): 
    449         # type: () -> None 
    450         """ 
    451         Free the resources associated with the model. 
    452         """ 
    453         if self.program is not None: 
    454             self.program = None 
    455  
    456     def __del__(self): 
    457         # type: () -> None 
    458         self.release() 
     471            data = gpu.cache[key] 
     472        return data[name] 
    459473 
    460474# TODO: check that we don't need a destructor for buffers which go out of scope 
     
    481495        # type: (List[np.ndarray], np.dtype) -> None 
    482496        # TODO: do we ever need double precision q? 
    483         env = environment() 
    484497        self.nq = q_vectors[0].size 
    485498        self.dtype = np.dtype(dtype) 
     
    490503        # architectures tested so far. 
    491504        if self.is_2d: 
    492             # Note: 16 rather than 15 because result is 1 longer than input. 
    493             width = ((self.nq+16)//16)*16 
     505            width = ((self.nq+15)//16)*16 
    494506            self.q = np.empty((width, 2), dtype=dtype) 
    495507            self.q[:self.nq, 0] = q_vectors[0] 
    496508            self.q[:self.nq, 1] = q_vectors[1] 
    497509        else: 
    498             # Note: 32 rather than 31 because result is 1 longer than input. 
    499             width = ((self.nq+32)//32)*32 
     510            width = ((self.nq+31)//32)*32 
    500511            self.q = np.empty(width, dtype=dtype) 
    501512            self.q[:self.nq] = q_vectors[0] 
    502513        self.global_size = [self.q.shape[0]] 
    503         context = env.get_context(self.dtype) 
    504         #print("creating inputs of size", self.global_size) 
    505         self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    506                              hostbuf=self.q) 
     514        self._cache_key = unique_id() 
     515 
     516    @property 
     517    def q_b(self): 
     518        """Lazy creation of q buffer so it can survive context reset""" 
     519        env = environment() 
     520        key = self._cache_key 
     521        if key not in env.cache: 
     522            context = env.context[self.dtype] 
     523            #print("creating inputs of size", self.global_size) 
     524            buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
     525                               hostbuf=self.q) 
     526            env.cache[key] = buffer 
     527        return env.cache[key] 
    507528 
    508529    def release(self): 
    509530        # type: () -> None 
    510531        """ 
    511         Free the memory. 
    512         """ 
    513         if self.q_b is not None: 
    514             self.q_b.release() 
    515             self.q_b = None 
     532        Free the buffer associated with the q value 
     533        """ 
     534        environment().free_buffer(id(self)) 
    516535 
    517536    def __del__(self): 
     
    523542    Callable SAS kernel. 
    524543 
    525     *kernel* is the GpuKernel object to call 
    526  
    527     *model_info* is the module information 
    528  
    529     *q_vectors* is the q vectors at which the kernel should be evaluated 
     544    *model* is the GpuModel object to call 
     545 
     546    The following attributes are defined: 
     547 
     548    *info* is the module information 
    530549 
    531550    *dtype* is the kernel precision 
     551 
     552    *dim* is '1d' or '2d' 
     553 
     554    *result* is a vector to contain the results of the call 
    532555 
    533556    The resulting call method takes the *pars*, a list of values for 
     
    539562    Call :meth:`release` when done with the kernel instance. 
    540563    """ 
    541     def __init__(self, kernel, dtype, model_info, q_vectors): 
     564    def __init__(self, model, q_vectors): 
    542565        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    543         q_input = GpuInput(q_vectors, dtype) 
    544         self.kernel = kernel 
    545         self.info = model_info 
    546         self.dtype = dtype 
    547         self.dim = '2d' if q_input.is_2d else '1d' 
    548         # plus three for the normalization values 
    549         self.result = np.empty(q_input.nq+1, dtype) 
    550  
    551         # Inputs and outputs for each kernel call 
    552         # Note: res may be shorter than res_b if global_size != nq 
     566        dtype = model.dtype 
     567        self.q_input = GpuInput(q_vectors, dtype) 
     568        self._model = model 
     569        # F16 isn't sufficient, so don't support it 
     570        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     571        self._cache_key = unique_id() 
     572 
     573        # attributes accessed from the outside 
     574        self.dim = '2d' if self.q_input.is_2d else '1d' 
     575        self.info = model.info 
     576        self.dtype = model.dtype 
     577 
     578        # holding place for the returned value 
     579        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
     580        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     581        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
     582 
     583    @property 
     584    def _result_b(self): 
     585        """Lazy creation of result buffer so it can survive context reset""" 
    553586        env = environment() 
    554         self.queue = env.get_queue(dtype) 
    555  
    556         self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 
    557                                   q_input.global_size[0] * dtype.itemsize) 
    558         self.q_input = q_input # allocated by GpuInput above 
    559  
    560         self._need_release = [self.result_b, self.q_input] 
    561         self.real = (np.float32 if dtype == generate.F32 
    562                      else np.float64 if dtype == generate.F64 
    563                      else np.float16 if dtype == generate.F16 
    564                      else np.float32)  # will never get here, so use np.float32 
    565  
    566     def __call__(self, call_details, values, cutoff, magnetic): 
     587        key = self._cache_key 
     588        if key not in env.cache: 
     589            context = env.context[self.dtype] 
     590            width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
     591            buffer = cl.Buffer(context, mf.READ_WRITE, width) 
     592            env.cache[key] = buffer 
     593        return env.cache[key] 
     594 
     595    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    567596        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    568         context = self.queue.context 
    569         # Arrange data transfer to card 
     597        env = environment() 
     598        queue = env.queue[self._model.dtype] 
     599        context = queue.context 
     600 
     601        # Arrange data transfer to/from card 
     602        q_b = self.q_input.q_b 
     603        result_b = self._result_b 
    570604        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 
    571605                              hostbuf=call_details.buffer) 
     
    573607                             hostbuf=values) 
    574608 
    575         kernel = self.kernel[1 if magnetic else 0] 
    576         args = [ 
     609        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
     610        kernel = self._model.fetch_kernel(name) 
     611        kernel_args = [ 
    577612            np.uint32(self.q_input.nq), None, None, 
    578             details_b, values_b, self.q_input.q_b, self.result_b, 
    579             self.real(cutoff), 
     613            details_b, values_b, q_b, result_b, 
     614            self._as_dtype(cutoff), 
     615            np.uint32(effective_radius_type), 
    580616        ] 
    581617        #print("Calling OpenCL") 
    582618        #call_details.show(values) 
    583         # Call kernel and retrieve results 
     619        #Call kernel and retrieve results 
    584620        wait_for = None 
    585621        last_nap = clock() 
     
    588624            stop = min(start + step, call_details.num_eval) 
    589625            #print("queuing",start,stop) 
    590             args[1:3] = [np.int32(start), np.int32(stop)] 
    591             wait_for = [kernel(self.queue, self.q_input.global_size, None, 
    592                                *args, wait_for=wait_for)] 
     626            kernel_args[1:3] = [np.int32(start), np.int32(stop)] 
     627            wait_for = [kernel(queue, self.q_input.global_size, None, 
     628                               *kernel_args, wait_for=wait_for)] 
    593629            if stop < call_details.num_eval: 
    594630                # Allow other processes to run 
     
    596632                current_time = clock() 
    597633                if current_time - last_nap > 0.5: 
    598                     time.sleep(0.05) 
     634                    time.sleep(0.001) 
    599635                    last_nap = current_time 
    600         cl.enqueue_copy(self.queue, self.result, self.result_b) 
     636        cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for) 
    601637        #print("result", self.result) 
    602638 
     
    606642                v.release() 
    607643 
    608         pd_norm = self.result[self.q_input.nq] 
    609         scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0) 
    610         background = values[1] 
    611         #print("scale",scale,values[0],self.result[self.q_input.nq],background) 
    612         return scale*self.result[:self.q_input.nq] + background 
    613         # return self.result[:self.q_input.nq] 
    614  
    615644    def release(self): 
    616645        # type: () -> None 
     
    618647        Release resources associated with the kernel. 
    619648        """ 
    620         for v in self._need_release: 
    621             v.release() 
    622         self._need_release = [] 
     649        environment().free_buffer(id(self)) 
     650        self.q_input.release() 
    623651 
    624652    def __del__(self): 
Note: See TracChangeset for help on using the changeset viewer.