← Previous Change
Next Change →

Changeset 8064d5e in sasmodels for sasmodels/kernelcl.py

Timestamp:

Nov 9, 2018 2:23:18 PM (5 years ago)

Author:

Paul Kienzle <pkienzle@…>

Branches:

master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

c11d09f

Parents:

b093c1c (diff), cf3d0ce (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.

Message:

Merge branch 'beta_approx' into py3

File:

: 1 edited

sasmodels/kernelcl.py (modified) (20 diffs)

Legend:

: Unmodified
: Added
: Removed

sasmodels/kernelcl.py

-                      rf31815d
+                      r8064d5e
 """
 GPU driver for C kernels
+TODO: docs are out of date
 There should be a single GPU environment running on the system.  This
 …
 import numpy as np  # type: ignore
 # Attempt to setup opencl. This may fail if the opencl package is not
+# Attempt to setup opencl. This may fail if the pyopencl package is not
 # installed or if it is installed but there are no devices available.
 try:
 …
 from . import generate
+from .generate import F32, F64
 from .kernel import KernelModel, Kernel
 …
 def use_opencl():
+    return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none"
+    sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
+    return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
 ENV = None
 …
     Return true if device supports the requested precision.
     """
     if dtype == generate.F32:
+    if dtype == F32:
         return True
     elif dtype == generate.F64:
+    elif dtype == F64:
         return "cl_khr_fp64" in device.extensions
-    elif dtype == generate.F16:
-        return "cl_khr_fp16" in device.extensions
     else:
+        # Not supporting F16 type since it isn't accurate enough
         return False
 …
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
         queue.device)
-def _stretch_input(vector, dtype, extra=1e-3, boundary=32):
-    # type: (np.ndarray, np.dtype, float, int) -> np.ndarray
-    """
-    Stretch an input vector to the correct boundary.
-    Performance on the kernels can drop by a factor of two or more if the
-    number of values to compute does not fall on a nice power of two
-    boundary.   The trailing additional vector elements are given a
-    value of *extra*, and so f(*extra*) will be computed for each of
-    them.  The returned array will thus be a subset of the computed array.
-    *boundary* should be a power of 2 which is at least 32 for good
-    performance on current platforms (as of Jan 2015).  It should
-    probably be the max of get_warp(kernel,queue) and
-    device.min_data_type_align_size//4.
-    """
-    remainder = vector.size % boundary
-    if remainder != 0:
-        size = vector.size + (boundary - remainder)
-        vector = np.hstack((vector, [extra] * (size - vector.size)))
-    return np.ascontiguousarray(vector, dtype=dtype)
 def compile_model(context, source, dtype, fast=False):
 …
     """
     GPU context, with possibly many devices, and one queue per device.
+    Because the environment can be reset during a live program (e.g., if the
+    user changes the active GPU device in the GUI), everything associated
+    with the device context must be cached in the environment and recreated
+    if the environment changes.  The *cache* attribute is a simple dictionary
+    which holds keys and references to objects, such as compiled kernels and
+    allocated buffers.  The running program should check in the cache for
+    long lived objects and create them if they are not there.  The program
+    should not hold onto cached objects, but instead only keep them active
+    for the duration of a function call.  When the environment is destroyed
+    then the *release* method for each active cache item is called before
+    the environment is freed.  This means that each cl buffer should be
+    in its own cache entry.
     """
     def __init__(self):
         # type: () -> None
         # find gpu context
+        #self.context = cl.create_some_context()
+        self.context = None
+        if 'SAS_OPENCL' in os.environ:
+            #Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+            os.environ["PYOPENCL_CTX"] = os.environ["SAS_OPENCL"]
+        if 'PYOPENCL_CTX' in os.environ:
+            self._create_some_context()
+        if not self.context:
+            self.context = _get_default_context()
+        context_list = _create_some_context()
+        # Find a context for F32 and for F64 (maybe the same one).
+        # F16 isn't good enough.
+        self.context = {}
+        for dtype in (F32, F64):
+            for context in context_list:
+                if has_type(context.devices[0], dtype):
+                    self.context[dtype] = context
+                    break
+            else:
+                self.context[dtype] = None
+        # Build a queue for each context
+        self.queue = {}
+        context = self.context[F32]
+        self.queue[F32] = cl.CommandQueue(context, context.devices[0])
+        if self.context[F64] == self.context[F32]:
+            self.queue[F64] = self.queue[F32]
+        else:
+            context = self.context[F64]
+            self.queue[F64] = cl.CommandQueue(context, context.devices[0])
         # Byte boundary for data alignment
         #self.data_boundary = max(d.min_data_type_align_size
         #                         for d in self.context.devices)
+        self.queues = [cl.CommandQueue(context, context.devices[0])
                        for context in self.context]
+        #self.data_boundary = max(context.devices[0].min_data_type_align_size
+        #                         for context in self.context.values())
+        # Cache for compiled programs, and for items in context
         self.compiled = {}
+        self.cache = {}
     def has_type(self, dtype):
 …
         Return True if all devices support a given type.
         """
+        return any(has_type(d, dtype)
+                   for context in self.context
+                   for d in context.devices)
+    def get_queue(self, dtype):
+        # type: (np.dtype) -> cl.CommandQueue
+        """
+        Return a command queue for the kernels of type dtype.
+        """
+        for context, queue in zip(self.context, self.queues):
+            if all(has_type(d, dtype) for d in context.devices):
+                return queue
+    def get_context(self, dtype):
+        # type: (np.dtype) -> cl.Context
+        """
+        Return a OpenCL context for the kernels of type dtype.
+        """
+        for context in self.context:
+            if all(has_type(d, dtype) for d in context.devices):
+                return context
+    def _create_some_context(self):
+        # type: () -> cl.Context
+        """
+        Protected call to cl.create_some_context without interactivity.  Use
+        this if SAS_OPENCL is set in the environment.  Sets the *context*
+        attribute.
+        """
+        try:
+            self.context = [cl.create_some_context(interactive=False)]
+        except Exception as exc:
+            warnings.warn(str(exc))
+            warnings.warn("pyopencl.create_some_context() failed")
+            warnings.warn("the environment variable 'SAS_OPENCL' might not be set correctly")
+        return self.context.get(dtype, None) is not None
     def compile_program(self, name, source, dtype, fast, timestamp):
 …
             del self.compiled[key]
         if key not in self.compiled:
             context = self.get_context(dtype)
+            context = self.context[dtype]
             logging.info("building %s for OpenCL %s", key,
                          context.devices[0].name.strip())
             program = compile_model(self.get_context(dtype),
+            program = compile_model(self.context[dtype],
                                     str(source), dtype, fast)
             self.compiled[key] = (program, timestamp)
         return program
+    def free_buffer(self, key):
+        if key in self.cache:
+            self.cache[key].release()
+            del self.cache[key]
+    def __del__(self):
+        for v in self.cache.values():
+            release = getattr(v, 'release', lambda: None)
+            release()
+        self.cache = {}
+_CURRENT_ID = 0
+def unique_id():
+    global _CURRENT_ID
+    _CURRENT_ID += 1
+    return _CURRENT_ID
+def _create_some_context():
+    # type: () -> cl.Context
+    """
+    Protected call to cl.create_some_context without interactivity.
+    Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
+    otherwise scans for the most appropriate device using
+    :func:`_get_default_context`.  Ignore *SAS_OPENCL=OpenCL*, which
+    indicates that an OpenCL device should be used without specifying
+    which one (and not a CUDA device, or no GPU).
+    """
+    # Assume we do not get here if SAS_OPENCL is None or CUDA
+    sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
+    if sas_opencl.lower() != 'opencl':
+        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+        os.environ["PYOPENCL_CTX"] = sas_opencl
+    if 'PYOPENCL_CTX' in os.environ:
+        try:
+            return [cl.create_some_context(interactive=False)]
+        except Exception as exc:
+            warnings.warn(str(exc))
+            warnings.warn("pyopencl.create_some_context() failed")
+            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly")
+    return _get_default_context()
 def _get_default_context():
 …
         self.dtype = dtype
         self.fast = fast
         self.program = None # delay program creation
         self._kernels = None
+        self.timestamp = generate.ocl_timestamp(self.info)
+        self._cache_key = unique_id()
     def __getstate__(self):
 …
         # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
         self.info, self.source, self.dtype, self.fast = state
-        self.program = None
     def make_kernel(self, q_vectors):
         # type: (List[np.ndarray]) -> "GpuKernel"
+        if self.program is None:
+            compile_program = environment().compile_program
+            timestamp = generate.ocl_timestamp(self.info)
+            self.program = compile_program(
+        return GpuKernel(self, q_vectors)
+    @property
+    def Iq(self):
+        return self._fetch_kernel('Iq')
+    def fetch_kernel(self, name):
+        # type: (str) -> cl.Kernel
+        """
+        Fetch the kernel from the environment by name, compiling it if it
+        does not already exist.
+        """
+        gpu = environment()
+        key = self._cache_key
+        if key not in gpu.cache:
+            program = gpu.compile_program(
                 self.info.name,
                 self.source['opencl'],
                 self.dtype,
                 self.fast,
                 timestamp)
+                self.timestamp)
             variants = ['Iq', 'Iqxy', 'Imagnetic']
             names = [generate.kernel_name(self.info, k) for k in variants]
             kernels = [getattr(self.program, k) for k in names]
             self._kernels = dict((k, v) for k, v in zip(variants, kernels))
         is_2d = len(q_vectors) == 2
         if is_2d:
             kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']]
+            kernels = [getattr(program, k) for k in names]
+            data = dict((k, v) for k, v in zip(variants, kernels))
+            # keep a handle to program so GC doesn't collect
+            data['program'] = program
+            gpu.cache[key] = data
         else:
+            kernel = [self._kernels['Iq']]*2
+        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
+    def release(self):
+        # type: () -> None
+        """
+        Free the resources associated with the model.
+        """
+        if self.program is not None:
+            self.program = None
+    def __del__(self):
+        # type: () -> None
+        self.release()
+            data = gpu.cache[key]
+        return data[name]
 # TODO: check that we don't need a destructor for buffers which go out of scope
 …
         # type: (List[np.ndarray], np.dtype) -> None
         # TODO: do we ever need double precision q?
-        env = environment()
         self.nq = q_vectors[0].size
         self.dtype = np.dtype(dtype)
 …
         # architectures tested so far.
         if self.is_2d:
+            # Note: 16 rather than 15 because result is 1 longer than input.
+            width = ((self.nq+16)//16)*16
+            width = ((self.nq+15)//16)*16
             self.q = np.empty((width, 2), dtype=dtype)
             self.q[:self.nq, 0] = q_vectors[0]
             self.q[:self.nq, 1] = q_vectors[1]
         else:
+            # Note: 32 rather than 31 because result is 1 longer than input.
+            width = ((self.nq+32)//32)*32
+            width = ((self.nq+31)//32)*32
             self.q = np.empty(width, dtype=dtype)
             self.q[:self.nq] = q_vectors[0]
         self.global_size = [self.q.shape[0]]
+        context = env.get_context(self.dtype)
+        #print("creating inputs of size", self.global_size)
+        self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
+                             hostbuf=self.q)
+        self._cache_key = unique_id()
+    @property
+    def q_b(self):
+        """Lazy creation of q buffer so it can survive context reset"""
+        env = environment()
+        key = self._cache_key
+        if key not in env.cache:
+            context = env.context[self.dtype]
+            #print("creating inputs of size", self.global_size)
+            buffer = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
+                               hostbuf=self.q)
+            env.cache[key] = buffer
+        return env.cache[key]
     def release(self):
         # type: () -> None
         """
+        Free the memory.
+        """
+        if self.q_b is not None:
+            self.q_b.release()
+            self.q_b = None
+        Free the buffer associated with the q value
+        """
+        environment().free_buffer(id(self))
     def __del__(self):
 …
     Callable SAS kernel.
     *kernel* is the GpuKernel object to call
     *model_info* is the module information
     *q_vectors* is the q vectors at which the kernel should be evaluated
+    *model* is the GpuModel object to call
+    The following attributes are defined:
+    *info* is the module information
     *dtype* is the kernel precision
+    *dim* is '1d' or '2d'
+    *result* is a vector to contain the results of the call
     The resulting call method takes the *pars*, a list of values for
 …
     Call :meth:`release` when done with the kernel instance.
     """
     def __init__(self, kernel, dtype, model_info, q_vectors):
+    def __init__(self, model, q_vectors):
         # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
+        q_input = GpuInput(q_vectors, dtype)
+        self.kernel = kernel
+        self.info = model_info
+        self.dtype = dtype
+        self.dim = '2d' if q_input.is_2d else '1d'
+        # plus three for the normalization values
+        self.result = np.empty(q_input.nq+1, dtype)
+        # Inputs and outputs for each kernel call
+        # Note: res may be shorter than res_b if global_size != nq
+        dtype = model.dtype
+        self.q_input = GpuInput(q_vectors, dtype)
+        self._model = model
+        # F16 isn't sufficient, so don't support it
+        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
+        self._cache_key = unique_id()
+        # attributes accessed from the outside
+        self.dim = '2d' if self.q_input.is_2d else '1d'
+        self.info = model.info
+        self.dtype = model.dtype
+        # holding place for the returned value
+        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
+        extra_q = 4  # total weight, form volume, shell volume and R_eff
+        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
+    @property
+    def _result_b(self):
+        """Lazy creation of result buffer so it can survive context reset"""
         env = environment()
+        self.queue = env.get_queue(dtype)
+        self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
+                                  q_input.global_size[0] * dtype.itemsize)
+        self.q_input = q_input # allocated by GpuInput above
+        self._need_release = [self.result_b, self.q_input]
+        self.real = (np.float32 if dtype == generate.F32
+                     else np.float64 if dtype == generate.F64
+                     else np.float16 if dtype == generate.F16
+                     else np.float32)  # will never get here, so use np.float32
+    def __call__(self, call_details, values, cutoff, magnetic):
+        key = self._cache_key
+        if key not in env.cache:
+            context = env.context[self.dtype]
+            width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
+            buffer = cl.Buffer(context, mf.READ_WRITE, width)
+            env.cache[key] = buffer
+        return env.cache[key]
+    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
+        context = self.queue.context
+        # Arrange data transfer to card
+        env = environment()
+        queue = env.queue[self._model.dtype]
+        context = queue.context
+        # Arrange data transfer to/from card
+        q_b = self.q_input.q_b
+        result_b = self._result_b
         details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=call_details.buffer)
 …
                              hostbuf=values)
+        kernel = self.kernel[1 if magnetic else 0]
+        args = [
+        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
+        kernel = self._model.fetch_kernel(name)
+        kernel_args = [
             np.uint32(self.q_input.nq), None, None,
+            details_b, values_b, self.q_input.q_b, self.result_b,
+            self.real(cutoff),
+            details_b, values_b, q_b, result_b,
+            self._as_dtype(cutoff),
+            np.uint32(effective_radius_type),
+        ]
         #print("Calling OpenCL")
         #call_details.show(values)
         # Call kernel and retrieve results
+        #Call kernel and retrieve results
         wait_for = None
         last_nap = clock()
 …
             stop = min(start + step, call_details.num_eval)
             #print("queuing",start,stop)
             args[1:3] = [np.int32(start), np.int32(stop)]
             wait_for = [kernel(self.queue, self.q_input.global_size, None,
                                *args, wait_for=wait_for)]
+            kernel_args[1:3] = [np.int32(start), np.int32(stop)]
+            wait_for = [kernel(queue, self.q_input.global_size, None,
+                               *kernel_args, wait_for=wait_for)]
             if stop < call_details.num_eval:
                 # Allow other processes to run
 …
                 current_time = clock()
                 if current_time - last_nap > 0.5:
                     time.sleep(0.05)
+                    time.sleep(0.001)
                     last_nap = current_time
         cl.enqueue_copy(self.queue, self.result, self.result_b)
+        cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for)
         #print("result", self.result)
 …
                 v.release()
-        pd_norm = self.result[self.q_input.nq]
-        scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0)
-        background = values[1]
-        #print("scale",scale,values[0],self.result[self.q_input.nq],background)
-        return scale*self.result[:self.q_input.nq] + background
-        # return self.result[:self.q_input.nq]
     def release(self):
         # type: () -> None
 …
         Release resources associated with the kernel.
         """
+        for v in self._need_release:
+            v.release()
+        self._need_release = []
+        environment().free_buffer(id(self))
+        self.q_input.release()
     def __del__(self):

Note: See TracChangeset for help on using the changeset viewer.

SasView

Changeset 8064d5e in sasmodels for sasmodels/kernelcl.py

Legend:

sasmodels/kernelcl.py

Download in other formats: