-                      r9404dd3
+                      r5d316e9
 from pyopencl import mem_flags as mf
+from pyopencl.characterize import get_fast_inaccurate_build_options
 from . import generate
-F64_DEFS = """\
-#ifdef cl_khr_fp64
-#  pragma OPENCL EXTENSION cl_khr_fp64: enable
-#endif
-"""
 # The max loops number is limited by the amount of local memory available
 …
     return ENV
+def has_double(device):
+    """
+    Return true if device supports double precision.
+    """
+    return "cl_khr_fp64" in device.extensions
+def has_type(device, dtype):
+    """
+    Return true if device supports the requested precision.
+    """
+    if dtype == generate.F32:
+        return True
+    elif dtype == generate.F64:
+        return "cl_khr_fp64" in device.extensions
+    elif dtype == generate.F16:
+        return "cl_khr_fp16" in device.extensions
+    else:
+        return False
 def get_warp(kernel, queue):
 …
 def compile_model(context, source, dtype):
+def compile_model(context, source, dtype, fast=False):
     """
     Build a model to run on the gpu.
 …
     """
     dtype = np.dtype(dtype)
+    if dtype == generate.F64 and not all(has_double(d) for d in context.devices):
+        raise RuntimeError("Double precision not supported for devices")
+    header = F64_DEFS if dtype == generate.F64 else ""
+    if dtype == generate.F32:
+        source = generate.use_single(source)
+    if not all(has_type(d, dtype) for d in context.devices):
+        raise RuntimeError("%s not supported for devices"%dtype)
+    source = generate.convert_type(source, dtype)
     # Note: USE_SINCOS makes the intel cpu slower under opencl
     if context.devices[0].type == cl.device_type.GPU:
+        header += "#define USE_SINCOS\n"
+    program = cl.Program(context, header + source).build()
+        source = "#define USE_SINCOS\n" + source
+    options = (get_fast_inaccurate_build_options(context.devices[0])
+               if fast else [])
+    program = cl.Program(context, source).build(options=options)
     return program
 …
         self.queues = [cl.CommandQueue(self.context, d)
                        for d in self.context.devices]
-        self.has_double = all(has_double(d) for d in self.context.devices)
         self.compiled = {}
+    def has_type(self, dtype):
+        dtype = np.dtype(dtype)
+        return all(has_type(d, dtype) for d in self.context.devices)
     def _create_some_context(self):
 …
             warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly")
     def compile_program(self, name, source, dtype):
+    def compile_program(self, name, source, dtype, fast=False):
         if name not in self.compiled:
             #print("compiling",name)
+            self.compiled[name] = compile_model(self.context, source, dtype)
+            self.compiled[name] = compile_model(self.context, source, dtype,
+                                                fast)
         return self.compiled[name]
 …
     for single and 'd', 'float64' or 'double' for double.  Double precision
     is an optional extension which may not be available on all devices.
+    """
+    def __init__(self, source, info, dtype=generate.F32):
+    *fast* is True if fast inaccurate math is acceptable (40% speed increase)
+    """
+    def __init__(self, source, info, dtype=generate.F32, fast=False):
         self.info = info
         self.source = source
         self.dtype = np.dtype(dtype)
+        self.fast = fast
         self.program = None # delay program creation
 …
     def __call__(self, q_input):
         if self.dtype != q_input.dtype:
+            raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype))
+            raise TypeError("data is %s kernel is %s"
+                            % (q_input.dtype, self.dtype))
         if self.program is None:
             compiler = environment().compile_program
+            self.program = compiler(self.info['name'], self.source, self.dtype)
+            self.program = compiler(self.info['name'], self.source, self.dtype,
+                                    self.fast)
         kernel_name = generate.kernel_name(self.info, q_input.is_2D)
         kernel = getattr(self.program, kernel_name)
 …
     def __call__(self, fixed_pars, pd_pars, cutoff=1e-5):
+        real = np.float32 if self.q_input.dtype == generate.F32 else np.float64
+        real = (np.float32 if self.q_input.dtype == generate.F32
+                else np.float64 if self.q_input.dtype == generate.F64
+                else np.float16 if self.q_input.dtype == generate.F16
+                else np.float32)  # will never get here, so use np.float32
         device_num = 0

Note: See TracChangeset for help on using the changeset viewer.

SasView

Changeset 5d316e9 in sasmodels for sasmodels/kernelcl.py

Legend:

sasmodels/kernelcl.py

Download in other formats: