Changeset 5d316e9 in sasmodels for sasmodels/kernelcl.py


Ignore:
Timestamp:
Dec 8, 2015 6:08:51 AM (8 years ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
cf404cb
Parents:
eaca9eb
Message:

support fast and loose single precision and half precision

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcl.py

    r9404dd3 r5d316e9  
    6262 
    6363from pyopencl import mem_flags as mf 
     64from pyopencl.characterize import get_fast_inaccurate_build_options 
    6465 
    6566from . import generate 
    66  
    67 F64_DEFS = """\ 
    68 #ifdef cl_khr_fp64 
    69 #  pragma OPENCL EXTENSION cl_khr_fp64: enable 
    70 #endif 
    71 """ 
    7267 
    7368# The max loops number is limited by the amount of local memory available 
     
    9287    return ENV 
    9388 
    94 def has_double(device): 
    95     """ 
    96     Return true if device supports double precision. 
    97     """ 
    98     return "cl_khr_fp64" in device.extensions 
     89def has_type(device, dtype): 
     90    """ 
     91    Return true if device supports the requested precision. 
     92    """ 
     93    if dtype == generate.F32: 
     94        return True 
     95    elif dtype == generate.F64: 
     96        return "cl_khr_fp64" in device.extensions 
     97    elif dtype == generate.F16: 
     98        return "cl_khr_fp16" in device.extensions 
     99    else: 
     100        return False 
    99101 
    100102def get_warp(kernel, queue): 
     
    128130 
    129131 
    130 def compile_model(context, source, dtype): 
     132def compile_model(context, source, dtype, fast=False): 
    131133    """ 
    132134    Build a model to run on the gpu. 
     
    137139    """ 
    138140    dtype = np.dtype(dtype) 
    139     if dtype == generate.F64 and not all(has_double(d) for d in context.devices): 
    140         raise RuntimeError("Double precision not supported for devices") 
    141  
    142     header = F64_DEFS if dtype == generate.F64 else "" 
    143     if dtype == generate.F32: 
    144         source = generate.use_single(source) 
     141    if not all(has_type(d, dtype) for d in context.devices): 
     142        raise RuntimeError("%s not supported for devices"%dtype) 
     143 
     144    source = generate.convert_type(source, dtype) 
    145145    # Note: USE_SINCOS makes the intel cpu slower under opencl 
    146146    if context.devices[0].type == cl.device_type.GPU: 
    147         header += "#define USE_SINCOS\n" 
    148     program = cl.Program(context, header + source).build() 
     147        source = "#define USE_SINCOS\n" + source 
     148    options = (get_fast_inaccurate_build_options(context.devices[0]) 
     149               if fast else []) 
     150    program = cl.Program(context, source).build(options=options) 
    149151    return program 
    150152 
     
    178180        self.queues = [cl.CommandQueue(self.context, d) 
    179181                       for d in self.context.devices] 
    180         self.has_double = all(has_double(d) for d in self.context.devices) 
    181182        self.compiled = {} 
     183 
     184    def has_type(self, dtype): 
     185        dtype = np.dtype(dtype) 
     186        return all(has_type(d, dtype) for d in self.context.devices) 
    182187 
    183188    def _create_some_context(self): 
     
    189194            warnings.warn("the environment variable 'PYOPENCL_CTX' might not be set correctly") 
    190195 
    191     def compile_program(self, name, source, dtype): 
     196    def compile_program(self, name, source, dtype, fast=False): 
    192197        if name not in self.compiled: 
    193198            #print("compiling",name) 
    194             self.compiled[name] = compile_model(self.context, source, dtype) 
     199            self.compiled[name] = compile_model(self.context, source, dtype, 
     200                                                fast) 
    195201        return self.compiled[name] 
    196202 
     
    226232    for single and 'd', 'float64' or 'double' for double.  Double precision 
    227233    is an optional extension which may not be available on all devices. 
    228     """ 
    229     def __init__(self, source, info, dtype=generate.F32): 
     234 
     235    *fast* is True if fast inaccurate math is acceptable (40% speed increase) 
     236    """ 
     237    def __init__(self, source, info, dtype=generate.F32, fast=False): 
    230238        self.info = info 
    231239        self.source = source 
    232240        self.dtype = np.dtype(dtype) 
     241        self.fast = fast 
    233242        self.program = None # delay program creation 
    234243 
     
    243252    def __call__(self, q_input): 
    244253        if self.dtype != q_input.dtype: 
    245             raise TypeError("data is %s kernel is %s" % (q_input.dtype, self.dtype)) 
     254            raise TypeError("data is %s kernel is %s" 
     255                            % (q_input.dtype, self.dtype)) 
    246256        if self.program is None: 
    247257            compiler = environment().compile_program 
    248             self.program = compiler(self.info['name'], self.source, self.dtype) 
     258            self.program = compiler(self.info['name'], self.source, self.dtype, 
     259                                    self.fast) 
    249260        kernel_name = generate.kernel_name(self.info, q_input.is_2D) 
    250261        kernel = getattr(self.program, kernel_name) 
     
    347358 
    348359    def __call__(self, fixed_pars, pd_pars, cutoff=1e-5): 
    349         real = np.float32 if self.q_input.dtype == generate.F32 else np.float64 
     360        real = (np.float32 if self.q_input.dtype == generate.F32 
     361                else np.float64 if self.q_input.dtype == generate.F64 
     362                else np.float16 if self.q_input.dtype == generate.F16 
     363                else np.float32)  # will never get here, so use np.float32 
    350364 
    351365        device_num = 0 
Note: See TracChangeset for help on using the changeset viewer.