Changeset ce27e21 in sasmodels for sasmodels/gpu.py


Ignore:
Timestamp:
Aug 24, 2014 7:18:14 PM (10 years ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
1780d59
Parents:
14de349
Message:

first pass for sasview wrapper around opencl models

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/gpu.py

    r14de349 rce27e21  
    4545#define real double 
    4646""" 
     47 
     48# The max loops number is limited by the amount of local memory available 
     49# on the device.  You don't want to make this value too big because it will 
     50# waste resources, nor too small because it may interfere with users trying 
     51# to do their polydispersity calculations.  A value of 1024 should be much 
     52# larger than necessary given that cost grows as npts^k where k is the number 
     53# of polydisperse parameters. 
     54MAX_LOOPS = 1024 
    4755 
    4856ENV = None 
     
    96104    dtype = np.dtype(dtype) 
    97105    if dtype==F64 and not all(has_double(d) for d in context.devices): 
    98         warnings.warn(RuntimeWarning("Double precision not support; using single precision instead")) 
    99         dtype = F32 
     106        raise RuntimeError("Double precision not supported for devices") 
    100107 
    101108    header = F64_DEFS if dtype == F64 else F32_DEFS 
     
    104111        header += "#define USE_SINCOS\n" 
    105112    program  = cl.Program(context, header+source).build() 
    106     return program, dtype 
     113    return program 
    107114 
    108115 
     
    125132        self.boundary = max(d.min_data_type_align_size 
    126133                            for d in self.context.devices) 
     134        self.has_double = all(has_double(d) for d in self.context.devices) 
     135        self.compiled = {} 
     136 
     137    def compile_program(self, name, source, dtype): 
     138        if name not in self.compiled: 
     139            #print "compiling",name 
     140            self.compiled[name] = compile_model(self.context, source, dtype) 
     141        return self.compiled[name] 
     142 
     143    def release_program(self, name): 
     144        if name in self.compiled: 
     145            self.compiled[name].release() 
     146            del self.compiled[name] 
    127147 
    128148class GpuModel(object): 
     
    130150    GPU wrapper for a single model. 
    131151 
    132     *source* and *meta* are the model source and interface as returned 
     152    *source* and *info* are the model source and interface as returned 
    133153    from :func:`gen.make`. 
    134154 
     
    138158    is an optional extension which may not be available on all devices. 
    139159    """ 
    140     def __init__(self, source, meta, dtype=F32): 
    141         context = environment().context 
    142         self.meta = meta 
    143         self.program, self.dtype = compile_model(context, source, dtype) 
    144  
    145         #TODO: need better interface 
    146         self.PARS = dict((p[0],p[2]) for p in meta['parameters']) 
    147         self.PD_PARS = [p[0] for p in meta['parameters'] if p[4] != ""] 
    148  
    149     def __call__(self, input, cutoff=1e-5): 
     160    def __init__(self, source, info, dtype=F32): 
     161        self.info = info 
     162        self.source = source 
     163        self.dtype = dtype 
     164        self.program = None # delay program creation 
     165 
     166    def __getstate__(self): 
     167        state = self.__dict__.copy() 
     168        state['program'] = None 
     169        return state 
     170 
     171    def __setstate__(self, state): 
     172        self.__dict__ = state.copy() 
     173 
     174    def __call__(self, input): 
    150175        if self.dtype != input.dtype: 
    151176            raise TypeError("data and kernel have different types") 
    152         kernel_name = gen.kernel_name(self.meta, input.is_2D) 
     177        if self.program is None: 
     178            self.program = environment().compile_program(self.info['name'],self.source, self.dtype) 
     179        kernel_name = gen.kernel_name(self.info, input.is_2D) 
    153180        kernel = getattr(self.program, kernel_name) 
    154         return GpuKernel(kernel, self.meta, input, cutoff) 
     181        return GpuKernel(kernel, self.info, input) 
     182 
     183    def release(self): 
     184        if self.program is not None: 
     185            environment().release_program(self.info['name']) 
     186            self.program = None 
    155187 
    156188    def make_input(self, q_vectors): 
     
    210242 
    211243class GpuKernel(object): 
    212     def __init__(self, kernel, meta, input, cutoff): 
    213         env = environment() 
    214  
    215         self.cutoff = cutoff 
     244    def __init__(self, kernel, info, input): 
    216245        self.input = input 
    217246        self.kernel = kernel 
    218         self.meta = meta 
     247        self.info = info 
     248        self.res = np.empty(input.nq, input.dtype) 
     249        dim = '2d' if input.is_2D else '1d' 
     250        self.fixed_pars = info['partype']['fixed-'+dim] 
     251        self.pd_pars = info['partype']['pd-'+dim] 
    219252 
    220253        # Inputs and outputs for each kernel call 
     254        # Note: res may be shorter than res_b if global_size != nq 
     255        env = environment() 
    221256        self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE, 
    222                                   1024*input.dtype.itemsize) 
     257                                  MAX_LOOPS*input.dtype.itemsize) 
    223258                        for _ in env.queues] 
    224259        self.res_b = [cl.Buffer(env.context, mf.READ_WRITE, 
     
    226261                      for _ in env.queues] 
    227262 
    228         # Note: may be shorter than res_b if global_size != nq 
    229         self.res = np.empty(input.nq, input.dtype) 
    230  
    231         # Determine the set of fixed and polydisperse parameters 
    232         self.fixed_pars = [p[0] for p in meta['parameters'] if p[4] == ""] 
    233         self.pd_pars = [p for p in meta['parameters'] 
    234                if p[4]=="volume" or (p[4]=="orientation" and input.is_2D)] 
    235  
    236     def eval(self, pars): 
     263 
     264    def __call__(self, pars, pd_pars, cutoff=1e-5): 
     265        real = np.float32 if self.input.dtype == F32 else np.float64 
     266        fixed = [real(p) for p in pars] 
     267        cutoff = real(cutoff) 
     268        loops = np.hstack(pd_pars) 
     269        loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten() 
     270        loops_N = [np.uint32(len(p[0])) for p in pd_pars] 
     271 
     272        #print "opencl eval",pars 
     273        if len(loops) > MAX_LOOPS: 
     274            raise ValueError("too many polydispersity points") 
    237275        device_num = 0 
    238276        res_bi = self.res_b[device_num] 
    239277        queuei = environment().queues[device_num] 
    240         fixed, loops, loop_n = \ 
    241             gen.kernel_pars(pars, self.meta, self.input.is_2D, dtype=self.input.dtype) 
     278        loops_bi = self.loops_b[device_num] 
    242279        loops_l = cl.LocalMemory(len(loops.data)) 
    243         real = np.float32 if self.input.dtype == F32 else np.float64 
    244         cutoff = real(self.cutoff) 
    245  
    246         loops_bi = self.loops_b[device_num] 
    247280        cl.enqueue_copy(queuei, loops_bi, loops) 
    248281        #ctx = environment().context 
    249282        #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops) 
    250         pars = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loop_n 
    251         self.kernel(queuei, self.input.global_size, None, *pars) 
     283        args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loops_N 
     284        self.kernel(queuei, self.input.global_size, None, *args) 
    252285        cl.enqueue_copy(queuei, self.res, res_bi) 
    253286 
Note: See TracChangeset for help on using the changeset viewer.