Changeset f5b9a6b in sasmodels


Ignore:
Timestamp:
Feb 13, 2015 5:14:19 PM (10 years ago)
Author:
pkienzle
Branches:
master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
cb6ecf4
Parents:
87c722e
Message:

fiddle calculation of q vector length as sent to the gpu

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/gpu.py

    r994d77f rf5b9a6b  
    7474    return "cl_khr_fp64" in device.extensions 
    7575 
    76  
    77 def _stretch_input(vector, dtype, extra=1e-3, boundary=128): 
     76def get_warp(kernel, queue): 
     77    """ 
     78    Return the size of an execution batch for *kernel* running on *queue*. 
     79    """ 
     80    return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
     81                                      queue.device) 
     82 
     83def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 
    7884    """ 
    7985    Stretch an input vector to the correct boundary. 
     
    8187    Performance on the kernels can drop by a factor of two or more if the 
    8288    number of values to compute does not fall on a nice power of two 
    83     boundary.  A good choice for the boundary value is the 
    84     min_data_type_align_size property of the OpenCL device.  The usual 
    85     value of 128 gives a working size as a multiple of 32.  The trailing 
    86     additional vector elements are given a value of *extra*, and so 
    87     f(*extra*) will be computed for each of them.  The returned array 
    88     will thus be a subset of the computed array. 
    89     """ 
    90     boundary // dtype.itemsize 
     89    boundary.   The trailing additional vector elements are given a 
     90    value of *extra*, and so f(*extra*) will be computed for each of 
     91    them.  The returned array will thus be a subset of the computed array. 
     92 
     93    *boundary* should be a power of 2 which is at least 32 for good 
     94    performance on current platforms (as of Jan 2015).  It should 
     95    probably be the max of get_warp(kernel,queue) and 
     96    device.min_data_type_align_size//4. 
     97    """ 
    9198    remainder = vector.size%boundary 
    92     size = vector.size + (boundary - remainder if remainder != 0 else 0) 
    93     if size != vector.size: 
     99    if remainder != 0: 
     100        size = vector.size + (boundary - remainder) 
    94101        vector = np.hstack((vector, [extra]*(size-vector.size))) 
    95102    return np.ascontiguousarray(vector, dtype=dtype) 
     
    134141        self.queues = [cl.CommandQueue(self.context, d) 
    135142                       for d in self.context.devices] 
    136         self.boundary = max(d.min_data_type_align_size 
    137                             for d in self.context.devices) 
     143        # Byte boundary for data alignment 
     144        #self.data_boundary = max(d.min_data_type_align_size 
     145        #                         for d in self.context.devices) 
    138146        self.has_double = all(has_double(d) for d in self.context.devices) 
    139147        self.compiled = {} 
     
    230238        self.dtype = np.dtype(dtype) 
    231239        self.is_2D = (len(q_vectors) == 2) 
    232         self.q_vectors = [ 
    233             _stretch_input(q, self.dtype, boundary=env.boundary) 
    234             for q in q_vectors 
    235         ] 
     240        # TODO: stretch input based on get_warp() 
     241        # not doing it now since warp depends on kernel, which is not known 
     242        # at this point, so instead using 32, which is good on the set of 
     243        # architectures tested so far. 
     244        self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors] 
    236245        self.q_buffers = [ 
    237246            cl.Buffer(env.context,  mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q) 
Note: See TracChangeset for help on using the changeset viewer.