Changeset f5b9a6b in sasmodels for sasmodels/gpu.py
 Timestamp:
 Feb 13, 2015 3:14:19 PM (10 years ago)
 Branches:
 master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket1257vesicleproduct, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
 Children:
 cb6ecf4
 Parents:
 87c722e
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

sasmodels/gpu.py
r994d77f rf5b9a6b 74 74 return "cl_khr_fp64" in device.extensions 75 75 76 77 def _stretch_input(vector, dtype, extra=1e3, boundary=128): 76 def get_warp(kernel, queue): 77 """ 78 Return the size of an execution batch for *kernel* running on *queue*. 79 """ 80 return kernel.get_work_group_info(cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 81 queue.device) 82 83 def _stretch_input(vector, dtype, extra=1e3, boundary=32): 78 84 """ 79 85 Stretch an input vector to the correct boundary. … … 81 87 Performance on the kernels can drop by a factor of two or more if the 82 88 number of values to compute does not fall on a nice power of two 83 boundary. A good choice for the boundary value is the 84 min_data_type_align_size property of the OpenCL device. The usual 85 value of 128 gives a working size as a multiple of 32. The trailing 86 additional vector elements are given a value of *extra*, and so 87 f(*extra*) will be computed for each of them. The returned array 88 will thus be a subset of the computed array. 89 """ 90 boundary // dtype.itemsize 89 boundary. The trailing additional vector elements are given a 90 value of *extra*, and so f(*extra*) will be computed for each of 91 them. The returned array will thus be a subset of the computed array. 92 93 *boundary* should be a power of 2 which is at least 32 for good 94 performance on current platforms (as of Jan 2015). It should 95 probably be the max of get_warp(kernel,queue) and 96 device.min_data_type_align_size//4. 97 """ 91 98 remainder = vector.size%boundary 92 size = vector.size + (boundary  remainder if remainder != 0 else 0)93 if size != vector.size:99 if remainder != 0: 100 size = vector.size + (boundary  remainder) 94 101 vector = np.hstack((vector, [extra]*(sizevector.size))) 95 102 return np.ascontiguousarray(vector, dtype=dtype) … … 134 141 self.queues = [cl.CommandQueue(self.context, d) 135 142 for d in self.context.devices] 136 self.boundary = max(d.min_data_type_align_size 137 for d in self.context.devices) 143 # Byte boundary for data alignment 144 #self.data_boundary = max(d.min_data_type_align_size 145 # for d in self.context.devices) 138 146 self.has_double = all(has_double(d) for d in self.context.devices) 139 147 self.compiled = {} … … 230 238 self.dtype = np.dtype(dtype) 231 239 self.is_2D = (len(q_vectors) == 2) 232 self.q_vectors = [ 233 _stretch_input(q, self.dtype, boundary=env.boundary) 234 for q in q_vectors 235 ] 240 # TODO: stretch input based on get_warp() 241 # not doing it now since warp depends on kernel, which is not known 242 # at this point, so instead using 32, which is good on the set of 243 # architectures tested so far. 244 self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors] 236 245 self.q_buffers = [ 237 246 cl.Buffer(env.context, mf.READ_ONLY  mf.COPY_HOST_PTR, hostbuf=q)
Note: See TracChangeset
for help on using the changeset viewer.