Changeset ce27e21 in sasmodels for sasmodels/gpu.py
- Timestamp:
- Aug 24, 2014 7:18:14 PM (10 years ago)
- Branches:
- master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- 1780d59
- Parents:
- 14de349
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/gpu.py
r14de349 rce27e21 45 45 #define real double 46 46 """ 47 48 # The max loops number is limited by the amount of local memory available 49 # on the device. You don't want to make this value too big because it will 50 # waste resources, nor too small because it may interfere with users trying 51 # to do their polydispersity calculations. A value of 1024 should be much 52 # larger than necessary given that cost grows as npts^k where k is the number 53 # of polydisperse parameters. 54 MAX_LOOPS = 1024 47 55 48 56 ENV = None … … 96 104 dtype = np.dtype(dtype) 97 105 if dtype==F64 and not all(has_double(d) for d in context.devices): 98 warnings.warn(RuntimeWarning("Double precision not support; using single precision instead")) 99 dtype = F32 106 raise RuntimeError("Double precision not supported for devices") 100 107 101 108 header = F64_DEFS if dtype == F64 else F32_DEFS … … 104 111 header += "#define USE_SINCOS\n" 105 112 program = cl.Program(context, header+source).build() 106 return program , dtype113 return program 107 114 108 115 … … 125 132 self.boundary = max(d.min_data_type_align_size 126 133 for d in self.context.devices) 134 self.has_double = all(has_double(d) for d in self.context.devices) 135 self.compiled = {} 136 137 def compile_program(self, name, source, dtype): 138 if name not in self.compiled: 139 #print "compiling",name 140 self.compiled[name] = compile_model(self.context, source, dtype) 141 return self.compiled[name] 142 143 def release_program(self, name): 144 if name in self.compiled: 145 self.compiled[name].release() 146 del self.compiled[name] 127 147 128 148 class GpuModel(object): … … 130 150 GPU wrapper for a single model. 131 151 132 *source* and * meta* are the model source and interface as returned152 *source* and *info* are the model source and interface as returned 133 153 from :func:`gen.make`. 134 154 … … 138 158 is an optional extension which may not be available on all devices. 139 159 """ 140 def __init__(self, source, meta, dtype=F32): 141 context = environment().context 142 self.meta = meta 143 self.program, self.dtype = compile_model(context, source, dtype) 144 145 #TODO: need better interface 146 self.PARS = dict((p[0],p[2]) for p in meta['parameters']) 147 self.PD_PARS = [p[0] for p in meta['parameters'] if p[4] != ""] 148 149 def __call__(self, input, cutoff=1e-5): 160 def __init__(self, source, info, dtype=F32): 161 self.info = info 162 self.source = source 163 self.dtype = dtype 164 self.program = None # delay program creation 165 166 def __getstate__(self): 167 state = self.__dict__.copy() 168 state['program'] = None 169 return state 170 171 def __setstate__(self, state): 172 self.__dict__ = state.copy() 173 174 def __call__(self, input): 150 175 if self.dtype != input.dtype: 151 176 raise TypeError("data and kernel have different types") 152 kernel_name = gen.kernel_name(self.meta, input.is_2D) 177 if self.program is None: 178 self.program = environment().compile_program(self.info['name'],self.source, self.dtype) 179 kernel_name = gen.kernel_name(self.info, input.is_2D) 153 180 kernel = getattr(self.program, kernel_name) 154 return GpuKernel(kernel, self.meta, input, cutoff) 181 return GpuKernel(kernel, self.info, input) 182 183 def release(self): 184 if self.program is not None: 185 environment().release_program(self.info['name']) 186 self.program = None 155 187 156 188 def make_input(self, q_vectors): … … 210 242 211 243 class GpuKernel(object): 212 def __init__(self, kernel, meta, input, cutoff): 213 env = environment() 214 215 self.cutoff = cutoff 244 def __init__(self, kernel, info, input): 216 245 self.input = input 217 246 self.kernel = kernel 218 self.meta = meta 247 self.info = info 248 self.res = np.empty(input.nq, input.dtype) 249 dim = '2d' if input.is_2D else '1d' 250 self.fixed_pars = info['partype']['fixed-'+dim] 251 self.pd_pars = info['partype']['pd-'+dim] 219 252 220 253 # Inputs and outputs for each kernel call 254 # Note: res may be shorter than res_b if global_size != nq 255 env = environment() 221 256 self.loops_b = [cl.Buffer(env.context, mf.READ_WRITE, 222 1024*input.dtype.itemsize)257 MAX_LOOPS*input.dtype.itemsize) 223 258 for _ in env.queues] 224 259 self.res_b = [cl.Buffer(env.context, mf.READ_WRITE, … … 226 261 for _ in env.queues] 227 262 228 # Note: may be shorter than res_b if global_size != nq 229 self.res = np.empty(input.nq, input.dtype) 230 231 # Determine the set of fixed and polydisperse parameters 232 self.fixed_pars = [p[0] for p in meta['parameters'] if p[4] == ""] 233 self.pd_pars = [p for p in meta['parameters'] 234 if p[4]=="volume" or (p[4]=="orientation" and input.is_2D)] 235 236 def eval(self, pars): 263 264 def __call__(self, pars, pd_pars, cutoff=1e-5): 265 real = np.float32 if self.input.dtype == F32 else np.float64 266 fixed = [real(p) for p in pars] 267 cutoff = real(cutoff) 268 loops = np.hstack(pd_pars) 269 loops = np.ascontiguousarray(loops.T, self.input.dtype).flatten() 270 loops_N = [np.uint32(len(p[0])) for p in pd_pars] 271 272 #print "opencl eval",pars 273 if len(loops) > MAX_LOOPS: 274 raise ValueError("too many polydispersity points") 237 275 device_num = 0 238 276 res_bi = self.res_b[device_num] 239 277 queuei = environment().queues[device_num] 240 fixed, loops, loop_n = \ 241 gen.kernel_pars(pars, self.meta, self.input.is_2D, dtype=self.input.dtype) 278 loops_bi = self.loops_b[device_num] 242 279 loops_l = cl.LocalMemory(len(loops.data)) 243 real = np.float32 if self.input.dtype == F32 else np.float64244 cutoff = real(self.cutoff)245 246 loops_bi = self.loops_b[device_num]247 280 cl.enqueue_copy(queuei, loops_bi, loops) 248 281 #ctx = environment().context 249 282 #loops_bi = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=loops) 250 pars = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loop_n251 self.kernel(queuei, self.input.global_size, None, * pars)283 args = self.input.q_buffers + [res_bi,loops_bi,loops_l,cutoff] + fixed + loops_N 284 self.kernel(queuei, self.input.global_size, None, *args) 252 285 cl.enqueue_copy(queuei, self.res, res_bi) 253 286
Note: See TracChangeset
for help on using the changeset viewer.