Changes in / [0880966:839283a] in sasmodels
- Location:
- sasmodels
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/generate.py
r0880966 r0880966 669 669 c = p + 3*npars 670 670 671 details = np.zeros(c + 3, 'int32')671 details = np.zeros(c + 2, 'int32') 672 672 details[0*max_pd:1*max_pd] = range(max_pd) # pd_par: arbitrary order; use first 673 673 details[1*max_pd:2*max_pd] = [1]*max_pd # pd_length: only one element … … 707 707 theta_par = -1 708 708 709 details = np.empty(constants_offset + 3, 'int32')709 details = np.empty(constants_offset + 2, 'int32') 710 710 details[0*max_pd:1*max_pd] = idx # pd_par 711 711 details[1*max_pd:2*max_pd] = pd_length[idx] -
sasmodels/kernel_iq.c
r9f4409a r0a7e5eb4 20 20 int32_t pd_par[MAX_PD]; // id of the nth polydispersity variable 21 21 int32_t pd_length[MAX_PD]; // length of the nth polydispersity weight vector 22 int32_t pd_offset[MAX_PD]; // offset of pd weights in the par& weight vector22 int32_t pd_offset[MAX_PD]; // offset of pd weights in the value & weight vector 23 23 int32_t pd_stride[MAX_PD]; // stride to move to the next index at this level 24 24 int32_t pd_isvol[MAX_PD]; // True if parameter is a volume weighting parameter 25 int32_t par_offset[NPARS]; // offset of par values in the par& weight vector25 int32_t par_offset[NPARS]; // offset of par values in the value & weight vector 26 26 int32_t par_coord[NPARS]; // polydispersity coordination bitvector 27 27 int32_t fast_coord_pars[NPARS]; // ids of the fast coordination parameters 28 28 int32_t fast_coord_count; // number of parameters coordinated with pd 1 29 int32_t theta_ var; // id of spherical correction variable29 int32_t theta_par; // id of spherical correction variable 30 30 } ProblemDetails; 31 31 … … 43 43 global const ProblemDetails *problem, 44 44 global const double *weights, 45 global const double * pars,45 global const double *values, 46 46 global const double *q, // nq q values, with padding to boundary 47 47 global double *result, // nq+3 return values, again with padding … … 51 51 // Storage for the current parameter values. These will be updated as we 52 52 // walk the polydispersity cube. 53 local ParameterBlock local_ pars; // current parameter values54 double *pvec = (double *)(&local_ pars); // Alias named parameters with a vector53 local ParameterBlock local_values; // current parameter values 54 double *pvec = (double *)(&local_values); // Alias named parameters with a vector 55 55 56 56 local int offset[NPARS]; // NPARS excludes scale/background … … 61 61 62 62 for (int k=0; k < NPARS; k++) { 63 pvec[k] = pars[k+2]; // skip scale and background64 } 65 66 const double volume = CALL_VOLUME(local_ pars);63 pvec[k] = values[k+2]; // skip scale and background 64 } 65 66 const double volume = CALL_VOLUME(local_values); 67 67 #ifdef USE_OPENMP 68 68 #pragma omp parallel for 69 69 #endif 70 70 for (int i=0; i < nq; i++) { 71 const double scattering = CALL_IQ(q, i, local_ pars);72 result[i] = pars[0]*scattering/volume + pars[1];71 const double scattering = CALL_IQ(q, i, local_values); 72 result[i] = values[0]*scattering/volume + values[1]; 73 73 } 74 74 return; … … 147 147 } 148 148 offset[k] = this_offset; 149 pvec[k] = pars[this_offset];149 pvec[k] = values[this_offset]; 150 150 } 151 151 weight = partial_weight * weights[problem->pd_offset[0]+pd_index[0]]; 152 if (problem->theta_ var >= 0) {153 spherical_correction = fabs(cos(M_PI_180*pvec[problem->theta_ var]));154 } 155 if (problem->theta_ var == problem->pd_par[0]) {152 if (problem->theta_par >= 0) { 153 spherical_correction = fabs(cos(M_PI_180*pvec[problem->theta_par])); 154 } 155 if (problem->theta_par == problem->pd_par[0]) { 156 156 weight *= spherical_correction; 157 157 } … … 166 166 for (int k=0; k < problem->fast_coord_count; k++) { 167 167 pvec[problem->fast_coord_pars[k]] 168 = pars[offset[problem->fast_coord_pars[k]]++];169 } 170 if (problem->theta_ var ==problem->pd_par[0]) {171 weight *= fabs(cos(M_PI_180*pvec[problem->theta_ var]));168 = values[offset[problem->fast_coord_pars[k]]++]; 169 } 170 if (problem->theta_par ==problem->pd_par[0]) { 171 weight *= fabs(cos(M_PI_180*pvec[problem->theta_par])); 172 172 } 173 173 } 174 174 175 175 #ifdef INVALID 176 if (INVALID(local_ pars)) continue;176 if (INVALID(local_values)) continue; 177 177 #endif 178 178 … … 181 181 if (weight > cutoff) { 182 182 norm += weight; 183 vol += vol_weight * CALL_VOLUME(local_ pars);183 vol += vol_weight * CALL_VOLUME(local_values); 184 184 norm_vol += vol_weight; 185 185 … … 188 188 #endif 189 189 for (int i=0; i < nq; i++) { 190 const double scattering = CALL_IQ(q, i, local_ pars);190 const double scattering = CALL_IQ(q, i, local_values); 191 191 result[i] += weight*scattering; 192 192 } … … 207 207 result[i] *= norm_vol/vol; 208 208 } 209 result[i] = pars[0]*result[i]/norm + pars[1];209 result[i] = values[0]*result[i]/norm + values[1]; 210 210 } 211 211 } -
sasmodels/kernelcl.py
ra6f9577 rc072f83 336 336 self.program = None 337 337 338 def __call__(self, q_vectors):338 def make_calculator(self, q_vectors, details): 339 339 if self.program is None: 340 340 compiler = environment().compile_program 341 self.program = compiler(self.info['name'], self.source, self.dtype,342 self. fast)341 self.program = compiler(self.info['name'], self.source, 342 self.dtype, self.fast) 343 343 is_2d = len(q_vectors) == 2 344 344 kernel_name = generate.kernel_name(self.info, is_2d) 345 345 kernel = getattr(self.program, kernel_name) 346 return GpuKernel(kernel, self.info, q_vectors, self.dtype)346 return GpuKernel(kernel, self.info, q_vectors, details, self.dtype) 347 347 348 348 def release(self): … … 387 387 # at this point, so instead using 32, which is good on the set of 388 388 # architectures tested so far. 389 self.q_vectors = [_stretch_input(q, self.dtype, 32) for q in q_vectors] 389 if self.is_2d: 390 # Note: 17 rather than 15 because results is 2 elements 391 # longer than input. 392 width = ((self.nq+17)//16)*16 393 self.q = np.empty((width, 2), dtype=dtype) 394 self.q[:self.nq, 0] = q_vectors[0] 395 self.q[:self.nq, 1] = q_vectors[1] 396 else: 397 # Note: 33 rather than 31 because results is 2 elements 398 # longer than input. 399 width = ((self.nq+33)//32)*32 400 self.q = np.empty(width, dtype=dtype) 401 self.q[:self.nq] = q_vectors[0] 402 self.global_size = [self.q.shape[0]] 390 403 context = env.get_context(self.dtype) 391 self.global_size = [self.q_vectors[0].size]392 404 #print("creating inputs of size", self.global_size) 393 self.q_buffers = [ 394 cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=q) 395 for q in self.q_vectors 396 ] 405 # COPY_HOST_PTR initiates transfer as necessary? 406 self.q_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, 407 hostbuf=self.q) 397 408 398 409 def release(self): … … 400 411 Free the memory. 401 412 """ 402 for b in self.q_buffers:403 b.release()404 self.q_buffers = []413 if self.q is not None: 414 self.q.release() 415 self.q = None 405 416 406 417 def __del__(self): … … 427 438 Call :meth:`release` when done with the kernel instance. 428 439 """ 429 def __init__(self, kernel, model_info, q_vectors, dtype): 440 def __init__(self, kernel, model_info, q_vectors, details, dtype): 441 if details.dtype != np.int32: 442 raise TypeError("numeric type does not match the kernel type") 443 444 max_pd = self.info['max_pd'] 445 npars = len(model_info['parameters'])-2 430 446 q_input = GpuInput(q_vectors, dtype) 447 self.dtype = dtype 431 448 self.kernel = kernel 432 449 self.info = model_info 433 self.res = np.empty(q_input.nq, q_input.dtype) 434 dim = '2d' if q_input.is_2d else '1d' 435 self.fixed_pars = model_info['partype']['fixed-' + dim] 436 self.pd_pars = model_info['partype']['pd-' + dim] 450 self.details = details 451 self.pd_stop_index = 4*max_pd-1 452 # plus three for the normalization values 453 self.result = np.empty(q_input.nq+3, q_input.dtype) 454 #self.dim = '2d' if q_input.is_2d else '1d' 437 455 438 456 # Inputs and outputs for each kernel call … … 440 458 env = environment() 441 459 self.queue = env.get_queue(dtype) 442 self.loops_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 443 2 * MAX_LOOPS * q_input.dtype.itemsize) 444 self.res_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 460 461 # details is int32 data, padded to a 32 integer boundary 462 size = 4*((self.info['mono'].size+7)//8)*8 # padded to 32 byte boundary 463 self.details_b = cl.Buffer(self.queue.context, 464 mf.READ_ONLY | mf.COPY_HOST_PTR, 465 hostbuf=details) 466 size = np.sum(details[max_pd:2*max_pd]) 467 self.weights_b = cl.Buffer(self.queue.context, mf.READ_ONLY, size) 468 size = np.sum(details[max_pd:2*max_pd])+npars 469 self.values_b = cl.Buffer(self.queue.context, mf.READ_ONLY, size) 470 self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 445 471 q_input.global_size[0] * q_input.dtype.itemsize) 446 self.q_input = q_input 447 448 self._need_release = [self.loops_b, self.res_b, self.q_input] 449 450 def __call__(self, details, weights, values, cutoff): 472 self.q_input = q_input # allocated by GpuInput above 473 474 self._need_release = [ 475 self.details_b, self.weights_b, self.values_b, self.result_b, 476 self.q_input, 477 ] 478 479 def __call__(self, weights, values, cutoff): 451 480 real = (np.float32 if self.q_input.dtype == generate.F32 452 481 else np.float64 if self.q_input.dtype == generate.F64 … … 454 483 else np.float32) # will never get here, so use np.float32 455 484 456 #print "pars", fixed_pars, pd_pars 457 res_bi = self.res_b 458 nq = np.uint32(self.q_input.nq) 459 if pd_pars: 460 cutoff = real(cutoff) 461 loops_N = [np.uint32(len(p[0])) for p in pd_pars] 462 loops = np.hstack(pd_pars) \ 463 if pd_pars else np.empty(0, dtype=self.q_input.dtype) 464 loops = np.ascontiguousarray(loops.T, self.q_input.dtype).flatten() 465 #print("loops",Nloops, loops) 466 467 #import sys; print("opencl eval",pars) 468 #print("opencl eval",pars) 469 if len(loops) > 2 * MAX_LOOPS: 470 raise ValueError("too many polydispersity points") 471 472 loops_bi = self.loops_b 473 cl.enqueue_copy(self.queue, loops_bi, loops) 474 loops_l = cl.LocalMemory(len(loops.data)) 475 #ctx = environment().context 476 #loops_bi = cl.Buffer(ctx, mf.READ_ONLY|mf.COPY_HOST_PTR, hostbuf=loops) 477 dispersed = [loops_bi, loops_l, cutoff] + loops_N 478 else: 479 dispersed = [] 480 fixed = [real(p) for p in fixed_pars] 481 args = self.q_input.q_buffers + [res_bi, nq] + dispersed + fixed 485 if weights.dtype != real or values.dtype != real: 486 raise TypeError("numeric type does not match the kernel type") 487 488 cl.enqueue_copy(self.queue, self.weights_b, weights) 489 cl.enqueue_copy(self.queue, self.values_b, values) 490 491 args = [ 492 np.uint32(self.q_input.nq), 493 np.uint32(0), 494 np.uint32(self.details[self.pd_stop_index]), 495 self.details_b, 496 self.weights_b, 497 self.values_b, 498 self.q_input.q_b, 499 self.result_b, 500 real(cutoff), 501 ] 482 502 self.kernel(self.queue, self.q_input.global_size, None, *args) 483 cl.enqueue_copy(self.queue, self.res , res_bi)484 485 return self.res 503 cl.enqueue_copy(self.queue, self.result, self.result_b) 504 505 return self.result[:self.nq] 486 506 487 507 def release(self):
Note: See TracChangeset
for help on using the changeset viewer.