Changes in / [1448db7a:25b82a1] in sasmodels
- Location:
- sasmodels
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/generate.py
rf2f67a6 rae2b6b5 473 473 dll_code = load_template('kernel_iq.c') 474 474 ocl_code = load_template('kernel_iq.cl') 475 #ocl_code = load_template('kernel_iq_local.cl') 475 476 user_code = [open(f).read() for f in model_sources(model_info)] 476 477 -
sasmodels/kernel_iq.c
rf2f67a6 rae2b6b5 52 52 // Storage for the current parameter values. These will be updated as we 53 53 // walk the polydispersity cube. 54 localParameterBlock local_values; // current parameter values54 ParameterBlock local_values; // current parameter values 55 55 double *pvec = (double *)(&local_values); // Alias named parameters with a vector 56 56 double norm; … … 74 74 norm = CALL_VOLUME(local_values); 75 75 76 const double scale = values[0];77 const double background = values[1];78 // result[nq] = norm; // Total volume normalization76 double scale, background; 77 scale = values[0]; 78 background = values[1]; 79 79 80 80 #ifdef USE_OPENMP 81 81 #pragma omp parallel for 82 82 #endif 83 for (int i=0; i < nq; i++) {84 double scattering = CALL_IQ(q, i, local_values);85 result[ i] = (norm>0. ? scale*scattering/norm + background : background);83 for (int q_index=0; q_index < nq; q_index++) { 84 double scattering = CALL_IQ(q, q_index, local_values); 85 result[q_index] = (norm>0. ? scale*scattering/norm + background : background); 86 86 } 87 87 return; … … 89 89 90 90 #if MAX_PD > 0 91 // If it is the first round initialize the result to zero, otherwise 92 // assume that the previous result has been passed back. 93 // Note: doing this even in the monodisperse case in order to handle the 94 // rare case where the model parameters are invalid and zero is returned. 95 // So slightly increased cost for slightly smaller code size. 91 92 // need product of weights at every Iq calc, so keep product of 93 // weights from the outer loops so that weight = partial_weight * fast_weight 94 double partial_weight; // product of weight w4*w3*w2 but not w1 95 double spherical_correction; // cosine correction for latitude variation 96 double weight; // product of partial_weight*w1*spherical_correction 97 98 // Location in the polydispersity hypercube, one index per dimension. 99 int pd_index[MAX_PD]; 100 101 // Location of the coordinated parameters in their own sub-cubes. 102 int offset[NPARS]; 103 104 // Number of coordinated indices 105 const int num_coord = details->num_coord; 106 107 // Number of elements in the longest polydispersity loop 108 const int fast_length = details->pd_length[0]; 109 110 // Trigger the reset behaviour that happens at the end the fast loop 111 // by setting the initial index >= weight vector length. 112 pd_index[0] = fast_length; 113 114 // Default the spherical correction to 1.0 in case it is not otherwise set 115 spherical_correction = 1.0; 116 117 // Since we are no longer looping over the entire polydispersity hypercube 118 // for each q, we need to track the result and normalization values between 119 // calls. This means initializing them to 0 at the start and accumulating 120 // them between calls. 121 norm = pd_start == 0 ? 0.0 : result[nq]; 96 122 if (pd_start == 0) { 97 123 #ifdef USE_OPENMP 98 124 #pragma omp parallel for 99 125 #endif 100 for (int i=0; i < nq+1; i++) { 101 result[i] = 0.0; 102 } 103 norm = 0.0; 104 } else { 105 norm = result[nq]; 106 } 107 108 // need product of weights at every Iq calc, so keep product of 109 // weights from the outer loops so that weight = partial_weight * fast_weight 110 double partial_weight = NAN; // product of weight w4*w3*w2 but not w1 111 double spherical_correction = 1.0; // cosine correction for latitude variation 112 113 // Location in the polydispersity hypercube, one index per dimension. 114 local int pd_index[MAX_PD]; 115 116 // Location of the coordinated parameters in their own sub-cubes. 117 local int offset[NPARS]; 118 119 // Trigger the reset behaviour that happens at the end the fast loop 120 // by setting the initial index >= weight vector length. 121 const int fast_length = details->pd_length[0]; 122 pd_index[0] = fast_length; 123 124 // Number of coordinated indices 125 const int num_coord = details->num_coord; 126 for (int q_index=0; q_index < nq; q_index++) { 127 result[q_index] = 0.0; 128 } 129 } 126 130 127 131 // Loop over the weights then loop over q, accumulating values … … 172 176 } 173 177 174 // Increment fast index 175 const double wi = weights[details->pd_offset[0] + pd_index[0]++]; 176 double weight = partial_weight*wi; 178 // Update fast parameters 177 179 //printf("fast %d: ", loop_index); 178 180 for (int k=0; k < num_coord; k++) { … … 189 191 //printf("\n"); 190 192 193 // Increment fast index 194 const double wi = weights[details->pd_offset[0] + pd_index[0]]; 195 weight = partial_weight*wi; 196 pd_index[0]++; 197 191 198 #ifdef INVALID 192 199 if (INVALID(local_values)) continue; … … 204 211 #pragma omp parallel for 205 212 #endif 206 for (int i=0; i < nq; i++) { 207 const double scattering = CALL_IQ(q, i, local_values); 208 result[i] += weight*scattering; 209 } 210 } 211 } 212 213 // End of the PD loop we can normalize 213 for (int q_index=0; q_index < nq; q_index++) { 214 const double scattering = CALL_IQ(q, q_index, local_values); 215 result[q_index] += weight*scattering; 216 } 217 } 218 } 219 214 220 if (pd_stop >= details->total_pd) { 215 const double scale = values[0]; 216 const double background = values[1]; 221 // End of the PD loop we can normalize 222 double scale, background; 223 scale = values[0]; 224 background = values[1]; 217 225 #ifdef USE_OPENMP 218 226 #pragma omp parallel for 219 227 #endif 220 for (int i=0; i < nq; i++) {221 result[ i] = (norm>0. ? scale*result[i]/norm + background : background);228 for (int q_index=0; q_index < nq; q_index++) { 229 result[q_index] = (norm>0. ? scale*result[q_index]/norm + background : background); 222 230 } 223 231 } -
sasmodels/kernel_iq.cl
rf2f67a6 rae2b6b5 50 50 ) 51 51 { 52 double norm;53 54 // who we are and what element we are working with55 const int q_index = get_global_id(0);56 57 // number of active loops58 const int num_active = details->num_active;59 60 52 // Storage for the current parameter values. These will be updated as we 61 53 // walk the polydispersity cube. 62 54 ParameterBlock local_values; // current parameter values 63 55 double *pvec = (double *)(&local_values); // Alias named parameters with a vector 56 double norm; 57 58 // who we are and what element we are working with 59 const int q_index = get_global_id(0); 60 61 // number of active loops 62 const int num_active = details->num_active; 63 64 64 // Fill in the initial variables 65 for (int k =0; k < NPARS; k++) {65 for (int k=0; k < NPARS; k++) { 66 66 pvec[k] = values[details->par_offset[k]]; 67 67 } … … 72 72 if (INVALID(local_values)) { return; } 73 73 #endif 74 norm = CALL_VOLUME(local_values); 74 75 75 76 double scale, background; 76 norm = CALL_VOLUME(local_values);77 77 scale = values[0]; 78 78 background = values[1]; 79 80 // if (i==0) result[nq] = norm; // Total volume normalization81 79 82 80 if (q_index < nq) { … … 89 87 #if MAX_PD > 0 90 88 91 // If it is the first round initialize the result to zero, otherwise92 // assume that the previous result has been passed back.93 // Note: doing this even in the monodisperse case in order to handle the94 // rare case where the model parameters are invalid and zero is returned.95 // So slightly increased cost for slightly smaller code size.96 89 double this_result; 97 90 … … 102 95 // weights from the outer loops so that weight = partial_weight * fast_weight 103 96 double partial_weight; // product of weight w4*w3*w2 but not w1 104 double spherical_correction; // cosine correction for latitude variation 97 double spherical_correction; // cosine correction for latitude variation 98 double weight; // product of partial_weight*w1*spherical_correction 105 99 106 100 // Location in the polydispersity hypercube, one index per dimension. … … 110 104 int offset[NPARS]; 111 105 106 // Number of coordinated indices 107 const int num_coord = details->num_coord; 108 112 109 // Number of elements in the longest polydispersity loop 113 110 const int fast_length = details->pd_length[0]; 114 111 115 // Number of coordinated indices116 const int num_coord = details->num_coord;117 118 // We could in theory spread this work across different threads, but119 // lets keep it simple;120 norm = pd_start == 0 ? 0.0 : result[nq];121 spherical_correction = 1.0; // the usual case.122 // partial_weight = NAN;123 112 // Trigger the reset behaviour that happens at the end the fast loop 124 113 // by setting the initial index >= weight vector length. 125 114 pd_index[0] = fast_length; 115 116 // Default the spherical correction to 1.0 in case it is not otherwise set 117 spherical_correction = 1.0; 126 118 127 119 // Since we are no longer looping over the entire polydispersity hypercube … … 129 121 // calls. This means initializing them to 0 at the start and accumulating 130 122 // them between calls. 123 norm = pd_start == 0 ? 0.0 : result[nq]; 131 124 if (q_index < nq) { 132 125 this_result = pd_start == 0 ? 0.0 : result[q_index]; … … 141 134 // Compute position in polydispersity hypercube 142 135 for (int k=0; k < num_active; k++) { 143 144 136 pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k]; 137 //printf("pd_index[%d] = %d\n",k,pd_index[k]); 145 138 } 146 139 … … 163 156 //printf("slow %d: ", loop_index); 164 157 for (int k=0; k < num_coord; k++) { 165 if (k < num_coord) { 166 int par = details->par_coord[k]; 167 int coord = details->pd_coord[k]; 168 int this_offset = details->par_offset[par]; 169 int block_size = 1; 170 for (int bit=0; coord != 0; bit++) { 171 if (coord&1) { 172 this_offset += block_size * pd_index[bit]; 173 block_size *= details->pd_length[bit]; 174 } 175 coord >>= 1; 158 int par = details->par_coord[k]; 159 int coord = details->pd_coord[k]; 160 int this_offset = details->par_offset[par]; 161 int block_size = 1; 162 for (int bit=0; coord != 0; bit++) { 163 if (coord&1) { 164 this_offset += block_size * pd_index[bit]; 165 block_size *= details->pd_length[bit]; 176 166 } 177 offset[par] = this_offset; 178 pvec[par] = values[this_offset]; 179 //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]); 180 // if theta is not coordinated with fast index, precompute spherical correction 181 if (par == details->theta_par && !(details->par_coord[k]&1)) { 182 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 183 } 167 coord >>= 1; 184 168 } 169 offset[par] = this_offset; 170 pvec[par] = values[this_offset]; 171 //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]); 172 // if theta is not coordinated with fast index, precompute spherical correction 173 if (par == details->theta_par && !(details->par_coord[k]&1)) { 174 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 175 } 185 176 } 186 177 //printf("\n"); 187 178 } 188 179 189 double weight; 180 // Update fast parameters 181 //printf("fast %d: ", loop_index); 182 for (int k=0; k < num_coord; k++) { 183 if (details->pd_coord[k]&1) { 184 const int par = details->par_coord[k]; 185 pvec[par] = values[offset[par]++]; 186 //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]); 187 // if theta is coordinated with fast index, compute spherical correction each time 188 if (par == details->theta_par) { 189 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 190 } 191 } 192 } 193 //printf("\n"); 194 195 // Increment fast index 190 196 const double wi = weights[details->pd_offset[0] + pd_index[0]]; 191 197 weight = partial_weight*wi; 192 198 pd_index[0]++; 193 194 // Increment fast index195 //printf("fast %d: ", loop_index);196 for (int k=0; k < num_coord; k++) {197 if (k < num_coord) {198 if (details->pd_coord[k]&1) {199 const int par = details->par_coord[k];200 pvec[par] = values[offset[par]++];201 //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]);202 // if theta is coordinated with fast index, compute spherical correction each time203 if (par == details->theta_par) {204 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);205 }206 }207 }208 }209 //printf("\n");210 199 211 200 #ifdef INVALID … … 229 218 if (pd_stop >= details->total_pd) { 230 219 // End of the PD loop we can normalize 231 const double scale = values[0]; 232 const double background = values[1]; 220 double scale, background; 221 scale = values[0]; 222 background = values[1]; 233 223 result[q_index] = (norm>0. ? scale*this_result/norm + background : background); 234 224 } else { … … 236 226 result[q_index] = this_result; 237 227 } 238 // Accumulate norm. 228 229 // Remember the updated norm. 239 230 if (q_index == 0) result[nq] = norm; 240 231 } -
sasmodels/kernelcl.py
rf2f67a6 rae2b6b5 511 511 hostbuf=values) 512 512 513 start, stop = 0, call_details.total_pd 514 args = [ 515 np.uint32(self.q_input.nq), np.int32(start), np.int32(stop), 516 details_b, weights_b, values_b, self.q_input.q_b, self.result_b, 517 self.real(cutoff), 518 ] 519 self.kernel(self.queue, self.q_input.global_size, None, *args) 513 # Call kernel and retrieve results 514 step = 100 515 for start in range(0, call_details.total_pd, step): 516 stop = min(start+step, call_details.total_pd) 517 args = [ 518 np.uint32(self.q_input.nq), np.int32(start), np.int32(stop), 519 details_b, weights_b, values_b, self.q_input.q_b, self.result_b, 520 self.real(cutoff), 521 ] 522 self.kernel(self.queue, self.q_input.global_size, None, *args) 520 523 cl.enqueue_copy(self.queue, self.result, self.result_b) 524 525 # Free buffers 521 526 for v in (details_b, weights_b, values_b): 522 527 if v is not None: v.release()
Note: See TracChangeset
for help on using the changeset viewer.