Changeset a738209 in sasmodels for sasmodels/kernel_iq.cl
- Timestamp:
- Jul 15, 2016 7:33:33 AM (8 years ago)
- Branches:
- master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
- Children:
- def2c1b
- Parents:
- 98ba1fc
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
sasmodels/kernel_iq.cl
rae2b6b5 ra738209 22 22 int32_t pd_stride[MAX_PD]; // stride to move to the next index at this level 23 23 #endif // MAX_PD > 0 24 int32_t par_offset[NPARS]; // offset of par value blocks in the value & weight vector 25 int32_t par_coord[NPARS]; // ids of the coordination parameters 26 int32_t pd_coord[NPARS]; // polydispersity coordination bitvector 24 int32_t pd_prod; // total number of voxels in hypercube 25 int32_t pd_sum; // total length of the weights vector 27 26 int32_t num_active; // number of non-trivial pd loops 28 int32_t total_pd; // total number of voxels in hypercube29 int32_t num_coord; // number of coordinated parameters30 27 int32_t theta_par; // id of spherical correction variable 31 28 } ProblemDetails; … … 43 40 const int32_t pd_stop, // where we are stopping in the polydispersity loop 44 41 global const ProblemDetails *details, 45 global const double *weights,46 42 global const double *values, 47 43 global const double *q, // nq q values, with padding to boundary 48 global double *result, // nq+ 3return values, again with padding44 global double *result, // nq+1 return values, again with padding 49 45 const double cutoff // cutoff in the polydispersity weight product 50 46 ) 51 47 { 52 48 // Storage for the current parameter values. These will be updated as we 53 // walk the polydispersity cube. 54 ParameterBlock local_values; // current parameter values 55 double *pvec = (double *)(&local_values); // Alias named parameters with a vector 56 double norm; 49 // walk the polydispersity cube. local_values will be aliased to pvec. 50 local ParameterBlock local_values; 57 51 58 52 // who we are and what element we are working with 59 53 const int q_index = get_global_id(0); 60 61 // number of active loops 62 const int num_active = details->num_active; 54 const int thread = get_local_id(0); 63 55 64 56 // Fill in the initial variables 65 for (int k=0; k < NPARS; k++) { 66 pvec[k] = values[details->par_offset[k]]; 67 } 57 event_t e = async_work_group_copy((local double *)&local_values, values+2, NPARS, 0); 58 wait_group_events(1, &e); 68 59 69 60 // Monodisperse computation 70 if (num_active == 0) { 61 if (details->num_active == 0) { 62 double norm, scale, background; 63 // TODO: only needs to be done by one process... 71 64 #ifdef INVALID 72 65 if (INVALID(local_values)) { return; } 73 66 #endif 67 74 68 norm = CALL_VOLUME(local_values); 75 76 double scale, background;77 69 scale = values[0]; 78 70 background = values[1]; … … 92 84 // norm will be shared across all threads. 93 85 86 // "values" is global and can't be assigned to a local, so even though only 87 // the alias is only needed for thread 0 it is allocated in all threads. 88 global const double *pd_value = values+2+NPARS; 89 global const double *pd_weight = pd_value+details->pd_sum; 90 94 91 // need product of weights at every Iq calc, so keep product of 95 92 // weights from the outer loops so that weight = partial_weight * fast_weight 96 double partial_weight; // product of weight w4*w3*w2 but not w1 97 double spherical_correction; // cosine correction for latitude variation 98 double weight; // product of partial_weight*w1*spherical_correction 99 100 // Location in the polydispersity hypercube, one index per dimension. 101 int pd_index[MAX_PD]; 102 103 // Location of the coordinated parameters in their own sub-cubes. 104 int offset[NPARS]; 105 106 // Number of coordinated indices 107 const int num_coord = details->num_coord; 93 local double pd_norm; 94 local double partial_weight; // product of weight w4*w3*w2 but not w1 95 local double spherical_correction; // cosine correction for latitude variation 96 local double weight; // product of partial_weight*w1*spherical_correction 97 local double *pvec; 98 local int p0_par; 99 local int p0_length; 100 local int p0_offset; 101 local int p0_is_theta; 102 local int p0_index; 108 103 109 104 // Number of elements in the longest polydispersity loop 110 const int fast_length = details->pd_length[0]; 111 112 // Trigger the reset behaviour that happens at the end the fast loop 113 // by setting the initial index >= weight vector length. 114 pd_index[0] = fast_length; 115 116 // Default the spherical correction to 1.0 in case it is not otherwise set 117 spherical_correction = 1.0; 118 119 // Since we are no longer looping over the entire polydispersity hypercube 120 // for each q, we need to track the result and normalization values between 121 // calls. This means initializing them to 0 at the start and accumulating 122 // them between calls. 123 norm = pd_start == 0 ? 0.0 : result[nq]; 105 barrier(CLK_LOCAL_MEM_FENCE); 106 if (thread == 0) { 107 pvec = (local double *)(&local_values); 108 109 // Number of elements in the longest polydispersity loop 110 p0_par = details->pd_par[0]; 111 p0_length = details->pd_length[0]; 112 p0_offset = details->pd_offset[0]; 113 p0_is_theta = (p0_par == details->theta_par); 114 115 // Trigger the reset behaviour that happens at the end the fast loop 116 // by setting the initial index >= weight vector length. 117 p0_index = p0_length; 118 119 // Default the spherical correction to 1.0 in case it is not otherwise set 120 spherical_correction = 1.0; 121 122 // Since we are no longer looping over the entire polydispersity hypercube 123 // for each q, we need to track the result and normalization values between 124 // calls. This means initializing them to 0 at the start and accumulating 125 // them between calls. 126 pd_norm = pd_start == 0 ? 0.0 : result[nq]; 127 } 128 barrier(CLK_LOCAL_MEM_FENCE); 129 124 130 if (q_index < nq) { 125 131 this_result = pd_start == 0 ? 0.0 : result[q_index]; … … 128 134 // Loop over the weights then loop over q, accumulating values 129 135 for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) { 130 // check if fast loop needs to be reset 131 if (pd_index[0] == fast_length) { 132 //printf("should be here with %d active\n", num_active); 133 134 // Compute position in polydispersity hypercube 135 for (int k=0; k < num_active; k++) { 136 pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k]; 137 //printf("pd_index[%d] = %d\n",k,pd_index[k]); 136 barrier(CLK_LOCAL_MEM_FENCE); 137 if (thread == 0) { 138 // check if fast loop needs to be reset 139 if (p0_index == p0_length) { 140 //printf("should be here with %d active\n", num_active); 141 142 // Compute position in polydispersity hypercube and partial weight 143 partial_weight = 1.0; 144 for (int k=1; k < details->num_active; k++) { 145 int pk = details->pd_par[k]; 146 int index = details->pd_offset[k] + (loop_index/details->pd_stride[k])%details->pd_length[k]; 147 pvec[pk] = pd_value[index]; 148 partial_weight *= pd_weight[index]; 149 //printf("index[%d] = %d\n",k,index); 150 if (pk == details->theta_par) { 151 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[pk])), 1.e-6); 152 } 153 } 154 p0_index = loop_index%p0_length; 155 //printf("\n"); 138 156 } 139 157 140 // need to compute the product of the weights. If the vector were really 141 // long, we could split the work into groups, with each thread taking 142 // every nth weight, but there really is no call for it here. We could 143 // also do some clever pair-wise multiplication similar to parallel 144 // prefix, but again simpler is probably faster since n is likely small. 145 // Compute partial weights 146 partial_weight = 1.0; 147 //printf("partial weight %d: ", loop_index); 148 for (int k=1; k < num_active; k++) { 149 double wi = weights[details->pd_offset[k] + pd_index[k]]; 150 //printf("pd[%d]=par[%d]=%g ", k, details->pd_par[k], wi); 151 partial_weight *= wi; 158 // Update parameter p0 159 weight = partial_weight*pd_weight[p0_offset + p0_index]; 160 pvec[p0_par] = pd_value[p0_offset + p0_index]; 161 if (p0_is_theta) { 162 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[p0_par])), 1.e-6); 152 163 } 153 //printf("\n"); 154 155 // Update parameter offsets in weight vector 156 //printf("slow %d: ", loop_index); 157 for (int k=0; k < num_coord; k++) { 158 int par = details->par_coord[k]; 159 int coord = details->pd_coord[k]; 160 int this_offset = details->par_offset[par]; 161 int block_size = 1; 162 for (int bit=0; coord != 0; bit++) { 163 if (coord&1) { 164 this_offset += block_size * pd_index[bit]; 165 block_size *= details->pd_length[bit]; 166 } 167 coord >>= 1; 168 } 169 offset[par] = this_offset; 170 pvec[par] = values[this_offset]; 171 //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]); 172 // if theta is not coordinated with fast index, precompute spherical correction 173 if (par == details->theta_par && !(details->par_coord[k]&1)) { 174 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 175 } 176 } 177 //printf("\n"); 178 } 179 180 // Update fast parameters 181 //printf("fast %d: ", loop_index); 182 for (int k=0; k < num_coord; k++) { 183 if (details->pd_coord[k]&1) { 184 const int par = details->par_coord[k]; 185 pvec[par] = values[offset[par]++]; 186 //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]); 187 // if theta is coordinated with fast index, compute spherical correction each time 188 if (par == details->theta_par) { 189 spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 190 } 191 } 192 } 164 p0_index++; 165 } 166 barrier(CLK_LOCAL_MEM_FENCE); 193 167 //printf("\n"); 194 168 195 169 // Increment fast index 196 const double wi = weights[details->pd_offset[0] + pd_index[0]];197 weight = partial_weight*wi;198 pd_index[0]++;199 170 200 171 #ifdef INVALID … … 208 179 // where it becomes zero. If the entirety of the correction 209 180 weight *= spherical_correction; 210 norm += weight * CALL_VOLUME(local_values);181 pd_norm += weight * CALL_VOLUME(local_values); 211 182 212 183 const double scattering = CALL_IQ(q, q_index, local_values); … … 216 187 217 188 if (q_index < nq) { 218 if (pd_stop >= details-> total_pd) {189 if (pd_stop >= details->pd_prod) { 219 190 // End of the PD loop we can normalize 220 191 double scale, background; 221 192 scale = values[0]; 222 193 background = values[1]; 223 result[q_index] = ( norm>0. ? scale*this_result/norm + background : background);194 result[q_index] = (pd_norm>0. ? scale*this_result/pd_norm + background : background); 224 195 } else { 225 196 // Partial result, so remember it but don't normalize it. … … 228 199 229 200 // Remember the updated norm. 230 if (q_index == 0) result[nq] = norm;201 if (q_index == 0) result[nq] = pd_norm; 231 202 } 232 203
Note: See TracChangeset
for help on using the changeset viewer.