← Previous Changeset
Next Changeset →

Changeset ae2b6b5 in sasmodels

Timestamp:

Apr 18, 2016 12:23:35 AM (9 years ago)

Author:

Paul Kienzle <pkienzle@…>

Branches:

master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

Parents:

Message:

increase code correspondance between iq.c and iq.cl

Location:

Files:

: 4 edited

generate.py (modified) (1 diff)
kernel_iq.c (modified) (6 diffs)
kernel_iq.cl (modified) (10 diffs)
kernelcl.py (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

sasmodels/generate.py

rf2f67a6	rae2b6b5
473	473	dll_code = load_template('kernel_iq.c')
474	474	ocl_code = load_template('kernel_iq.cl')
	475	#ocl_code = load_template('kernel_iq_local.cl')
475	476	user_code = [open(f).read() for f in model_sources(model_info)]
476	477

sasmodels/kernel_iq.c

-                      rf2f67a6
+                      rae2b6b5
   // Storage for the current parameter values.  These will be updated as we
   // walk the polydispersity cube.
   local ParameterBlock local_values;  // current parameter values
+  ParameterBlock local_values;  // current parameter values
   double *pvec = (double *)(&local_values);  // Alias named parameters with a vector
   double norm;
 …
     norm = CALL_VOLUME(local_values);
     const double scale = values[0];
     const double background = values[1];
     // result[nq] = norm; // Total volume normalization
+    double scale, background;
+    scale = values[0];
+    background = values[1];
     #ifdef USE_OPENMP
     #pragma omp parallel for
     #endif
     for (int i=0; i < nq; i++) {
       double scattering = CALL_IQ(q, i, local_values);
       result[i] = (norm>0. ? scale*scattering/norm + background : background);
+    for (int q_index=0; q_index < nq; q_index++) {
+      double scattering = CALL_IQ(q, q_index, local_values);
+      result[q_index] = (norm>0. ? scale*scattering/norm + background : background);
+    }
     return;
 …
 #if MAX_PD > 0
+  // If it is the first round initialize the result to zero, otherwise
+  // assume that the previous result has been passed back.
+  // Note: doing this even in the monodisperse case in order to handle the
+  // rare case where the model parameters are invalid and zero is returned.
+  // So slightly increased cost for slightly smaller code size.
+  // need product of weights at every Iq calc, so keep product of
+  // weights from the outer loops so that weight = partial_weight * fast_weight
+  double partial_weight; // product of weight w4*w3*w2 but not w1
+  double spherical_correction; // cosine correction for latitude variation
+  double weight; // product of partial_weight*w1*spherical_correction
+  // Location in the polydispersity hypercube, one index per dimension.
+  int pd_index[MAX_PD];
+  // Location of the coordinated parameters in their own sub-cubes.
+  int offset[NPARS];
+  // Number of coordinated indices
+  const int num_coord = details->num_coord;
+  // Number of elements in the longest polydispersity loop
+  const int fast_length = details->pd_length[0];
+  // Trigger the reset behaviour that happens at the end the fast loop
+  // by setting the initial index >= weight vector length.
+  pd_index[0] = fast_length;
+  // Default the spherical correction to 1.0 in case it is not otherwise set
+  spherical_correction = 1.0;
+  // Since we are no longer looping over the entire polydispersity hypercube
+  // for each q, we need to track the result and normalization values between
+  // calls.  This means initializing them to 0 at the start and accumulating
+  // them between calls.
+  norm = pd_start == 0 ? 0.0 : result[nq];
   if (pd_start == 0) {
     #ifdef USE_OPENMP
     #pragma omp parallel for
     #endif
+    for (int i=0; i < nq+1; i++) {
+      result[i] = 0.0;
+    }
+    norm = 0.0;
+  } else {
+    norm = result[nq];
+  }
+  // need product of weights at every Iq calc, so keep product of
+  // weights from the outer loops so that weight = partial_weight * fast_weight
+  double partial_weight = NAN; // product of weight w4*w3*w2 but not w1
+  double spherical_correction = 1.0;  // cosine correction for latitude variation
+  // Location in the polydispersity hypercube, one index per dimension.
+  local int pd_index[MAX_PD];
+  // Location of the coordinated parameters in their own sub-cubes.
+  local int offset[NPARS];
+  // Trigger the reset behaviour that happens at the end the fast loop
+  // by setting the initial index >= weight vector length.
+  const int fast_length = details->pd_length[0];
+  pd_index[0] = fast_length;
+  // Number of coordinated indices
+  const int num_coord = details->num_coord;
+    for (int q_index=0; q_index < nq; q_index++) {
+      result[q_index] = 0.0;
+    }
+  }
   // Loop over the weights then loop over q, accumulating values
 …
+    }
+    // Increment fast index
+    const double wi = weights[details->pd_offset[0] + pd_index[0]++];
+    double weight = partial_weight*wi;
+    // Update fast parameters
     //printf("fast %d: ", loop_index);
     for (int k=0; k < num_coord; k++) {
 …
     //printf("\n");
+    // Increment fast index
+    const double wi = weights[details->pd_offset[0] + pd_index[0]];
+    weight = partial_weight*wi;
+    pd_index[0]++;
     #ifdef INVALID
     if (INVALID(local_values)) continue;
 …
       #pragma omp parallel for
       #endif
+      for (int i=0; i < nq; i++) {
+        const double scattering = CALL_IQ(q, i, local_values);
+        result[i] += weight*scattering;
+      }
+    }
+  }
+  // End of the PD loop we can normalize
+      for (int q_index=0; q_index < nq; q_index++) {
+        const double scattering = CALL_IQ(q, q_index, local_values);
+        result[q_index] += weight*scattering;
+      }
+    }
+  }
   if (pd_stop >= details->total_pd) {
+    const double scale = values[0];
+    const double background = values[1];
+    // End of the PD loop we can normalize
+    double scale, background;
+    scale = values[0];
+    background = values[1];
     #ifdef USE_OPENMP
     #pragma omp parallel for
     #endif
     for (int i=0; i < nq; i++) {
       result[i] = (norm>0. ? scale*result[i]/norm + background : background);
+    for (int q_index=0; q_index < nq; q_index++) {
+      result[q_index] = (norm>0. ? scale*result[q_index]/norm + background : background);
+    }
+  }

sasmodels/kernel_iq.cl

-                      rf2f67a6
+                      rae2b6b5
+    )
+{
-  double norm;
-  // who we are and what element we are working with
-  const int q_index = get_global_id(0);
-  // number of active loops
-  const int num_active = details->num_active;
   // Storage for the current parameter values.  These will be updated as we
   // walk the polydispersity cube.
   ParameterBlock local_values;  // current parameter values
   double *pvec = (double *)(&local_values);  // Alias named parameters with a vector
+  double norm;
+  // who we are and what element we are working with
+  const int q_index = get_global_id(0);
+  // number of active loops
+  const int num_active = details->num_active;
   // Fill in the initial variables
   for (int k = 0; k < NPARS; k++) {
+  for (int k=0; k < NPARS; k++) {
     pvec[k] = values[details->par_offset[k]];
+  }
 …
     if (INVALID(local_values)) { return; }
     #endif
+    norm = CALL_VOLUME(local_values);
     double scale, background;
-    norm = CALL_VOLUME(local_values);
     scale = values[0];
     background = values[1];
-    // if (i==0) result[nq] = norm; // Total volume normalization
     if (q_index < nq) {
 …
 #if MAX_PD > 0
-  // If it is the first round initialize the result to zero, otherwise
-  // assume that the previous result has been passed back.
-  // Note: doing this even in the monodisperse case in order to handle the
-  // rare case where the model parameters are invalid and zero is returned.
-  // So slightly increased cost for slightly smaller code size.
   double this_result;
 …
   // weights from the outer loops so that weight = partial_weight * fast_weight
   double partial_weight; // product of weight w4*w3*w2 but not w1
+  double spherical_correction;  // cosine correction for latitude variation
+  double spherical_correction; // cosine correction for latitude variation
+  double weight; // product of partial_weight*w1*spherical_correction
   // Location in the polydispersity hypercube, one index per dimension.
 …
   int offset[NPARS];
+  // Number of coordinated indices
+  const int num_coord = details->num_coord;
   // Number of elements in the longest polydispersity loop
   const int fast_length = details->pd_length[0];
-  // Number of coordinated indices
-  const int num_coord = details->num_coord;
-  // We could in theory spread this work across different threads, but
-  // lets keep it simple;
-  norm = pd_start == 0 ? 0.0 : result[nq];
-  spherical_correction = 1.0;  // the usual case.
-  // partial_weight = NAN;
   // Trigger the reset behaviour that happens at the end the fast loop
   // by setting the initial index >= weight vector length.
   pd_index[0] = fast_length;
+  // Default the spherical correction to 1.0 in case it is not otherwise set
+  spherical_correction = 1.0;
   // Since we are no longer looping over the entire polydispersity hypercube
 …
   // calls.  This means initializing them to 0 at the start and accumulating
   // them between calls.
+  norm = pd_start == 0 ? 0.0 : result[nq];
   if (q_index < nq) {
     this_result = pd_start == 0 ? 0.0 : result[q_index];
 …
       // Compute position in polydispersity hypercube
       for (int k=0; k < num_active; k++) {
           pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k];
           //printf("pd_index[%d] = %d\n",k,pd_index[k]);
+        pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k];
+        //printf("pd_index[%d] = %d\n",k,pd_index[k]);
+      }
 …
       //printf("slow %d: ", loop_index);
       for (int k=0; k < num_coord; k++) {
+        if (k < num_coord) {
+          int par = details->par_coord[k];
+          int coord = details->pd_coord[k];
+          int this_offset = details->par_offset[par];
+          int block_size = 1;
+          for (int bit=0; coord != 0; bit++) {
+            if (coord&1) {
+                this_offset += block_size * pd_index[bit];
+                block_size *= details->pd_length[bit];
+            }
+            coord >>= 1;
+        int par = details->par_coord[k];
+        int coord = details->pd_coord[k];
+        int this_offset = details->par_offset[par];
+        int block_size = 1;
+        for (int bit=0; coord != 0; bit++) {
+          if (coord&1) {
+              this_offset += block_size * pd_index[bit];
+              block_size *= details->pd_length[bit];
+          }
+          offset[par] = this_offset;
+          pvec[par] = values[this_offset];
+          //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]);
+          // if theta is not coordinated with fast index, precompute spherical correction
+          if (par == details->theta_par && !(details->par_coord[k]&1)) {
+            spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+          }
+          coord >>= 1;
+        }
+        offset[par] = this_offset;
+        pvec[par] = values[this_offset];
+        //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]);
+        // if theta is not coordinated with fast index, precompute spherical correction
+        if (par == details->theta_par && !(details->par_coord[k]&1)) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
       //printf("\n");
+    }
+    double weight;
+    // Update fast parameters
+    //printf("fast %d: ", loop_index);
+    for (int k=0; k < num_coord; k++) {
+      if (details->pd_coord[k]&1) {
+        const int par = details->par_coord[k];
+        pvec[par] = values[offset[par]++];
+        //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]);
+        // if theta is coordinated with fast index, compute spherical correction each time
+        if (par == details->theta_par) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
+    }
+    //printf("\n");
+    // Increment fast index
     const double wi = weights[details->pd_offset[0] + pd_index[0]];
     weight = partial_weight*wi;
     pd_index[0]++;
-    // Increment fast index
-    //printf("fast %d: ", loop_index);
-    for (int k=0; k < num_coord; k++) {
-      if (k < num_coord) {
-        if (details->pd_coord[k]&1) {
-          const int par = details->par_coord[k];
-          pvec[par] = values[offset[par]++];
-          //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]);
-          // if theta is coordinated with fast index, compute spherical correction each time
-          if (par == details->theta_par) {
-            spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+          }
+        }
+      }
+    }
-    //printf("\n");
     #ifdef INVALID
 …
     if (pd_stop >= details->total_pd) {
       // End of the PD loop we can normalize
+      const double scale = values[0];
+      const double background = values[1];
+      double scale, background;
+      scale = values[0];
+      background = values[1];
       result[q_index] = (norm>0. ? scale*this_result/norm + background : background);
     } else {
 …
       result[q_index] = this_result;
+    }
+    // Accumulate norm.
+    // Remember the updated norm.
     if (q_index == 0) result[nq] = norm;
+  }

sasmodels/kernelcl.py

-                      rf2f67a6
+                      rae2b6b5
                              hostbuf=values)
+        start, stop = 0, call_details.total_pd
+        args = [
+            np.uint32(self.q_input.nq), np.int32(start), np.int32(stop),
+            details_b, weights_b, values_b, self.q_input.q_b, self.result_b,
+            self.real(cutoff),
+        ]
+        self.kernel(self.queue, self.q_input.global_size, None, *args)
+        # Call kernel and retrieve results
+        step = 100
+        for start in range(0, call_details.total_pd, step):
+            stop = min(start+step, call_details.total_pd)
+            args = [
+                np.uint32(self.q_input.nq), np.int32(start), np.int32(stop),
+                details_b, weights_b, values_b, self.q_input.q_b, self.result_b,
+                self.real(cutoff),
+            ]
+            self.kernel(self.queue, self.q_input.global_size, None, *args)
         cl.enqueue_copy(self.queue, self.result, self.result_b)
+        # Free buffers
         for v in (details_b, weights_b, values_b):
             if v is not None: v.release()

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: