← Previous Changeset
Next Changeset →

Changeset a738209 in sasmodels

Timestamp:

Jul 15, 2016 9:33:33 AM (9 years ago)

Author:

Paul Kienzle <pkienzle@…>

Branches:

master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

Parents:

Message:

simplify kernels by remove coordination parameter logic

Location:

Files:

: 11 edited

compare.py (modified) (1 diff)
details.py (modified) (5 diffs)
direct_model.py (modified) (2 diffs)
generate.py (modified) (2 diffs)
kernel.py (modified) (2 diffs)
kernel_iq.c (modified) (10 diffs)
kernel_iq.cl (modified) (7 diffs)
kernelcl.py (modified) (4 diffs)
kerneldll.py (modified) (5 diffs)
kernelpy.py (modified) (6 diffs)
sasview_model.py (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

sasmodels/compare.py

-                      rf2f67a6
+                      ra738209
     Return a model calculator using the OpenCL calculation engine.
     """
+    try:
+        model = core.build_model(model_info, dtype=dtype, platform="ocl")
+    except Exception as exc:
+        print(exc)
+        print("... trying again with single precision")
+        model = core.build_model(model_info, dtype='single', platform="ocl")
+    if not core.HAVE_OPENCL:
+        raise RuntimeError("OpenCL not available")
+    model = core.build_model(model_info, dtype=dtype, platform="ocl")
     calculator = DirectModel(data, model, cutoff=cutoff)
     calculator.engine = "OCL%s"%DTYPE_MAP[dtype]

sasmodels/details.py

-                      r0ff62d4
+                      ra738209
+from __future__ import print_function
 import numpy as np  # type: ignore
 …
         parameters = model_info.parameters
         max_pd = parameters.max_pd
+        npars = parameters.npars
+        par_offset = 4*max_pd
+        self.buffer = np.zeros(par_offset + 3 * npars + 4, 'i4')
+        # Structure of the call details buffer:
+        #   pd_par[max_pd]     pd params in order of length
+        #   pd_length[max_pd]  length of each pd param
+        #   pd_offset[max_pd]  offset of pd values in parameter array
+        #   pd_stride[max_pd]  index of pd value in loop = n//stride[k]
+        #   pd_prod            total length of pd loop
+        #   pd_sum             total length of the weight vector
+        #   num_active         number of pd params
+        #   theta_par          parameter number for theta parameter
+        self.buffer = np.zeros(4*max_pd + 4, 'i4')
         # generate views on different parts of the array
 …
         self._pd_offset  = self.buffer[2 * max_pd:3 * max_pd]
         self._pd_stride  = self.buffer[3 * max_pd:4 * max_pd]
-        self._par_offset = self.buffer[par_offset + 0 * npars:par_offset + 1 * npars]
-        self._par_coord  = self.buffer[par_offset + 1 * npars:par_offset + 2 * npars]
-        self._pd_coord   = self.buffer[par_offset + 2 * npars:par_offset + 3 * npars]
         # theta_par is fixed
         self.buffer[-1] = parameters.theta_offset
+        self.theta_par = parameters.theta_offset
     @property
 …
     @property
+    def pd_coord(self): return self._pd_coord
+    def pd_prod(self): return self.buffer[-4]
+    @pd_prod.setter
+    def pd_prod(self, v): self.buffer[-4] = v
     @property
+    def par_coord(self): return self._par_coord
+    def pd_sum(self): return self.buffer[-3]
+    @pd_sum.setter
+    def pd_sum(self, v): self.buffer[-3] = v
     @property
+    def par_offset(self): return self._par_offset
+    @property
+    def num_active(self): return self.buffer[-4]
+    def num_active(self): return self.buffer[-2]
     @num_active.setter
+    def num_active(self, v): self.buffer[-4] = v
+    @property
+    def total_pd(self): return self.buffer[-3]
+    @total_pd.setter
+    def total_pd(self, v): self.buffer[-3] = v
+    @property
+    def num_coord(self): return self.buffer[-2]
+    @num_coord.setter
+    def num_coord(self, v): self.buffer[-2] = v
+    def num_active(self, v): self.buffer[-2] = v
     @property
     def theta_par(self): return self.buffer[-1]
+    @theta_par.setter
+    def theta_par(self, v): self.buffer[-1] = v
     def show(self):
-        print("total_pd", self.total_pd)
         print("num_active", self.num_active)
+        print("pd_prod", self.pd_prod)
+        print("pd_sum", self.pd_sum)
+        print("theta par", self.theta_par)
         print("pd_par", self.pd_par)
         print("pd_length", self.pd_length)
         print("pd_offset", self.pd_offset)
         print("pd_stride", self.pd_stride)
-        print("par_offsets", self.par_offset)
-        print("num_coord", self.num_coord)
-        print("par_coord", self.par_coord)
-        print("pd_coord", self.pd_coord)
-        print("theta par", self.buffer[-1])
 def mono_details(model_info):
     call_details = CallDetails(model_info)
+    # The zero defaults for monodisperse systems are mostly fine
+    call_details.par_offset[:] = np.arange(2, len(call_details.par_offset)+2)
+    call_details.pd_prod = 1
     return call_details
 def poly_details(model_info, weights):
     #print("weights",weights)
     weights = weights[2:] # Skip scale and background
+    #weights = weights[2:] # Skip scale and background
     # Decreasing list of polydispersity lengths
-    # Note: the reversing view, x[::-1], does not require a copy
     pd_length = np.array([len(w) for w in weights])
     num_active = np.sum(pd_length>1)
 …
     pd_offset = np.cumsum(np.hstack((0, pd_length)))
+    # Note: the reversing view, x[::-1], does not require a copy
     idx = np.argsort(pd_length)[::-1][:num_active]
     par_length = np.array([max(len(w),1) for w in weights])
+    par_length = np.array([len(w) for w in weights])
     pd_stride = np.cumprod(np.hstack((1, par_length[idx])))
-    par_offsets = np.cumsum(np.hstack((2, par_length)))
     call_details = CallDetails(model_info)
     call_details.pd_par[:num_active] = idx
+    call_details.pd_par[:num_active] = idx - 2  # skip background & scale
     call_details.pd_length[:num_active] = pd_length[idx]
     call_details.pd_offset[:num_active] = pd_offset[idx]
     call_details.pd_stride[:num_active] = pd_stride[:-1]
     call_details.par_offset[:] = par_offsets[:-1]
     call_details.total_pd = pd_stride[-1]
+    call_details.pd_prod = pd_stride[-1]
+    call_details.pd_sum = np.sum(par_length)
     call_details.num_active = num_active
-    # Without constraints coordinated parameters are just the pd parameters
-    call_details.par_coord[:num_active] = idx
-    call_details.pd_coord[:num_active] = 2**np.arange(num_active)
-    call_details.num_coord = num_active
     #call_details.show()
     return call_details
-def constrained_poly_details(model_info, weights, constraints):
-    # Need to find the independently varying pars and sort them
-    # Need to build a coordination list for the dependent variables
-    # Need to generate a constraints function which takes values
-    # and weights, returning par blocks
-    raise NotImplementedError("Can't handle constraints yet")

sasmodels/direct_model.py

-                      r56b2687
+                      ra738209
     vw_pairs = [(get_weights(p, pars) if active(p.name)
                  else ([pars.get(p.name, p.default)], []))
+                 else ([pars.get(p.name, p.default)], [1.0]))
                 for p in parameters.call_parameters]
     call_details, weights, values = kernel.build_details(calculator, vw_pairs)
     return calculator(call_details, weights, values, cutoff)
+    call_details, values = kernel.build_details(calculator, vw_pairs)
+    return calculator(call_details, values, cutoff)
 def get_weights(parameter, values):
 …
     nsigma = values.get(parameter.name+'_pd_nsigma', 3.0)
     if npts == 0 or width == 0:
         return [value], []
+        return [value], [1.0]
     value, weight = weights.get_weights(
         disperser, npts, width, nsigma, value, limits, relative)

sasmodels/generate.py

-                      r56b2687
+                      ra738209
+_IQXY_PATTERN = re.compile("^((inline|static) )? *(double )? *Iqxy *([(]|$)",
+# type in IQXY pattern could be single, float, double, long double, ...
+_IQXY_PATTERN = re.compile("^((inline|static) )? *([a-z ]+ )? *Iqxy *([(]|$)",
                            flags=re.MULTILINE)
 def _have_Iqxy(sources):
 …
     line instead.
     """
     for code, path in sources:
+    for path, code in sources:
         if _IQXY_PATTERN.search(code):
             return True

sasmodels/kernel.py

-                      r0ff62d4
+                      ra738209
 the kernel should be released, which also releases the inputs.
 """
+from __future__ import division, print_function
 import numpy as np
 …
     """
     values, weights = zip(*pairs)
+    if max([len(w) for w in weights]) > 1:
+    scalars = [v[0] for v in values]
+    if all(len(w)==1 for w in weights):
+        call_details = mono_details(kernel.info)
+        data = np.array(scalars, dtype=kernel.dtype)
+    else:
         call_details = poly_details(kernel.info, weights)
+    else:
+        call_details = mono_details(kernel.info)
+    weights, values = [np.hstack(v) for v in (weights, values)]
+    weights = weights.astype(dtype=kernel.dtype)
+    values = values.astype(dtype=kernel.dtype)
+    return call_details, weights, values
+        data = np.hstack(scalars+list(values)+list(weights)).astype(kernel.dtype)
+    return call_details, data

sasmodels/kernel_iq.c

-                      rae2b6b5
+                      ra738209
     int32_t pd_stride[MAX_PD];  // stride to move to the next index at this level
 #endif // MAX_PD > 0
+    int32_t par_offset[NPARS];  // offset of par value blocks in the value & weight vector
+    int32_t par_coord[NPARS];   // ids of the coordination parameters
+    int32_t pd_coord[NPARS];    // polydispersity coordination bitvector
+    int32_t pd_prod;            // total number of voxels in hypercube
+    int32_t pd_sum;             // total length of the weights vector
     int32_t num_active;         // number of non-trivial pd loops
-    int32_t total_pd;           // total number of voxels in hypercube
-    int32_t num_coord;          // number of coordinated parameters
     int32_t theta_par;          // id of spherical correction variable
 } ProblemDetails;
 …
     const int32_t pd_stop,      // where we are stopping in the polydispersity loop
     global const ProblemDetails *details,
-    global const double *weights,
     global const double *values,
     global const double *q, // nq q values, with padding to boundary
 …
   ParameterBlock local_values;  // current parameter values
   double *pvec = (double *)(&local_values);  // Alias named parameters with a vector
-  double norm;
-  // number of active loops
-  const int num_active = details->num_active;
   // Fill in the initial variables
 …
   #endif
   for (int k=0; k < NPARS; k++) {
     pvec[k] = values[details->par_offset[k]];
+    pvec[k] = values[k+2];
+  }
   // Monodisperse computation
+  if (num_active == 0) {
+  if (details->num_active == 0) {
+    double norm, scale, background;
     #ifdef INVALID
     if (INVALID(local_values)) { return; }
     #endif
     norm = CALL_VOLUME(local_values);
-    double scale, background;
     scale = values[0];
     background = values[1];
 …
 #if MAX_PD > 0
+  const double *pd_value = values+2+NPARS;
+  const double *pd_weight = pd_value+details->pd_sum;
   // need product of weights at every Iq calc, so keep product of
   // weights from the outer loops so that weight = partial_weight * fast_weight
+  double pd_norm;
   double partial_weight; // product of weight w4*w3*w2 but not w1
   double spherical_correction; // cosine correction for latitude variation
   double weight; // product of partial_weight*w1*spherical_correction
-  // Location in the polydispersity hypercube, one index per dimension.
-  int pd_index[MAX_PD];
-  // Location of the coordinated parameters in their own sub-cubes.
-  int offset[NPARS];
-  // Number of coordinated indices
-  const int num_coord = details->num_coord;
   // Number of elements in the longest polydispersity loop
+  const int fast_length = details->pd_length[0];
+  const int p0_par = details->pd_par[0];
+  const int p0_length = details->pd_length[0];
+  const int p0_offset = details->pd_offset[0];
+  const int p0_is_theta = (p0_par == details->theta_par);
+  int p0_index;
   // Trigger the reset behaviour that happens at the end the fast loop
   // by setting the initial index >= weight vector length.
   pd_index[0] = fast_length;
+  p0_index = p0_length;
   // Default the spherical correction to 1.0 in case it is not otherwise set
 …
   // calls.  This means initializing them to 0 at the start and accumulating
   // them between calls.
+  norm = pd_start == 0 ? 0.0 : result[nq];
+  pd_norm = (pd_start == 0 ? 0.0 : result[nq]);
   if (pd_start == 0) {
     #ifdef USE_OPENMP
 …
   for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) {
     // check if fast loop needs to be reset
+    if (pd_index[0] == fast_length) {
+      //printf("should be here with %d active\n", num_active);
+    if (p0_index == p0_length) {
+      // Compute position in polydispersity hypercube
+      for (int k=0; k < num_active; k++) {
+        pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k];
+        //printf("pd_index[%d] = %d\n",k,pd_index[k]);
+      }
+      // Compute partial weights
+      // Compute position in polydispersity hypercube and partial weight
       partial_weight = 1.0;
+      //printf("partial weight %d: ", loop_index);
+      for (int k=1; k < num_active; k++) {
+        double wi = weights[details->pd_offset[k] + pd_index[k]];
+        //printf("pd[%d]=par[%d]=%g ", k, details->pd_par[k], wi);
+        partial_weight *= wi;
+      }
+      //printf("\n");
+      // Update parameter offsets in weight vector
+      //printf("slow %d: ", loop_index);
+      for (int k=0; k < num_coord; k++) {
+        int par = details->par_coord[k];
+        int coord = details->pd_coord[k];
+        int this_offset = details->par_offset[par];
+        int block_size = 1;
+        for (int bit=0; coord != 0; bit++) {
+          if (coord&1) {
+              this_offset += block_size * pd_index[bit];
+              block_size *= details->pd_length[bit];
+          }
+          coord >>= 1;
+        }
+        offset[par] = this_offset;
+        pvec[par] = values[this_offset];
+        //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]);
+        // if theta is not coordinated with fast index, precompute spherical correction
+        if (par == details->theta_par && !(details->par_coord[k]&1)) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+      for (int k=1; k < details->num_active; k++) {
+        int pk = details->pd_par[k];
+        int index = details->pd_offset[k] + (loop_index/details->pd_stride[k])%details->pd_length[k];
+        pvec[pk] = pd_value[index];
+        partial_weight *= pd_weight[index];
+        if (pk == details->theta_par) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[pk])), 1.e-6);
+        }
+      }
       //printf("\n");
+      p0_index = loop_index%p0_length;
+    }
+    // Update fast parameters
+    //printf("fast %d: ", loop_index);
+    for (int k=0; k < num_coord; k++) {
+      if (details->pd_coord[k]&1) {
+        const int par = details->par_coord[k];
+        pvec[par] = values[offset[par]++];
+        //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]);
+        // if theta is coordinated with fast index, compute spherical correction each time
+        if (par == details->theta_par) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
+    // Update parameter p0
+    weight = partial_weight*pd_weight[p0_offset + p0_index];
+    pvec[p0_par] = pd_value[p0_offset + p0_index];
+    if (p0_is_theta) {
+      spherical_correction = fmax(fabs(cos(M_PI_180*pvec[p0_par])), 1.e-6);
+    }
+    //printf("\n");
+    // Increment fast index
+    const double wi = weights[details->pd_offset[0] + pd_index[0]];
+    weight = partial_weight*wi;
+    pd_index[0]++;
+    p0_index++;
     #ifdef INVALID
 …
       // where it becomes zero.  If the entirety of the correction
       weight *= spherical_correction;
       norm += weight * CALL_VOLUME(local_values);
+      pd_norm += weight * CALL_VOLUME(local_values);
       #ifdef USE_OPENMP
 …
+  }
   if (pd_stop >= details->total_pd) {
+  if (pd_stop >= details->pd_prod) {
     // End of the PD loop we can normalize
     double scale, background;
 …
     #endif
     for (int q_index=0; q_index < nq; q_index++) {
       result[q_index] = (norm>0. ? scale*result[q_index]/norm + background : background);
+      result[q_index] = (pd_norm>0. ? scale*result[q_index]/pd_norm + background : background);
+    }
+  }
   // Remember the updated norm.
   result[nq] = norm;
+  result[nq] = pd_norm;
 #endif // MAX_PD > 0
+}

sasmodels/kernel_iq.cl

-                      rae2b6b5
+                      ra738209
     int32_t pd_stride[MAX_PD];  // stride to move to the next index at this level
 #endif // MAX_PD > 0
+    int32_t par_offset[NPARS];  // offset of par value blocks in the value & weight vector
+    int32_t par_coord[NPARS];   // ids of the coordination parameters
+    int32_t pd_coord[NPARS];    // polydispersity coordination bitvector
+    int32_t pd_prod;            // total number of voxels in hypercube
+    int32_t pd_sum;             // total length of the weights vector
     int32_t num_active;         // number of non-trivial pd loops
-    int32_t total_pd;           // total number of voxels in hypercube
-    int32_t num_coord;          // number of coordinated parameters
     int32_t theta_par;          // id of spherical correction variable
 } ProblemDetails;
 …
     const int32_t pd_stop,      // where we are stopping in the polydispersity loop
     global const ProblemDetails *details,
-    global const double *weights,
     global const double *values,
     global const double *q, // nq q values, with padding to boundary
     global double *result,  // nq+3 return values, again with padding
+    global double *result,  // nq+1 return values, again with padding
     const double cutoff     // cutoff in the polydispersity weight product
+    )
+{
   // Storage for the current parameter values.  These will be updated as we
+  // walk the polydispersity cube.
+  ParameterBlock local_values;  // current parameter values
+  double *pvec = (double *)(&local_values);  // Alias named parameters with a vector
+  double norm;
+  // walk the polydispersity cube.  local_values will be aliased to pvec.
+  local ParameterBlock local_values;
   // who we are and what element we are working with
   const int q_index = get_global_id(0);
+  // number of active loops
+  const int num_active = details->num_active;
+  const int thread = get_local_id(0);
   // Fill in the initial variables
+  for (int k=0; k < NPARS; k++) {
+    pvec[k] = values[details->par_offset[k]];
+  }
+  event_t e = async_work_group_copy((local double *)&local_values, values+2, NPARS, 0);
+  wait_group_events(1, &e);
   // Monodisperse computation
+  if (num_active == 0) {
+  if (details->num_active == 0) {
+    double norm, scale, background;
+    // TODO: only needs to be done by one process...
     #ifdef INVALID
     if (INVALID(local_values)) { return; }
     #endif
     norm = CALL_VOLUME(local_values);
-    double scale, background;
     scale = values[0];
     background = values[1];
 …
   // norm will be shared across all threads.
+  // "values" is global and can't be assigned to a local, so even though only
+  // the alias is only needed for thread 0 it is allocated in all threads.
+  global const double *pd_value = values+2+NPARS;
+  global const double *pd_weight = pd_value+details->pd_sum;
   // need product of weights at every Iq calc, so keep product of
   // weights from the outer loops so that weight = partial_weight * fast_weight
+  double partial_weight; // product of weight w4*w3*w2 but not w1
+  double spherical_correction; // cosine correction for latitude variation
+  double weight; // product of partial_weight*w1*spherical_correction
+  // Location in the polydispersity hypercube, one index per dimension.
+  int pd_index[MAX_PD];
+  // Location of the coordinated parameters in their own sub-cubes.
+  int offset[NPARS];
+  // Number of coordinated indices
+  const int num_coord = details->num_coord;
+  local double pd_norm;
+  local double partial_weight; // product of weight w4*w3*w2 but not w1
+  local double spherical_correction; // cosine correction for latitude variation
+  local double weight; // product of partial_weight*w1*spherical_correction
+  local double *pvec;
+  local int p0_par;
+  local int p0_length;
+  local int p0_offset;
+  local int p0_is_theta;
+  local int p0_index;
   // Number of elements in the longest polydispersity loop
+  const int fast_length = details->pd_length[0];
+  // Trigger the reset behaviour that happens at the end the fast loop
+  // by setting the initial index >= weight vector length.
+  pd_index[0] = fast_length;
+  // Default the spherical correction to 1.0 in case it is not otherwise set
+  spherical_correction = 1.0;
+  // Since we are no longer looping over the entire polydispersity hypercube
+  // for each q, we need to track the result and normalization values between
+  // calls.  This means initializing them to 0 at the start and accumulating
+  // them between calls.
+  norm = pd_start == 0 ? 0.0 : result[nq];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (thread == 0) {
+    pvec = (local double *)(&local_values);
+    // Number of elements in the longest polydispersity loop
+    p0_par = details->pd_par[0];
+    p0_length = details->pd_length[0];
+    p0_offset = details->pd_offset[0];
+    p0_is_theta = (p0_par == details->theta_par);
+    // Trigger the reset behaviour that happens at the end the fast loop
+    // by setting the initial index >= weight vector length.
+    p0_index = p0_length;
+    // Default the spherical correction to 1.0 in case it is not otherwise set
+    spherical_correction = 1.0;
+    // Since we are no longer looping over the entire polydispersity hypercube
+    // for each q, we need to track the result and normalization values between
+    // calls.  This means initializing them to 0 at the start and accumulating
+    // them between calls.
+    pd_norm = pd_start == 0 ? 0.0 : result[nq];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
   if (q_index < nq) {
     this_result = pd_start == 0 ? 0.0 : result[q_index];
 …
   // Loop over the weights then loop over q, accumulating values
   for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) {
+    // check if fast loop needs to be reset
+    if (pd_index[0] == fast_length) {
+      //printf("should be here with %d active\n", num_active);
+      // Compute position in polydispersity hypercube
+      for (int k=0; k < num_active; k++) {
+        pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k];
+        //printf("pd_index[%d] = %d\n",k,pd_index[k]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (thread == 0) {
+      // check if fast loop needs to be reset
+      if (p0_index == p0_length) {
+        //printf("should be here with %d active\n", num_active);
+        // Compute position in polydispersity hypercube and partial weight
+        partial_weight = 1.0;
+        for (int k=1; k < details->num_active; k++) {
+          int pk = details->pd_par[k];
+          int index = details->pd_offset[k] + (loop_index/details->pd_stride[k])%details->pd_length[k];
+          pvec[pk] = pd_value[index];
+          partial_weight *= pd_weight[index];
+          //printf("index[%d] = %d\n",k,index);
+          if (pk == details->theta_par) {
+            spherical_correction = fmax(fabs(cos(M_PI_180*pvec[pk])), 1.e-6);
+          }
+        }
+        p0_index = loop_index%p0_length;
+        //printf("\n");
+      }
+      // need to compute the product of the weights.  If the vector were really
+      // long, we could split the work into groups, with each thread taking
+      // every nth weight, but there really is no call for it here.  We could
+      // also do some clever pair-wise multiplication similar to parallel
+      // prefix, but again simpler is probably faster since n is likely small.
+      // Compute partial weights
+      partial_weight = 1.0;
+      //printf("partial weight %d: ", loop_index);
+      for (int k=1; k < num_active; k++) {
+        double wi = weights[details->pd_offset[k] + pd_index[k]];
+        //printf("pd[%d]=par[%d]=%g ", k, details->pd_par[k], wi);
+        partial_weight *= wi;
+      // Update parameter p0
+      weight = partial_weight*pd_weight[p0_offset + p0_index];
+      pvec[p0_par] = pd_value[p0_offset + p0_index];
+      if (p0_is_theta) {
+        spherical_correction = fmax(fabs(cos(M_PI_180*pvec[p0_par])), 1.e-6);
+      }
+      //printf("\n");
+      // Update parameter offsets in weight vector
+      //printf("slow %d: ", loop_index);
+      for (int k=0; k < num_coord; k++) {
+        int par = details->par_coord[k];
+        int coord = details->pd_coord[k];
+        int this_offset = details->par_offset[par];
+        int block_size = 1;
+        for (int bit=0; coord != 0; bit++) {
+          if (coord&1) {
+              this_offset += block_size * pd_index[bit];
+              block_size *= details->pd_length[bit];
+          }
+          coord >>= 1;
+        }
+        offset[par] = this_offset;
+        pvec[par] = values[this_offset];
+        //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]);
+        // if theta is not coordinated with fast index, precompute spherical correction
+        if (par == details->theta_par && !(details->par_coord[k]&1)) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
+      //printf("\n");
+    }
+    // Update fast parameters
+    //printf("fast %d: ", loop_index);
+    for (int k=0; k < num_coord; k++) {
+      if (details->pd_coord[k]&1) {
+        const int par = details->par_coord[k];
+        pvec[par] = values[offset[par]++];
+        //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]);
+        // if theta is coordinated with fast index, compute spherical correction each time
+        if (par == details->theta_par) {
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
+    }
+      p0_index++;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
     //printf("\n");
     // Increment fast index
-    const double wi = weights[details->pd_offset[0] + pd_index[0]];
-    weight = partial_weight*wi;
-    pd_index[0]++;
     #ifdef INVALID
 …
       // where it becomes zero.  If the entirety of the correction
       weight *= spherical_correction;
       norm += weight * CALL_VOLUME(local_values);
+      pd_norm += weight * CALL_VOLUME(local_values);
       const double scattering = CALL_IQ(q, q_index, local_values);
 …
   if (q_index < nq) {
     if (pd_stop >= details->total_pd) {
+    if (pd_stop >= details->pd_prod) {
       // End of the PD loop we can normalize
       double scale, background;
       scale = values[0];
       background = values[1];
       result[q_index] = (norm>0. ? scale*this_result/norm + background : background);
+      result[q_index] = (pd_norm>0. ? scale*this_result/pd_norm + background : background);
     } else {
       // Partial result, so remember it but don't normalize it.
 …
     // Remember the updated norm.
     if (q_index == 0) result[nq] = norm;
+    if (q_index == 0) result[nq] = pd_norm;
+  }

sasmodels/kernelcl.py

-                      r56b2687
+                      ra738209
                      else np.float32)  # will never get here, so use np.float32
     def __call__(self, call_details, weights, values, cutoff):
+    def __call__(self, call_details, values, cutoff):
         # type: (CallDetails, np.ndarray, np.ndarray, float) -> np.ndarray
         context = self.queue.context
 …
         details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=call_details.buffer)
-        weights_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
-                              hostbuf=weights) if len(weights) else None
         values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                              hostbuf=values)
 …
         # Call kernel and retrieve results
         step = 100
         for start in range(0, call_details.total_pd, step):
             stop = min(start+step, call_details.total_pd)
+        for start in range(0, call_details.pd_prod, step):
+            stop = min(start+step, call_details.pd_prod)
             args = [
                 np.uint32(self.q_input.nq), np.int32(start), np.int32(stop),
                 details_b, weights_b, values_b, self.q_input.q_b, self.result_b,
+                details_b, values_b, self.q_input.q_b, self.result_b,
                 self.real(cutoff),
+            ]
 …
         # Free buffers
         for v in (details_b, weights_b, values_b):
+        for v in (details_b, values_b):
             if v is not None: v.release()

sasmodels/kerneldll.py

-                      r56b2687
+                      ra738209
     exist yet if it hasn't been compiled.
     """
     return os.path.join(DLL_PATH, dll_name(model_info, dtype)+".so")
+    return os.path.join(DLL_PATH, dll_name(model_info, dtype))
 …
         need_recompile = dll_time < newest_source
     if need_recompile:
         basename = dll_name(model_info, dtype) + "_"
         fid, filename = tempfile.mkstemp(suffix=".c", prefix=basename)
+        basename = os.path.splitext(os.path.basename(dll))[0] + "_"
+        fd, filename = tempfile.mkstemp(suffix=".c", prefix=basename)
         source = generate.convert_type(source, dtype)
-        fd, filename = tempfile.mkstemp(suffix=".c", prefix=tempfile_prefix)
         with os.fdopen(fd, "w") as file:
             file.write(source)
 …
               else c_longdouble)
         # int, int, int, int*, double*, double*, double*, double*, double*, double
         argtypes = [c_int32]*3 + [c_void_p]*5 + [fp]
+        # int, int, int, int*, double*, double*, double*, double*, double
+        argtypes = [c_int32]*3 + [c_void_p]*4 + [fp]
         self._Iq = self._dll[generate.kernel_name(self.info, is_2d=False)]
         self._Iqxy = self._dll[generate.kernel_name(self.info, is_2d=True)]
 …
                      else np.float128)
     def __call__(self, call_details, weights, values, cutoff):
+    def __call__(self, call_details, values, cutoff):
         # type: (CallDetails, np.ndarray, np.ndarray, float) -> np.ndarray
         #print("in kerneldll")
-        #print("weights", weights)
         #print("values", values)
         start, stop = 0, call_details.total_pd
+        start, stop = 0, call_details.pd_prod
         args = [
             self.q_input.nq, # nq
 …
             stop, # pd_stop pd_stride[MAX_PD]
             call_details.buffer.ctypes.data, # problem
-            weights.ctypes.data,  # weights
             values.ctypes.data,  #pars
             self.q_input.q.ctypes.data, #q

sasmodels/kernelpy.py

-                      r7ae2b7f
+                      ra738209
 :class:`kernelcl.GpuModel` and :class:`kerneldll.DllModel`.
 """
+from __future__ import division, print_function
 import numpy as np  # type: ignore
 from numpy import pi, cos  #type: ignore
 …
                         else (lambda: 1.0))
     def __call__(self, call_details, weights, values, cutoff):
+    def __call__(self, call_details, values, cutoff):
         assert isinstance(call_details, details.CallDetails)
         res = _loops(self._parameter_vector, self._form, self._volume,
                      self.q_input.nq, call_details, weights, values, cutoff)
+                     self.q_input.nq, call_details, values, cutoff)
         return res
 …
         self.q_input = None
 def _loops(parameters, form, form_volume, nq, call_details,
            weights, values, cutoff):
+def _loops(parameters, form, form_volume, nq, details,
+           values, cutoff):
     # type: (np.ndarray, Callable[[], np.ndarray], Callable[[], float], int, details.CallDetails, np.ndarray, np.ndarray, float) -> None
     ################################################################
 …
     #                                                              #
     ################################################################
+    parameters[:] = values[call_details.par_offset]
+    NPARS = len(parameters)
+    parameters[:] = values[2:NPARS+2]
     scale, background = values[0], values[1]
     if call_details.num_active == 0:
+    if details.num_active == 0:
         norm = float(form_volume())
         if norm > 0.0:
 …
             return np.ones(nq, 'd')*background
+    pd_value = values[2+NPARS:2+NPARS+details.pd_sum]
+    pd_weight = values[2+NPARS+details.pd_sum:]
+    pd_norm = 0.0
+    spherical_correction = 1.0
     partial_weight = np.NaN
+    spherical_correction = 1.0
+    pd_stride = call_details.pd_stride[:call_details.num_active]
+    pd_length = call_details.pd_length[:call_details.num_active]
+    pd_offset = call_details.pd_offset[:call_details.num_active]
+    pd_index = np.empty_like(pd_offset)
+    offset = np.empty_like(call_details.par_offset)
+    theta = call_details.theta_par
+    fast_length = pd_length[0]
+    pd_index[0] = fast_length
+    weight =np.NaN
+    p0_par = details.pd_par[0]
+    p0_is_theta = (p0_par == details.theta_par)
+    p0_length = details.pd_length[0]
+    p0_index = p0_length
+    p0_offset = details.pd_offset[0]
+    pd_par = details.pd_par[:details.num_active]
+    pd_offset = details.pd_offset[:details.num_active]
+    pd_stride = details.pd_stride[:details.num_active]
+    pd_length = details.pd_length[:details.num_active]
     total = np.zeros(nq, 'd')
+    norm = 0.0
+    for loop_index in range(call_details.total_pd):
+    for loop_index in range(details.pd_prod):
         # update polydispersity parameter values
+        if pd_index[0] == fast_length:
+            pd_index[:] = (loop_index/pd_stride)%pd_length
+            partial_weight = np.prod(weights[pd_offset+pd_index][1:])
+            for k in range(call_details.num_coord):
+                par = call_details.par_coord[k]
+                coord = call_details.pd_coord[k]
+                this_offset = call_details.par_offset[par]
+                block_size = 1
+                for bit in range(len(pd_offset)):
+                    if coord&1:
+                        this_offset += block_size * pd_index[bit]
+                        block_size *= pd_length[bit]
+                    coord >>= 1
+                    if coord == 0: break
+                offset[par] = this_offset
+                parameters[par] = values[this_offset]
+                if par == theta and not (call_details.par_coord[k]&1):
+                    spherical_correction = max(abs(cos(pi/180 * parameters[theta])), 1e-6)
+        for k in range(call_details.num_coord):
+            if call_details.pd_coord[k]&1:
+                #par = call_details.par_coord[k]
+                parameters[par] = values[offset[par]]
+                #print "par",par,offset[par],parameters[par+2]
+                offset[par] += 1
+                if par == theta:
+                    spherical_correction = max(abs(cos(pi/180 * parameters[theta])), 1e-6)
+        weight = partial_weight * weights[pd_offset[0] + pd_index[0]]
+        pd_index[0] += 1
+        if p0_index == p0_length:
+            pd_index = (loop_index//pd_stride)%pd_length
+            parameters[pd_par] = pd_value[pd_offset+pd_index]
+            partial_weight = np.prod(pd_weight[pd_offset+pd_index][1:])
+            if details.theta_par >= 0:
+                spherical_correction = max(abs(cos(pi/180 * parameters[details.theta_par])), 1e-6)
+            p0_index = loop_index%p0_length
+        weight = partial_weight * pd_weight[p0_offset + p0_index]
+        parameters[p0_par] = pd_value[p0_offset + p0_index]
+        if p0_is_theta:
+            spherical_correction = max(abs(cos(pi/180 * parameters[p0_par])), 1e-6)
+        p0_index += 1
         if weight > cutoff:
             # Call the scattering function
 …
             weight *= spherical_correction
             total += weight * I
             norm += weight * form_volume()
     if norm > 0.0:
         return (scale/norm)*total + background
+            pd_norm += weight * form_volume()
+    if pd_norm > 0.0:
+        return (scale/pd_norm)*total + background
     else:
         return np.ones(nq, 'd')*background

sasmodels/sasview_model.py

-                      r56b2687
+                      ra738209
         else:
             q_vectors = [np.asarray(qx)]
         kernel = self._model.make_kernel(q_vectors)
+        calculator = self._model.make_kernel(q_vectors)
         pairs = [self._get_weights(p)
                  for p in self._model_info.parameters.call_parameters]
         call_details, weight, value = kernel.build_details(kernel, pairs)
         result = kernel(call_details, weight, value, cutoff=self.cutoff)
         kernel.release()
+        call_details, value = kernel.build_details(calculator, pairs)
+        result = calculator(call_details, value, cutoff=self.cutoff)
+        calculator.release()
         return result

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: