← Previous Changeset
Next Changeset →

Changeset f2f67a6 in sasmodels

Timestamp:

Apr 15, 2016 7:26:24 PM (9 years ago)

Author:

Paul Kienzle <pkienzle@…>

Branches:

master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

ae2b6b5, 38a9b07, eb97b11

Parents:

Message:

reenable opencl; works on cpu but not gpu

Files:

: 1 added
: 6 edited

.gitignore (modified) (1 diff)
sasmodels/compare.py (modified) (2 diffs)
sasmodels/core.py (modified) (1 diff)
sasmodels/generate.py (modified) (8 diffs)
sasmodels/kernel_iq.c (modified) (11 diffs)
sasmodels/kernel_iq.cl (added)
sasmodels/kernelcl.py (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

.gitignore

r4a82d4d	rf2f67a6
4	4	/*.csv
5	5	*.pyc
6		*.cl
7	6	*.so
8	7	*.obj

sasmodels/compare.py

-                      r8d62008
+                      rf2f67a6
         value = calculator(**pars)
     average_time = toc()*1000./Nevals
+    #print("I(q)",value)
     return value, average_time
 …
     if Nbase > 0 and Ncomp > 0:
         resid = (base_value - comp_value)
         relerr = resid/comp_value
+        relerr = resid/np.where(comp_value!=0., abs(comp_value), 1.0)
         _print_stats("|%s-%s|"
                      % (base.engine, comp.engine) + (" "*(3+len(comp.engine))),

sasmodels/core.py

-                      rdd7fc12
+                      rf2f67a6
             or not HAVE_OPENCL
             or not kernelcl.environment().has_type(numpy_dtype)):
+        #print("building dll", numpy_dtype)
         return kerneldll.load_dll(source, model_info, numpy_dtype)
     else:
+        #print("building ocl", numpy_dtype)
         return kernelcl.GpuModel(source, model_info, numpy_dtype, fast=fast)

sasmodels/generate.py

-                      ra5b8477
+                      rf2f67a6
 try:
     from typing import Tuple, Sequence, Iterator
+    from typing import Tuple, Sequence, Iterator, Dict
     from .modelinfo import ModelInfo
 except ImportError:
 …
     Return a timestamp for the model corresponding to the most recently
     changed file or dependency.
+    Note that this does not look at the time stamps for the OpenCL header
+    information since that need not trigger a recompile of the DLL.
     """
     source_files = (model_sources(model_info)
 …
     newest = max(getmtime(f) for f in source_files)
     return newest
+def model_templates():
+    # type: () -> List[str]
+    # TODO: fails DRY; templates appear two places.
+    # should instead have model_info contain a list of paths
+    # Note: kernel_iq.cl is not on this list because changing it need not
+    # trigger a recompile of the dll.
+    return [joinpath(TEMPLATE_ROOT, filename)
+            for filename in ('kernel_header.c', 'kernel_iq.c')]
 def convert_type(source, dtype):
 …
     return _template_cache[filename][1]
-def model_templates():
-    # type: () -> List[str]
-    # TODO: fails DRY; templates are listed in two places.
-    # should instead have model_info contain a list of paths
-    return [joinpath(TEMPLATE_ROOT, filename)
-            for filename in ('kernel_header.c', 'kernel_iq.c')]
 _FN_TEMPLATE = """\
 …
     %(body)s
+}
 """
 …
     # Load templates and user code
     kernel_header = load_template('kernel_header.c')
+    kernel_code = load_template('kernel_iq.c')
+    dll_code = load_template('kernel_iq.c')
+    ocl_code = load_template('kernel_iq.cl')
     user_code = [open(f).read() for f in model_sources(model_info)]
 …
     if _have_Iqxy(user_code):
         # Call 2D model
         refs = ["q[2*i]", "q[2*i+1]"] + _call_pars("_v.", partable.iqxy_parameters)
+        refs = ["q[2*_i]", "q[2*_i+1]"] + _call_pars("_v.", partable.iqxy_parameters)
         call_iqxy = "#define CALL_IQ(_q,_i,_v) Iqxy(%s)" % (",".join(refs))
     else:
 …
     # TODO: allow mixed python/opencl kernels?
+    # define the Iq kernel
+    source.append("#define KERNEL_NAME %s_Iq"%model_info.name)
+    source.append(call_iq)
+    source.append(kernel_code)
+    source.append("#undef CALL_IQ")
+    source.append("#undef KERNEL_NAME")
+    # define the Iqxy kernel from the same source with different #defines
+    source.append("#define KERNEL_NAME %s_Iqxy"%model_info.name)
+    source.append(call_iqxy)
+    source.append(kernel_code)
+    source.append("#undef CALL_IQ")
+    source.append("#undef KERNEL_NAME")
+    source.append("#if defined(USE_OPENCL)")
+    source.extend(_add_kernels(ocl_code, call_iq, call_iqxy, model_info.name))
+    source.append("#else /* !USE_OPENCL */")
+    source.extend(_add_kernels(dll_code, call_iq, call_iqxy, model_info.name))
+    source.append("#endif /* !USE_OPENCL */")
     return '\n'.join(source)
+def _add_kernels(kernel_code, call_iq, call_iqxy, name):
+    # type: (str, str, str, str) -> List[str]
+    source = [
+        # define the Iq kernel
+        "#define KERNEL_NAME %s_Iq"%name,
+        call_iq,
+        kernel_code,
+        "#undef CALL_IQ",
+        "#undef KERNEL_NAME",
+        # define the Iqxy kernel from the same source with different #defines
+        "#define KERNEL_NAME %s_Iqxy"%name,
+        call_iqxy,
+        kernel_code,
+        "#undef CALL_IQ",
+        "#undef KERNEL_NAME",
+    ]
+    return source
 def load_kernel_module(model_name):
     # type: (str) -> module
+    """
+    Return the kernel module named in *model_name*.
+    If the name ends in *.py* then load it as a custom model using
+    :func:`sasmodels.custom.load_custom_kernel_module`, otherwise
+    load it from :mod:`sasmodels.models`.
+    """
     if model_name.endswith('.py'):
         kernel_module = load_custom_kernel_module(model_name)

sasmodels/kernel_iq.c

-                      r6e7ff6d
+                      rf2f67a6
   local ParameterBlock local_values;  // current parameter values
   double *pvec = (double *)(&local_values);  // Alias named parameters with a vector
+  double norm;
+  // number of active loops
+  const int num_active = details->num_active;
   // Fill in the initial variables
 …
+  }
+  // Monodisperse computation
+  if (num_active == 0) {
+    #ifdef INVALID
+    if (INVALID(local_values)) { return; }
+    #endif
+    norm = CALL_VOLUME(local_values);
+    const double scale = values[0];
+    const double background = values[1];
+    // result[nq] = norm; // Total volume normalization
+    #ifdef USE_OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int i=0; i < nq; i++) {
+      double scattering = CALL_IQ(q, i, local_values);
+      result[i] = (norm>0. ? scale*scattering/norm + background : background);
+    }
+    return;
+  }
+#if MAX_PD > 0
   // If it is the first round initialize the result to zero, otherwise
   // assume that the previous result has been passed back.
 …
       result[i] = 0.0;
+    }
+  }
+  // Monodisperse computation
+  if (details->num_active == 0) {
+    #ifdef INVALID
+    if (INVALID(local_values)) { return; }
+    #endif
+    const double norm = CALL_VOLUME(local_values);
+    const double scale = values[0];
+    const double background = values[1];
+    #ifdef USE_OPENMP
+    #pragma omp parallel for
+    #endif
+    result[nq] = norm; // Total volume normalization
+    for (int i=0; i < nq; i++) {
+      double scattering = CALL_IQ(q, i, local_values);
+      result[i] = (norm>0. ? scale*scattering/norm + background : background);
+    }
+    return;
+  }
+#if MAX_PD > 0
+  //printf("Entering polydispersity from %d to %d\n", pd_start, pd_stop);
+  // Since we are no longer looping over the entire polydispersity hypercube
+  // for each q, we need to track the normalization values between calls.
+  double norm = 0.0;
+    norm = 0.0;
+  } else {
+    norm = result[nq];
+  }
   // need product of weights at every Iq calc, so keep product of
 …
   pd_index[0] = fast_length;
+  // Number of coordinated indices
+  const int num_coord = details->num_coord;
   // Loop over the weights then loop over q, accumulating values
   for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) {
     // check if indices need to be updated
+    // check if fast loop needs to be reset
     if (pd_index[0] == fast_length) {
       //printf("should be here with %d active\n", details->num_active);
+      //printf("should be here with %d active\n", num_active);
       // Compute position in polydispersity hypercube
       for (int k=0; k < details->num_active; k++) {
+      for (int k=0; k < num_active; k++) {
         pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k];
         //printf("pd_index[%d] = %d\n",k,pd_index[k]);
 …
       partial_weight = 1.0;
       //printf("partial weight %d: ", loop_index);
       for (int k=1; k < details->num_active; k++) {
+      for (int k=1; k < num_active; k++) {
         double wi = weights[details->pd_offset[k] + pd_index[k]];
         //printf("pd[%d]=par[%d]=%g ", k, details->pd_par[k], wi);
 …
       // Update parameter offsets in weight vector
       //printf("slow %d: ", loop_index);
       for (int k=0; k < details->num_coord; k++) {
+      for (int k=0; k < num_coord; k++) {
         int par = details->par_coord[k];
         int coord = details->pd_coord[k];
 …
         // if theta is not coordinated with fast index, precompute spherical correction
         if (par == details->theta_par && !(details->par_coord[k]&1)) {
           spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1e-6);
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
 …
     double weight = partial_weight*wi;
     //printf("fast %d: ", loop_index);
     for (int k=0; k < details->num_coord; k++) {
+    for (int k=0; k < num_coord; k++) {
       if (details->pd_coord[k]&1) {
         const int par = details->par_coord[k];
 …
         // if theta is coordinated with fast index, compute spherical correction each time
         if (par == details->theta_par) {
           spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1e-6);
+          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6);
+        }
+      }
 …
+  }
-  // Accumulate norm.
-  result[nq] += norm;
   // End of the PD loop we can normalize
   if (pd_stop >= details->total_pd) {
 …
+    }
+  }
+  // Remember the updated norm.
+  result[nq] = norm;
 #endif // MAX_PD > 0
+}

sasmodels/kernelcl.py

-                      r8d62008
+                      rf2f67a6
 try:
     raise NotImplementedError("OpenCL not yet implemented for new kernel template")
+    #raise NotImplementedError("OpenCL not yet implemented for new kernel template")
     import pyopencl as cl  # type: ignore
     # Ask OpenCL for the default context so that we know that one exists
 …
         key = "%s-%s-%s"%(name, dtype, fast)
         if key not in self.compiled:
             print("compiling",name)
+            #print("OpenCL compile",name)
             dtype = np.dtype(dtype)
             program = compile_model(self.get_context(dtype),
 …
         kernel_name = generate.kernel_name(self.info, is_2d)
         kernel = getattr(self.program, kernel_name)
         return GpuKernel(kernel, self.info, q_vectors)
+        return GpuKernel(kernel, self.dtype, self.info, q_vectors)
     def release(self):
 …
         Free the memory.
         """
         if self.q is not None:
             self.q.release()
             self.q = None
+        if self.q_b is not None:
+            self.q_b.release()
+            self.q_b = None
     def __del__(self):
 …
     Call :meth:`release` when done with the kernel instance.
     """
     def __init__(self, kernel, model_info, q_vectors):
         # type: (cl.Kernel, ModelInfo, List[np.ndarray]) -> None
+    def __init__(self, kernel, dtype, model_info, q_vectors):
+        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
         max_pd = model_info.parameters.max_pd
         npars = len(model_info.parameters.kernel_parameters)-2
         q_input = GpuInput(q_vectors, kernel.dtype)
+        q_input = GpuInput(q_vectors, dtype)
         self.kernel = kernel
         self.info = model_info
         self.dtype = kernel.dtype
+        self.dtype = dtype
         self.dim = '2d' if q_input.is_2d else '1d'
         # plus three for the normalization values
         self.result = np.empty(q_input.nq+3, q_input.dtype)
+        self.result = np.empty(q_input.nq+3, dtype)
         # Inputs and outputs for each kernel call
         # Note: res may be shorter than res_b if global_size != nq
         env = environment()
         self.queue = env.get_queue(kernel.dtype)
+        self.queue = env.get_queue(dtype)
         # details is int32 data, padded to an 8 integer boundary
         size = ((max_pd*5 + npars*3 + 2 + 7)//8)*8
         self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE,
                                q_input.global_size[0] * kernel.dtype.itemsize)
+                               q_input.global_size[0] * dtype.itemsize)
         self.q_input = q_input # allocated by GpuInput above
         self._need_release = [ self.result_b, self.q_input ]
         self.real = (np.float32 if self.q_input.dtype == generate.F32
                      else np.float64 if self.q_input.dtype == generate.F64
                      else np.float16 if self.q_input.dtype == generate.F16
+        self.real = (np.float32 if dtype == generate.F32
+                     else np.float64 if dtype == generate.F64
+                     else np.float16 if dtype == generate.F16
                      else np.float32)  # will never get here, so use np.float32
     def __call__(self, call_details, weights, values, cutoff):
         # type: (CallDetails, np.ndarray, np.ndarray, float) -> np.ndarray
         context = self.queue.context
         # Arrange data transfer to card
 …
                               hostbuf=call_details.buffer)
         weights_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=weights)
+                              hostbuf=weights) if len(weights) else None
         values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                              hostbuf=values)
 …
         cl.enqueue_copy(self.queue, self.result, self.result_b)
         for v in (details_b, weights_b, values_b):
             v.release()
+            if v is not None: v.release()
         return self.result[:self.q_input.nq]

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: