← Previous Changeset
Next Changeset →

Changeset 3199b17 in sasmodels

Timestamp:

Mar 6, 2019 12:24:03 PM (6 years ago)

Author:

Paul Kienzle <pkienzle@…>

Branches:

master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests

Children:

Parents:

Message:

PEP 8 changes and improved consistency between OpenCL/CUDA/DLL code

Location:

Files:

: 5 edited

kernel.py (modified) (2 diffs)
kernelcl.py (modified) (34 diffs)
kernelcuda.py (modified) (26 diffs)
kerneldll.py (modified) (15 diffs)
kernelpy.py (modified) (11 diffs)

Legend:

: Unmodified
: Added
: Removed

sasmodels/kernel.py

-                      re44432d
+                      r3199b17
 # pylint: enable=unused-import
 class KernelModel(object):
     info = None  # type: ModelInfo
 …
         # type: () -> None
         pass
 class Kernel(object):

sasmodels/kernelcl.py

-                      r00afc15
+                      r3199b17
 # Attempt to setup opencl. This may fail if the pyopencl package is not
+# Attempt to setup OpenCL. This may fail if the pyopencl package is not
 # installed or if it is installed but there are no devices available.
 try:
 …
     from pyopencl import mem_flags as mf
     from pyopencl.characterize import get_fast_inaccurate_build_options
     # Ask OpenCL for the default context so that we know that one exists
+    # Ask OpenCL for the default context so that we know that one exists.
     cl.create_some_context(interactive=False)
     HAVE_OPENCL = True
 …
 # pylint: enable=unused-import
+# CRUFT: pyopencl < 2017.1  (as of June 2016 needs quotes around include path)
+# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path).
 def quote_path(v):
     """
 …
     return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
 def fix_pyopencl_include():
     """
 …
     import pyopencl as cl
     if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'):
+        cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS]
+        cl._DEFAULT_INCLUDE_OPTIONS = [
+            quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS
+            ]
 if HAVE_OPENCL:
 …
 MAX_LOOPS = 2048
 # Pragmas for enable OpenCL features.  Be sure to protect them so that they
 # still compile even if OpenCL is not present.
 …
 """
 def use_opencl():
     sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
     return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
 ENV = None
 def reset_environment():
 …
     global ENV
     ENV = GpuEnvironment() if use_opencl() else None
 def environment():
 …
     return ENV
 def has_type(device, dtype):
     # type: (cl.Device, np.dtype) -> bool
 …
         return "cl_khr_fp64" in device.extensions
     else:
         # Not supporting F16 type since it isn't accurate enough
+        # Not supporting F16 type since it isn't accurate enough.
         return False
 def get_warp(kernel, queue):
 …
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
         queue.device)
 def compile_model(context, source, dtype, fast=False):
 …
         source_list.insert(0, _F64_PRAGMA)
     # Note: USE_SINCOS makes the intel cpu slower under opencl
+    # Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
     if context.devices[0].type == cl.device_type.GPU:
         source_list.insert(0, "#define USE_SINCOS\n")
 …
     source = "\n".join(source_list)
     program = cl.Program(context, source).build(options=options)
     #print("done with "+program)
     return program
 # for now, this returns one device in the context
 # TODO: create a context that contains all devices on all platforms
+# For now, this returns one device in the context.
+# TODO: Create a context that contains all devices on all platforms.
 class GpuEnvironment(object):
     """
+    GPU context, with possibly many devices, and one queue per device.
+    Because the environment can be reset during a live program (e.g., if the
+    user changes the active GPU device in the GUI), everything associated
+    with the device context must be cached in the environment and recreated
+    if the environment changes.  The *cache* attribute is a simple dictionary
+    which holds keys and references to objects, such as compiled kernels and
+    allocated buffers.  The running program should check in the cache for
+    long lived objects and create them if they are not there.  The program
+    should not hold onto cached objects, but instead only keep them active
+    for the duration of a function call.  When the environment is destroyed
+    then the *release* method for each active cache item is called before
+    the environment is freed.  This means that each cl buffer should be
+    in its own cache entry.
+    GPU context for OpenCL, with possibly many devices and one queue per device.
     """
     def __init__(self):
         # type: () -> None
         # find gpu context
+        # Find gpu context.
         context_list = _create_some_context()
 …
                 self.context[dtype] = None
         # Build a queue for each context
+        # Build a queue for each context.
         self.queue = {}
         context = self.context[F32]
 …
             self.queue[F64] = cl.CommandQueue(context, context.devices[0])
         # Byte boundary for data alignment
+        ## Byte boundary for data alignment.
         #self.data_boundary = max(context.devices[0].min_data_type_align_size
         #                         for context in self.context.values())
         # Cache for compiled programs, and for items in context
+        # Cache for compiled programs, and for items in context.
         self.compiled = {}
 …
         """
         # Note: PyOpenCL caches based on md5 hash of source, options and device
+        # so we don't really need to cache things for ourselves.  I'll do so
+        # anyway just to save some data munging time.
+        # but I'll do so as well just to save some data munging time.
         tag = generate.tag_source(source)
         key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
         # Check timestamp on program
+        # Check timestamp on program.
         program, program_timestamp = self.compiled.get(key, (None, np.inf))
         if program_timestamp < timestamp:
 …
         return program
 def _create_some_context():
     # type: () -> cl.Context
 …
     which one (and not a CUDA device, or no GPU).
     """
     # Assume we do not get here if SAS_OPENCL is None or CUDA
+    # Assume we do not get here if SAS_OPENCL is None or CUDA.
     sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
     if sas_opencl.lower() != 'opencl':
         # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
+        # Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context.
         os.environ["PYOPENCL_CTX"] = sas_opencl
 …
         except Exception as exc:
             warnings.warn(str(exc))
+            warnings.warn("pyopencl.create_some_context() failed")
+            warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly")
+            warnings.warn("pyopencl.create_some_context() failed.  The "
+                "environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might "
+                "not be set correctly")
     return _get_default_context()
 def _get_default_context():
 …
     # is running may increase throughput.
+    #
     # Macbook pro, base install:
+    # MacBook Pro, base install:
     #     {'Apple': [Intel CPU, NVIDIA GPU]}
     # Macbook pro, base install:
+    # MacBook Pro, base install:
     #     {'Apple': [Intel CPU, Intel GPU]}
     # 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed
+    # 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install:
     #     {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
     gpu, cpu = None, None
 …
             else:
                 # System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
                 # Intel Phi for example registers as an accelerator
+                # Intel Phi for example registers as an accelerator.
                 # Since the user installed a custom device on their system
                 # and went through the pain of sorting out OpenCL drivers for
 …
                 gpu = device
     # order the devices by gpu then by cpu; when searching for an available
+    # Order the devices by gpu then by cpu; when searching for an available
     # device by data type they will be checked in this order, which means
     # that if the gpu supports double then the cpu will never be used (though
 …
     that the compiler is allowed to take shortcuts.
     """
     info = None # type: ModelInfo
     source = "" # type: str
     dtype = None # type: np.dtype
     fast = False # type: bool
     _program = None # type: cl.Program
     _kernels = None # type: Dict[str, cl.Kernel]
+    info = None  # type: ModelInfo
+    source = ""  # type: str
+    dtype = None  # type: np.dtype
+    fast = False  # type: bool
+    _program = None  # type: cl.Program
+    _kernels = None  # type: Dict[str, cl.Kernel]
     def __init__(self, source, model_info, dtype=generate.F32, fast=False):
 …
         functions = [getattr(program, k) for k in names]
         self._kernels = {k: v for k, v in zip(variants, functions)}
         # keep a handle to program so GC doesn't collect
+        # Keep a handle to program so GC doesn't collect.
         self._program = program
+# TODO: check that we don't need a destructor for buffers which go out of scope
+# TODO: Check that we don't need a destructor for buffers which go out of scope.
 class GpuInput(object):
     """
 …
     def __init__(self, q_vectors, dtype=generate.F32):
         # type: (List[np.ndarray], np.dtype) -> None
         # TODO: do we ever need double precision q?
+        # TODO: Do we ever need double precision q?
         self.nq = q_vectors[0].size
         self.dtype = np.dtype(dtype)
         self.is_2d = (len(q_vectors) == 2)
         # TODO: stretch input based on get_warp()
         # not doing it now since warp depends on kernel, which is not known
+        # TODO: Stretch input based on get_warp().
+        # Not doing it now since warp depends on kernel, which is not known
         # at this point, so instead using 32, which is good on the set of
         # architectures tested so far.
 …
         #print("creating inputs of size", self.global_size)
         # transfer input value to gpu
+        # Transfer input value to GPU.
         env = environment()
         context = env.context[self.dtype]
 …
         # type: () -> None
         """
         Free the buffer associated with the q value
+        Free the buffer associated with the q value.
         """
         if self.q_b is not None:
 …
         # type: () -> None
         self.release()
 class GpuKernel(Kernel):
 …
     Call :meth:`release` when done with the kernel instance.
     """
     #: SAS model information structure
     info = None # type: ModelInfo
     #: kernel precision
     dtype = None # type: np.dtype
     #: kernel dimensions (1d or 2d)
     dim = "" # type: str
     #: calculation results, updated after each call to :meth:`_call_kernel`
     result = None # type: np.ndarray
+    #: SAS model information structure.
+    info = None  # type: ModelInfo
+    #: Kernel precision.
+    dtype = None  # type: np.dtype
+    #: Kernel dimensions (1d or 2d).
+    dim = ""  # type: str
+    #: Calculation results, updated after each call to :meth:`_call_kernel`.
+    result = None  # type: np.ndarray
     def __init__(self, model, q_vectors):
 …
         self.q_input = GpuInput(q_vectors, dtype)
         self._model = model
+        # F16 isn't sufficient, so don't support it
+        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
+        # attributes accessed from the outside
+        # Attributes accessed from the outside.
         self.dim = '2d' if self.q_input.is_2d else '1d'
         self.info = model.info
+        self.dtype = model.dtype
+        # holding place for the returned value
+        self.dtype = dtype
+        # Converter to translate input to target type.
+        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
+        # Holding place for the returned value.
         nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
         extra_q = 4  # total weight, form volume, shell volume and R_eff
         self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
         # allocate result value on gpu
+        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
+        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
+        # Allocate result value on GPU.
         env = environment()
         context = env.context[self.dtype]
 …
         self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
+    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
+        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
+    def _call_kernel(self, call_details, values, cutoff, magnetic,
+                     effective_radius_type):
+        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
         env = environment()
         queue = env.queue[self._model.dtype]
         context = queue.context
         # Arrange data transfer to/from card
+        # Arrange data transfer to card.
         details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=call_details.buffer)
 …
                              hostbuf=values)
+        # Setup kernel function and arguments.
         name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
         kernel = self._model.get_function(name)
         kernel_args = [
+            np.uint32(self.q_input.nq), None, None,
+            details_b, values_b, self.q_input.q_b, self._result_b,
+            self._as_dtype(cutoff),
+            np.uint32(effective_radius_type),
+            np.uint32(self.q_input.nq),  # Number of inputs.
+            None,  # Placeholder for pd_start.
+            None,  # Placeholder for pd_stop.
+            details_b,  # Problem definition.
+            values_b,  # Parameter values.
+            self.q_input.q_b,  # Q values.
+            self._result_b,   # Result storage.
+            self._as_dtype(cutoff),  # Probability cutoff.
+            np.uint32(effective_radius_type),  # R_eff mode.
+        ]
+        # Call kernel and retrieve results.
         #print("Calling OpenCL")
         #call_details.show(values)
-        #Call kernel and retrieve results
         wait_for = None
         last_nap = time.clock()
 …
                                *kernel_args, wait_for=wait_for)]
             if stop < call_details.num_eval:
                 # Allow other processes to run
+                # Allow other processes to run.
                 wait_for[0].wait()
                 current_time = time.clock()
 …
         #print("result", self.result)
         # Free buffers
+        # Free buffers.
         details_b.release()
         values_b.release()

sasmodels/kernelcuda.py

-                      r00afc15
+                      r3199b17
 # Attempt to setup cuda. This may fail if the pycuda package is not
+# Attempt to setup CUDA. This may fail if the pycuda package is not
 # installed or if it is installed but there are no devices available.
 try:
 …
 MAX_LOOPS = 2048
 def use_cuda():
+    env = os.environ.get("SAS_OPENCL", "").lower()
+    return HAVE_CUDA and (env == "" or env.startswith("cuda"))
+    sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower()
+    return HAVE_CUDA and sas_opencl.startswith("cuda")
 ENV = None
 …
         ENV.release()
     ENV = GpuEnvironment() if use_cuda() else None
 def environment():
 …
     return ENV
-def free_context():
-    global ENV
-    if ENV is not None:
-        ENV.release()
-        ENV = None
-atexit.register(free_context)
 def has_type(dtype):
 …
     Return true if device supports the requested precision.
     """
     # Assume the nvidia card supports 32-bit and 64-bit floats.
     # TODO: check if pycuda support F16
+    # Assume the NVIDIA card supports 32-bit and 64-bit floats.
+    # TODO: Check if pycuda support F16.
     return dtype in (generate.F32, generate.F64)
 FUNCTION_PATTERN = re.compile(r"""^
   (?P<space>\s*)                   # initial space
   (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function
   (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens
+  (?P<space>\s*)                       # Initial space.
+  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function.
+  (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens.
   """, re.VERBOSE|re.MULTILINE)
 …
   """, re.VERBOSE|re.MULTILINE)
 def _add_device_tag(match):
     # type: (None) -> str
     # Note: should be re.Match, but that isn't a simple type
+    # Note: Should be re.Match, but that isn't a simple type.
     """
     replace qualifiers with __device__ qualifiers if needed
 …
         return "".join((space, "__device__ ", qualifiers, function))
 def mark_device_functions(source):
     # type: (str) -> str
 …
     """
     return FUNCTION_PATTERN.sub(_add_device_tag, source)
 def show_device_functions(source):
 …
         print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(')
     return source
 def compile_model(source, dtype, fast=False):
 …
     #options = ['--verbose', '-E']
     options = ['--use_fast_math'] if fast else None
     program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...]
+    program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...])
     #print("done with "+program)
 …
 # for now, this returns one device in the context
 # TODO: create a context that contains all devices on all platforms
+# For now, this returns one device in the context.
+# TODO: Create a context that contains all devices on all platforms.
 class GpuEnvironment(object):
     """
     GPU context, with possibly many devices, and one queue per device.
+    GPU context for CUDA.
     """
     context = None # type: cuda.Context
     def __init__(self, devnum=None):
         # type: (int) -> None
-        # Byte boundary for data alignment
-        #self.data_boundary = max(d.min_data_type_align_size
-        #                         for d in self.context.devices)
-        self.compiled = {}
         env = os.environ.get("SAS_OPENCL", "").lower()
         if devnum is None and env.startswith("cuda:"):
             devnum = int(env[5:])
         # Set the global context to the particular device number if one is
         # given, otherwise use the default context.  Perhaps this will be set
 …
             self.context = make_default_context()
+        ## Byte boundary for data alignment.
+        #self.data_boundary = max(d.min_data_type_align_size
+        #                         for d in self.context.devices)
+        # Cache for compiled programs, and for items in context.
+        self.compiled = {}
     def release(self):
         if self.context is not None:
 …
         Compile the program for the device in the given context.
         """
+        # Note: PyOpenCL caches based on md5 hash of source, options and device
+        # so we don't really need to cache things for ourselves.  I'll do so
+        # anyway just to save some data munging time.
+        # Note: PyCuda (probably) caches but I'll do so as well just to
+        # save some data munging time.
         tag = generate.tag_source(source)
         key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
         # Check timestamp on program
+        # Check timestamp on program.
         program, program_timestamp = self.compiled.get(key, (None, np.inf))
         if program_timestamp < timestamp:
 …
         return program
 class GpuModel(KernelModel):
     """
 …
     that the compiler is allowed to take shortcuts.
     """
     info = None # type: ModelInfo
     source = "" # type: str
     dtype = None # type: np.dtype
     fast = False # type: bool
     _program = None # type: SourceModule
     _kernels = None # type: Dict[str, cuda.Function]
+    info = None  # type: ModelInfo
+    source = ""  # type: str
+    dtype = None  # type: np.dtype
+    fast = False  # type: bool
+    _program = None  # type: SourceModule
+    _kernels = None  # type: Dict[str, cuda.Function]
     def __init__(self, source, model_info, dtype=generate.F32, fast=False):
 …
         functions = [program.get_function(k) for k in names]
         self._kernels = {k: v for k, v in zip(variants, functions)}
         # keep a handle to program so GC doesn't collect
+        # Keep a handle to program so GC doesn't collect.
         self._program = program
+# TODO: check that we don't need a destructor for buffers which go out of scope
+# TODO: Check that we don't need a destructor for buffers which go out of scope.
 class GpuInput(object):
     """
 …
     def __init__(self, q_vectors, dtype=generate.F32):
         # type: (List[np.ndarray], np.dtype) -> None
         # TODO: do we ever need double precision q?
+        # TODO: Do we ever need double precision q?
         self.nq = q_vectors[0].size
         self.dtype = np.dtype(dtype)
         self.is_2d = (len(q_vectors) == 2)
         # TODO: stretch input based on get_warp()
         # not doing it now since warp depends on kernel, which is not known
+        # TODO: Stretch input based on get_warp().
+        # Not doing it now since warp depends on kernel, which is not known
         # at this point, so instead using 32, which is good on the set of
         # architectures tested so far.
         if self.is_2d:
+            # Note: 16 rather than 15 because result is 1 longer than input.
+            width = ((self.nq+16)//16)*16
+            width = ((self.nq+15)//16)*16
             self.q = np.empty((width, 2), dtype=dtype)
             self.q[:self.nq, 0] = q_vectors[0]
             self.q[:self.nq, 1] = q_vectors[1]
         else:
+            # Note: 32 rather than 31 because result is 1 longer than input.
+            width = ((self.nq+32)//32)*32
+            width = ((self.nq+31)//32)*32
             self.q = np.empty(width, dtype=dtype)
             self.q[:self.nq] = q_vectors[0]
 …
         #print("creating inputs of size", self.global_size)
         # transfer input value to gpu
+        # Transfer input value to GPU.
         self.q_b = cuda.to_device(self.q)
 …
         # type: () -> None
         """
         Free the memory.
+        Free the buffer associated with the q value.
         """
         if self.q_b is not None:
 …
         # type: () -> None
         self.release()
 class GpuKernel(Kernel):
 …
     Call :meth:`release` when done with the kernel instance.
     """
     #: SAS model information structure
     info = None # type: ModelInfo
     #: kernel precision
     dtype = None # type: np.dtype
     #: kernel dimensions (1d or 2d)
     dim = "" # type: str
     #: calculation results, updated after each call to :meth:`_call_kernel`
     result = None # type: np.ndarray
+    #: SAS model information structure.
+    info = None  # type: ModelInfo
+    #: Kernel precision.
+    dtype = None  # type: np.dtype
+    #: Kernel dimensions (1d or 2d).
+    dim = ""  # type: str
+    #: Calculation results, updated after each call to :meth:`_call_kernel`.
+    result = None  # type: np.ndarray
     def __init__(self, model, q_vectors):
 …
         self.q_input = GpuInput(q_vectors, dtype)
         self._model = model
+        # F16 isn't sufficient, so don't support it
+        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
+        # attributes accessed from the outside
+        # Attributes accessed from the outside.
         self.dim = '2d' if self.q_input.is_2d else '1d'
         self.info = model.info
+        self.dtype = model.dtype
+        # holding place for the returned value
+        self.dtype = dtype
+        # Converter to translate input to target type.
+        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
+        # Holding place for the returned value.
         nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
         extra_q = 4  # total weight, form volume, shell volume and R_eff
         self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
         # allocate result value on gpu
+        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
+        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
+        # Allocate result value on GPU.
         width = ((self.result.size+31)//32)*32 * self.dtype.itemsize
         self._result_b = cuda.mem_alloc(width)
+    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
+        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
+        # Arrange data transfer to card
+    def _call_kernel(self, call_details, values, cutoff, magnetic,
+                     effective_radius_type):
+        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
+        # Arrange data transfer to card.
         details_b = cuda.to_device(call_details.buffer)
         values_b = cuda.to_device(values)
+        # Setup kernel function and arguments.
         name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
         kernel = self._model.get_function(name)
         kernel_args = [
+            np.uint32(self.q_input.nq), None, None,
+            details_b, values_b, self.q_input.q_b, self._result_b,
+            self._as_dtype(cutoff),
+            np.uint32(effective_radius_type),
+            np.uint32(self.q_input.nq),  # Number of inputs.
+            None,  # Placeholder for pd_start.
+            None,  # Placeholder for pd_stop.
+            details_b,  # Problem definition.
+            values_b,  # Parameter values.
+            self.q_input.q_b,  # Q values.
+            self._result_b,   # Result storage.
+            self._as_dtype(cutoff),  # Probability cutoff.
+            np.uint32(effective_radius_type),  # R_eff mode.
+        ]
         grid = partition(self.q_input.nq)
+        #print("Calling OpenCL")
+        # Call kernel and retrieve results.
+        #print("Calling CUDA")
         #call_details.show(values)
-        # Call kernel and retrieve results
         last_nap = time.clock()
         step = 100000000//self.q_input.nq + 1
 …
             if stop < call_details.num_eval:
                 sync()
                 # Allow other processes to run
+                # Allow other processes to run.
                 current_time = time.clock()
                 if current_time - last_nap > 0.5:
 …
     Note: Maybe context.synchronize() is sufficient.
     """
+    #return # The following works in C++; don't know what pycuda is doing
+    # Create an event with which to synchronize
+    # Create an event with which to synchronize.
     done = cuda.Event()
 …
     done.record()
     #line added to not hog resources
+    # Make sure we don't hog resource while waiting to sync.
     while not done.query():
         time.sleep(0.01)
 …
     # Block until the GPU executes the kernel.
     done.synchronize()
     # Clean up the event; I don't think they can be reused.
     del done

sasmodels/kerneldll.py

-                      re44432d
+                      r3199b17
 # pylint: enable=unused-import
 # Compiler output is a byte stream that needs to be decode in python 3
+# Compiler output is a byte stream that needs to be decode in python 3.
 decode = (lambda s: s) if sys.version_info[0] < 3 else (lambda s: s.decode('utf8'))
 …
         COMPILER = "tinycc"
     elif "VCINSTALLDIR" in os.environ:
+        # If vcvarsall.bat has been called, then VCINSTALLDIR is in the environment
+        # and we can use the MSVC compiler.  Otherwise, if tinycc is available
+        # the use it.  Otherwise, hope that mingw is available.
+        # If vcvarsall.bat has been called, then VCINSTALLDIR is in the
+        # environment and we can use the MSVC compiler.  Otherwise, if
+        # tinycc is available then use it.  Otherwise, hope that mingw
+        # is available.
         COMPILER = "msvc"
     else:
 …
     COMPILER = "unix"
 ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86
+ARCH = "" if ct.sizeof(ct.c_void_p) > 4 else "x86"  # 4 byte pointers on x86.
 if COMPILER == "unix":
     # Generic unix compile
     # On mac users will need the X code command line tools installed
+    # Generic unix compile.
+    # On Mac users will need the X code command line tools installed.
     #COMPILE = "gcc-mp-4.7 -shared -fPIC -std=c99 -fopenmp -O2 -Wall %s -o %s -lm -lgomp"
     CC = "cc -shared -fPIC -std=c99 -O2 -Wall".split()
     # add openmp support if not running on a mac
+    # Add OpenMP support if not running on a Mac.
     if sys.platform != "darwin":
         # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9)
+        # OpenMP seems to be broken on gcc 5.4.0 (ubuntu 16.04.9).
         # Shut it off for all unix until we can investigate.
         #CC.append("-fopenmp")
 …
     # vcomp90.dll on the path.  One may be found here:
     #       C:/Windows/winsxs/x86_microsoft.vc90.openmp*/vcomp90.dll
     # Copy this to the python directory and uncomment the OpenMP COMPILE
     # TODO: remove intermediate OBJ file created in the directory
     # TODO: maybe don't use randomized name for the c file
     # TODO: maybe ask distutils to find MSVC
+    # Copy this to the python directory and uncomment the OpenMP COMPILE.
+    # TODO: Remove intermediate OBJ file created in the directory.
+    # TODO: Maybe don't use randomized name for the c file.
+    # TODO: Maybe ask distutils to find MSVC.
     CC = "cl /nologo /Ox /MD /W3 /GS- /DNDEBUG".split()
     if "SAS_OPENMP" in os.environ:
 …
 ALLOW_SINGLE_PRECISION_DLLS = True
 def compile(source, output):
     # type: (str, str) -> None
 …
     logging.info(command_str)
     try:
         # need shell=True on windows to keep console box from popping up
+        # Need shell=True on windows to keep console box from popping up.
         shell = (os.name == 'nt')
         subprocess.check_output(command, shell=shell, stderr=subprocess.STDOUT)
 …
         raise RuntimeError("compile failed.  File is in %r"%source)
 def dll_name(model_info, dtype):
     # type: (ModelInfo, np.dtype) ->  str
 …
     basename += ARCH + ".so"
     # Hack to find precompiled dlls
+    # Hack to find precompiled dlls.
     path = joinpath(generate.DATA_PATH, '..', 'compiled_models', basename)
     if os.path.exists(path):
 …
         raise ValueError("16 bit floats not supported")
     if dtype == F32 and not ALLOW_SINGLE_PRECISION_DLLS:
         dtype = F64  # Force 64-bit dll
     # Note: dtype may be F128 for long double precision
+        dtype = F64  # Force 64-bit dll.
+    # Note: dtype may be F128 for long double precision.
     dll = dll_path(model_info, dtype)
 …
         need_recompile = dll_time < newest_source
     if need_recompile:
         # Make sure the DLL path exists
+        # Make sure the DLL path exists.
         if not os.path.exists(SAS_DLL_PATH):
             os.makedirs(SAS_DLL_PATH)
 …
             file_handle.write(source)
         compile(source=filename, output=dll)
         # comment the following to keep the generated c file
         # Note: if there is a syntax error then compile raises an error
+        # Comment the following to keep the generated C file.
+        # Note: If there is a syntax error then compile raises an error
         # and the source file will not be deleted.
         os.unlink(filename)
 …
         self.dllpath = dllpath
         self._dll = None  # type: ct.CDLL
         self._kernels = None # type: List[Callable, Callable]
+        self._kernels = None  # type: List[Callable, Callable]
         self.dtype = np.dtype(dtype)
 …
         # type: (List[np.ndarray]) -> DllKernel
         q_input = PyInput(q_vectors, self.dtype)
         # Note: pickle not supported for DllKernel
+        # Note: DLL is lazy loaded.
         if self._dll is None:
             self._load_dll()
 …
         self._dll = None
 class DllKernel(Kernel):
     """
 …
     def __init__(self, kernel, model_info, q_input):
         # type: (Callable[[], np.ndarray], ModelInfo, PyInput) -> None
+        #,model_info,q_input)
+        dtype = q_input.dtype
+        self.q_input = q_input
         self.kernel = kernel
+        # Attributes accessed from the outside.
+        self.dim = '2d' if q_input.is_2d else '1d'
         self.info = model_info
+        self.q_input = q_input
+        self.dtype = q_input.dtype
+        self.dim = '2d' if q_input.is_2d else '1d'
+        # leave room for f1/f2 results in case we need to compute beta for 1d models
+        self.dtype = dtype
+        # Converter to translate input to target type.
+        self._as_dtype = (np.float32 if dtype == generate.F32
+                          else np.float64 if dtype == generate.F64
+                          else np.float128)
+        # Holding place for the returned value.
         nout = 2 if self.info.have_Fq else 1
         # +4 for total weight, shell volume, effective radius, form volume
         self.result = np.empty(q_input.nq*nout + 4, self.dtype)
+        self.real = (np.float32 if self.q_input.dtype == generate.F32
                      else np.float64 if self.q_input.dtype == generate.F64
                      else np.float128)
+    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
         # type: (CallDetails, np.ndarray, np.ndarray, float, bool, int) -> np.ndarray
+        extra_q = 4  # Total weight, form volume, shell volume and R_eff.
+        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
+    def _call_kernel(self, call_details, values, cutoff, magnetic,
+                     effective_radius_type):
+        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
+        # Setup kernel function and arguments.
         kernel = self.kernel[1 if magnetic else 0]
         args = [
             self.q_input.nq, # nq
             None, # pd_start
             None, # pd_stop pd_stride[MAX_PD]
             call_details.buffer.ctypes.data, # problem
             values.ctypes.data,  # pars
             self.q_input.q.ctypes.data, # q
             self.result.ctypes.data,   # results
             self.real(cutoff), # cutoff
             effective_radius_type, # cutoff
+        kernel_args = [
+            self.q_input.nq,  # Number of inputs.
+            None,  # Placeholder for pd_start.
+            None,  # Placeholder for pd_stop.
+            call_details.buffer.ctypes.data,  # Problem definition.
+            values.ctypes.data,  # Parameter values.
+            self.q_input.q.ctypes.data,  # Q values.
+            self.result.ctypes.data,   # Result storage.
+            self._as_dtype(cutoff),  # Probability cutoff.
+            effective_radius_type,  # R_eff mode.
+        ]
+        # Call kernel and retrieve results.
         #print("Calling DLL")
         #call_details.show(values)
         step = 100
+        # TODO: Do we need the explicit sleep like the OpenCL and CUDA loops?
         for start in range(0, call_details.num_eval, step):
             stop = min(start + step, call_details.num_eval)
             args[1:3] = [start, stop]
             kernel(*args) # type: ignore
+            kernel_args[1:3] = [start, stop]
+            kernel(*kernel_args) # type: ignore
     def release(self):
         # type: () -> None
         """
         Release any resources associated with the kernel.
+        Release resources associated with the kernel.
         """
+        self.q_input.release()
+        # TODO: OpenCL/CUDA allocate q_input in __init__ and free it in release.
+        # Should we be doing the same for DLL?
+        #self.q_input.release()
+        pass
+    def __del__(self):
+        # type: () -> None
+        self.release()

sasmodels/kernelpy.py

-                      raa8c6e0
+                      r3199b17
 logger = logging.getLogger(__name__)
 class PyModel(KernelModel):
     """
 …
     """
     def __init__(self, model_info):
         # Make sure Iq is available and vectorized
+        # Make sure Iq is available and vectorized.
         _create_default_functions(model_info)
         self.info = model_info
 …
         """
         pass
 class PyInput(object):
 …
         self.q = None
 class PyKernel(Kernel):
     """
 …
         parameter_vector = np.empty(len(partable.call_parameters)-2, 'd')
         # Create views into the array to hold the arguments
+        # Create views into the array to hold the arguments.
         offset = 0
         kernel_args, volume_args = [], []
 …
                         else (lambda mode: 1.0))
     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
 …
         self.q_input.release()
         self.q_input = None
 def _loops(parameters,    # type: np.ndarray
 …
         total = np.zeros(nq, 'd')
         for loop_index in range(call_details.num_eval):
             # update polydispersity parameter values
+            # Update polydispersity parameter values.
             if p0_index == p0_length:
                 pd_index = (loop_index//pd_stride)%pd_length
 …
             p0_index += 1
             if weight > cutoff:
                 # Call the scattering function
+                # Call the scattering function.
                 # Assume that NaNs are only generated if the parameters are bad;
                 # exclude all q for that NaN.  Even better would be to have an
 …
                     continue
                 # update value and norm
+                # Update value and norm.
                 total += weight * Iq
                 weight_norm += weight
 …
     any functions that are not already marked as vectorized.
     """
     # Note: must call create_vector_Iq before create_vector_Iqxy
+    # Note: Must call create_vector_Iq before create_vector_Iqxy.
     _create_vector_Iq(model_info)
     _create_vector_Iqxy(model_info)

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: