Changes in sasmodels/kernelcuda.py [f872fd1:fa26e78] in sasmodels


Ignore:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcuda.py

    rf872fd1 rfa26e78  
    6363import time 
    6464import re 
     65import atexit 
    6566 
    6667import numpy as np  # type: ignore 
    6768 
    6869 
    69 # Attempt to setup cuda. This may fail if the pycuda package is not 
     70# Attempt to setup CUDA. This may fail if the pycuda package is not 
    7071# installed or if it is installed but there are no devices available. 
    7172try: 
     
    107108MAX_LOOPS = 2048 
    108109 
     110 
    109111def use_cuda(): 
    110     env = os.environ.get("SAS_OPENCL", "").lower() 
    111     return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
     112    sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 
     113    return HAVE_CUDA and sas_opencl.startswith("cuda") 
     114 
    112115 
    113116ENV = None 
     
    121124        ENV.release() 
    122125    ENV = GpuEnvironment() if use_cuda() else None 
     126 
    123127 
    124128def environment(): 
     
    138142    return ENV 
    139143 
     144 
     145# PyTest is not freeing ENV, so make sure it gets freed. 
     146atexit.register(lambda: ENV.release() if ENV is not None else None) 
     147 
     148 
    140149def has_type(dtype): 
    141150    # type: (np.dtype) -> bool 
     
    143152    Return true if device supports the requested precision. 
    144153    """ 
    145     # Assume the nvidia card supports 32-bit and 64-bit floats. 
    146     # TODO: check if pycuda support F16 
     154    # Assume the NVIDIA card supports 32-bit and 64-bit floats. 
     155    # TODO: Check if pycuda support F16. 
    147156    return dtype in (generate.F32, generate.F64) 
    148157 
    149158 
    150159FUNCTION_PATTERN = re.compile(r"""^ 
    151   (?P<space>\s*)                   # initial space 
    152   (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 
    153   (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens 
     160  (?P<space>\s*)                       # Initial space. 
     161  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function. 
     162  (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens. 
    154163  """, re.VERBOSE|re.MULTILINE) 
    155164 
     
    158167  """, re.VERBOSE|re.MULTILINE) 
    159168 
     169 
    160170def _add_device_tag(match): 
    161171    # type: (None) -> str 
    162     # Note: should be re.Match, but that isn't a simple type 
     172    # Note: Should be re.Match, but that isn't a simple type. 
    163173    """ 
    164174    replace qualifiers with __device__ qualifiers if needed 
     
    173183        return "".join((space, "__device__ ", qualifiers, function)) 
    174184 
     185 
    175186def mark_device_functions(source): 
    176187    # type: (str) -> str 
     
    179190    """ 
    180191    return FUNCTION_PATTERN.sub(_add_device_tag, source) 
     192 
    181193 
    182194def show_device_functions(source): 
     
    188200        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 
    189201    return source 
     202 
    190203 
    191204def compile_model(source, dtype, fast=False): 
     
    212225    #options = ['--verbose', '-E'] 
    213226    options = ['--use_fast_math'] if fast else None 
    214     program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
     227    program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 
    215228 
    216229    #print("done with "+program) 
     
    218231 
    219232 
    220 # for now, this returns one device in the context 
    221 # TODO: create a context that contains all devices on all platforms 
     233# For now, this returns one device in the context. 
     234# TODO: Create a context that contains all devices on all platforms. 
    222235class GpuEnvironment(object): 
    223236    """ 
    224     GPU context, with possibly many devices, and one queue per device. 
     237    GPU context for CUDA. 
    225238    """ 
    226239    context = None # type: cuda.Context 
    227240    def __init__(self, devnum=None): 
    228241        # type: (int) -> None 
    229         # Byte boundary for data alignment 
    230         #self.data_boundary = max(d.min_data_type_align_size 
    231         #                         for d in self.context.devices) 
    232         self.compiled = {} 
    233242        env = os.environ.get("SAS_OPENCL", "").lower() 
    234243        if devnum is None and env.startswith("cuda:"): 
    235244            devnum = int(env[5:]) 
     245 
    236246        # Set the global context to the particular device number if one is 
    237247        # given, otherwise use the default context.  Perhaps this will be set 
     
    242252            self.context = make_default_context() 
    243253 
     254        ## Byte boundary for data alignment. 
     255        #self.data_boundary = max(d.min_data_type_align_size 
     256        #                         for d in self.context.devices) 
     257 
     258        # Cache for compiled programs, and for items in context. 
     259        self.compiled = {} 
     260 
    244261    def release(self): 
    245262        if self.context is not None: 
     
    262279        Compile the program for the device in the given context. 
    263280        """ 
    264         # Note: PyOpenCL caches based on md5 hash of source, options and device 
    265         # so we don't really need to cache things for ourselves.  I'll do so 
    266         # anyway just to save some data munging time. 
     281        # Note: PyCuda (probably) caches but I'll do so as well just to 
     282        # save some data munging time. 
    267283        tag = generate.tag_source(source) 
    268284        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    269         # Check timestamp on program 
     285        # Check timestamp on program. 
    270286        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    271287        if program_timestamp < timestamp: 
     
    277293        return program 
    278294 
     295 
    279296class GpuModel(KernelModel): 
    280297    """ 
     
    292309    that the compiler is allowed to take shortcuts. 
    293310    """ 
    294     info = None # type: ModelInfo 
    295     source = "" # type: str 
    296     dtype = None # type: np.dtype 
    297     fast = False # type: bool 
    298     program = None # type: SourceModule 
    299     _kernels = None # type: List[cuda.Function] 
     311    info = None  # type: ModelInfo 
     312    source = ""  # type: str 
     313    dtype = None  # type: np.dtype 
     314    fast = False  # type: bool 
     315    _program = None # type: SourceModule 
     316    _kernels = None  # type: Dict[str, cuda.Function] 
    300317 
    301318    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
     
    305322        self.dtype = dtype 
    306323        self.fast = fast 
    307         self.program = None # delay program creation 
    308         self._kernels = None 
    309324 
    310325    def __getstate__(self): 
     
    315330        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    316331        self.info, self.source, self.dtype, self.fast = state 
    317         self.program = None 
     332        self._program = self._kernels = None 
    318333 
    319334    def make_kernel(self, q_vectors): 
    320335        # type: (List[np.ndarray]) -> "GpuKernel" 
    321         if self.program is None: 
    322             compile_program = environment().compile_program 
    323             timestamp = generate.ocl_timestamp(self.info) 
    324             self.program = compile_program( 
    325                 self.info.name, 
    326                 self.source['opencl'], 
    327                 self.dtype, 
    328                 self.fast, 
    329                 timestamp) 
    330             variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    331             names = [generate.kernel_name(self.info, k) for k in variants] 
    332             kernels = [self.program.get_function(k) for k in names] 
    333             self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 
    334         is_2d = len(q_vectors) == 2 
    335         if is_2d: 
    336             kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 
    337         else: 
    338             kernel = [self._kernels['Iq']]*2 
    339         return GpuKernel(kernel, self.dtype, self.info, q_vectors) 
    340  
    341     def release(self): 
    342         # type: () -> None 
    343         """ 
    344         Free the resources associated with the model. 
    345         """ 
    346         if self.program is not None: 
    347             self.program = None 
    348  
    349     def __del__(self): 
    350         # type: () -> None 
    351         self.release() 
    352  
    353 # TODO: check that we don't need a destructor for buffers which go out of scope 
     336        return GpuKernel(self, q_vectors) 
     337 
     338    def get_function(self, name): 
     339        # type: (str) -> cuda.Function 
     340        """ 
     341        Fetch the kernel from the environment by name, compiling it if it 
     342        does not already exist. 
     343        """ 
     344        if self._program is None: 
     345            self._prepare_program() 
     346        return self._kernels[name] 
     347 
     348    def _prepare_program(self): 
     349        # type: (str) -> None 
     350        env = environment() 
     351        timestamp = generate.ocl_timestamp(self.info) 
     352        program = env.compile_program( 
     353            self.info.name, 
     354            self.source['opencl'], 
     355            self.dtype, 
     356            self.fast, 
     357            timestamp) 
     358        variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     359        names = [generate.kernel_name(self.info, k) for k in variants] 
     360        functions = [program.get_function(k) for k in names] 
     361        self._kernels = {k: v for k, v in zip(variants, functions)} 
     362        # Keep a handle to program so GC doesn't collect. 
     363        self._program = program 
     364 
     365 
     366# TODO: Check that we don't need a destructor for buffers which go out of scope. 
    354367class GpuInput(object): 
    355368    """ 
     
    373386    def __init__(self, q_vectors, dtype=generate.F32): 
    374387        # type: (List[np.ndarray], np.dtype) -> None 
    375         # TODO: do we ever need double precision q? 
     388        # TODO: Do we ever need double precision q? 
    376389        self.nq = q_vectors[0].size 
    377390        self.dtype = np.dtype(dtype) 
    378391        self.is_2d = (len(q_vectors) == 2) 
    379         # TODO: stretch input based on get_warp() 
    380         # not doing it now since warp depends on kernel, which is not known 
     392        # TODO: Stretch input based on get_warp(). 
     393        # Not doing it now since warp depends on kernel, which is not known 
    381394        # at this point, so instead using 32, which is good on the set of 
    382395        # architectures tested so far. 
    383396        if self.is_2d: 
    384             # Note: 16 rather than 15 because result is 1 longer than input. 
    385             width = ((self.nq+16)//16)*16 
     397            width = ((self.nq+15)//16)*16 
    386398            self.q = np.empty((width, 2), dtype=dtype) 
    387399            self.q[:self.nq, 0] = q_vectors[0] 
    388400            self.q[:self.nq, 1] = q_vectors[1] 
    389401        else: 
    390             # Note: 32 rather than 31 because result is 1 longer than input. 
    391             width = ((self.nq+32)//32)*32 
     402            width = ((self.nq+31)//32)*32 
    392403            self.q = np.empty(width, dtype=dtype) 
    393404            self.q[:self.nq] = q_vectors[0] 
    394405        self.global_size = [self.q.shape[0]] 
    395406        #print("creating inputs of size", self.global_size) 
     407 
     408        # Transfer input value to GPU. 
    396409        self.q_b = cuda.to_device(self.q) 
    397410 
     
    399412        # type: () -> None 
    400413        """ 
    401         Free the memory. 
     414        Free the buffer associated with the q value. 
    402415        """ 
    403416        if self.q_b is not None: 
     
    409422        self.release() 
    410423 
     424 
    411425class GpuKernel(Kernel): 
    412426    """ 
    413427    Callable SAS kernel. 
    414428 
    415     *kernel* is the GpuKernel object to call 
    416  
    417     *model_info* is the module information 
    418  
    419     *q_vectors* is the q vectors at which the kernel should be evaluated 
    420  
    421     *dtype* is the kernel precision 
    422  
    423     The resulting call method takes the *pars*, a list of values for 
    424     the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
    425     vectors for the polydisperse parameters.  *cutoff* determines the 
    426     integration limits: any points with combined weight less than *cutoff* 
    427     will not be calculated. 
     429    *model* is the GpuModel object to call 
     430 
     431    The kernel is derived from :class:`Kernel`, providing the 
     432    :meth:`call_kernel` method to evaluate the kernel for a given set of 
     433    parameters.  Because of the need to move the q values to the GPU before 
     434    evaluation, the kernel is instantiated for a particular set of q vectors, 
     435    and can be called many times without transfering q each time. 
    428436 
    429437    Call :meth:`release` when done with the kernel instance. 
    430438    """ 
    431     def __init__(self, kernel, dtype, model_info, q_vectors): 
    432         # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
     439    #: SAS model information structure. 
     440    info = None  # type: ModelInfo 
     441    #: Kernel precision. 
     442    dtype = None  # type: np.dtype 
     443    #: Kernel dimensions (1d or 2d). 
     444    dim = ""  # type: str 
     445    #: Calculation results, updated after each call to :meth:`_call_kernel`. 
     446    result = None  # type: np.ndarray 
     447 
     448    def __init__(self, model, q_vectors): 
     449        # type: (GpuModel, List[np.ndarray]) -> None 
     450        dtype = model.dtype 
    433451        self.q_input = GpuInput(q_vectors, dtype) 
    434         self.kernel = kernel 
    435         # F16 isn't sufficient, so don't support it 
     452        self._model = model 
     453 
     454        # Attributes accessed from the outside. 
     455        self.dim = '2d' if self.q_input.is_2d else '1d' 
     456        self.info = model.info 
     457        self.dtype = dtype 
     458 
     459        # Converter to translate input to target type. 
    436460        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    437461 
    438         # attributes accessed from the outside 
    439         self.dim = '2d' if self.q_input.is_2d else '1d' 
    440         self.info = model_info 
    441         self.dtype = dtype 
    442  
    443         # holding place for the returned value 
     462        # Holding place for the returned value. 
    444463        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    445         extra_q = 4  # total weight, form volume, shell volume and R_eff 
    446         self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
    447  
    448         # Inputs and outputs for each kernel call 
    449         # Note: res may be shorter than res_b if global_size != nq 
     464        extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
     465        self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
     466 
     467        # Allocate result value on GPU. 
    450468        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    451         self.result_b = cuda.mem_alloc(width) 
    452         self._need_release = [self.result_b] 
    453  
    454     def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    455         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    456         # Arrange data transfer to card 
     469        self._result_b = cuda.mem_alloc(width) 
     470 
     471    def _call_kernel(self, call_details, values, cutoff, magnetic, 
     472                     effective_radius_type): 
     473        # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
     474 
     475        # Arrange data transfer to card. 
    457476        details_b = cuda.to_device(call_details.buffer) 
    458477        values_b = cuda.to_device(values) 
    459478 
    460         kernel = self.kernel[1 if magnetic else 0] 
    461         args = [ 
    462             np.uint32(self.q_input.nq), None, None, 
    463             details_b, values_b, self.q_input.q_b, self.result_b, 
    464             self._as_dtype(cutoff), 
    465             np.uint32(effective_radius_type), 
     479        # Setup kernel function and arguments. 
     480        name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
     481        kernel = self._model.get_function(name) 
     482        kernel_args = [ 
     483            np.uint32(self.q_input.nq),  # Number of inputs. 
     484            None,  # Placeholder for pd_start. 
     485            None,  # Placeholder for pd_stop. 
     486            details_b,  # Problem definition. 
     487            values_b,  # Parameter values. 
     488            self.q_input.q_b,  # Q values. 
     489            self._result_b,   # Result storage. 
     490            self._as_dtype(cutoff),  # Probability cutoff. 
     491            np.uint32(effective_radius_type),  # R_eff mode. 
    466492        ] 
    467493        grid = partition(self.q_input.nq) 
    468         #print("Calling OpenCL") 
     494 
     495        # Call kernel and retrieve results. 
     496        #print("Calling CUDA") 
    469497        #call_details.show(values) 
    470         # Call kernel and retrieve results 
    471498        last_nap = time.clock() 
    472499        step = 100000000//self.q_input.nq + 1 
     
    475502            stop = min(start + step, call_details.num_eval) 
    476503            #print("queuing",start,stop) 
    477             args[1:3] = [np.int32(start), np.int32(stop)] 
    478             kernel(*args, **grid) 
     504            kernel_args[1:3] = [np.int32(start), np.int32(stop)] 
     505            kernel(*kernel_args, **grid) 
    479506            if stop < call_details.num_eval: 
    480507                sync() 
    481                 # Allow other processes to run 
     508                # Allow other processes to run. 
    482509                current_time = time.clock() 
    483510                if current_time - last_nap > 0.5: 
     
    485512                    last_nap = current_time 
    486513        sync() 
    487         cuda.memcpy_dtoh(self.result, self.result_b) 
     514        cuda.memcpy_dtoh(self.result, self._result_b) 
    488515        #print("result", self.result) 
    489516 
     
    496523        Release resources associated with the kernel. 
    497524        """ 
    498         for p in self._need_release: 
    499             p.free() 
    500         self._need_release = [] 
     525        self.q_input.release() 
     526        if self._result_b is not None: 
     527            self._result_b.free() 
     528            self._result_b = None 
    501529 
    502530    def __del__(self): 
     
    512540    Note: Maybe context.synchronize() is sufficient. 
    513541    """ 
    514     #return # The following works in C++; don't know what pycuda is doing 
    515     # Create an event with which to synchronize 
     542    # Create an event with which to synchronize. 
    516543    done = cuda.Event() 
    517544 
     
    519546    done.record() 
    520547 
    521     #line added to not hog resources 
     548    # Make sure we don't hog resource while waiting to sync. 
    522549    while not done.query(): 
    523550        time.sleep(0.01) 
     
    525552    # Block until the GPU executes the kernel. 
    526553    done.synchronize() 
     554 
    527555    # Clean up the event; I don't think they can be reused. 
    528556    del done 
Note: See TracChangeset for help on using the changeset viewer.