Changes in sasmodels/kernelcuda.py [fa26e78:f872fd1] in sasmodels


Ignore:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcuda.py

    rfa26e78 rf872fd1  
    6363import time 
    6464import re 
    65 import atexit 
    6665 
    6766import numpy as np  # type: ignore 
    6867 
    6968 
    70 # Attempt to setup CUDA. This may fail if the pycuda package is not 
     69# Attempt to setup cuda. This may fail if the pycuda package is not 
    7170# installed or if it is installed but there are no devices available. 
    7271try: 
     
    108107MAX_LOOPS = 2048 
    109108 
    110  
    111109def use_cuda(): 
    112     sas_opencl = os.environ.get("SAS_OPENCL", "CUDA").lower() 
    113     return HAVE_CUDA and sas_opencl.startswith("cuda") 
    114  
     110    env = os.environ.get("SAS_OPENCL", "").lower() 
     111    return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
    115112 
    116113ENV = None 
     
    124121        ENV.release() 
    125122    ENV = GpuEnvironment() if use_cuda() else None 
    126  
    127123 
    128124def environment(): 
     
    142138    return ENV 
    143139 
    144  
    145 # PyTest is not freeing ENV, so make sure it gets freed. 
    146 atexit.register(lambda: ENV.release() if ENV is not None else None) 
    147  
    148  
    149140def has_type(dtype): 
    150141    # type: (np.dtype) -> bool 
     
    152143    Return true if device supports the requested precision. 
    153144    """ 
    154     # Assume the NVIDIA card supports 32-bit and 64-bit floats. 
    155     # TODO: Check if pycuda support F16. 
     145    # Assume the nvidia card supports 32-bit and 64-bit floats. 
     146    # TODO: check if pycuda support F16 
    156147    return dtype in (generate.F32, generate.F64) 
    157148 
    158149 
    159150FUNCTION_PATTERN = re.compile(r"""^ 
    160   (?P<space>\s*)                       # Initial space. 
    161   (?P<qualifiers>^(?:\s*\b\w+\b\s*)+)  # One or more qualifiers before function. 
    162   (?P<function>\s*\b\w+\b\s*[(])       # Function name plus open parens. 
     151  (?P<space>\s*)                   # initial space 
     152  (?P<qualifiers>^(?:\s*\b\w+\b\s*)+) # one or more qualifiers before function 
     153  (?P<function>\s*\b\w+\b\s*[(])      # function name plus open parens 
    163154  """, re.VERBOSE|re.MULTILINE) 
    164155 
     
    167158  """, re.VERBOSE|re.MULTILINE) 
    168159 
    169  
    170160def _add_device_tag(match): 
    171161    # type: (None) -> str 
    172     # Note: Should be re.Match, but that isn't a simple type. 
     162    # Note: should be re.Match, but that isn't a simple type 
    173163    """ 
    174164    replace qualifiers with __device__ qualifiers if needed 
     
    183173        return "".join((space, "__device__ ", qualifiers, function)) 
    184174 
    185  
    186175def mark_device_functions(source): 
    187176    # type: (str) -> str 
     
    190179    """ 
    191180    return FUNCTION_PATTERN.sub(_add_device_tag, source) 
    192  
    193181 
    194182def show_device_functions(source): 
     
    200188        print(match.group('qualifiers').replace('\n',r'\n'), match.group('function'), '(') 
    201189    return source 
    202  
    203190 
    204191def compile_model(source, dtype, fast=False): 
     
    225212    #options = ['--verbose', '-E'] 
    226213    options = ['--use_fast_math'] if fast else None 
    227     program = SourceModule(source, no_extern_c=True, options=options) #, include_dirs=[...]) 
     214    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
    228215 
    229216    #print("done with "+program) 
     
    231218 
    232219 
    233 # For now, this returns one device in the context. 
    234 # TODO: Create a context that contains all devices on all platforms. 
     220# for now, this returns one device in the context 
     221# TODO: create a context that contains all devices on all platforms 
    235222class GpuEnvironment(object): 
    236223    """ 
    237     GPU context for CUDA. 
     224    GPU context, with possibly many devices, and one queue per device. 
    238225    """ 
    239226    context = None # type: cuda.Context 
    240227    def __init__(self, devnum=None): 
    241228        # type: (int) -> None 
     229        # Byte boundary for data alignment 
     230        #self.data_boundary = max(d.min_data_type_align_size 
     231        #                         for d in self.context.devices) 
     232        self.compiled = {} 
    242233        env = os.environ.get("SAS_OPENCL", "").lower() 
    243234        if devnum is None and env.startswith("cuda:"): 
    244235            devnum = int(env[5:]) 
    245  
    246236        # Set the global context to the particular device number if one is 
    247237        # given, otherwise use the default context.  Perhaps this will be set 
     
    252242            self.context = make_default_context() 
    253243 
    254         ## Byte boundary for data alignment. 
    255         #self.data_boundary = max(d.min_data_type_align_size 
    256         #                         for d in self.context.devices) 
    257  
    258         # Cache for compiled programs, and for items in context. 
    259         self.compiled = {} 
    260  
    261244    def release(self): 
    262245        if self.context is not None: 
     
    279262        Compile the program for the device in the given context. 
    280263        """ 
    281         # Note: PyCuda (probably) caches but I'll do so as well just to 
    282         # save some data munging time. 
     264        # Note: PyOpenCL caches based on md5 hash of source, options and device 
     265        # so we don't really need to cache things for ourselves.  I'll do so 
     266        # anyway just to save some data munging time. 
    283267        tag = generate.tag_source(source) 
    284268        key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else "")) 
    285         # Check timestamp on program. 
     269        # Check timestamp on program 
    286270        program, program_timestamp = self.compiled.get(key, (None, np.inf)) 
    287271        if program_timestamp < timestamp: 
     
    293277        return program 
    294278 
    295  
    296279class GpuModel(KernelModel): 
    297280    """ 
     
    309292    that the compiler is allowed to take shortcuts. 
    310293    """ 
    311     info = None  # type: ModelInfo 
    312     source = ""  # type: str 
    313     dtype = None  # type: np.dtype 
    314     fast = False  # type: bool 
    315     _program = None # type: SourceModule 
    316     _kernels = None  # type: Dict[str, cuda.Function] 
     294    info = None # type: ModelInfo 
     295    source = "" # type: str 
     296    dtype = None # type: np.dtype 
     297    fast = False # type: bool 
     298    program = None # type: SourceModule 
     299    _kernels = None # type: List[cuda.Function] 
    317300 
    318301    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
     
    322305        self.dtype = dtype 
    323306        self.fast = fast 
     307        self.program = None # delay program creation 
     308        self._kernels = None 
    324309 
    325310    def __getstate__(self): 
     
    330315        # type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None 
    331316        self.info, self.source, self.dtype, self.fast = state 
    332         self._program = self._kernels = None 
     317        self.program = None 
    333318 
    334319    def make_kernel(self, q_vectors): 
    335320        # type: (List[np.ndarray]) -> "GpuKernel" 
    336         return GpuKernel(self, q_vectors) 
    337  
    338     def get_function(self, name): 
    339         # type: (str) -> cuda.Function 
    340         """ 
    341         Fetch the kernel from the environment by name, compiling it if it 
    342         does not already exist. 
    343         """ 
    344         if self._program is None: 
    345             self._prepare_program() 
    346         return self._kernels[name] 
    347  
    348     def _prepare_program(self): 
    349         # type: (str) -> None 
    350         env = environment() 
    351         timestamp = generate.ocl_timestamp(self.info) 
    352         program = env.compile_program( 
    353             self.info.name, 
    354             self.source['opencl'], 
    355             self.dtype, 
    356             self.fast, 
    357             timestamp) 
    358         variants = ['Iq', 'Iqxy', 'Imagnetic'] 
    359         names = [generate.kernel_name(self.info, k) for k in variants] 
    360         functions = [program.get_function(k) for k in names] 
    361         self._kernels = {k: v for k, v in zip(variants, functions)} 
    362         # Keep a handle to program so GC doesn't collect. 
    363         self._program = program 
    364  
    365  
    366 # TODO: Check that we don't need a destructor for buffers which go out of scope. 
     321        if self.program is None: 
     322            compile_program = environment().compile_program 
     323            timestamp = generate.ocl_timestamp(self.info) 
     324            self.program = compile_program( 
     325                self.info.name, 
     326                self.source['opencl'], 
     327                self.dtype, 
     328                self.fast, 
     329                timestamp) 
     330            variants = ['Iq', 'Iqxy', 'Imagnetic'] 
     331            names = [generate.kernel_name(self.info, k) for k in variants] 
     332            kernels = [self.program.get_function(k) for k in names] 
     333            self._kernels = dict((k, v) for k, v in zip(variants, kernels)) 
     334        is_2d = len(q_vectors) == 2 
     335        if is_2d: 
     336            kernel = [self._kernels['Iqxy'], self._kernels['Imagnetic']] 
     337        else: 
     338            kernel = [self._kernels['Iq']]*2 
     339        return GpuKernel(kernel, self.dtype, self.info, q_vectors) 
     340 
     341    def release(self): 
     342        # type: () -> None 
     343        """ 
     344        Free the resources associated with the model. 
     345        """ 
     346        if self.program is not None: 
     347            self.program = None 
     348 
     349    def __del__(self): 
     350        # type: () -> None 
     351        self.release() 
     352 
     353# TODO: check that we don't need a destructor for buffers which go out of scope 
    367354class GpuInput(object): 
    368355    """ 
     
    386373    def __init__(self, q_vectors, dtype=generate.F32): 
    387374        # type: (List[np.ndarray], np.dtype) -> None 
    388         # TODO: Do we ever need double precision q? 
     375        # TODO: do we ever need double precision q? 
    389376        self.nq = q_vectors[0].size 
    390377        self.dtype = np.dtype(dtype) 
    391378        self.is_2d = (len(q_vectors) == 2) 
    392         # TODO: Stretch input based on get_warp(). 
    393         # Not doing it now since warp depends on kernel, which is not known 
     379        # TODO: stretch input based on get_warp() 
     380        # not doing it now since warp depends on kernel, which is not known 
    394381        # at this point, so instead using 32, which is good on the set of 
    395382        # architectures tested so far. 
    396383        if self.is_2d: 
    397             width = ((self.nq+15)//16)*16 
     384            # Note: 16 rather than 15 because result is 1 longer than input. 
     385            width = ((self.nq+16)//16)*16 
    398386            self.q = np.empty((width, 2), dtype=dtype) 
    399387            self.q[:self.nq, 0] = q_vectors[0] 
    400388            self.q[:self.nq, 1] = q_vectors[1] 
    401389        else: 
    402             width = ((self.nq+31)//32)*32 
     390            # Note: 32 rather than 31 because result is 1 longer than input. 
     391            width = ((self.nq+32)//32)*32 
    403392            self.q = np.empty(width, dtype=dtype) 
    404393            self.q[:self.nq] = q_vectors[0] 
    405394        self.global_size = [self.q.shape[0]] 
    406395        #print("creating inputs of size", self.global_size) 
    407  
    408         # Transfer input value to GPU. 
    409396        self.q_b = cuda.to_device(self.q) 
    410397 
     
    412399        # type: () -> None 
    413400        """ 
    414         Free the buffer associated with the q value. 
     401        Free the memory. 
    415402        """ 
    416403        if self.q_b is not None: 
     
    422409        self.release() 
    423410 
    424  
    425411class GpuKernel(Kernel): 
    426412    """ 
    427413    Callable SAS kernel. 
    428414 
    429     *model* is the GpuModel object to call 
    430  
    431     The kernel is derived from :class:`Kernel`, providing the 
    432     :meth:`call_kernel` method to evaluate the kernel for a given set of 
    433     parameters.  Because of the need to move the q values to the GPU before 
    434     evaluation, the kernel is instantiated for a particular set of q vectors, 
    435     and can be called many times without transfering q each time. 
     415    *kernel* is the GpuKernel object to call 
     416 
     417    *model_info* is the module information 
     418 
     419    *q_vectors* is the q vectors at which the kernel should be evaluated 
     420 
     421    *dtype* is the kernel precision 
     422 
     423    The resulting call method takes the *pars*, a list of values for 
     424    the fixed parameters to the kernel, and *pd_pars*, a list of (value,weight) 
     425    vectors for the polydisperse parameters.  *cutoff* determines the 
     426    integration limits: any points with combined weight less than *cutoff* 
     427    will not be calculated. 
    436428 
    437429    Call :meth:`release` when done with the kernel instance. 
    438430    """ 
    439     #: SAS model information structure. 
    440     info = None  # type: ModelInfo 
    441     #: Kernel precision. 
    442     dtype = None  # type: np.dtype 
    443     #: Kernel dimensions (1d or 2d). 
    444     dim = ""  # type: str 
    445     #: Calculation results, updated after each call to :meth:`_call_kernel`. 
    446     result = None  # type: np.ndarray 
    447  
    448     def __init__(self, model, q_vectors): 
    449         # type: (GpuModel, List[np.ndarray]) -> None 
    450         dtype = model.dtype 
     431    def __init__(self, kernel, dtype, model_info, q_vectors): 
     432        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    451433        self.q_input = GpuInput(q_vectors, dtype) 
    452         self._model = model 
    453  
    454         # Attributes accessed from the outside. 
     434        self.kernel = kernel 
     435        # F16 isn't sufficient, so don't support it 
     436        self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
     437 
     438        # attributes accessed from the outside 
    455439        self.dim = '2d' if self.q_input.is_2d else '1d' 
    456         self.info = model.info 
     440        self.info = model_info 
    457441        self.dtype = dtype 
    458442 
    459         # Converter to translate input to target type. 
    460         self._as_dtype = np.float64 if dtype == generate.F64 else np.float32 
    461  
    462         # Holding place for the returned value. 
     443        # holding place for the returned value 
    463444        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
    464         extra_q = 4  # Total weight, form volume, shell volume and R_eff. 
    465         self.result = np.empty(self.q_input.nq*nout + extra_q, dtype) 
    466  
    467         # Allocate result value on GPU. 
     445        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     446        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
     447 
     448        # Inputs and outputs for each kernel call 
     449        # Note: res may be shorter than res_b if global_size != nq 
    468450        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
    469         self._result_b = cuda.mem_alloc(width) 
    470  
    471     def _call_kernel(self, call_details, values, cutoff, magnetic, 
    472                      effective_radius_type): 
    473         # type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray 
    474  
    475         # Arrange data transfer to card. 
     451        self.result_b = cuda.mem_alloc(width) 
     452        self._need_release = [self.result_b] 
     453 
     454    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
     455        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
     456        # Arrange data transfer to card 
    476457        details_b = cuda.to_device(call_details.buffer) 
    477458        values_b = cuda.to_device(values) 
    478459 
    479         # Setup kernel function and arguments. 
    480         name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy' 
    481         kernel = self._model.get_function(name) 
    482         kernel_args = [ 
    483             np.uint32(self.q_input.nq),  # Number of inputs. 
    484             None,  # Placeholder for pd_start. 
    485             None,  # Placeholder for pd_stop. 
    486             details_b,  # Problem definition. 
    487             values_b,  # Parameter values. 
    488             self.q_input.q_b,  # Q values. 
    489             self._result_b,   # Result storage. 
    490             self._as_dtype(cutoff),  # Probability cutoff. 
    491             np.uint32(effective_radius_type),  # R_eff mode. 
     460        kernel = self.kernel[1 if magnetic else 0] 
     461        args = [ 
     462            np.uint32(self.q_input.nq), None, None, 
     463            details_b, values_b, self.q_input.q_b, self.result_b, 
     464            self._as_dtype(cutoff), 
     465            np.uint32(effective_radius_type), 
    492466        ] 
    493467        grid = partition(self.q_input.nq) 
    494  
    495         # Call kernel and retrieve results. 
    496         #print("Calling CUDA") 
     468        #print("Calling OpenCL") 
    497469        #call_details.show(values) 
     470        # Call kernel and retrieve results 
    498471        last_nap = time.clock() 
    499472        step = 100000000//self.q_input.nq + 1 
     
    502475            stop = min(start + step, call_details.num_eval) 
    503476            #print("queuing",start,stop) 
    504             kernel_args[1:3] = [np.int32(start), np.int32(stop)] 
    505             kernel(*kernel_args, **grid) 
     477            args[1:3] = [np.int32(start), np.int32(stop)] 
     478            kernel(*args, **grid) 
    506479            if stop < call_details.num_eval: 
    507480                sync() 
    508                 # Allow other processes to run. 
     481                # Allow other processes to run 
    509482                current_time = time.clock() 
    510483                if current_time - last_nap > 0.5: 
     
    512485                    last_nap = current_time 
    513486        sync() 
    514         cuda.memcpy_dtoh(self.result, self._result_b) 
     487        cuda.memcpy_dtoh(self.result, self.result_b) 
    515488        #print("result", self.result) 
    516489 
     
    523496        Release resources associated with the kernel. 
    524497        """ 
    525         self.q_input.release() 
    526         if self._result_b is not None: 
    527             self._result_b.free() 
    528             self._result_b = None 
     498        for p in self._need_release: 
     499            p.free() 
     500        self._need_release = [] 
    529501 
    530502    def __del__(self): 
     
    540512    Note: Maybe context.synchronize() is sufficient. 
    541513    """ 
    542     # Create an event with which to synchronize. 
     514    #return # The following works in C++; don't know what pycuda is doing 
     515    # Create an event with which to synchronize 
    543516    done = cuda.Event() 
    544517 
     
    546519    done.record() 
    547520 
    548     # Make sure we don't hog resource while waiting to sync. 
     521    #line added to not hog resources 
    549522    while not done.query(): 
    550523        time.sleep(0.01) 
     
    552525    # Block until the GPU executes the kernel. 
    553526    done.synchronize() 
    554  
    555527    # Clean up the event; I don't think they can be reused. 
    556528    del done 
Note: See TracChangeset for help on using the changeset viewer.