Changeset b0de252 in sasmodels for sasmodels/kernelcuda.py


Ignore:
Timestamp:
Oct 12, 2018 7:31:24 PM (6 years ago)
Author:
pkienzle
Branches:
master, core_shell_microgels, magnetic_model, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
74e9b5f
Parents:
47fb816
Message:

improve control over cuda context

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcuda.py

    r0db7dbd rb0de252  
    11""" 
    2 GPU driver for C kernels 
     2GPU driver for C kernels (with CUDA) 
     3 
     4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular 
     5device number.  If no device number is specified, then look for CUDA_DEVICE=n 
     6or a file ~/.cuda-device containing n for the device number.  Otherwise, try 
     7all available device numbers. 
     8 
     9TODO: docs are out of date 
    310 
    411There should be a single GPU environment running on the system.  This 
     
    5966 
    6067 
    61 # Attempt to setup opencl. This may fail if the opencl package is not 
     68# Attempt to setup cuda. This may fail if the pycuda package is not 
    6269# installed or if it is installed but there are no devices available. 
    6370try: 
    64     import pycuda.autoinit 
    6571    import pycuda.driver as cuda  # type: ignore 
    6672    from pycuda.compiler import SourceModule 
     73    from pycuda.tools import make_default_context, clear_context_caches 
     74    # Ask CUDA for the default context (so that we know that one exists) 
     75    # then immediately throw it away in case the user doesn't want it. 
     76    # Note: cribbed from pycuda.autoinit 
     77    cuda.init() 
     78    context = make_default_context() 
     79    context.pop() 
     80    clear_context_caches() 
     81    del context 
    6782    HAVE_CUDA = True 
    6883    CUDA_ERROR = "" 
     
    91106MAX_LOOPS = 2048 
    92107 
    93  
    94 # Pragmas for enable OpenCL features.  Be sure to protect them so that they 
    95 # still compile even if OpenCL is not present. 
    96 _F16_PRAGMA = """\ 
    97 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16) 
    98 #  pragma OPENCL EXTENSION cl_khr_fp16: enable 
    99 #endif 
    100 """ 
    101  
    102 _F64_PRAGMA = """\ 
    103 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64) 
    104 #  pragma OPENCL EXTENSION cl_khr_fp64: enable 
    105 #endif 
    106 """ 
    107  
    108108def use_cuda(): 
    109     return HAVE_CUDA 
     109    env = os.environ.get("SAS_OPENCL", "").lower() 
     110    return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
    110111 
    111112ENV = None 
     
    115116    """ 
    116117    global ENV 
     118    # Free any previous allocated context. 
     119    if ENV is not None and ENV.context is not None: 
     120        ENV.release() 
    117121    ENV = GpuEnvironment() if use_cuda() else None 
    118122 
     
    126130    if ENV is None: 
    127131        if not HAVE_CUDA: 
    128             raise RuntimeError("OpenCL startup failed with ***" 
     132            raise RuntimeError("CUDA startup failed with ***" 
    129133                            + CUDA_ERROR + "***; using C compiler instead") 
    130134        reset_environment() 
     
    133137    return ENV 
    134138 
    135 def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 
    136     # type: (np.ndarray, np.dtype, float, int) -> np.ndarray 
    137     """ 
    138     Stretch an input vector to the correct boundary. 
    139  
    140     Performance on the kernels can drop by a factor of two or more if the 
    141     number of values to compute does not fall on a nice power of two 
    142     boundary.   The trailing additional vector elements are given a 
    143     value of *extra*, and so f(*extra*) will be computed for each of 
    144     them.  The returned array will thus be a subset of the computed array. 
    145  
    146     *boundary* should be a power of 2 which is at least 32 for good 
    147     performance on current platforms (as of Jan 2015).  It should 
    148     probably be the max of get_warp(kernel,queue) and 
    149     device.min_data_type_align_size//4. 
    150     """ 
    151     remainder = vector.size % boundary 
    152     if remainder != 0: 
    153         size = vector.size + (boundary - remainder) 
    154         vector = np.hstack((vector, [extra] * (size - vector.size))) 
    155     return np.ascontiguousarray(vector, dtype=dtype) 
    156  
     139def has_type(dtype): 
     140    # type: (np.dtype) -> bool 
     141    """ 
     142    Return true if device supports the requested precision. 
     143    """ 
     144    # Assume the nvidia card supports 32-bit and 64-bit floats. 
     145    # TODO: check if pycuda support F16 
     146    return dtype in (generate.F32, generate.F64) 
    157147 
    158148def compile_model(source, dtype, fast=False): 
    159     # type: (str, np.dtype, bool) -> cl.Program 
     149    # type: (str, np.dtype, bool) -> SourceModule 
    160150    """ 
    161151    Build a model to run on the gpu. 
     
    165155    devices in the context do not support the cl_khr_fp64 extension. 
    166156    """ 
     157    dtype = np.dtype(dtype) 
     158    if not has_type(dtype): 
     159        raise RuntimeError("%s not supported for devices"%dtype) 
     160 
    167161    source_list = [generate.convert_type(source, dtype)] 
    168  
    169     if dtype == generate.F16: 
    170         source_list.insert(0, _F16_PRAGMA) 
    171     elif dtype == generate.F64: 
    172         source_list.insert(0, _F64_PRAGMA) 
    173162 
    174163    source_list.insert(0, "#define USE_SINCOS\n") 
    175164    source = "\n".join(source_list) 
    176     program = SourceModule(source) # no_extern_c=True, include_dirs=[...] 
     165    options = '-use_fast_math' if fast else None 
     166    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
     167    #print("done with "+program) 
    177168    return program 
    178169 
     
    184175    GPU context, with possibly many devices, and one queue per device. 
    185176    """ 
    186     def __init__(self, devnum=0): 
    187         # type: () -> None 
     177    context = None # type: cuda.Context 
     178    def __init__(self, devnum=None): 
     179        # type: (int) -> None 
    188180        # Byte boundary for data alignment 
    189181        #self.data_boundary = max(d.min_data_type_align_size 
    190182        #                         for d in self.context.devices) 
    191183        self.compiled = {} 
    192         #self.device = cuda.Device(devnum) 
    193         #self.context = self.device.make_context() 
     184        env = os.environ.get("SAS_OPENCL", "").lower() 
     185        if devnum is None and env.startswith("cuda:"): 
     186            devnum = int(env[5:]) 
     187        # Set the global context to the particular device number if one is 
     188        # given, otherwise use the default context.  Perhaps this will be set 
     189        # by an environment variable within autoinit. 
     190        if devnum is not None: 
     191            self.context = cuda.Device(devnum).make_context() 
     192        else: 
     193            self.context = make_default_context() 
     194 
     195    def release(self): 
     196        if self.context is not None: 
     197            self.context.pop() 
     198            self.context = None 
     199 
     200    def __del__(self): 
     201        self.release() 
    194202 
    195203    def has_type(self, dtype): 
     
    198206        Return True if all devices support a given type. 
    199207        """ 
    200         return True 
     208        return has_type(dtype) 
    201209 
    202210    def compile_program(self, name, source, dtype, fast, timestamp): 
     
    235243    that the compiler is allowed to take shortcuts. 
    236244    """ 
     245    info = None # type: ModelInfo 
     246    source = "" # type: str 
     247    dtype = None # type: np.dtype 
     248    fast = False # type: bool 
     249    program = None # type: SourceModule 
     250    _kernels = None # type: List[cuda.Function] 
     251 
    237252    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
    238253        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None 
     
    418433                    last_nap = current_time 
    419434        sync() 
     435        cuda.memcpy_dtoh(self.result, self.result_b) 
     436        #print("result", self.result) 
     437 
    420438        details_b.free() 
    421439        values_b.free() 
    422         cuda.memcpy_dtoh(self.result, self.result_b) 
    423         #print("result", self.result) 
    424440 
    425441        pd_norm = self.result[self.q_input.nq] 
     
    459475 
    460476    #line added to not hog resources 
    461     while not done.query(): time.sleep(0.01) 
     477    while not done.query(): 
     478        time.sleep(0.01) 
    462479 
    463480    # Block until the GPU executes the kernel. 
     
    473490    efficiency. 
    474491    ''' 
    475     max_gx,max_gy = 65535,65535 
     492    max_gx, max_gy = 65535, 65535 
    476493    blocksize = 32 
    477     #max_gx,max_gy = 5,65536 
     494    #max_gx, max_gy = 5, 65536 
    478495    #blocksize = 3 
    479     block = (blocksize,1,1) 
     496    block = (blocksize, 1, 1) 
    480497    num_blocks = int((n+blocksize-1)/blocksize) 
    481498    if num_blocks < max_gx: 
    482         grid = (num_blocks,1) 
     499        grid = (num_blocks, 1) 
    483500    else: 
    484501        gx = max_gx 
    485502        gy = (num_blocks + max_gx - 1) / max_gx 
    486         if gy >= max_gy: raise ValueError("vector is too large") 
    487         grid = (gx,gy) 
    488     #print "block",block,"grid",grid 
    489     #print "waste",block[0]*block[1]*block[2]*grid[0]*grid[1] - n 
    490     return dict(block=block,grid=grid) 
    491  
     503        if gy >= max_gy: 
     504            raise ValueError("vector is too large") 
     505        grid = (gx, gy) 
     506    #print("block", block, "grid", grid) 
     507    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n) 
     508    return dict(block=block, grid=grid) 
Note: See TracChangeset for help on using the changeset viewer.