Changeset b0de252 in sasmodels


Ignore:
Timestamp:
Oct 12, 2018 7:31:24 PM (5 weeks ago)
Author:
pkienzle
Branches:
beta_approx, cuda-test, py3, ticket-1015-gpu-mem-error, ticket-1157, ticket-608-user-defined-weights, ticket_1156
Children:
74e9b5f
Parents:
47fb816
Message:

improve control over cuda context

Location:
sasmodels
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/compare.py

    r610ef23 rb0de252  
    115115    === environment variables === 
    116116    -DSAS_MODELPATH=path sets directory containing custom models 
    117     -DSAS_OPENCL=vendor:device|none sets the target OpenCL device 
     117    -DSAS_OPENCL=vendor:device|cuda:device|none sets the target GPU device 
    118118    -DXDG_CACHE_HOME=~/.cache sets the pyopencl cache root (linux only) 
    119119    -DSAS_COMPILER=tinycc|msvc|mingw|unix sets the DLL compiler 
  • sasmodels/core.py

    r47fb816 rb0de252  
    1313from glob import glob 
    1414import re 
    15  
    16 # Set "SAS_OPENCL=cuda" in the environment to use the CUDA rather than OpenCL 
    17 USE_CUDA = os.environ.get("SAS_OPENCL", "") == "cuda" 
    1815 
    1916import numpy as np # type: ignore 
     
    2421from . import mixture 
    2522from . import kernelpy 
    26 if USE_CUDA: 
    27     from . import kernelcuda 
    28 else: 
    29     from . import kernelcl 
     23from . import kernelcuda 
     24from . import kernelcl 
    3025from . import kerneldll 
    3126from . import custom 
     
    216211        #print("building dll", numpy_dtype) 
    217212        return kerneldll.load_dll(source['dll'], model_info, numpy_dtype) 
    218     elif USE_CUDA: 
    219         #print("building cuda", numpy_dtype) 
     213    elif platform == "cuda": 
    220214        return kernelcuda.GpuModel(source, model_info, numpy_dtype, fast=fast) 
    221215    else: 
     
    254248    # type: (ModelInfo, str, str) -> (np.dtype, bool, str) 
    255249    """ 
    256     Interpret dtype string, returning np.dtype and fast flag. 
     250    Interpret dtype string, returning np.dtype, fast flag and platform. 
    257251 
    258252    Possible types include 'half', 'single', 'double' and 'quad'.  If the 
     
    262256    default for the model and platform. 
    263257 
    264     Platform preference can be specfied ("ocl" vs "dll"), with the default 
    265     being OpenCL if it is availabe.  If the dtype name ends with '!' then 
    266     platform is forced to be DLL rather than OpenCL. 
     258    Platform preference can be specfied ("ocl", "cuda", "dll"), with the 
     259    default being OpenCL or CUDA if available, otherwise DLL.  If the dtype 
     260    name ends with '!' then platform is forced to be DLL rather than GPU. 
     261    The default platform is set by the environment variable SAS_OPENCL, 
     262    SAS_OPENCL=driver:device for OpenCL, SAS_OPENCL=cuda:device for CUDA 
     263    or SAS_OPENCL=none for DLL. 
    267264 
    268265    This routine ignores the preferences within the model definition.  This 
     
    277274    if platform is None: 
    278275        platform = "ocl" 
    279     if not model_info.opencl: 
    280         platform = "dll" 
    281     elif USE_CUDA: 
    282         if not kernelcuda.use_cuda(): 
    283             platform = "dll" 
    284     else: 
    285         if not kernelcl.use_opencl(): 
    286             platform = "dll" 
    287276 
    288277    # Check if type indicates dll regardless of which platform is given 
     
    290279        platform = "dll" 
    291280        dtype = dtype[:-1] 
     281 
     282    # Make sure model allows opencl/gpu 
     283    if not model_info.opencl: 
     284        platform = "dll" 
     285 
     286    # Make sure opencl is available, or fallback to cuda then to dll 
     287    if platform == "ocl" and not kernelcl.use_opencl(): 
     288        platform = "cuda" if kernelcuda.use_cuda() else "dll" 
    292289 
    293290    # Convert special type names "half", "fast", and "quad" 
     
    300297        dtype = "float16" 
    301298 
    302     # Convert dtype string to numpy dtype. 
     299    # Convert dtype string to numpy dtype.  Use single precision for GPU 
     300    # if model allows it, otherwise use double precision. 
    303301    if dtype is None or dtype == "default": 
    304         numpy_dtype = (generate.F32 if platform == "ocl" and model_info.single 
     302        numpy_dtype = (generate.F32 if model_info.single and platform in ("ocl", "cuda") 
    305303                       else generate.F64) 
    306304    else: 
    307305        numpy_dtype = np.dtype(dtype) 
    308306 
    309     # Make sure that the type is supported by opencl, otherwise use dll 
     307    # Make sure that the type is supported by GPU, otherwise use dll 
    310308    if platform == "ocl": 
    311         if USE_CUDA: 
    312             env = kernelcuda.environment() 
    313         else: 
    314             env = kernelcl.environment() 
    315         if not env.has_type(numpy_dtype): 
    316             platform = "dll" 
    317             if dtype is None: 
    318                 numpy_dtype = generate.F64 
     309        env = kernelcl.environment() 
     310    elif platform == "cuda": 
     311        env = kernelcuda.environment() 
     312    else: 
     313        env = None 
     314    if env is not None and not env.has_type(numpy_dtype): 
     315        platform = "dll" 
     316        if dtype is None: 
     317            numpy_dtype = generate.F64 
    319318 
    320319    return numpy_dtype, fast, platform 
  • sasmodels/kernel_header.c

    r0db7dbd rb0de252  
    55#elif defined(_OPENMP) 
    66# define USE_OPENMP 
    7 #endif 
     7#elif defined(__CUDACC__) 
     8# define USE_CUDA 
     9#endif 
     10 
     11// Use SAS_DOUBLE to force the use of double even for float kernels 
     12#define SAS_DOUBLE dou ## ble 
    813 
    914// If opencl is not available, then we are compiling a C function 
     
    127132#endif // !USE_OPENCL 
    128133 
    129 // Use SAS_DOUBLE to force the use of double even for float kernels 
    130 #define SAS_DOUBLE dou ## ble 
    131  
    132134#if defined(NEED_EXPM1) 
    133135   // TODO: precision is a half digit lower than numpy on mac in [1e-7, 0.5] 
  • sasmodels/kernel_iq.c

    r47fb816 rb0de252  
    8080//     du * (m_sigma_y + 1j*m_sigma_z); 
    8181// weights for spin crosssections: dd du real, ud real, uu, du imag, ud imag 
     82__device__ 
    8283static void set_spin_weights(double in_spin, double out_spin, double weight[6]) 
    8384{ 
  • sasmodels/kernelcl.py

    rd86f0fc rb0de252  
    11""" 
    22GPU driver for C kernels 
     3 
     4TODO: docs are out of date 
    35 
    46There should be a single GPU environment running on the system.  This 
     
    5961 
    6062 
    61 # Attempt to setup opencl. This may fail if the opencl package is not 
     63# Attempt to setup opencl. This may fail if the pyopencl package is not 
    6264# installed or if it is installed but there are no devices available. 
    6365try: 
     
    131133 
    132134def use_opencl(): 
    133     return HAVE_OPENCL and os.environ.get("SAS_OPENCL", "").lower() != "none" 
     135    env = os.environ.get("SAS_OPENCL", "").lower() 
     136    return HAVE_OPENCL and env != "none" and not env.startswith("cuda") 
    134137 
    135138ENV = None 
     
    179182        cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
    180183        queue.device) 
    181  
    182 def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 
    183     # type: (np.ndarray, np.dtype, float, int) -> np.ndarray 
    184     """ 
    185     Stretch an input vector to the correct boundary. 
    186  
    187     Performance on the kernels can drop by a factor of two or more if the 
    188     number of values to compute does not fall on a nice power of two 
    189     boundary.   The trailing additional vector elements are given a 
    190     value of *extra*, and so f(*extra*) will be computed for each of 
    191     them.  The returned array will thus be a subset of the computed array. 
    192  
    193     *boundary* should be a power of 2 which is at least 32 for good 
    194     performance on current platforms (as of Jan 2015).  It should 
    195     probably be the max of get_warp(kernel,queue) and 
    196     device.min_data_type_align_size//4. 
    197     """ 
    198     remainder = vector.size % boundary 
    199     if remainder != 0: 
    200         size = vector.size + (boundary - remainder) 
    201         vector = np.hstack((vector, [extra] * (size - vector.size))) 
    202     return np.ascontiguousarray(vector, dtype=dtype) 
    203  
    204184 
    205185def compile_model(context, source, dtype, fast=False): 
  • sasmodels/kernelcuda.py

    r0db7dbd rb0de252  
    11""" 
    2 GPU driver for C kernels 
     2GPU driver for C kernels (with CUDA) 
     3 
     4To select cuda, use SAS_OPENCL=cuda, or SAS_OPENCL=cuda:n for a particular 
     5device number.  If no device number is specified, then look for CUDA_DEVICE=n 
     6or a file ~/.cuda-device containing n for the device number.  Otherwise, try 
     7all available device numbers. 
     8 
     9TODO: docs are out of date 
    310 
    411There should be a single GPU environment running on the system.  This 
     
    5966 
    6067 
    61 # Attempt to setup opencl. This may fail if the opencl package is not 
     68# Attempt to setup cuda. This may fail if the pycuda package is not 
    6269# installed or if it is installed but there are no devices available. 
    6370try: 
    64     import pycuda.autoinit 
    6571    import pycuda.driver as cuda  # type: ignore 
    6672    from pycuda.compiler import SourceModule 
     73    from pycuda.tools import make_default_context, clear_context_caches 
     74    # Ask CUDA for the default context (so that we know that one exists) 
     75    # then immediately throw it away in case the user doesn't want it. 
     76    # Note: cribbed from pycuda.autoinit 
     77    cuda.init() 
     78    context = make_default_context() 
     79    context.pop() 
     80    clear_context_caches() 
     81    del context 
    6782    HAVE_CUDA = True 
    6883    CUDA_ERROR = "" 
     
    91106MAX_LOOPS = 2048 
    92107 
    93  
    94 # Pragmas for enable OpenCL features.  Be sure to protect them so that they 
    95 # still compile even if OpenCL is not present. 
    96 _F16_PRAGMA = """\ 
    97 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16) 
    98 #  pragma OPENCL EXTENSION cl_khr_fp16: enable 
    99 #endif 
    100 """ 
    101  
    102 _F64_PRAGMA = """\ 
    103 #if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64) 
    104 #  pragma OPENCL EXTENSION cl_khr_fp64: enable 
    105 #endif 
    106 """ 
    107  
    108108def use_cuda(): 
    109     return HAVE_CUDA 
     109    env = os.environ.get("SAS_OPENCL", "").lower() 
     110    return HAVE_CUDA and (env == "" or env.startswith("cuda")) 
    110111 
    111112ENV = None 
     
    115116    """ 
    116117    global ENV 
     118    # Free any previous allocated context. 
     119    if ENV is not None and ENV.context is not None: 
     120        ENV.release() 
    117121    ENV = GpuEnvironment() if use_cuda() else None 
    118122 
     
    126130    if ENV is None: 
    127131        if not HAVE_CUDA: 
    128             raise RuntimeError("OpenCL startup failed with ***" 
     132            raise RuntimeError("CUDA startup failed with ***" 
    129133                            + CUDA_ERROR + "***; using C compiler instead") 
    130134        reset_environment() 
     
    133137    return ENV 
    134138 
    135 def _stretch_input(vector, dtype, extra=1e-3, boundary=32): 
    136     # type: (np.ndarray, np.dtype, float, int) -> np.ndarray 
    137     """ 
    138     Stretch an input vector to the correct boundary. 
    139  
    140     Performance on the kernels can drop by a factor of two or more if the 
    141     number of values to compute does not fall on a nice power of two 
    142     boundary.   The trailing additional vector elements are given a 
    143     value of *extra*, and so f(*extra*) will be computed for each of 
    144     them.  The returned array will thus be a subset of the computed array. 
    145  
    146     *boundary* should be a power of 2 which is at least 32 for good 
    147     performance on current platforms (as of Jan 2015).  It should 
    148     probably be the max of get_warp(kernel,queue) and 
    149     device.min_data_type_align_size//4. 
    150     """ 
    151     remainder = vector.size % boundary 
    152     if remainder != 0: 
    153         size = vector.size + (boundary - remainder) 
    154         vector = np.hstack((vector, [extra] * (size - vector.size))) 
    155     return np.ascontiguousarray(vector, dtype=dtype) 
    156  
     139def has_type(dtype): 
     140    # type: (np.dtype) -> bool 
     141    """ 
     142    Return true if device supports the requested precision. 
     143    """ 
     144    # Assume the nvidia card supports 32-bit and 64-bit floats. 
     145    # TODO: check if pycuda support F16 
     146    return dtype in (generate.F32, generate.F64) 
    157147 
    158148def compile_model(source, dtype, fast=False): 
    159     # type: (str, np.dtype, bool) -> cl.Program 
     149    # type: (str, np.dtype, bool) -> SourceModule 
    160150    """ 
    161151    Build a model to run on the gpu. 
     
    165155    devices in the context do not support the cl_khr_fp64 extension. 
    166156    """ 
     157    dtype = np.dtype(dtype) 
     158    if not has_type(dtype): 
     159        raise RuntimeError("%s not supported for devices"%dtype) 
     160 
    167161    source_list = [generate.convert_type(source, dtype)] 
    168  
    169     if dtype == generate.F16: 
    170         source_list.insert(0, _F16_PRAGMA) 
    171     elif dtype == generate.F64: 
    172         source_list.insert(0, _F64_PRAGMA) 
    173162 
    174163    source_list.insert(0, "#define USE_SINCOS\n") 
    175164    source = "\n".join(source_list) 
    176     program = SourceModule(source) # no_extern_c=True, include_dirs=[...] 
     165    options = '-use_fast_math' if fast else None 
     166    program = SourceModule(source, no_extern_c=True, options=options) # include_dirs=[...] 
     167    #print("done with "+program) 
    177168    return program 
    178169 
     
    184175    GPU context, with possibly many devices, and one queue per device. 
    185176    """ 
    186     def __init__(self, devnum=0): 
    187         # type: () -> None 
     177    context = None # type: cuda.Context 
     178    def __init__(self, devnum=None): 
     179        # type: (int) -> None 
    188180        # Byte boundary for data alignment 
    189181        #self.data_boundary = max(d.min_data_type_align_size 
    190182        #                         for d in self.context.devices) 
    191183        self.compiled = {} 
    192         #self.device = cuda.Device(devnum) 
    193         #self.context = self.device.make_context() 
     184        env = os.environ.get("SAS_OPENCL", "").lower() 
     185        if devnum is None and env.startswith("cuda:"): 
     186            devnum = int(env[5:]) 
     187        # Set the global context to the particular device number if one is 
     188        # given, otherwise use the default context.  Perhaps this will be set 
     189        # by an environment variable within autoinit. 
     190        if devnum is not None: 
     191            self.context = cuda.Device(devnum).make_context() 
     192        else: 
     193            self.context = make_default_context() 
     194 
     195    def release(self): 
     196        if self.context is not None: 
     197            self.context.pop() 
     198            self.context = None 
     199 
     200    def __del__(self): 
     201        self.release() 
    194202 
    195203    def has_type(self, dtype): 
     
    198206        Return True if all devices support a given type. 
    199207        """ 
    200         return True 
     208        return has_type(dtype) 
    201209 
    202210    def compile_program(self, name, source, dtype, fast, timestamp): 
     
    235243    that the compiler is allowed to take shortcuts. 
    236244    """ 
     245    info = None # type: ModelInfo 
     246    source = "" # type: str 
     247    dtype = None # type: np.dtype 
     248    fast = False # type: bool 
     249    program = None # type: SourceModule 
     250    _kernels = None # type: List[cuda.Function] 
     251 
    237252    def __init__(self, source, model_info, dtype=generate.F32, fast=False): 
    238253        # type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None 
     
    418433                    last_nap = current_time 
    419434        sync() 
     435        cuda.memcpy_dtoh(self.result, self.result_b) 
     436        #print("result", self.result) 
     437 
    420438        details_b.free() 
    421439        values_b.free() 
    422         cuda.memcpy_dtoh(self.result, self.result_b) 
    423         #print("result", self.result) 
    424440 
    425441        pd_norm = self.result[self.q_input.nq] 
     
    459475 
    460476    #line added to not hog resources 
    461     while not done.query(): time.sleep(0.01) 
     477    while not done.query(): 
     478        time.sleep(0.01) 
    462479 
    463480    # Block until the GPU executes the kernel. 
     
    473490    efficiency. 
    474491    ''' 
    475     max_gx,max_gy = 65535,65535 
     492    max_gx, max_gy = 65535, 65535 
    476493    blocksize = 32 
    477     #max_gx,max_gy = 5,65536 
     494    #max_gx, max_gy = 5, 65536 
    478495    #blocksize = 3 
    479     block = (blocksize,1,1) 
     496    block = (blocksize, 1, 1) 
    480497    num_blocks = int((n+blocksize-1)/blocksize) 
    481498    if num_blocks < max_gx: 
    482         grid = (num_blocks,1) 
     499        grid = (num_blocks, 1) 
    483500    else: 
    484501        gx = max_gx 
    485502        gy = (num_blocks + max_gx - 1) / max_gx 
    486         if gy >= max_gy: raise ValueError("vector is too large") 
    487         grid = (gx,gy) 
    488     #print "block",block,"grid",grid 
    489     #print "waste",block[0]*block[1]*block[2]*grid[0]*grid[1] - n 
    490     return dict(block=block,grid=grid) 
    491  
     503        if gy >= max_gy: 
     504            raise ValueError("vector is too large") 
     505        grid = (gx, gy) 
     506    #print("block", block, "grid", grid) 
     507    #print("waste", block[0]*block[1]*block[2]*grid[0]*grid[1] - n) 
     508    return dict(block=block, grid=grid) 
Note: See TracChangeset for help on using the changeset viewer.