kernelcl.py @ 9fac5f5

Last change on this file since 9fac5f5 was 9fac5f5, checked in by Paul Kienzle <pkienzle@…>, 5 years ago
kernelcl.reset_environment() returns a new environment
Property mode set to `100644`
File size: 23.9 KB

Rev	Line
[14de349]	1	"""
[eafc9fa]	2	GPU driver for C kernels
[14de349]	3
[b0de252]	4	TODO: docs are out of date
	5
[14de349]	6	There should be a single GPU environment running on the system. This
	7	environment is constructed on the first call to :func:`env`, and the
	8	same environment is returned on each call.
	9
	10	After retrieving the environment, the next step is to create the kernel.
	11	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	12	returns the type of data used by the kernel.
	13
	14	Next a :class:`GpuData` object should be created with the correct kind
	15	of data. This data object can be used by multiple kernels, for example,
	16	if the target model is a weighted sum of multiple kernels. The data
	17	should include any extra evaluation points required to compute the proper
	18	data smearing. This need not match the square grid for 2D data if there
	19	is an index saying which q points are active.
	20
	21	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	22	This kernel is used during fitting, receiving new sets of parameters and
	23	evaluating them. The output value is stored in an output buffer on the
	24	devices, where it can be combined with other structure factors and form
	25	factors and have instrumental resolution effects applied.
[92da231]	26
	27	In order to use OpenCL for your models, you will need OpenCL drivers for
	28	your machine. These should be available from your graphics card vendor.
	29	Intel provides OpenCL drivers for CPUs as well as their integrated HD
	30	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
	31	this writing the performance is lacking compared to the Intel drivers.
	32	NVidia combines drivers for CUDA and OpenCL in one package. The result
	33	is a bit messy if you have multiple drivers installed. You can see which
	34	drivers are available by starting python and running:
	35
	36	import pyopencl as cl
	37	cl.create_some_context(interactive=True)
	38
	39	Once you have done that, it will show the available drivers which you
	40	can select. It will then tell you that you can use these drivers
[880a2ed]	41	automatically by setting the SAS_OPENCL environment variable, which is
	42	PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
[92da231]	43
	44	Some graphics cards have multiple devices on the same card. You cannot
	45	yet use both of them concurrently to evaluate models, but you can run
	46	the program twice using a different device for each session.
	47
	48	OpenCL kernels are compiled when needed by the device driver. Some
	49	drivers produce compiler output even when there is no error. You
	50	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
	51	harmless, albeit annoying.
[14de349]	52	"""
[ba32cdd]	53	from __future__ import print_function
[a5b8477]	54
[b297ba9]	55	import sys
[250fa25]	56	import os
	57	import warnings
[821a9c6]	58	import logging
[6e5b2a7]	59	import time
[250fa25]	60
[e2592f0]	61	try:
	62	from time import perf_counter as clock
	63	except ImportError: # CRUFT: python < 3.3
	64	if sys.platform.count("darwin") > 0:
	65	from time import time as clock
	66	else:
	67	from time import clock
[b3f6bc3]	68
[e2592f0]	69	import numpy as np # type: ignore
[3221de0]	70
[3199b17]	71	# Attempt to setup OpenCL. This may fail if the pyopencl package is not
[3221de0]	72	# installed or if it is installed but there are no devices available.
[250fa25]	73	try:
[3221de0]	74	import pyopencl as cl # type: ignore
	75	from pyopencl import mem_flags as mf
	76	from pyopencl.characterize import get_fast_inaccurate_build_options
[3199b17]	77	# Ask OpenCL for the default context so that we know that one exists.
[3221de0]	78	cl.create_some_context(interactive=False)
	79	HAVE_OPENCL = True
	80	OPENCL_ERROR = ""
[9404dd3]	81	except Exception as exc:
[6dba2f0]	82	HAVE_OPENCL = False
[3221de0]	83	OPENCL_ERROR = str(exc)
[14de349]	84
[cb6ecf4]	85	from . import generate
[95f62aa]	86	from .generate import F32, F64
[f619de7]	87	from .kernel import KernelModel, Kernel
[14de349]	88
[2d81cfe]	89	# pylint: disable=unused-import
[a5b8477]	90	try:
	91	from typing import Tuple, Callable, Any
	92	from .modelinfo import ModelInfo
	93	from .details import CallDetails
	94	except ImportError:
	95	pass
[2d81cfe]	96	# pylint: enable=unused-import
[a5b8477]	97
[3199b17]	98
	99	# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path).
[20317b3]	100	def quote_path(v):
[b297ba9]	101	# type: (str) -> str
[20317b3]	102	"""
	103	Quote the path if it is not already quoted.
	104
	105	If v starts with '-', then assume that it is a -I option or similar
	106	and do not quote it. This is fragile: -Ipath with space needs to
	107	be quoted.
	108	"""
	109	return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
	110
[3199b17]	111
[20317b3]	112	def fix_pyopencl_include():
[b297ba9]	113	# type: (None) -> None
[40a87fa]	114	"""
	115	Monkey patch pyopencl to allow spaces in include file path.
	116	"""
[4e28511]	117	# pylint: disable=protected-access
[b297ba9]	118	import pyopencl
	119	if hasattr(pyopencl, '_DEFAULT_INCLUDE_OPTIONS'):
	120	pyopencl._DEFAULT_INCLUDE_OPTIONS = [
	121	quote_path(v) for v in pyopencl._DEFAULT_INCLUDE_OPTIONS
[3199b17]	122	]
	123
[20317b3]	124
[6dba2f0]	125	if HAVE_OPENCL:
	126	fix_pyopencl_include()
[20317b3]	127
[ce27e21]	128	# The max loops number is limited by the amount of local memory available
	129	# on the device. You don't want to make this value too big because it will
	130	# waste resources, nor too small because it may interfere with users trying
	131	# to do their polydispersity calculations. A value of 1024 should be much
	132	# larger than necessary given that cost grows as npts^k where k is the number
	133	# of polydisperse parameters.
[5d4777d]	134	MAX_LOOPS = 2048
	135
[5464d68]	136	# Pragmas for enable OpenCL features. Be sure to protect them so that they
	137	# still compile even if OpenCL is not present.
	138	_F16_PRAGMA = """\
	139	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
	140	# pragma OPENCL EXTENSION cl_khr_fp16: enable
	141	#endif
	142	"""
	143
	144	_F64_PRAGMA = """\
	145	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
	146	# pragma OPENCL EXTENSION cl_khr_fp64: enable
	147	#endif
	148	"""
	149
[3199b17]	150
[3221de0]	151	def use_opencl():
[b297ba9]	152	# type: () -> bool
	153	"""Return True if OpenCL is the default computational engine"""
[07646b6]	154	sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
	155	return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
[5464d68]	156
[3199b17]	157
[14de349]	158	ENV = None
[3221de0]	159	def reset_environment():
[9fac5f5]	160	# type: () -> "GpuEnvironment"
[3221de0]	161	"""
[9fac5f5]	162	Return a new OpenCL context, such as after a change to SAS_OPENCL.
[3221de0]	163	"""
	164	global ENV
	165	ENV = GpuEnvironment() if use_opencl() else None
[9fac5f5]	166	return ENV
[3199b17]	167
[14de349]	168	def environment():
[dd7fc12]	169	# type: () -> "GpuEnvironment"
[14de349]	170	"""
	171	Returns a singleton :class:`GpuEnvironment`.
	172
	173	This provides an OpenCL context and one queue per device.
	174	"""
[b4272a2]	175	if ENV is None:
	176	if not HAVE_OPENCL:
	177	raise RuntimeError("OpenCL startup failed with ***"
[d86f0fc]	178	+ OPENCL_ERROR + "***; using C compiler instead")
[3221de0]	179	reset_environment()
[b4272a2]	180	if ENV is None:
	181	raise RuntimeError("SAS_OPENCL=None in environment")
[14de349]	182	return ENV
	183
[3199b17]	184
[5d316e9]	185	def has_type(device, dtype):
[dd7fc12]	186	# type: (cl.Device, np.dtype) -> bool
[14de349]	187	"""
[5d316e9]	188	Return true if device supports the requested precision.
[14de349]	189	"""
[95f62aa]	190	if dtype == F32:
[5d316e9]	191	return True
[f872fd1]	192	elif dtype == F64:
[5d316e9]	193	return "cl_khr_fp64" in device.extensions
	194	else:
[3199b17]	195	# Not supporting F16 type since it isn't accurate enough.
[5d316e9]	196	return False
[14de349]	197
[3199b17]	198
[f5b9a6b]	199	def get_warp(kernel, queue):
[dd7fc12]	200	# type: (cl.Kernel, cl.CommandQueue) -> int
[f5b9a6b]	201	"""
	202	Return the size of an execution batch for kernel running on queue.
	203	"""
[750ffa5]	204	return kernel.get_work_group_info(
[63b32bb]	205	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
	206	queue.device)
[14de349]	207
[3199b17]	208
[5d316e9]	209	def compile_model(context, source, dtype, fast=False):
[dd7fc12]	210	# type: (cl.Context, str, np.dtype, bool) -> cl.Program
[14de349]	211	"""
	212	Build a model to run on the gpu.
	213
[6cbdcd4]	214	Returns the compiled program and its type.
	215
	216	Raises an error if the desired precision is not available.
[14de349]	217	"""
	218	dtype = np.dtype(dtype)
[5d316e9]	219	if not all(has_type(d, dtype) for d in context.devices):
	220	raise RuntimeError("%s not supported for devices"%dtype)
[14de349]	221
[5464d68]	222	source_list = [generate.convert_type(source, dtype)]
	223
	224	if dtype == generate.F16:
	225	source_list.insert(0, _F16_PRAGMA)
	226	elif dtype == generate.F64:
	227	source_list.insert(0, _F64_PRAGMA)
	228
[3199b17]	229	# Note: USE_SINCOS makes the Intel CPU slower under OpenCL.
[14de349]	230	if context.devices[0].type == cl.device_type.GPU:
[5464d68]	231	source_list.insert(0, "#define USE_SINCOS\n")
[5d316e9]	232	options = (get_fast_inaccurate_build_options(context.devices[0])
	233	if fast else [])
[ba32cdd]	234	source = "\n".join(source_list)
[5d316e9]	235	program = cl.Program(context, source).build(options=options)
[3199b17]	236
[821a9c6]	237	#print("done with "+program)
[ce27e21]	238	return program
[14de349]	239
	240
[3199b17]	241	# For now, this returns one device in the context.
	242	# TODO: Create a context that contains all devices on all platforms.
[14de349]	243	class GpuEnvironment(object):
	244	"""
[3199b17]	245	GPU context for OpenCL, with possibly many devices and one queue per device.
[14de349]	246	"""
	247	def __init__(self):
[dd7fc12]	248	# type: () -> None
[3199b17]	249	# Find gpu context.
[95f62aa]	250	context_list = _create_some_context()
	251
	252	# Find a context for F32 and for F64 (maybe the same one).
	253	# F16 isn't good enough.
	254	self.context = {}
	255	for dtype in (F32, F64):
	256	for context in context_list:
	257	if has_type(context.devices[0], dtype):
	258	self.context[dtype] = context
	259	break
	260	else:
	261	self.context[dtype] = None
	262
[3199b17]	263	# Build a queue for each context.
[95f62aa]	264	self.queue = {}
	265	context = self.context[F32]
	266	self.queue[F32] = cl.CommandQueue(context, context.devices[0])
	267	if self.context[F64] == self.context[F32]:
	268	self.queue[F64] = self.queue[F32]
	269	else:
	270	context = self.context[F64]
	271	self.queue[F64] = cl.CommandQueue(context, context.devices[0])
[250fa25]	272
[3199b17]	273	## Byte boundary for data alignment.
[95f62aa]	274	#self.data_boundary = max(context.devices[0].min_data_type_align_size
	275	# for context in self.context.values())
	276
[3199b17]	277	# Cache for compiled programs, and for items in context.
[ce27e21]	278	self.compiled = {}
	279
[5d316e9]	280	def has_type(self, dtype):
[dd7fc12]	281	# type: (np.dtype) -> bool
[eafc9fa]	282	"""
	283	Return True if all devices support a given type.
	284	"""
[95f62aa]	285	return self.context.get(dtype, None) is not None
[250fa25]	286
[300a2f7]	287	def compile_program(self, name, source, dtype, fast, timestamp):
	288	# type: (str, str, np.dtype, bool, float) -> cl.Program
[eafc9fa]	289	"""
	290	Compile the program for the device in the given context.
	291	"""
[300a2f7]	292	# Note: PyOpenCL caches based on md5 hash of source, options and device
[3199b17]	293	# but I'll do so as well just to save some data munging time.
[7fcdc9f]	294	tag = generate.tag_source(source)
	295	key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
[3199b17]	296	# Check timestamp on program.
[300a2f7]	297	program, program_timestamp = self.compiled.get(key, (None, np.inf))
	298	if program_timestamp < timestamp:
	299	del self.compiled[key]
[cde11f0f]	300	if key not in self.compiled:
[95f62aa]	301	context = self.context[dtype]
[20317b3]	302	logging.info("building %s for OpenCL %s", key,
	303	context.devices[0].name.strip())
[95f62aa]	304	program = compile_model(self.context[dtype],
[fec69dd]	305	str(source), dtype, fast)
[300a2f7]	306	self.compiled[key] = (program, timestamp)
	307	return program
[14de349]	308
[3199b17]	309
[95f62aa]	310	def _create_some_context():
	311	# type: () -> cl.Context
	312	"""
	313	Protected call to cl.create_some_context without interactivity.
	314
	315	Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
	316	otherwise scans for the most appropriate device using
[d5ce7fa]	317	:func:`_get_default_context`. Ignore SAS_OPENCL=OpenCL, which
	318	indicates that an OpenCL device should be used without specifying
	319	which one (and not a CUDA device, or no GPU).
[95f62aa]	320	"""
[3199b17]	321	# Assume we do not get here if SAS_OPENCL is None or CUDA.
[07646b6]	322	sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
	323	if sas_opencl.lower() != 'opencl':
[3199b17]	324	# Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context.
[07646b6]	325	os.environ["PYOPENCL_CTX"] = sas_opencl
[95f62aa]	326
	327	if 'PYOPENCL_CTX' in os.environ:
	328	try:
	329	return [cl.create_some_context(interactive=False)]
	330	except Exception as exc:
[069743a]	331	# TODO: Should warnings instead be put into logging.warn?
[95f62aa]	332	warnings.warn(str(exc))
[b297ba9]	333	warnings.warn(
	334	"pyopencl.create_some_context() failed. The environment "
	335	"variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set "
	336	"correctly")
[95f62aa]	337
	338	return _get_default_context()
	339
[3199b17]	340
[3c56da87]	341	def _get_default_context():
[20317b3]	342	# type: () -> List[cl.Context]
[eafc9fa]	343	"""
[d18582e]	344	Get an OpenCL context, preferring GPU over CPU, and preferring Intel
	345	drivers over AMD drivers.
[eafc9fa]	346	"""
[d18582e]	347	# Note: on mobile devices there is automatic clock scaling if either the
	348	# CPU or the GPU is underutilized; probably doesn't affect us, but we if
	349	# it did, it would mean that putting a busy loop on the CPU while the GPU
	350	# is running may increase throughput.
	351	#
[3199b17]	352	# MacBook Pro, base install:
[d18582e]	353	# {'Apple': [Intel CPU, NVIDIA GPU]}
[3199b17]	354	# MacBook Pro, base install:
[d18582e]	355	# {'Apple': [Intel CPU, Intel GPU]}
[3199b17]	356	# 2 x NVIDIA 295 with Intel and NVIDIA opencl drivers install:
[d18582e]	357	# {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
	358	gpu, cpu = None, None
[3c56da87]	359	for platform in cl.get_platforms():
[e6a5556]	360	# AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
[20317b3]	361	# If someone has bothered to install the AMD/NVIDIA drivers, prefer
	362	# them over the integrated graphics driver that may have been supplied
	363	# with the CPU chipset.
	364	preferred_cpu = (platform.vendor.startswith('Intel')
	365	or platform.vendor.startswith('Apple'))
	366	preferred_gpu = (platform.vendor.startswith('Advanced')
	367	or platform.vendor.startswith('NVIDIA'))
[3c56da87]	368	for device in platform.get_devices():
	369	if device.type == cl.device_type.GPU:
[20317b3]	370	# If the existing type is not GPU then it will be CUSTOM
	371	# or ACCELERATOR so don't override it.
[e6a5556]	372	if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
	373	gpu = device
	374	elif device.type == cl.device_type.CPU:
	375	if cpu is None or preferred_cpu:
	376	cpu = device
[d18582e]	377	else:
[e6a5556]	378	# System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
[3199b17]	379	# Intel Phi for example registers as an accelerator.
[20317b3]	380	# Since the user installed a custom device on their system
	381	# and went through the pain of sorting out OpenCL drivers for
	382	# it, lets assume they really do want to use it as their
	383	# primary compute device.
[e6a5556]	384	gpu = device
[199d40d]	385
[3199b17]	386	# Order the devices by gpu then by cpu; when searching for an available
[20317b3]	387	# device by data type they will be checked in this order, which means
	388	# that if the gpu supports double then the cpu will never be used (though
	389	# we may make it possible to explicitly request the cpu at some point).
[e6a5556]	390	devices = []
	391	if gpu is not None:
	392	devices.append(gpu)
	393	if cpu is not None:
	394	devices.append(cpu)
	395	return [cl.Context([d]) for d in devices]
[3c56da87]	396
[250fa25]	397
[f619de7]	398	class GpuModel(KernelModel):
[14de349]	399	"""
	400	GPU wrapper for a single model.
	401
[17bbadd]	402	source and model_info are the model source and interface as returned
	403	from :func:`generate.make_source` and :func:`generate.make_model_info`.
[14de349]	404
	405	dtype is the desired model precision. Any numpy dtype for single
	406	or double precision floats will do, such as 'f', 'float32' or 'single'
	407	for single and 'd', 'float64' or 'double' for double. Double precision
	408	is an optional extension which may not be available on all devices.
[cde11f0f]	409	Half precision ('float16','half') may be available on some devices.
	410	Fast precision ('fast') is a loose version of single precision, indicating
	411	that the compiler is allowed to take shortcuts.
[14de349]	412	"""
[3199b17]	413	info = None # type: ModelInfo
	414	source = "" # type: str
	415	dtype = None # type: np.dtype
	416	fast = False # type: bool
	417	_program = None # type: cl.Program
	418	_kernels = None # type: Dict[str, cl.Kernel]
[7126c04]	419
[dd7fc12]	420	def __init__(self, source, model_info, dtype=generate.F32, fast=False):
[a4280bd]	421	# type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
[17bbadd]	422	self.info = model_info
[ce27e21]	423	self.source = source
[dd7fc12]	424	self.dtype = dtype
	425	self.fast = fast
[14de349]	426
[ce27e21]	427	def __getstate__(self):
[dd7fc12]	428	# type: () -> Tuple[ModelInfo, str, np.dtype, bool]
[eafc9fa]	429	return self.info, self.source, self.dtype, self.fast
[14de349]	430
[ce27e21]	431	def __setstate__(self, state):
[dd7fc12]	432	# type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
[eafc9fa]	433	self.info, self.source, self.dtype, self.fast = state
[7126c04]	434	self._program = self._kernels = None
[ce27e21]	435
[9eb3632]	436	def make_kernel(self, q_vectors):
[dd7fc12]	437	# type: (List[np.ndarray]) -> "GpuKernel"
[95f62aa]	438	return GpuKernel(self, q_vectors)
	439
[7126c04]	440	def get_function(self, name):
[95f62aa]	441	# type: (str) -> cl.Kernel
	442	"""
	443	Fetch the kernel from the environment by name, compiling it if it
	444	does not already exist.
	445	"""
[7126c04]	446	if self._program is None:
	447	self._prepare_program()
	448	return self._kernels[name]
	449
	450	def _prepare_program(self):
	451	# type: (str) -> None
	452	env = environment()
	453	timestamp = generate.ocl_timestamp(self.info)
	454	program = env.compile_program(
	455	self.info.name,
	456	self.source['opencl'],
	457	self.dtype,
	458	self.fast,
	459	timestamp)
	460	variants = ['Iq', 'Iqxy', 'Imagnetic']
	461	names = [generate.kernel_name(self.info, k) for k in variants]
[00afc15]	462	functions = [getattr(program, k) for k in names]
	463	self._kernels = {k: v for k, v in zip(variants, functions)}
[3199b17]	464	# Keep a handle to program so GC doesn't collect.
[7126c04]	465	self._program = program
[14de349]	466
[3199b17]	467
	468	# TODO: Check that we don't need a destructor for buffers which go out of scope.
[14de349]	469	class GpuInput(object):
	470	"""
	471	Make q data available to the gpu.
	472
	473	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	474	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	475	to get the best performance on OpenCL, which may involve shifting and
	476	stretching the array to better match the memory architecture. Additional
	477	points will be evaluated with q=1e-3.
	478
	479	dtype is the data type for the q vectors. The data type should be
	480	set to match that of the kernel, which is an attribute of
	481	:class:`GpuProgram`. Note that not all kernels support double
	482	precision, so even if the program was created for double precision,
	483	the GpuProgram.dtype may be single precision.
	484
	485	Call :meth:`release` when complete. Even if not called directly, the
	486	buffer will be released when the data object is freed.
	487	"""
[cb6ecf4]	488	def __init__(self, q_vectors, dtype=generate.F32):
[dd7fc12]	489	# type: (List[np.ndarray], np.dtype) -> None
[3199b17]	490	# TODO: Do we ever need double precision q?
[14de349]	491	self.nq = q_vectors[0].size
	492	self.dtype = np.dtype(dtype)
[eafc9fa]	493	self.is_2d = (len(q_vectors) == 2)
[3199b17]	494	# TODO: Stretch input based on get_warp().
	495	# Not doing it now since warp depends on kernel, which is not known
[f5b9a6b]	496	# at this point, so instead using 32, which is good on the set of
	497	# architectures tested so far.
[c072f83]	498	if self.is_2d:
[07646b6]	499	width = ((self.nq+15)//16)*16
[c072f83]	500	self.q = np.empty((width, 2), dtype=dtype)
	501	self.q[:self.nq, 0] = q_vectors[0]
	502	self.q[:self.nq, 1] = q_vectors[1]
	503	else:
[07646b6]	504	width = ((self.nq+31)//32)*32
[c072f83]	505	self.q = np.empty(width, dtype=dtype)
	506	self.q[:self.nq] = q_vectors[0]
	507	self.global_size = [self.q.shape[0]]
[7126c04]	508	#print("creating inputs of size", self.global_size)
[95f62aa]	509
[3199b17]	510	# Transfer input value to GPU.
[95f62aa]	511	env = environment()
[7126c04]	512	context = env.context[self.dtype]
	513	self.q_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
	514	hostbuf=self.q)
[14de349]	515
	516	def release(self):
[dd7fc12]	517	# type: () -> None
[eafc9fa]	518	"""
[3199b17]	519	Free the buffer associated with the q value.
[eafc9fa]	520	"""
[7126c04]	521	if self.q_b is not None:
	522	self.q_b.release()
	523	self.q_b = None
[14de349]	524
[eafc9fa]	525	def __del__(self):
[dd7fc12]	526	# type: () -> None
[eafc9fa]	527	self.release()
	528
[3199b17]	529
[f619de7]	530	class GpuKernel(Kernel):
[ff7119b]	531	"""
	532	Callable SAS kernel.
	533
[95f62aa]	534	model is the GpuModel object to call
[ff7119b]	535
[7126c04]	536	The kernel is derived from :class:`Kernel`, providing the
	537	:meth:`call_kernel` method to evaluate the kernel for a given set of
	538	parameters. Because of the need to move the q values to the GPU before
	539	evaluation, the kernel is instantiated for a particular set of q vectors,
	540	and can be called many times without transfering q each time.
[ff7119b]	541
	542	Call :meth:`release` when done with the kernel instance.
	543	"""
[3199b17]	544	#: SAS model information structure.
	545	info = None # type: ModelInfo
	546	#: Kernel precision.
	547	dtype = None # type: np.dtype
	548	#: Kernel dimensions (1d or 2d).
	549	dim = "" # type: str
	550	#: Calculation results, updated after each call to :meth:`_call_kernel`.
	551	result = None # type: np.ndarray
[7126c04]	552
[95f62aa]	553	def __init__(self, model, q_vectors):
[7126c04]	554	# type: (GpuModel, List[np.ndarray]) -> None
[95f62aa]	555	dtype = model.dtype
	556	self.q_input = GpuInput(q_vectors, dtype)
	557	self._model = model
	558
[3199b17]	559	# Attributes accessed from the outside.
[95f62aa]	560	self.dim = '2d' if self.q_input.is_2d else '1d'
	561	self.info = model.info
[3199b17]	562	self.dtype = dtype
[95f62aa]	563
[3199b17]	564	# Converter to translate input to target type.
	565	self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
	566
	567	# Holding place for the returned value.
[5399809]	568	nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
[3199b17]	569	extra_q = 4 # Total weight, form volume, shell volume and R_eff.
	570	self.result = np.empty(self.q_input.nq*nout + extra_q, dtype)
[14de349]	571
[3199b17]	572	# Allocate result value on GPU.
[ce27e21]	573	env = environment()
[7126c04]	574	context = env.context[self.dtype]
	575	width = ((self.result.size+31)//32)32 self.dtype.itemsize
	576	self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
[d18582e]	577
[3199b17]	578	def _call_kernel(self, call_details, values, cutoff, magnetic,
[a34b811]	579	radius_effective_mode):
[3199b17]	580	# type: (CallDetails, np.ndarray, float, bool, int) -> np.ndarray
[95f62aa]	581	env = environment()
	582	queue = env.queue[self._model.dtype]
	583	context = queue.context
	584
[3199b17]	585	# Arrange data transfer to card.
[48fbd50]	586	details_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
[8d62008]	587	hostbuf=call_details.buffer)
[48fbd50]	588	values_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
	589	hostbuf=values)
	590
[3199b17]	591	# Setup kernel function and arguments.
[95f62aa]	592	name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
[7126c04]	593	kernel = self._model.get_function(name)
[95f62aa]	594	kernel_args = [
[3199b17]	595	np.uint32(self.q_input.nq), # Number of inputs.
	596	None, # Placeholder for pd_start.
	597	None, # Placeholder for pd_stop.
	598	details_b, # Problem definition.
	599	values_b, # Parameter values.
	600	self.q_input.q_b, # Q values.
	601	self._result_b, # Result storage.
	602	self._as_dtype(cutoff), # Probability cutoff.
[a34b811]	603	np.uint32(radius_effective_mode), # R_eff mode.
[9eb3632]	604	]
[3199b17]	605
	606	# Call kernel and retrieve results.
[9eb3632]	607	#print("Calling OpenCL")
[c036ddb]	608	#call_details.show(values)
[6e5b2a7]	609	wait_for = None
[e2592f0]	610	last_nap = clock()
[6e5b2a7]	611	step = 1000000//self.q_input.nq + 1
[bde38b5]	612	for start in range(0, call_details.num_eval, step):
	613	stop = min(start + step, call_details.num_eval)
[9eb3632]	614	#print("queuing",start,stop)
[95f62aa]	615	kernel_args[1:3] = [np.int32(start), np.int32(stop)]
	616	wait_for = [kernel(queue, self.q_input.global_size, None,
	617	*kernel_args, wait_for=wait_for)]
[6e5b2a7]	618	if stop < call_details.num_eval:
[3199b17]	619	# Allow other processes to run.
[6e5b2a7]	620	wait_for[0].wait()
[e2592f0]	621	current_time = clock()
[6e5b2a7]	622	if current_time - last_nap > 0.5:
[8b31efa]	623	time.sleep(0.001)
[6e5b2a7]	624	last_nap = current_time
[7126c04]	625	cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for)
[bde38b5]	626	#print("result", self.result)
[ae2b6b5]	627
[3199b17]	628	# Free buffers.
[7126c04]	629	details_b.release()
	630	values_b.release()
[14de349]	631
	632	def release(self):
[dd7fc12]	633	# type: () -> None
[eafc9fa]	634	"""
	635	Release resources associated with the kernel.
	636	"""
[95f62aa]	637	self.q_input.release()
[7126c04]	638	if self._result_b is not None:
	639	self._result_b.release()
	640	self._result_b = None
[14de349]	641
	642	def __del__(self):
[dd7fc12]	643	# type: () -> None
[14de349]	644	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 9fac5f5

Download in other formats: