kernelcl.py @ 0be86aa

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since 0be86aa was 0be86aa, checked in by Paul Kienzle <pkienzle@…>, 5 years ago
remove unused code
Property mode set to `100644`
File size: 23.9 KB

Rev	Line
[14de349]	1	"""
[eafc9fa]	2	GPU driver for C kernels
[14de349]	3
[b0de252]	4	TODO: docs are out of date
	5
[14de349]	6	There should be a single GPU environment running on the system. This
	7	environment is constructed on the first call to :func:`env`, and the
	8	same environment is returned on each call.
	9
	10	After retrieving the environment, the next step is to create the kernel.
	11	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	12	returns the type of data used by the kernel.
	13
	14	Next a :class:`GpuData` object should be created with the correct kind
	15	of data. This data object can be used by multiple kernels, for example,
	16	if the target model is a weighted sum of multiple kernels. The data
	17	should include any extra evaluation points required to compute the proper
	18	data smearing. This need not match the square grid for 2D data if there
	19	is an index saying which q points are active.
	20
	21	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	22	This kernel is used during fitting, receiving new sets of parameters and
	23	evaluating them. The output value is stored in an output buffer on the
	24	devices, where it can be combined with other structure factors and form
	25	factors and have instrumental resolution effects applied.
[92da231]	26
	27	In order to use OpenCL for your models, you will need OpenCL drivers for
	28	your machine. These should be available from your graphics card vendor.
	29	Intel provides OpenCL drivers for CPUs as well as their integrated HD
	30	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
	31	this writing the performance is lacking compared to the Intel drivers.
	32	NVidia combines drivers for CUDA and OpenCL in one package. The result
	33	is a bit messy if you have multiple drivers installed. You can see which
	34	drivers are available by starting python and running:
	35
	36	import pyopencl as cl
	37	cl.create_some_context(interactive=True)
	38
	39	Once you have done that, it will show the available drivers which you
	40	can select. It will then tell you that you can use these drivers
[880a2ed]	41	automatically by setting the SAS_OPENCL environment variable, which is
	42	PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
[92da231]	43
	44	Some graphics cards have multiple devices on the same card. You cannot
	45	yet use both of them concurrently to evaluate models, but you can run
	46	the program twice using a different device for each session.
	47
	48	OpenCL kernels are compiled when needed by the device driver. Some
	49	drivers produce compiler output even when there is no error. You
	50	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
	51	harmless, albeit annoying.
[14de349]	52	"""
[ba32cdd]	53	from __future__ import print_function
[a5b8477]	54
[250fa25]	55	import os
	56	import warnings
[821a9c6]	57	import logging
[6e5b2a7]	58	import time
[250fa25]	59
[7ae2b7f]	60	import numpy as np # type: ignore
[b3f6bc3]	61
[3221de0]	62
[b0de252]	63	# Attempt to setup opencl. This may fail if the pyopencl package is not
[3221de0]	64	# installed or if it is installed but there are no devices available.
[250fa25]	65	try:
[3221de0]	66	import pyopencl as cl # type: ignore
	67	from pyopencl import mem_flags as mf
	68	from pyopencl.characterize import get_fast_inaccurate_build_options
	69	# Ask OpenCL for the default context so that we know that one exists
	70	cl.create_some_context(interactive=False)
	71	HAVE_OPENCL = True
	72	OPENCL_ERROR = ""
[9404dd3]	73	except Exception as exc:
[6dba2f0]	74	HAVE_OPENCL = False
[3221de0]	75	OPENCL_ERROR = str(exc)
[14de349]	76
[cb6ecf4]	77	from . import generate
[95f62aa]	78	from .generate import F32, F64
[f619de7]	79	from .kernel import KernelModel, Kernel
[14de349]	80
[2d81cfe]	81	# pylint: disable=unused-import
[a5b8477]	82	try:
	83	from typing import Tuple, Callable, Any
	84	from .modelinfo import ModelInfo
	85	from .details import CallDetails
	86	except ImportError:
	87	pass
[2d81cfe]	88	# pylint: enable=unused-import
[a5b8477]	89
[20317b3]	90	# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path)
	91	def quote_path(v):
	92	"""
	93	Quote the path if it is not already quoted.
	94
	95	If v starts with '-', then assume that it is a -I option or similar
	96	and do not quote it. This is fragile: -Ipath with space needs to
	97	be quoted.
	98	"""
	99	return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
	100
	101	def fix_pyopencl_include():
[40a87fa]	102	"""
	103	Monkey patch pyopencl to allow spaces in include file path.
	104	"""
[20317b3]	105	import pyopencl as cl
	106	if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'):
	107	cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS]
	108
[6dba2f0]	109	if HAVE_OPENCL:
	110	fix_pyopencl_include()
[20317b3]	111
[ce27e21]	112	# The max loops number is limited by the amount of local memory available
	113	# on the device. You don't want to make this value too big because it will
	114	# waste resources, nor too small because it may interfere with users trying
	115	# to do their polydispersity calculations. A value of 1024 should be much
	116	# larger than necessary given that cost grows as npts^k where k is the number
	117	# of polydisperse parameters.
[5d4777d]	118	MAX_LOOPS = 2048
	119
[ce27e21]	120
[5464d68]	121	# Pragmas for enable OpenCL features. Be sure to protect them so that they
	122	# still compile even if OpenCL is not present.
	123	_F16_PRAGMA = """\
	124	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
	125	# pragma OPENCL EXTENSION cl_khr_fp16: enable
	126	#endif
	127	"""
	128
	129	_F64_PRAGMA = """\
	130	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
	131	# pragma OPENCL EXTENSION cl_khr_fp64: enable
	132	#endif
	133	"""
	134
[3221de0]	135	def use_opencl():
[07646b6]	136	sas_opencl = os.environ.get("SAS_OPENCL", "OpenCL").lower()
	137	return HAVE_OPENCL and sas_opencl != "none" and not sas_opencl.startswith("cuda")
[5464d68]	138
[14de349]	139	ENV = None
[3221de0]	140	def reset_environment():
	141	"""
	142	Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
	143	"""
	144	global ENV
	145	ENV = GpuEnvironment() if use_opencl() else None
	146
[14de349]	147	def environment():
[dd7fc12]	148	# type: () -> "GpuEnvironment"
[14de349]	149	"""
	150	Returns a singleton :class:`GpuEnvironment`.
	151
	152	This provides an OpenCL context and one queue per device.
	153	"""
[b4272a2]	154	if ENV is None:
	155	if not HAVE_OPENCL:
	156	raise RuntimeError("OpenCL startup failed with ***"
[d86f0fc]	157	+ OPENCL_ERROR + "***; using C compiler instead")
[3221de0]	158	reset_environment()
[b4272a2]	159	if ENV is None:
	160	raise RuntimeError("SAS_OPENCL=None in environment")
[14de349]	161	return ENV
	162
[5d316e9]	163	def has_type(device, dtype):
[dd7fc12]	164	# type: (cl.Device, np.dtype) -> bool
[14de349]	165	"""
[5d316e9]	166	Return true if device supports the requested precision.
[14de349]	167	"""
[95f62aa]	168	if dtype == F32:
[5d316e9]	169	return True
[f872fd1]	170	elif dtype == F64:
[5d316e9]	171	return "cl_khr_fp64" in device.extensions
	172	else:
[f872fd1]	173	# Not supporting F16 type since it isn't accurate enough
[5d316e9]	174	return False
[14de349]	175
[f5b9a6b]	176	def get_warp(kernel, queue):
[dd7fc12]	177	# type: (cl.Kernel, cl.CommandQueue) -> int
[f5b9a6b]	178	"""
	179	Return the size of an execution batch for kernel running on queue.
	180	"""
[750ffa5]	181	return kernel.get_work_group_info(
[63b32bb]	182	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
	183	queue.device)
[14de349]	184
[5d316e9]	185	def compile_model(context, source, dtype, fast=False):
[dd7fc12]	186	# type: (cl.Context, str, np.dtype, bool) -> cl.Program
[14de349]	187	"""
	188	Build a model to run on the gpu.
	189
[6cbdcd4]	190	Returns the compiled program and its type.
	191
	192	Raises an error if the desired precision is not available.
[14de349]	193	"""
	194	dtype = np.dtype(dtype)
[5d316e9]	195	if not all(has_type(d, dtype) for d in context.devices):
	196	raise RuntimeError("%s not supported for devices"%dtype)
[14de349]	197
[5464d68]	198	source_list = [generate.convert_type(source, dtype)]
	199
	200	if dtype == generate.F16:
	201	source_list.insert(0, _F16_PRAGMA)
	202	elif dtype == generate.F64:
	203	source_list.insert(0, _F64_PRAGMA)
	204
[14de349]	205	# Note: USE_SINCOS makes the intel cpu slower under opencl
	206	if context.devices[0].type == cl.device_type.GPU:
[5464d68]	207	source_list.insert(0, "#define USE_SINCOS\n")
[5d316e9]	208	options = (get_fast_inaccurate_build_options(context.devices[0])
	209	if fast else [])
[ba32cdd]	210	source = "\n".join(source_list)
[5d316e9]	211	program = cl.Program(context, source).build(options=options)
[821a9c6]	212	#print("done with "+program)
[ce27e21]	213	return program
[14de349]	214
	215
	216	# for now, this returns one device in the context
	217	# TODO: create a context that contains all devices on all platforms
	218	class GpuEnvironment(object):
	219	"""
	220	GPU context, with possibly many devices, and one queue per device.
[95f62aa]	221
	222	Because the environment can be reset during a live program (e.g., if the
	223	user changes the active GPU device in the GUI), everything associated
	224	with the device context must be cached in the environment and recreated
	225	if the environment changes. The cache attribute is a simple dictionary
	226	which holds keys and references to objects, such as compiled kernels and
	227	allocated buffers. The running program should check in the cache for
	228	long lived objects and create them if they are not there. The program
	229	should not hold onto cached objects, but instead only keep them active
	230	for the duration of a function call. When the environment is destroyed
	231	then the release method for each active cache item is called before
	232	the environment is freed. This means that each cl buffer should be
	233	in its own cache entry.
[14de349]	234	"""
	235	def __init__(self):
[dd7fc12]	236	# type: () -> None
[250fa25]	237	# find gpu context
[95f62aa]	238	context_list = _create_some_context()
	239
	240	# Find a context for F32 and for F64 (maybe the same one).
	241	# F16 isn't good enough.
	242	self.context = {}
	243	for dtype in (F32, F64):
	244	for context in context_list:
	245	if has_type(context.devices[0], dtype):
	246	self.context[dtype] = context
	247	break
	248	else:
	249	self.context[dtype] = None
	250
	251	# Build a queue for each context
	252	self.queue = {}
	253	context = self.context[F32]
	254	self.queue[F32] = cl.CommandQueue(context, context.devices[0])
	255	if self.context[F64] == self.context[F32]:
	256	self.queue[F64] = self.queue[F32]
	257	else:
	258	context = self.context[F64]
	259	self.queue[F64] = cl.CommandQueue(context, context.devices[0])
[250fa25]	260
[f5b9a6b]	261	# Byte boundary for data alignment
[95f62aa]	262	#self.data_boundary = max(context.devices[0].min_data_type_align_size
	263	# for context in self.context.values())
	264
	265	# Cache for compiled programs, and for items in context
[ce27e21]	266	self.compiled = {}
	267
[5d316e9]	268	def has_type(self, dtype):
[dd7fc12]	269	# type: (np.dtype) -> bool
[eafc9fa]	270	"""
	271	Return True if all devices support a given type.
	272	"""
[95f62aa]	273	return self.context.get(dtype, None) is not None
[250fa25]	274
[300a2f7]	275	def compile_program(self, name, source, dtype, fast, timestamp):
	276	# type: (str, str, np.dtype, bool, float) -> cl.Program
[eafc9fa]	277	"""
	278	Compile the program for the device in the given context.
	279	"""
[300a2f7]	280	# Note: PyOpenCL caches based on md5 hash of source, options and device
	281	# so we don't really need to cache things for ourselves. I'll do so
	282	# anyway just to save some data munging time.
[7fcdc9f]	283	tag = generate.tag_source(source)
	284	key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
[300a2f7]	285	# Check timestamp on program
	286	program, program_timestamp = self.compiled.get(key, (None, np.inf))
	287	if program_timestamp < timestamp:
	288	del self.compiled[key]
[cde11f0f]	289	if key not in self.compiled:
[95f62aa]	290	context = self.context[dtype]
[20317b3]	291	logging.info("building %s for OpenCL %s", key,
	292	context.devices[0].name.strip())
[95f62aa]	293	program = compile_model(self.context[dtype],
[fec69dd]	294	str(source), dtype, fast)
[300a2f7]	295	self.compiled[key] = (program, timestamp)
	296	return program
[14de349]	297
[95f62aa]	298	def _create_some_context():
	299	# type: () -> cl.Context
	300	"""
	301	Protected call to cl.create_some_context without interactivity.
	302
	303	Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
	304	otherwise scans for the most appropriate device using
[d5ce7fa]	305	:func:`_get_default_context`. Ignore SAS_OPENCL=OpenCL, which
	306	indicates that an OpenCL device should be used without specifying
	307	which one (and not a CUDA device, or no GPU).
[95f62aa]	308	"""
[07646b6]	309	# Assume we do not get here if SAS_OPENCL is None or CUDA
	310	sas_opencl = os.environ.get('SAS_OPENCL', 'opencl')
	311	if sas_opencl.lower() != 'opencl':
	312	# Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
	313	os.environ["PYOPENCL_CTX"] = sas_opencl
[95f62aa]	314
	315	if 'PYOPENCL_CTX' in os.environ:
	316	try:
	317	return [cl.create_some_context(interactive=False)]
	318	except Exception as exc:
	319	warnings.warn(str(exc))
	320	warnings.warn("pyopencl.create_some_context() failed")
	321	warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly")
	322
	323	return _get_default_context()
	324
[3c56da87]	325	def _get_default_context():
[20317b3]	326	# type: () -> List[cl.Context]
[eafc9fa]	327	"""
[d18582e]	328	Get an OpenCL context, preferring GPU over CPU, and preferring Intel
	329	drivers over AMD drivers.
[eafc9fa]	330	"""
[d18582e]	331	# Note: on mobile devices there is automatic clock scaling if either the
	332	# CPU or the GPU is underutilized; probably doesn't affect us, but we if
	333	# it did, it would mean that putting a busy loop on the CPU while the GPU
	334	# is running may increase throughput.
	335	#
	336	# Macbook pro, base install:
	337	# {'Apple': [Intel CPU, NVIDIA GPU]}
	338	# Macbook pro, base install:
	339	# {'Apple': [Intel CPU, Intel GPU]}
	340	# 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed
	341	# {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
	342	gpu, cpu = None, None
[3c56da87]	343	for platform in cl.get_platforms():
[e6a5556]	344	# AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
[20317b3]	345	# If someone has bothered to install the AMD/NVIDIA drivers, prefer
	346	# them over the integrated graphics driver that may have been supplied
	347	# with the CPU chipset.
	348	preferred_cpu = (platform.vendor.startswith('Intel')
	349	or platform.vendor.startswith('Apple'))
	350	preferred_gpu = (platform.vendor.startswith('Advanced')
	351	or platform.vendor.startswith('NVIDIA'))
[3c56da87]	352	for device in platform.get_devices():
	353	if device.type == cl.device_type.GPU:
[20317b3]	354	# If the existing type is not GPU then it will be CUSTOM
	355	# or ACCELERATOR so don't override it.
[e6a5556]	356	if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
	357	gpu = device
	358	elif device.type == cl.device_type.CPU:
	359	if cpu is None or preferred_cpu:
	360	cpu = device
[d18582e]	361	else:
[e6a5556]	362	# System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
	363	# Intel Phi for example registers as an accelerator
[20317b3]	364	# Since the user installed a custom device on their system
	365	# and went through the pain of sorting out OpenCL drivers for
	366	# it, lets assume they really do want to use it as their
	367	# primary compute device.
[e6a5556]	368	gpu = device
[199d40d]	369
[20317b3]	370	# order the devices by gpu then by cpu; when searching for an available
	371	# device by data type they will be checked in this order, which means
	372	# that if the gpu supports double then the cpu will never be used (though
	373	# we may make it possible to explicitly request the cpu at some point).
[e6a5556]	374	devices = []
	375	if gpu is not None:
	376	devices.append(gpu)
	377	if cpu is not None:
	378	devices.append(cpu)
	379	return [cl.Context([d]) for d in devices]
[3c56da87]	380
[250fa25]	381
[f619de7]	382	class GpuModel(KernelModel):
[14de349]	383	"""
	384	GPU wrapper for a single model.
	385
[17bbadd]	386	source and model_info are the model source and interface as returned
	387	from :func:`generate.make_source` and :func:`generate.make_model_info`.
[14de349]	388
	389	dtype is the desired model precision. Any numpy dtype for single
	390	or double precision floats will do, such as 'f', 'float32' or 'single'
	391	for single and 'd', 'float64' or 'double' for double. Double precision
	392	is an optional extension which may not be available on all devices.
[cde11f0f]	393	Half precision ('float16','half') may be available on some devices.
	394	Fast precision ('fast') is a loose version of single precision, indicating
	395	that the compiler is allowed to take shortcuts.
[14de349]	396	"""
[7126c04]	397	info = None # type: ModelInfo
	398	source = "" # type: str
	399	dtype = None # type: np.dtype
	400	fast = False # type: bool
	401	_program = None # type: cl.Program
	402	_kernels = None # type: Dict[str, cl.Kernel]
	403
[dd7fc12]	404	def __init__(self, source, model_info, dtype=generate.F32, fast=False):
[a4280bd]	405	# type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
[17bbadd]	406	self.info = model_info
[ce27e21]	407	self.source = source
[dd7fc12]	408	self.dtype = dtype
	409	self.fast = fast
[14de349]	410
[ce27e21]	411	def __getstate__(self):
[dd7fc12]	412	# type: () -> Tuple[ModelInfo, str, np.dtype, bool]
[eafc9fa]	413	return self.info, self.source, self.dtype, self.fast
[14de349]	414
[ce27e21]	415	def __setstate__(self, state):
[dd7fc12]	416	# type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
[eafc9fa]	417	self.info, self.source, self.dtype, self.fast = state
[7126c04]	418	self._program = self._kernels = None
[ce27e21]	419
[9eb3632]	420	def make_kernel(self, q_vectors):
[dd7fc12]	421	# type: (List[np.ndarray]) -> "GpuKernel"
[95f62aa]	422	return GpuKernel(self, q_vectors)
	423
[7126c04]	424	def get_function(self, name):
[95f62aa]	425	# type: (str) -> cl.Kernel
	426	"""
	427	Fetch the kernel from the environment by name, compiling it if it
	428	does not already exist.
	429	"""
[7126c04]	430	if self._program is None:
	431	self._prepare_program()
	432	return self._kernels[name]
	433
	434	def _prepare_program(self):
	435	# type: (str) -> None
	436	env = environment()
	437	timestamp = generate.ocl_timestamp(self.info)
	438	program = env.compile_program(
	439	self.info.name,
	440	self.source['opencl'],
	441	self.dtype,
	442	self.fast,
	443	timestamp)
	444	variants = ['Iq', 'Iqxy', 'Imagnetic']
	445	names = [generate.kernel_name(self.info, k) for k in variants]
	446	handles = [getattr(program, k) for k in names]
	447	self._kernels = {k: v for k, v in zip(variants, handles)}
	448	# keep a handle to program so GC doesn't collect
	449	self._program = program
[14de349]	450
	451	# TODO: check that we don't need a destructor for buffers which go out of scope
	452	class GpuInput(object):
	453	"""
	454	Make q data available to the gpu.
	455
	456	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	457	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	458	to get the best performance on OpenCL, which may involve shifting and
	459	stretching the array to better match the memory architecture. Additional
	460	points will be evaluated with q=1e-3.
	461
	462	dtype is the data type for the q vectors. The data type should be
	463	set to match that of the kernel, which is an attribute of
	464	:class:`GpuProgram`. Note that not all kernels support double
	465	precision, so even if the program was created for double precision,
	466	the GpuProgram.dtype may be single precision.
	467
	468	Call :meth:`release` when complete. Even if not called directly, the
	469	buffer will be released when the data object is freed.
	470	"""
[cb6ecf4]	471	def __init__(self, q_vectors, dtype=generate.F32):
[dd7fc12]	472	# type: (List[np.ndarray], np.dtype) -> None
[17bbadd]	473	# TODO: do we ever need double precision q?
[14de349]	474	self.nq = q_vectors[0].size
	475	self.dtype = np.dtype(dtype)
[eafc9fa]	476	self.is_2d = (len(q_vectors) == 2)
[f5b9a6b]	477	# TODO: stretch input based on get_warp()
	478	# not doing it now since warp depends on kernel, which is not known
	479	# at this point, so instead using 32, which is good on the set of
	480	# architectures tested so far.
[c072f83]	481	if self.is_2d:
[07646b6]	482	width = ((self.nq+15)//16)*16
[c072f83]	483	self.q = np.empty((width, 2), dtype=dtype)
	484	self.q[:self.nq, 0] = q_vectors[0]
	485	self.q[:self.nq, 1] = q_vectors[1]
	486	else:
[07646b6]	487	width = ((self.nq+31)//32)*32
[c072f83]	488	self.q = np.empty(width, dtype=dtype)
	489	self.q[:self.nq] = q_vectors[0]
	490	self.global_size = [self.q.shape[0]]
[7126c04]	491	#print("creating inputs of size", self.global_size)
[95f62aa]	492
[7126c04]	493	# transfer input value to gpu
[95f62aa]	494	env = environment()
[7126c04]	495	context = env.context[self.dtype]
	496	self.q_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
	497	hostbuf=self.q)
[14de349]	498
	499	def release(self):
[dd7fc12]	500	# type: () -> None
[eafc9fa]	501	"""
[95f62aa]	502	Free the buffer associated with the q value
[eafc9fa]	503	"""
[7126c04]	504	if self.q_b is not None:
	505	self.q_b.release()
	506	self.q_b = None
[14de349]	507
[eafc9fa]	508	def __del__(self):
[dd7fc12]	509	# type: () -> None
[eafc9fa]	510	self.release()
	511
[f619de7]	512	class GpuKernel(Kernel):
[ff7119b]	513	"""
	514	Callable SAS kernel.
	515
[95f62aa]	516	model is the GpuModel object to call
[ff7119b]	517
[7126c04]	518	The kernel is derived from :class:`Kernel`, providing the
	519	:meth:`call_kernel` method to evaluate the kernel for a given set of
	520	parameters. Because of the need to move the q values to the GPU before
	521	evaluation, the kernel is instantiated for a particular set of q vectors,
	522	and can be called many times without transfering q each time.
[ff7119b]	523
	524	Call :meth:`release` when done with the kernel instance.
	525	"""
[7126c04]	526	#: SAS model information structure
	527	info = None # type: ModelInfo
	528	#: kernel precision
	529	dtype = None # type: np.dtype
	530	#: kernel dimensions (1d or 2d)
	531	dim = "" # type: str
	532	#: calculation results, updated after each call to :meth:`_call_kernel`
	533	result = None # type: np.ndarray
	534
[95f62aa]	535	def __init__(self, model, q_vectors):
[7126c04]	536	# type: (GpuModel, List[np.ndarray]) -> None
[95f62aa]	537	dtype = model.dtype
	538	self.q_input = GpuInput(q_vectors, dtype)
	539	self._model = model
[f872fd1]	540	# F16 isn't sufficient, so don't support it
	541	self._as_dtype = np.float64 if dtype == generate.F64 else np.float32
[95f62aa]	542
	543	# attributes accessed from the outside
	544	self.dim = '2d' if self.q_input.is_2d else '1d'
	545	self.info = model.info
	546	self.dtype = model.dtype
	547
	548	# holding place for the returned value
[5399809]	549	nout = 2 if self.info.have_Fq and self.dim == '1d' else 1
[07646b6]	550	extra_q = 4 # total weight, form volume, shell volume and R_eff
	551	self.result = np.empty(self.q_input.nq*nout+extra_q, dtype)
[14de349]	552
[7126c04]	553	# allocate result value on gpu
[ce27e21]	554	env = environment()
[7126c04]	555	context = env.context[self.dtype]
	556	width = ((self.result.size+31)//32)32 self.dtype.itemsize
	557	self._result_b = cl.Buffer(context, mf.READ_WRITE, width)
[d18582e]	558
[6e7ba14]	559	def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type):
[c036ddb]	560	# type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
[95f62aa]	561	env = environment()
	562	queue = env.queue[self._model.dtype]
	563	context = queue.context
	564
	565	# Arrange data transfer to/from card
[48fbd50]	566	details_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
[8d62008]	567	hostbuf=call_details.buffer)
[48fbd50]	568	values_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
	569	hostbuf=values)
	570
[95f62aa]	571	name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
[7126c04]	572	kernel = self._model.get_function(name)
[95f62aa]	573	kernel_args = [
[9eb3632]	574	np.uint32(self.q_input.nq), None, None,
[7126c04]	575	details_b, values_b, self.q_input.q_b, self._result_b,
[95f62aa]	576	self._as_dtype(cutoff),
[6e7ba14]	577	np.uint32(effective_radius_type),
[9eb3632]	578	]
	579	#print("Calling OpenCL")
[c036ddb]	580	#call_details.show(values)
[01c8d9e]	581	#Call kernel and retrieve results
[6e5b2a7]	582	wait_for = None
	583	last_nap = time.clock()
	584	step = 1000000//self.q_input.nq + 1
[bde38b5]	585	for start in range(0, call_details.num_eval, step):
	586	stop = min(start + step, call_details.num_eval)
[9eb3632]	587	#print("queuing",start,stop)
[95f62aa]	588	kernel_args[1:3] = [np.int32(start), np.int32(stop)]
	589	wait_for = [kernel(queue, self.q_input.global_size, None,
	590	*kernel_args, wait_for=wait_for)]
[6e5b2a7]	591	if stop < call_details.num_eval:
	592	# Allow other processes to run
	593	wait_for[0].wait()
	594	current_time = time.clock()
	595	if current_time - last_nap > 0.5:
[8b31efa]	596	time.sleep(0.001)
[6e5b2a7]	597	last_nap = current_time
[7126c04]	598	cl.enqueue_copy(queue, self.result, self._result_b, wait_for=wait_for)
[bde38b5]	599	#print("result", self.result)
[ae2b6b5]	600
	601	# Free buffers
[7126c04]	602	details_b.release()
	603	values_b.release()
[14de349]	604
	605	def release(self):
[dd7fc12]	606	# type: () -> None
[eafc9fa]	607	"""
	608	Release resources associated with the kernel.
	609	"""
[95f62aa]	610	self.q_input.release()
[7126c04]	611	if self._result_b is not None:
	612	self._result_b.release()
	613	self._result_b = None
[14de349]	614
	615	def __del__(self):
[dd7fc12]	616	# type: () -> None
[14de349]	617	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ 0be86aa

Download in other formats: