kernelcl.py @ d5ce7fa

core_shell_microgelsmagnetic_modelticket-1257-vesicle-productticket_1156ticket_1265_superballticket_822_more_unit_tests

Last change on this file since d5ce7fa was d5ce7fa, checked in by Paul Kienzle <pkienzle@…>, 6 years ago
Merge branch 'ticket-1015-gpu-mem-error' into cuda-test
Property mode set to `100644`
File size: 25.0 KB

Rev	Line
[14de349]	1	"""
[eafc9fa]	2	GPU driver for C kernels
[14de349]	3
[b0de252]	4	TODO: docs are out of date
	5
[14de349]	6	There should be a single GPU environment running on the system. This
	7	environment is constructed on the first call to :func:`env`, and the
	8	same environment is returned on each call.
	9
	10	After retrieving the environment, the next step is to create the kernel.
	11	This is done with a call to :meth:`GpuEnvironment.make_kernel`, which
	12	returns the type of data used by the kernel.
	13
	14	Next a :class:`GpuData` object should be created with the correct kind
	15	of data. This data object can be used by multiple kernels, for example,
	16	if the target model is a weighted sum of multiple kernels. The data
	17	should include any extra evaluation points required to compute the proper
	18	data smearing. This need not match the square grid for 2D data if there
	19	is an index saying which q points are active.
	20
	21	Together the GpuData, the program, and a device form a :class:`GpuKernel`.
	22	This kernel is used during fitting, receiving new sets of parameters and
	23	evaluating them. The output value is stored in an output buffer on the
	24	devices, where it can be combined with other structure factors and form
	25	factors and have instrumental resolution effects applied.
[92da231]	26
	27	In order to use OpenCL for your models, you will need OpenCL drivers for
	28	your machine. These should be available from your graphics card vendor.
	29	Intel provides OpenCL drivers for CPUs as well as their integrated HD
	30	graphics chipsets. AMD also provides drivers for Intel CPUs, but as of
	31	this writing the performance is lacking compared to the Intel drivers.
	32	NVidia combines drivers for CUDA and OpenCL in one package. The result
	33	is a bit messy if you have multiple drivers installed. You can see which
	34	drivers are available by starting python and running:
	35
	36	import pyopencl as cl
	37	cl.create_some_context(interactive=True)
	38
	39	Once you have done that, it will show the available drivers which you
	40	can select. It will then tell you that you can use these drivers
[880a2ed]	41	automatically by setting the SAS_OPENCL environment variable, which is
	42	PYOPENCL_CTX equivalent but not conflicting with other pyopnecl programs.
[92da231]	43
	44	Some graphics cards have multiple devices on the same card. You cannot
	45	yet use both of them concurrently to evaluate models, but you can run
	46	the program twice using a different device for each session.
	47
	48	OpenCL kernels are compiled when needed by the device driver. Some
	49	drivers produce compiler output even when there is no error. You
	50	can see the output by setting PYOPENCL_COMPILER_OUTPUT=1. It should be
	51	harmless, albeit annoying.
[14de349]	52	"""
[ba32cdd]	53	from __future__ import print_function
[a5b8477]	54
[250fa25]	55	import os
	56	import warnings
[821a9c6]	57	import logging
[6e5b2a7]	58	import time
[250fa25]	59
[7ae2b7f]	60	import numpy as np # type: ignore
[b3f6bc3]	61
[3221de0]	62
[b0de252]	63	# Attempt to setup opencl. This may fail if the pyopencl package is not
[3221de0]	64	# installed or if it is installed but there are no devices available.
[250fa25]	65	try:
[3221de0]	66	import pyopencl as cl # type: ignore
	67	from pyopencl import mem_flags as mf
	68	from pyopencl.characterize import get_fast_inaccurate_build_options
	69	# Ask OpenCL for the default context so that we know that one exists
	70	cl.create_some_context(interactive=False)
	71	HAVE_OPENCL = True
	72	OPENCL_ERROR = ""
[9404dd3]	73	except Exception as exc:
[6dba2f0]	74	HAVE_OPENCL = False
[3221de0]	75	OPENCL_ERROR = str(exc)
[14de349]	76
[cb6ecf4]	77	from . import generate
[95f62aa]	78	from .generate import F32, F64
[f619de7]	79	from .kernel import KernelModel, Kernel
[14de349]	80
[2d81cfe]	81	# pylint: disable=unused-import
[a5b8477]	82	try:
	83	from typing import Tuple, Callable, Any
	84	from .modelinfo import ModelInfo
	85	from .details import CallDetails
	86	except ImportError:
	87	pass
[2d81cfe]	88	# pylint: enable=unused-import
[a5b8477]	89
[20317b3]	90	# CRUFT: pyopencl < 2017.1 (as of June 2016 needs quotes around include path)
	91	def quote_path(v):
	92	"""
	93	Quote the path if it is not already quoted.
	94
	95	If v starts with '-', then assume that it is a -I option or similar
	96	and do not quote it. This is fragile: -Ipath with space needs to
	97	be quoted.
	98	"""
	99	return '"'+v+'"' if v and ' ' in v and not v[0] in "\"'-" else v
	100
	101	def fix_pyopencl_include():
[40a87fa]	102	"""
	103	Monkey patch pyopencl to allow spaces in include file path.
	104	"""
[20317b3]	105	import pyopencl as cl
	106	if hasattr(cl, '_DEFAULT_INCLUDE_OPTIONS'):
	107	cl._DEFAULT_INCLUDE_OPTIONS = [quote_path(v) for v in cl._DEFAULT_INCLUDE_OPTIONS]
	108
[6dba2f0]	109	if HAVE_OPENCL:
	110	fix_pyopencl_include()
[20317b3]	111
[ce27e21]	112	# The max loops number is limited by the amount of local memory available
	113	# on the device. You don't want to make this value too big because it will
	114	# waste resources, nor too small because it may interfere with users trying
	115	# to do their polydispersity calculations. A value of 1024 should be much
	116	# larger than necessary given that cost grows as npts^k where k is the number
	117	# of polydisperse parameters.
[5d4777d]	118	MAX_LOOPS = 2048
	119
[ce27e21]	120
[5464d68]	121	# Pragmas for enable OpenCL features. Be sure to protect them so that they
	122	# still compile even if OpenCL is not present.
	123	_F16_PRAGMA = """\
	124	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp16)
	125	# pragma OPENCL EXTENSION cl_khr_fp16: enable
	126	#endif
	127	"""
	128
	129	_F64_PRAGMA = """\
	130	#if defined(__OPENCL_VERSION__) // && !defined(cl_khr_fp64)
	131	# pragma OPENCL EXTENSION cl_khr_fp64: enable
	132	#endif
	133	"""
	134
[3221de0]	135	def use_opencl():
[b0de252]	136	env = os.environ.get("SAS_OPENCL", "").lower()
	137	return HAVE_OPENCL and env != "none" and not env.startswith("cuda")
[5464d68]	138
[14de349]	139	ENV = None
[3221de0]	140	def reset_environment():
	141	"""
	142	Call to create a new OpenCL context, such as after a change to SAS_OPENCL.
	143	"""
	144	global ENV
	145	ENV = GpuEnvironment() if use_opencl() else None
	146
[14de349]	147	def environment():
[dd7fc12]	148	# type: () -> "GpuEnvironment"
[14de349]	149	"""
	150	Returns a singleton :class:`GpuEnvironment`.
	151
	152	This provides an OpenCL context and one queue per device.
	153	"""
[b4272a2]	154	if ENV is None:
	155	if not HAVE_OPENCL:
	156	raise RuntimeError("OpenCL startup failed with ***"
[d86f0fc]	157	+ OPENCL_ERROR + "***; using C compiler instead")
[3221de0]	158	reset_environment()
[b4272a2]	159	if ENV is None:
	160	raise RuntimeError("SAS_OPENCL=None in environment")
[14de349]	161	return ENV
	162
[5d316e9]	163	def has_type(device, dtype):
[dd7fc12]	164	# type: (cl.Device, np.dtype) -> bool
[14de349]	165	"""
[5d316e9]	166	Return true if device supports the requested precision.
[14de349]	167	"""
[95f62aa]	168	if dtype == F32:
[5d316e9]	169	return True
	170	elif dtype == generate.F64:
	171	return "cl_khr_fp64" in device.extensions
	172	elif dtype == generate.F16:
	173	return "cl_khr_fp16" in device.extensions
	174	else:
	175	return False
[14de349]	176
[f5b9a6b]	177	def get_warp(kernel, queue):
[dd7fc12]	178	# type: (cl.Kernel, cl.CommandQueue) -> int
[f5b9a6b]	179	"""
	180	Return the size of an execution batch for kernel running on queue.
	181	"""
[750ffa5]	182	return kernel.get_work_group_info(
[63b32bb]	183	cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
	184	queue.device)
[14de349]	185
[5d316e9]	186	def compile_model(context, source, dtype, fast=False):
[dd7fc12]	187	# type: (cl.Context, str, np.dtype, bool) -> cl.Program
[14de349]	188	"""
	189	Build a model to run on the gpu.
	190
[6cbdcd4]	191	Returns the compiled program and its type.
	192
	193	Raises an error if the desired precision is not available.
[14de349]	194	"""
	195	dtype = np.dtype(dtype)
[5d316e9]	196	if not all(has_type(d, dtype) for d in context.devices):
	197	raise RuntimeError("%s not supported for devices"%dtype)
[14de349]	198
[5464d68]	199	source_list = [generate.convert_type(source, dtype)]
	200
	201	if dtype == generate.F16:
	202	source_list.insert(0, _F16_PRAGMA)
	203	elif dtype == generate.F64:
	204	source_list.insert(0, _F64_PRAGMA)
	205
[14de349]	206	# Note: USE_SINCOS makes the intel cpu slower under opencl
	207	if context.devices[0].type == cl.device_type.GPU:
[5464d68]	208	source_list.insert(0, "#define USE_SINCOS\n")
[5d316e9]	209	options = (get_fast_inaccurate_build_options(context.devices[0])
	210	if fast else [])
[ba32cdd]	211	source = "\n".join(source_list)
[5d316e9]	212	program = cl.Program(context, source).build(options=options)
[821a9c6]	213	#print("done with "+program)
[ce27e21]	214	return program
[14de349]	215
	216
	217	# for now, this returns one device in the context
	218	# TODO: create a context that contains all devices on all platforms
	219	class GpuEnvironment(object):
	220	"""
	221	GPU context, with possibly many devices, and one queue per device.
[95f62aa]	222
	223	Because the environment can be reset during a live program (e.g., if the
	224	user changes the active GPU device in the GUI), everything associated
	225	with the device context must be cached in the environment and recreated
	226	if the environment changes. The cache attribute is a simple dictionary
	227	which holds keys and references to objects, such as compiled kernels and
	228	allocated buffers. The running program should check in the cache for
	229	long lived objects and create them if they are not there. The program
	230	should not hold onto cached objects, but instead only keep them active
	231	for the duration of a function call. When the environment is destroyed
	232	then the release method for each active cache item is called before
	233	the environment is freed. This means that each cl buffer should be
	234	in its own cache entry.
[14de349]	235	"""
	236	def __init__(self):
[dd7fc12]	237	# type: () -> None
[250fa25]	238	# find gpu context
[95f62aa]	239	context_list = _create_some_context()
	240
	241	# Find a context for F32 and for F64 (maybe the same one).
	242	# F16 isn't good enough.
	243	self.context = {}
	244	for dtype in (F32, F64):
	245	for context in context_list:
	246	if has_type(context.devices[0], dtype):
	247	self.context[dtype] = context
	248	break
	249	else:
	250	self.context[dtype] = None
	251
	252	# Build a queue for each context
	253	self.queue = {}
	254	context = self.context[F32]
	255	self.queue[F32] = cl.CommandQueue(context, context.devices[0])
	256	if self.context[F64] == self.context[F32]:
	257	self.queue[F64] = self.queue[F32]
	258	else:
	259	context = self.context[F64]
	260	self.queue[F64] = cl.CommandQueue(context, context.devices[0])
[250fa25]	261
[f5b9a6b]	262	# Byte boundary for data alignment
[95f62aa]	263	#self.data_boundary = max(context.devices[0].min_data_type_align_size
	264	# for context in self.context.values())
	265
	266	# Cache for compiled programs, and for items in context
[ce27e21]	267	self.compiled = {}
[95f62aa]	268	self.cache = {}
[ce27e21]	269
[5d316e9]	270	def has_type(self, dtype):
[dd7fc12]	271	# type: (np.dtype) -> bool
[eafc9fa]	272	"""
	273	Return True if all devices support a given type.
	274	"""
[95f62aa]	275	return self.context.get(dtype, None) is not None
[250fa25]	276
[300a2f7]	277	def compile_program(self, name, source, dtype, fast, timestamp):
	278	# type: (str, str, np.dtype, bool, float) -> cl.Program
[eafc9fa]	279	"""
	280	Compile the program for the device in the given context.
	281	"""
[300a2f7]	282	# Note: PyOpenCL caches based on md5 hash of source, options and device
	283	# so we don't really need to cache things for ourselves. I'll do so
	284	# anyway just to save some data munging time.
[7fcdc9f]	285	tag = generate.tag_source(source)
	286	key = "%s-%s-%s%s"%(name, dtype, tag, ("-fast" if fast else ""))
[300a2f7]	287	# Check timestamp on program
	288	program, program_timestamp = self.compiled.get(key, (None, np.inf))
	289	if program_timestamp < timestamp:
	290	del self.compiled[key]
[cde11f0f]	291	if key not in self.compiled:
[95f62aa]	292	context = self.context[dtype]
[20317b3]	293	logging.info("building %s for OpenCL %s", key,
	294	context.devices[0].name.strip())
[95f62aa]	295	program = compile_model(self.context[dtype],
[fec69dd]	296	str(source), dtype, fast)
[300a2f7]	297	self.compiled[key] = (program, timestamp)
	298	return program
[14de349]	299
[95f62aa]	300	def free_buffer(self, key):
	301	if key in self.cache:
	302	self.cache[key].release()
	303	del self.cache[key]
	304
	305	def __del__(self):
	306	for v in self.cache.values():
	307	release = getattr(v, 'release', lambda: None)
	308	release()
	309	self.cache = {}
	310
	311	_CURRENT_ID = 0
	312	def unique_id():
	313	global _CURRENT_ID
	314	_CURRENT_ID += 1
	315	return _CURRENT_ID
	316
	317	def _create_some_context():
	318	# type: () -> cl.Context
	319	"""
	320	Protected call to cl.create_some_context without interactivity.
	321
	322	Uses SAS_OPENCL or PYOPENCL_CTX if they are set in the environment,
	323	otherwise scans for the most appropriate device using
[d5ce7fa]	324	:func:`_get_default_context`. Ignore SAS_OPENCL=OpenCL, which
	325	indicates that an OpenCL device should be used without specifying
	326	which one (and not a CUDA device, or no GPU).
[95f62aa]	327	"""
[d5ce7fa]	328	dev_str = os.environ["SAS_OPENCL"]
	329	if dev_str and dev_str.lower() != "opencl":
[95f62aa]	330	#Setting PYOPENCL_CTX as a SAS_OPENCL to create cl context
[d5ce7fa]	331	os.environ["PYOPENCL_CTX"] = dev_str
[95f62aa]	332
	333	if 'PYOPENCL_CTX' in os.environ:
	334	try:
	335	return [cl.create_some_context(interactive=False)]
	336	except Exception as exc:
	337	warnings.warn(str(exc))
	338	warnings.warn("pyopencl.create_some_context() failed")
	339	warnings.warn("the environment variable 'SAS_OPENCL' or 'PYOPENCL_CTX' might not be set correctly")
	340
	341	return _get_default_context()
	342
[3c56da87]	343	def _get_default_context():
[20317b3]	344	# type: () -> List[cl.Context]
[eafc9fa]	345	"""
[d18582e]	346	Get an OpenCL context, preferring GPU over CPU, and preferring Intel
	347	drivers over AMD drivers.
[eafc9fa]	348	"""
[d18582e]	349	# Note: on mobile devices there is automatic clock scaling if either the
	350	# CPU or the GPU is underutilized; probably doesn't affect us, but we if
	351	# it did, it would mean that putting a busy loop on the CPU while the GPU
	352	# is running may increase throughput.
	353	#
	354	# Macbook pro, base install:
	355	# {'Apple': [Intel CPU, NVIDIA GPU]}
	356	# Macbook pro, base install:
	357	# {'Apple': [Intel CPU, Intel GPU]}
	358	# 2 x nvidia 295 with Intel and NVIDIA opencl drivers installed
	359	# {'Intel': [CPU], 'NVIDIA': [GPU, GPU, GPU, GPU]}
	360	gpu, cpu = None, None
[3c56da87]	361	for platform in cl.get_platforms():
[e6a5556]	362	# AMD provides a much weaker CPU driver than Intel/Apple, so avoid it.
[20317b3]	363	# If someone has bothered to install the AMD/NVIDIA drivers, prefer
	364	# them over the integrated graphics driver that may have been supplied
	365	# with the CPU chipset.
	366	preferred_cpu = (platform.vendor.startswith('Intel')
	367	or platform.vendor.startswith('Apple'))
	368	preferred_gpu = (platform.vendor.startswith('Advanced')
	369	or platform.vendor.startswith('NVIDIA'))
[3c56da87]	370	for device in platform.get_devices():
	371	if device.type == cl.device_type.GPU:
[20317b3]	372	# If the existing type is not GPU then it will be CUSTOM
	373	# or ACCELERATOR so don't override it.
[e6a5556]	374	if gpu is None or (preferred_gpu and gpu.type == cl.device_type.GPU):
	375	gpu = device
	376	elif device.type == cl.device_type.CPU:
	377	if cpu is None or preferred_cpu:
	378	cpu = device
[d18582e]	379	else:
[e6a5556]	380	# System has cl.device_type.ACCELERATOR or cl.device_type.CUSTOM
	381	# Intel Phi for example registers as an accelerator
[20317b3]	382	# Since the user installed a custom device on their system
	383	# and went through the pain of sorting out OpenCL drivers for
	384	# it, lets assume they really do want to use it as their
	385	# primary compute device.
[e6a5556]	386	gpu = device
[199d40d]	387
[20317b3]	388	# order the devices by gpu then by cpu; when searching for an available
	389	# device by data type they will be checked in this order, which means
	390	# that if the gpu supports double then the cpu will never be used (though
	391	# we may make it possible to explicitly request the cpu at some point).
[e6a5556]	392	devices = []
	393	if gpu is not None:
	394	devices.append(gpu)
	395	if cpu is not None:
	396	devices.append(cpu)
	397	return [cl.Context([d]) for d in devices]
[3c56da87]	398
[250fa25]	399
[f619de7]	400	class GpuModel(KernelModel):
[14de349]	401	"""
	402	GPU wrapper for a single model.
	403
[17bbadd]	404	source and model_info are the model source and interface as returned
	405	from :func:`generate.make_source` and :func:`generate.make_model_info`.
[14de349]	406
	407	dtype is the desired model precision. Any numpy dtype for single
	408	or double precision floats will do, such as 'f', 'float32' or 'single'
	409	for single and 'd', 'float64' or 'double' for double. Double precision
	410	is an optional extension which may not be available on all devices.
[cde11f0f]	411	Half precision ('float16','half') may be available on some devices.
	412	Fast precision ('fast') is a loose version of single precision, indicating
	413	that the compiler is allowed to take shortcuts.
[14de349]	414	"""
[dd7fc12]	415	def __init__(self, source, model_info, dtype=generate.F32, fast=False):
[a4280bd]	416	# type: (Dict[str,str], ModelInfo, np.dtype, bool) -> None
[17bbadd]	417	self.info = model_info
[ce27e21]	418	self.source = source
[dd7fc12]	419	self.dtype = dtype
	420	self.fast = fast
[95f62aa]	421	self.timestamp = generate.ocl_timestamp(self.info)
	422	self._cache_key = unique_id()
[14de349]	423
[ce27e21]	424	def __getstate__(self):
[dd7fc12]	425	# type: () -> Tuple[ModelInfo, str, np.dtype, bool]
[eafc9fa]	426	return self.info, self.source, self.dtype, self.fast
[14de349]	427
[ce27e21]	428	def __setstate__(self, state):
[dd7fc12]	429	# type: (Tuple[ModelInfo, str, np.dtype, bool]) -> None
[eafc9fa]	430	self.info, self.source, self.dtype, self.fast = state
[ce27e21]	431
[9eb3632]	432	def make_kernel(self, q_vectors):
[dd7fc12]	433	# type: (List[np.ndarray]) -> "GpuKernel"
[95f62aa]	434	return GpuKernel(self, q_vectors)
	435
	436	@property
	437	def Iq(self):
	438	return self._fetch_kernel('Iq')
	439
	440	def fetch_kernel(self, name):
	441	# type: (str) -> cl.Kernel
	442	"""
	443	Fetch the kernel from the environment by name, compiling it if it
	444	does not already exist.
	445	"""
	446	gpu = environment()
	447	key = self._cache_key
	448	if key not in gpu.cache:
	449	program = gpu.compile_program(
[a4280bd]	450	self.info.name,
	451	self.source['opencl'],
	452	self.dtype,
[300a2f7]	453	self.fast,
[95f62aa]	454	self.timestamp)
[a4280bd]	455	variants = ['Iq', 'Iqxy', 'Imagnetic']
	456	names = [generate.kernel_name(self.info, k) for k in variants]
[95f62aa]	457	kernels = [getattr(program, k) for k in names]
	458	data = dict((k, v) for k, v in zip(variants, kernels))
	459	# keep a handle to program so GC doesn't collect
	460	data['program'] = program
	461	gpu.cache[key] = data
[a4280bd]	462	else:
[95f62aa]	463	data = gpu.cache[key]
	464	return data[name]
[14de349]	465
	466	# TODO: check that we don't need a destructor for buffers which go out of scope
	467	class GpuInput(object):
	468	"""
	469	Make q data available to the gpu.
	470
	471	q_vectors is a list of q vectors, which will be [q] for 1-D data,
	472	and [qx, qy] for 2-D data. Internally, the vectors will be reallocated
	473	to get the best performance on OpenCL, which may involve shifting and
	474	stretching the array to better match the memory architecture. Additional
	475	points will be evaluated with q=1e-3.
	476
	477	dtype is the data type for the q vectors. The data type should be
	478	set to match that of the kernel, which is an attribute of
	479	:class:`GpuProgram`. Note that not all kernels support double
	480	precision, so even if the program was created for double precision,
	481	the GpuProgram.dtype may be single precision.
	482
	483	Call :meth:`release` when complete. Even if not called directly, the
	484	buffer will be released when the data object is freed.
	485	"""
[cb6ecf4]	486	def __init__(self, q_vectors, dtype=generate.F32):
[dd7fc12]	487	# type: (List[np.ndarray], np.dtype) -> None
[17bbadd]	488	# TODO: do we ever need double precision q?
[14de349]	489	self.nq = q_vectors[0].size
	490	self.dtype = np.dtype(dtype)
[eafc9fa]	491	self.is_2d = (len(q_vectors) == 2)
[f5b9a6b]	492	# TODO: stretch input based on get_warp()
	493	# not doing it now since warp depends on kernel, which is not known
	494	# at this point, so instead using 32, which is good on the set of
	495	# architectures tested so far.
[c072f83]	496	if self.is_2d:
[b8ddf2e]	497	# Note: 16 rather than 15 because result is 1 longer than input.
	498	width = ((self.nq+16)//16)*16
[c072f83]	499	self.q = np.empty((width, 2), dtype=dtype)
	500	self.q[:self.nq, 0] = q_vectors[0]
	501	self.q[:self.nq, 1] = q_vectors[1]
	502	else:
[b8ddf2e]	503	# Note: 32 rather than 31 because result is 1 longer than input.
	504	width = ((self.nq+32)//32)*32
[c072f83]	505	self.q = np.empty(width, dtype=dtype)
	506	self.q[:self.nq] = q_vectors[0]
	507	self.global_size = [self.q.shape[0]]
[95f62aa]	508	self._cache_key = unique_id()
	509
	510	@property
	511	def q_b(self):
	512	"""Lazy creation of q buffer so it can survive context reset"""
	513	env = environment()
	514	key = self._cache_key
	515	if key not in env.cache:
	516	context = env.context[self.dtype]
	517	#print("creating inputs of size", self.global_size)
	518	buffer = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
	519	hostbuf=self.q)
	520	env.cache[key] = buffer
	521	return env.cache[key]
[14de349]	522
	523	def release(self):
[dd7fc12]	524	# type: () -> None
[eafc9fa]	525	"""
[95f62aa]	526	Free the buffer associated with the q value
[eafc9fa]	527	"""
[95f62aa]	528	environment().free_buffer(id(self))
[14de349]	529
[eafc9fa]	530	def __del__(self):
[dd7fc12]	531	# type: () -> None
[eafc9fa]	532	self.release()
	533
[f619de7]	534	class GpuKernel(Kernel):
[ff7119b]	535	"""
	536	Callable SAS kernel.
	537
[95f62aa]	538	model is the GpuModel object to call
[ff7119b]	539
[95f62aa]	540	The following attributes are defined:
[ff7119b]	541
[95f62aa]	542	info is the module information
[eafc9fa]	543
	544	dtype is the kernel precision
[ff7119b]	545
[95f62aa]	546	dim is '1d' or '2d'
	547
	548	result is a vector to contain the results of the call
	549
[ff7119b]	550	The resulting call method takes the pars, a list of values for
	551	the fixed parameters to the kernel, and pd_pars, a list of (value,weight)
	552	vectors for the polydisperse parameters. cutoff determines the
	553	integration limits: any points with combined weight less than cutoff
	554	will not be calculated.
	555
	556	Call :meth:`release` when done with the kernel instance.
	557	"""
[95f62aa]	558	def __init__(self, model, q_vectors):
[f2f67a6]	559	# type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None
[95f62aa]	560	dtype = model.dtype
	561	self.q_input = GpuInput(q_vectors, dtype)
	562	self._model = model
	563	self._as_dtype = (np.float32 if dtype == generate.F32
	564	else np.float64 if dtype == generate.F64
	565	else np.float16 if dtype == generate.F16
	566	else np.float32) # will never get here, so use np.float32
	567	self._cache_key = unique_id()
	568
	569	# attributes accessed from the outside
	570	self.dim = '2d' if self.q_input.is_2d else '1d'
	571	self.info = model.info
	572	self.dtype = model.dtype
	573
	574	# holding place for the returned value
	575	# plus one for the normalization values
	576	self.result = np.empty(self.q_input.nq+1, dtype)
	577
	578	@property
	579	def _result_b(self):
	580	"""Lazy creation of result buffer so it can survive context reset"""
[ce27e21]	581	env = environment()
[95f62aa]	582	key = self._cache_key
	583	if key not in env.cache:
	584	context = env.context[self.dtype]
	585	#print("creating inputs of size", self.global_size)
	586	buffer = cl.Buffer(context, mf.READ_WRITE,
	587	self.q_input.global_size[0] * self.dtype.itemsize)
	588	env.cache[key] = buffer
	589	return env.cache[key]
[d18582e]	590
[32e3c9b]	591	def __call__(self, call_details, values, cutoff, magnetic):
	592	# type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray
[95f62aa]	593	env = environment()
	594	queue = env.queue[self._model.dtype]
	595	context = queue.context
	596
	597	# Arrange data transfer to/from card
	598	q_b = self.q_input.q_b
	599	result_b = self._result_b
[48fbd50]	600	details_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
[8d62008]	601	hostbuf=call_details.buffer)
[48fbd50]	602	values_b = cl.Buffer(context, mf.READ_ONLY \| mf.COPY_HOST_PTR,
	603	hostbuf=values)
	604
[95f62aa]	605	name = 'Iq' if self.dim == '1d' else 'Imagnetic' if magnetic else 'Iqxy'
	606	kernel = self._model.fetch_kernel(name)
	607	kernel_args = [
[9eb3632]	608	np.uint32(self.q_input.nq), None, None,
[95f62aa]	609	details_b, values_b, q_b, result_b,
	610	self._as_dtype(cutoff),
[9eb3632]	611	]
	612	#print("Calling OpenCL")
[bde38b5]	613	#call_details.show(values)
[ae2b6b5]	614	# Call kernel and retrieve results
[6e5b2a7]	615	wait_for = None
	616	last_nap = time.clock()
	617	step = 1000000//self.q_input.nq + 1
[bde38b5]	618	for start in range(0, call_details.num_eval, step):
	619	stop = min(start + step, call_details.num_eval)
[9eb3632]	620	#print("queuing",start,stop)
[95f62aa]	621	kernel_args[1:3] = [np.int32(start), np.int32(stop)]
	622	wait_for = [kernel(queue, self.q_input.global_size, None,
	623	*kernel_args, wait_for=wait_for)]
[6e5b2a7]	624	if stop < call_details.num_eval:
	625	# Allow other processes to run
	626	wait_for[0].wait()
	627	current_time = time.clock()
	628	if current_time - last_nap > 0.5:
[8b31efa]	629	time.sleep(0.001)
[6e5b2a7]	630	last_nap = current_time
[95f62aa]	631	cl.enqueue_copy(queue, self.result, result_b, wait_for=wait_for)
[bde38b5]	632	#print("result", self.result)
[ae2b6b5]	633
	634	# Free buffers
[a738209]	635	for v in (details_b, values_b):
[c1114bf]	636	if v is not None:
	637	v.release()
[14de349]	638
[14a15a3]	639	pd_norm = self.result[self.q_input.nq]
[c1114bf]	640	scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0)
[9eb3632]	641	background = values[1]
[14a15a3]	642	#print("scale",scale,values[0],self.result[self.q_input.nq],background)
[9eb3632]	643	return scale*self.result[:self.q_input.nq] + background
	644	# return self.result[:self.q_input.nq]
[14de349]	645
	646	def release(self):
[dd7fc12]	647	# type: () -> None
[eafc9fa]	648	"""
	649	Release resources associated with the kernel.
	650	"""
[95f62aa]	651	environment().free_buffer(id(self))
	652	self.q_input.release()
[14de349]	653
	654	def __del__(self):
[dd7fc12]	655	# type: () -> None
[14de349]	656	self.release()

Note: See TracBrowser for help on using the repository browser.

SasView

source: sasmodels/sasmodels/kernelcl.py @ d5ce7fa

Download in other formats: