OpenCL 是一个小而美的东西,比 OpenGL 要简单很多,妙不可言。( ̄▽ ̄)" OpenGL 需要图形学的知识储备,OpenCL 不太需要。
并行计算行情。
眼睛能看到的是 OpenGL(Open Graphics Library),耳朵能听到的是 OpenAL(Open Audio Library),后来发现显卡可以用于通用计算,于是乎有了 OpenCL(Open Computing Language)。 OpenCL 和 OpenGL 都能用于操作 GPU,但是前者主要用于通用计算,而后者主要用于图像渲染。
clEnqueueMapBuffer / clEnqueueMapImage 用于访问内存对象的 OpenCL 机制,而不是使用 clEnqueueRead / Write。 我们可以将设备上的内存对象映射到主机上的内存区域。一旦我们映射了对象,我们就可以随意读 / 写或修改。 Read / Write 缓冲器和 clEnqueueMapBuffer 之间的一个更差的是 map_flags 参数。 如果 map_flags 设置为 CL_MAP_READ,映射的内存将是只读的,如果它被设置为 CL_MAP_WRITE 映射的内存将只写, 如果你想同时读取 + 写入然后再进行标志 CL_MAP_READ | CL_MAP_WRITE。 与读 / 写 fns 相比,内存映射需要三步过程:
CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
Size dsize, double fx = 0, double fy = 0,
int interpolation = INTER_LINEAR );
__kernel void resizeLN
目前 hal 层的 cv_hal_resize 默认并没有实现
CV_TRY_AVX2
和 CV_TRY_SSE4_1
CV_CPU_SSE4_1
CV_CPU_SSE4_2
CV_CPU_FP16
CV_CPU_AVX
OpenCV 关于 GPU 的具体优化实现基本都是用的 OpenCL。 高性能计算方面:多线程,cuda/opencl, simd。
// 获取当前使用的 ParallelForBackend,omp 或者 tbb
CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.);
OpenCL 图片对象,支持 RGBA,就是不支持 R8G8B8,因为这个不是 4 字节对齐的,对性能不利,所以都没有支持!额……
cl_image_format getImageFormat(int depth, int cn, bool norm) {
cl_image_format format;
static const int channelTypes[] = {
CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16, //
CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, //
-1, CL_HALF_FLOAT};
static const int channelTypesNorm[] = {
CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16, //
CL_SNORM_INT16, -1, -1, //
-1, -1};
// CL_RGB has no mappings to OpenCV types because CL_RGB can only be used with
// CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, or CL_UNORM_INT_101010.
static const int channelOrders[] = {-1, CL_R, CL_RG, /*CL_RGB*/ -1, CL_RGBA};
int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
int channelOrder = channelOrders[cn];
format.image_channel_data_type = (cl_channel_type)channelType;
format.image_channel_order = (cl_channel_order)channelOrder;
return format;
}
bool isFormatSupported(cl_image_format format) {
// Figure out how many formats are supported by this context.
cl_uint numFormats = 0;
cl_int err = clGetSupportedImageFormats(m_context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, //
numFormats, NULL, &numFormats);
if (err != CL_SUCCESS) {
return false;
}
if (numFormats > 0) {
cl_image_format* formats = new cl_image_format[numFormats];
err = clGetSupportedImageFormats(m_context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, //
numFormats, formats, NULL);
if (err != CL_SUCCESS) {
delete[] formats;
return false;
}
for (cl_uint i = 0; i < numFormats; ++i) {
if (!memcmp(&formats[i], &format, sizeof(format))) {
delete[] formats;
return true;
}
}
delete[] formats;
}
return false;
}
https://0cch.com/2022/03/08/oneapi-summary/
Vtune ,这个工具就厉害了,我想做过性能优化的朋友肯定是用过的。这个工具在做性能优化方面并不局限于异构程序,其实很早之前我就接触过它了。 它可以对程序性能的缺陷做非常系统的分析,包括 IO,线程、内存、指令集的使用等等, 分析的粒度可以从指令到代码行再到函数块,支持的架构从 CPU、GPU 到 FPGA。
// opencl_kernel_set.buildopts.txt
-D dstT=uchar3 -D rowsPerWI=4 -D dstST=uchar4 -D dstT1=uchar -D cn=3
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
// opencl_kernel_resizeLN.buildopts.txt
__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
float ifx, float ify);
// INTER_LINEAR resizeLN 15000,0,2812,5000/12000,0,2249,4000/1.25,1.25
// INTER_LINEAR resizeLN 300,0,100,100/240,0,80,80/1.25,1.25
// -D INTER_LINEAR -D depth=0 -D T=uchar3 -D T1=uchar -D WT=int3 -D convertToWT=convert_int3 -D convertToDT=convert_uchar3_sat -D cn=3 -D INTER_RESIZE_COEF_BITS=11
// -D INTER_LINEAR -D depth=0 -D T=uchar3 -D T1=uchar -D WT=int3 -D convertToWT=convert_int3 -D convertToDT=convert_uchar3_sat -D cn=3 -D INTER_RESIZE_COEF_BITS=11
static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
double fx, double fy, int interpolation)
{
int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
"-D INTER_RESIZE_COEF_BITS=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
cn, INTER_RESIZE_COEF_BITS));
if (k.empty())
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
(float)inv_fx, (float)inv_fy);
}
缓存:
C:\Users\ADMIN\AppData\Local\Temp\opencv\4.5\opencl_cache
32-bit–Intel_R__Corporation–Intel_R__HD_Graphics_630–27_20_100_9168
imgproc–resize_292f623900ce0dbfd5b5eb23a3c7c5bd.bin
/* Detect which version to target */
#if !defined(CL_TARGET_OPENCL_VERSION)
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
#define CL_TARGET_OPENCL_VERSION 300
#endif
/* cl_device_type - bitfield */
#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
#define CL_DEVICE_TYPE_CPU (1 << 1)
#define CL_DEVICE_TYPE_GPU (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
#ifdef CL_VERSION_1_2
#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
#endif
#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
void test() {
const cl_uint num_entries = 100;
const size_t param_value_size = 100;
cl_platform_id platforms[num_entries];
cl_uint num_platforms = 0;
clGetPlatformIDs(num_entries, platforms, &num_platforms);
for (int i = 0; i < num_platforms; i++) {
cl_platform_id platform = platforms[i];
cl_platform_info param_name = CL_PLATFORM_NAME;
char param_valuep[param_value_size];
size_t param_value_size_ret = 0;
clGetPlatformInfo(platform,
param_name,
param_value_size,
param_valuep,
¶m_value_size_ret);
cl_device_type device_type = CL_DEVICE_TYPE_ALL;
cl_device_id devices[num_entries];
cl_uint num_devices = 0;
clGetDeviceIDs(platform,
device_type,
num_entries,
devices,
&num_devices);
for (int j = 0; j < num_devices; j++) {
cl_device_id device = devices[j];
cl_device_info param_name = CL_DEVICE_NAME;
char param_valued[param_value_size];
size_t param_value_size_ret = 0;
clGetDeviceInfo(device,
param_name,
param_value_size,
param_valued,
¶m_value_size_ret);
param_value_size_ret = 0; //
}
}
}
Intel(R) OpenCL HD Graphics Intel(R) HD Graphics 630
OpenCL 将设备中的内部存储器抽象成四层结构的存储模型:
下表描述了宿主机和设备对内存的的分配和访问规则。
在运行 OpenCL 应用时,宿主机需要将待处理的数据送到 OpenCL 设备,OpenCL 设备运算完成后需要把结构返回给宿主机,这就需要在宿主机与 OpenCL 设备之间进行数据交互,这种交互有两种方式:拷贝数据法和内存映射法。 OpenCL 规定了一个松散的内存模型,它不保证所有的工作节点访问的内存状态是一致的,它只规定在一个工作节点内部访问内存必须是一致的;在工作组内,可以通过同步点来保证组内节点的内存访问一致性。在不同工作组的访问内存一致性上,OpenCL 不提供任何保证。
OpenCL 和 nVidia CUDA 很像。
CUDA | OpenCL |
---|---|
Thread | Work-item |
Thread block | Work-group |
global memory | global memory |
constant memory | constant memory |
shared memory | local memory |
local memory | private memory |
Grid size | Global range |
Block size | Local range |
__global__ | kernel |
gridDim.x | get_num_groups(0) |
blockDim.x | get_local_size(0) |
blockIdx.x | get_group_id(0) |
threadIds.x | get_local_id(0) |
__syncthreads | barrier() |
warp | no equivalent |
#include <cl/cl.hpp>
#include <vector>
using namespace cl;
using namespace std;
int main(int, char**)
{
Platform platform = Platform::getDefault();
vector<Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
Context context(devices[0]);
CommandQueue queue(context, devices[0]);
Program program(context, "OpenCL C code goes here...");
program.build();
auto kernel = make_kernel<Buffer, Buffer>(program, "example_kernel");
const static int Size = 1000000;
vector<int> inputData(Size, 0), outputData(Size, 0);
Buffer inputBuffer(context, CL_MEM_READ_ONLY, Size * sizeof(int));
Buffer outputBuffer(context, CL_MEM_WRITE_ONLY, Size * sizeof(int));
// 发送数据
queue.enqueueWriteBuffer(inputBuffer, false, 0, Size * sizeof(int), inputData.data());
// 运行 kernel
kernel(EnqueueArgs(queue, NDRange(Size)), inputBuffer, outputBuffer);
// 拖回数据
queue.enqueueReadBuffer(outputBuffer, false, 0, Size * sizeof(int), outputData.data());
queue.finish();
return 0;
}
kernel void example_kernel(global int * input, global int * output)
{
int worker_id = get_global_id(0);
output[worker_id] = input[worker_id] + 10 + worker_id;
}
#define MACRO(a, b) a + b
bool function(int a)
{
float4 vector_type(0, 1, 2, 3); // 有的支持 SIMD
vector_type *= 2;
float v = vector_type.x; // 和 OpenGL 有点类似
float2 v2; float8 v8; float16 v16;
uchar uc; uint ui;
local bool local_buffer[256]; // local 内存不能初始化。
int lid = get_local_id(0);
if (lid < 256)
local_buffer[lid] = (uc8.S1 == uc);
barrier(CLK_LOCAL_MEM_FENCE);
if (lid < 256 && lid > 1)
return local_buffer[lid - 1];
return false;
}
实现图片模糊,对比性能。
E:\kpdf\fastimage\fastapp-turbo\gpuip\build\test\Release\test_performance.exe E:\kpdf\fastimage\fastapp-turbo\gpuip\examples\images\bridge.exr E:\kpdf\fastimage\fastapp-turbo\gpuip\examples\kernels\
---------------------------------------------------------------
| LERP |
---------------------------------------------------------------
CPU: 33.6 ms.
CPU MT: 216.8 ms.
OpenCL: 4.5 ms, Process 0.8 ms (16.8%), Copy 3.7 ms (83.2%)
GLSL: 23.8 ms, Process 0.7 ms (2.8%), Copy 23.1 ms (97.2%)
---------------------------------------------------------------
| BOX BLUR |
---------------------------------------------------------------
CPU: 3491.4 ms.
CPU MT: 983.1 ms.
OpenCL: 5.7 ms, Process 4.9 ms (85.5%), Copy 0.8 ms (14.5%)
GLSL: 5.1 ms, Process -1.0 ms (-19.5%), Copy 6.1 ms (119.5%)
---------------------------------------------------------------
| GAUSSIAN BLUR |
---------------------------------------------------------------
CPU: 4601.4 ms.
CPU MT: 178.7 ms.
OpenCL: 9.2 ms, Process 8.5 ms (91.5%), Copy 0.8 ms (8.5%)
GLSL: 11.4 ms, Process -1.0 ms (-8.7%), Copy 12.4 ms (108.7%)
---------------------------------------------------------------
| SEPARABLE GAUSSIAN BLUR |
---------------------------------------------------------------
CPU: 838.0 ms.
CPU MT: 624.9 ms.
OpenCL: 1.8 ms, Process 1.1 ms (59.4%), Copy 0.7 ms (40.6%)
GLSL: 1.7 ms, Process -1.0 ms (-57.5%), Copy 2.7 ms (157.5%)
简单的事情 CPU MT 比 CPU 更慢; GLSL 传输比 OpenCL 更慢。(也有可能是程序没写好 [惭愧]-_-)
普通指令集 vs 增强指令集
CPU | 20.0 ms. | 9.4 ms. |
CPU MT | 946.8 ms. | 660.6 ms. |
OpenCL | 6.5 ms, Process 0.8 ms (12.0%), Copy 5.7 ms (88.0%) | 2.2 ms, Process 0.7 ms (31.1%), Copy 1.5 ms (68.9%) |
CPU | 1207.2 ms. | 1341.0 ms. |
CPU MT | 619.2 ms. | 872.4 ms. |
OpenCL | 5.9 ms, Process 5.0 ms (85.3%), Copy 0.9 ms (14.7%) | 5.6 ms, Process 4.8 ms (87.2%), Copy 0.7 ms (12.8%) |
CPU | 1983.4 ms. | 3481.0 ms. |
CPU MT | 96.4 ms. | 233.0 ms. |
OpenCL | 9.4 ms, Process 8.6 ms (91.0%), Copy 0.8 ms (9.0%) | 9.4 ms, Process 8.6 ms (90.9%), Copy 0.9 ms (9.1%) |
CPU | 385.2 ms. | 813.4 ms. |
CPU MT | 667.9 ms. | 773.2 ms. |
OpenCL | 1.9 ms, Process 1.1 ms (59.2%), Copy 0.8 ms (40.8%) | 1.9 ms, Process 1.1 ms (56.7%), Copy 0.8 ms (43.3%) |
AMD 的丰富的示例 Samples from the AMD APP SDK (with OpenCRun support) AMD APP Samples
__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
/* Copy input 2D image to output 2D image */
__kernel void image2dCopy(__read_only image2d_t input, __write_only image2d_t output)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
uint4 temp = read_imageui(input, imageSampler, coord);
write_imageui(output, coord, temp);
}
/* Copy input 3D image to 2D image */
__kernel void image3dCopy(__read_only image3d_t input, __write_only image2d_t output)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
/* Read first slice into lower half */
uint4 temp0 = read_imageui(input, imageSampler, (int4)(coord, 0, 0));
/* Read second slice into upper half */
uint4 temp1 = read_imageui(input, imageSampler, (int4)((int2)(get_global_id(0), get_global_id(1) - get_global_size(1)/2), 1, 0));
write_imageui(output, coord, temp0 + temp1);
}
Simple kernel to modify vertex positions in sine wave pattern. param data data in global memory
__kernel
void sineWave(
__global float4 * pos,
unsigned int width,
unsigned int height,
float time)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
// calculate uv coordinates
float u = x / (float) width;
float v = y / (float) height;
u = u*2.0f - 1.0f;
v = v*2.0f - 1.0f;
// calculate simple sine wave pattern
float freq = 4.0f;
float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;
// write output vertex
pos[y*width+x] = (float4)(u, w, v, 1.0f);
}
#define GROUP_SIZE 64
__constant int mask[] =
{
1, -1, 2, -2
};
__kernel void MemoryModel(__global int *outputbuffer, __global int *inputbuffer)
{
__local int localBuffer[GROUP_SIZE];
__private int result=0;
__private size_t group_id=get_group_id(0);
__private size_t item_id=get_local_id(0);
__private size_t gid = get_global_id(0);
// Each workitem within a work group initialize one element of the local buffer
localBuffer[item_id]=inputbuffer[gid];
// Synchronize the local memory
barrier(CLK_LOCAL_MEM_FENCE);
// add 4 elements from the local buffer
// and store the result into a private variable
for (int i = 0; i < 4; i++) {
result += localBuffer[(item_id+i)%GROUP_SIZE];
}
// multiply the partial result with a value from the constant memory
result *= mask[group_id%4];
// store the result into a buffer
outputbuffer[gid]= result;
}
FluidSimulation2D 搞 豹趣魔屏 的时候接触过,再次见到;作者说是 基于 OpenMP 版本改的。
developer.amd.com/wordpress/media/2012/10/OpenCLTutorial-Chinese.pdf
读写传输命令、内存映射命令 RGB -> RGBA,就需要 clEnqueueCopyBufferToImage/clEnqueueCopyImageToBuffer:
操作对象数据
// 定义采样器
// CLK_NORMALIZED_COORDS_TRUE 指定使用归一化坐标
// CLK_ADDRESS_CLAMP 指定超出图像范围的颜色为黑色
// CLK_FILTER_LINEAR 指定使用双线性插值
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;
__kernel void image_scaling(__read_only image2d_t sourceImage,
__write_only image2d_t destinationImage,
const float widthNormalizationFactor,
const float heightNormalizationFactor)
{
// 从 glob_id 中获取目标像素坐标
int2 coordinate = (int2)(get_global_id(0), get_global_id(1));
// 计算归一化浮点坐标
float2 normalizedCoordinate = convert_float2(coordinate) * (float2)(widthNormalizationFactor, heightNormalizationFactor);
// 根据归一化坐标从原图中读取像素数据
float4 colour = read_imagef(sourceImage, sampler, normalizedCoordinate);
// 将像素数据写入目标图像
write_imagef(destinationImage, coordinate, colour);
}
CL_MEM_USE_HOST_PTR Vs CL_MEM_COPY_HOST_PTR Vs CL_MEM_ALLOC_HOST_PTR
使用 map 代替 copy 假设 OpenCL 应用程序对数据流有完全的控制,即目标和源内存对象的创建都由 OpenCL 应用程序管理。这是最简单的情况,可以通过以下步骤避免内存复制: CL_MEM_ALLOC_HOST_PTR 是避免在这种情况下复制数据的唯一方法。 对于其他标志,如 CL_MEM_USE_HOST_PTR 或 CL_MEM_COPY_HOST_PTR,驱动程序必须做额外的内存复制以便 GPU 访问。
OpenCL memory object 之 传输优化 如果 device 及操作系统支持 zero copy,则下面 buffer 类型可以使用:
注意:创建 buffer 的大小是平台 dependience 的,比如在某个平台上一个 buffer 不能超过 64M,总的 buffer 不能超过 128M 等。
CL_MEM_USE_PERSISTENT_MEM_AMD cl_khr_egl_image
这个文档讲的最清楚。 Arm Mali Bifrost OpenCL Developer Guide Version 1.0 About memory allocation
host_ptr
argument into memory allocated by the driver.Arm recommends the following:
Use CL_MEM_ALLOC_HOST_PTR to avoid copying memory
Do not create buffers with CL_MEM_USE_HOST_PTR if possible
Do not allocate memory buffers created with malloc() for OpenCL applications
You must allocate the memory in OpenCL with CL_MEM_ALLOC_HOST_PTR because it ensures that the memory pages are always mapped into physical memory.
自己测试结果。
集成显卡的情况:
集成显卡,上传 alloc,临时 alloc,下载 usehost:
create 0.0024 ms. upload 0.0088 ms. scale= 0.0577 ms. download 0.0001 ms.
全部 alloc 的情况:
create 0.0047 ms. upload 0.0085 ms. scale= 0.0576 ms. download 0.0078 ms.
// Debug 1.2
create 0 ms. upload 13 ms. scale 0 ms. download 73 ms.
create 2 ms. upload 12 ms. scale 0 ms. download 27 ms.
create 2 ms. upload 9 ms. scale 0 ms. download 22 ms.
create 1.3333 ms. upload 11.3333 ms. scale 0.0000 ms. download 40.6667 ms.
// Debug 3.0
create 1 ms. upload 16 ms. scale 13 ms. download 7 ms.
create 2 ms. upload 9 ms. scale 11 ms. download 4 ms.
create 1 ms. upload 10 ms. scale 10 ms. download 4 ms.
create 1.3333 ms. upload 11.6667 ms. scale 11.3333 ms. download 5.0000 ms.
// Debug 2.0
create 0 ms. upload 13 ms. scale 12 ms. download 5 ms.
create 2 ms. upload 10 ms. scale 10 ms. download 4 ms.
create 1 ms. upload 10 ms. scale 10 ms. download 4 ms.
create 1.0000 ms. upload 11.0000 ms. scale 10.6667 ms. download 4.3333 ms.
This sample intends to demonstrate how to use separate compilation and consumption of OpenCL program by saving and loading program binaries.
该示例旨在通过 保存 和 加载 程序二进制文件来演示如何使用 OpenCL 程序的单独 编译 和 使用 。
This sample intends to demonstrate how to use different techniques of data exchange between workitems in workgroup, query various extensions applicable and use compile options to touch up kernel sources at runtime to produce the best kernel implementation for the task.
该示例旨在演示如何在工作组中的工作项之间使用 不同的数据交换技术 , 查询各种适用的扩展并使用编译选项在 运行时修改内核源代码 ,从而为任务生成最佳内核实现。
In this very simple sample, OpenCL APIs are used to copy the contents of one buffer to another buffer on the OpenCL device. To do this, OpenCL APIs are used to create both buffers, to create the OpenCL command queue, and to initialize the source buffer and verify the contents of the destination buffer on the host. By default, this sample will run in the first enumerated OpenCL device on the first enumerated OpenCL platform. To run on a different OpenCL device or platform, please use the provided command line options.
在这个非常简单的示例中,OpenCL API 用于将一个缓冲区的 内容复制 到 OpenCL 设备上的另一个缓冲区。 为此,OpenCL API 用于创建两个缓冲区、创建 OpenCL 命令队列、初始化源缓冲区并验证主机上目标缓冲区的内容。 默认情况下,此示例将在第一个枚举 OpenCL 平台上的第一个枚举 OpenCL 设备中运行。 要在不同的 OpenCL 设备或平台上运行,请使用提供的命令行选项。
This example uses an OpenCL kernel to do work. An OpenCL kernel is a short program defining what one OpenCL work item should do. In this case, each OpenCL work item will copy one value from a source buffer to a destination buffer. Since this sample launches one work item for every element in the source buffer, behaviorally this sample will do exactly the same thing as the copy buffer sample. In this sample, the source code for the OpenCL kernel is embedded into the host code as a raw string. At runtime, an OpenCL program is created from the raw string, and the OpenCL device compiler is invoked to compile the OpenCL program for the OpenCL device. This isn't the only way to create OpenCL programs, but it is fairly common, especially while learning and developing an OpenCL application. By default, this sample will run in the first enumerated OpenCL device on the first enumerated OpenCL platform. To run on a different OpenCL device or platform, please use the provided command line options.
此示例使用 OpenCL 内核来完成工作。 OpenCL 内核是一个简短的程序,它定义了一个 OpenCL 工作项应该做什么。 在这种情况下,每个 OpenCL 工作项会将一个值从源缓冲区复制到目标缓冲区。 由于此示例为源缓冲区中的每个元素启动一个工作项,因此从行为上讲,此示例将执行与复制缓冲区示例完全相同的操作。 在此示例中,OpenCL 内核的源代码作为原始字符串嵌入到主机代码中。 在运行时,从原始字符串创建 OpenCL 程序,并调用 OpenCL 设备编译器为 OpenCL 设备编译 OpenCL 程序。 这不是创建 OpenCL 程序的唯一方法,但它相当普遍,尤其是在学习和开发 OpenCL 应用程序时。 默认情况下,此示例将在第一个枚举 OpenCL 平台上的第一个枚举 OpenCL 设备中运行。 要在不同的 OpenCL 设备或平台上运行,请使用提供的命令行选项。
This is a very simple sample that demonstrates how to enumerate the OpenCL platforms that are installed on a machine, and the OpenCL devices that these platforms expose. This is one of the few samples that uses the OpenCL C APIs, as described in the OpenCL specification. Most of the other samples use the OpenCL C++ API bindings, since they make it a lot easier to write and understand OpenCL code! This is a good first sample to run to verify that OpenCL is correctly installed on your machine, and that your build environment is correctly setup.
这是一个非常简单的示例,演示了如何枚举安装在机器上的 OpenCL 平台,以及这些平台公开的 OpenCL 设备。 这是使用 OpenCL C API 的少数示例之一,如 OpenCL 规范中所述。 大多数其他示例都使用 OpenCL C++ API 绑定,因为它们使编写和理解 OpenCL 代码变得更加容易! 这是一个很好的第一个示例,可以运行以验证 OpenCL 是否正确安装在您的机器上,以及您的构建环境是否正确设置。
This sample intends to demonstrate how to query various extensions applicable in the context of a reduction algorithm, touch up kernel sources at runtime to select the best kernel implementation for the task.
此示例旨在演示如何查询适用于缩减算法的各种扩展,在运行时修改内核源以选择任务的最佳内核实现。
This sample intends to be a minimal end-to-end OpenCL application doing actual device-side computation. The structure of the sample rhymes well with the How Does OpenCL Work? chapter of the OpenCL-Guide, particularly the Executing an OpenCL Program part. This sample is implemented using both C and C++ languages to demonstrate the difference in verbosity when using the naked C bindings compared to the C++ wrapper.
此示例旨在成为执行实际设备端计算的最小端到端 OpenCL 应用程序。示例的结构与 OpenCL 如何工作? OpenCL 指南的章节,特别是执行 OpenCL 程序部分。 此示例是使用 C 和 C++ 语言实现的,以展示使用裸 C 绑定与 C++ 包装器相比在详细程度方面的差异。
This sample intends to demonstrate how to share images (textures) between OpenCL and OpenGL. How Does OpenCL-OpenGL Interop? chapter of the OpenCL-Guide lays out the fundamentals of OpenCL-OpenGL interoperability.
此示例旨在演示如何在 OpenCL 和 OpenGL 之间共享图像(纹理)。 OpenCL-OpenGL 如何互操作? OpenCL 指南的第 1 章阐述了 OpenCL-OpenGL 互操作性的基础知识。
The sample calculate the histogram of a random sequence with global atomic add and when it is possible, it's using local atomic add.
This sample intends to be a minimal end-to-end OpenCL application doing actual device-side computation. The structure of the sample rhymes well with the How Does OpenCL Work? chapter of the OpenCL-Guide, particularly the Executing an OpenCL Program part. This sample is implemented using C++ languages.
该示例使用全局原子添加计算随机序列的直方图,并且在可能的情况下,它使用局部原子添加。
此示例旨在成为执行实际设备端计算的最小端到端 OpenCL 应用程序。示例的结构与 OpenCL 如何工作? OpenCL 指南的章节,特别是执行 OpenCL 程序部分。 此示例使用 C++ 语言实现。
This sample intends to demonstrate how to share (vertex) buffers between OpenCL and OpenGL. How Does OpenCL-OpenGL Interop? chapter of the OpenCL-Guide lays out the fundamentals of OpenCL-OpenGL interoperability.
此示例旨在演示如何在 OpenCL 和 OpenGL 之间共享(顶点)缓冲区。 OpenCL-OpenGL 如何互操作? OpenCL 指南的第 1 章阐述了 OpenCL-OpenGL 互操作性的基础知识。
A simple argument parser library
https://github.com/likle/cargs.git
http://khronosgroup.github.io/OpenCL-CLHPP/
对于许多大型应用程序,C++ 是首选语言,因此为 OpenCL 定义 C++ 绑定似乎是合理的。
该接口包含在单个 C++ 头文件 opencl.hpp 中,所有定义都包含在命名空间 cl 中。不需要包含 cl.h 并使用 C++ 或原始 C 绑定;只需包含 opencl.hpp 就足够了。
绑定本身是轻量级的,并且与底层 C API 密切对应。使用 C++ 绑定不会引入额外的执行开销。
新标头中有许多兼容性、可移植性和内存管理修复以及其他 OpenCL 2.0 功能。因此,标头不能直接向后兼容,因此我们将其作为 opencl.hpp 而不是新版本的 cl.hpp 发布。
OpenCL SDK 中有两个库,所有示例都在不同程度上使用它们。一个这样的库是 OpenCL 实用程序库,它是一个导出库,旨在简化 OpenCL 的使用,而 OpenCL SDK 库构建在它之上,但在安装 SDK 时不会导出。 OpenCL SDK 库以在 SDK 示例上下文之外可能没有意义的方式扩展了实用程序库。
One may think of this library as analogous to GLU and GLUT in the domain of OpenGL. A set of utilities which condense common tasks into singular functions or add missing functionality of the API which otherwise couldn't be added as a non-API-breaking change. For a complete list utilities provided by this library, refer to the OpenCL Utility Library docs.
The OpenCL Utility Library provides both C and C++ bindings with near feature parity. The utilities are broken into to libraries, OpenCLUtils and OpenCLUtilsCpp. To include them in your project, include <CL/Utils/Utils.h>/<CL/Utils/Utils.hpp> and link to their libraries respectively.
OpenCL 实用程序库提供具有接近功能奇偶性的 C 和 C++ 绑定。这些实用程序分为库、OpenCLUtils 和 OpenCLUtilsCpp。 要将它们包含在您的项目中,请包含 <CL/Utils/Utils.h>/<CL/Utils/Utils.hpp> 并分别链接到它们的库。
The SDK library extends the Utility library by deduplicating common tasks like command-line argument parsing, selecting devices, logging, and other contentious tasks which your application likely does differently, hence the value in shipping it for external use, moreover promise forward and/or backward compatibility is low. For a complete list functionality provided by this library, refer to the OpenCL SDK Library docs.
The OpenCL SDK Library hosts both C and C++ utilities which are generally useful for writing OpenCL applications but are either dependency-heavy or contentious. Because these utilities aren't the subject of universal interest, these utilities are not exported, meaning SDK installations won't install their headers nor their libraries. Doing so the OpenCL Utility Library can be kept dependency-free. The utilities are broken into to libraries, OpenCLSDK and OpenCLSDKCpp. Samples include <CL/Utils/Utils.h>/<CL/Utils/Utils.hpp> and link to their libraries respectively.
OpenCL SDK 库包含 C 和 C++ 实用程序,这些实用程序通常对编写 OpenCL 应用程序很有用,但依赖关系严重或有争议。 因为这些实用程序不是普遍感兴趣的主题,所以不会导出这些实用程序,这意味着 SDK 安装不会安装它们的头文件或库。这样做可以使 OpenCL 实用程序库保持无依赖关系。 这些实用程序分为库、OpenCLSDK 和 OpenCLSDKCpp。示例包括 <CL/Utils/Utils.h>/<CL/Utils/Utils.hpp> 并分别链接到它们的库。
我电脑存在多个 GPU,优先 英伟达、AMD、Intel,其它就无所谓了。 https://pci-ids.ucw.cz/ 完整表格:https://pci-ids.ucw.cz/v2.2/pci.ids
#define OPENCL_VENDOR_NVIDIA 4318 // 0x10de NVIDIA Corporation
#define OPENCL_VENDOR_AMD1 4098 // 0x1002 Advanced Micro Devices, Inc. [AMD/ATI]
#define OPENCL_VENDOR_AMD2 4130 // 0x1022 Advanced Micro Devices, Inc. [AMD]
#define OPENCL_VENDOR_INTEL 32902 // 0x8086 Intel Corporation
#define OPENCL_VENDOR_VMWARE 0x15ad // 0x15ad VMware
游戏本独立显卡:
NVIDIA Corporation
preload 32 ms.
create 0 ms. upload 153 ms. scale 1 ms. download 89 ms.
create 0 ms. upload 143 ms. scale 1 ms. download 102 ms.
create 0 ms. upload 152 ms. scale 0 ms. download 87 ms.
create 0 ms. upload 157 ms. scale 1 ms. download 72 ms.
create 0 ms. upload 219 ms. scale 4 ms. download 195 ms.
create 0 ms. upload 209 ms. scale 3 ms. download 90 ms.
create 0 ms. upload 196 ms. scale 5 ms. download 104 ms.
create 0 ms. upload 225 ms. scale 14 ms. download 123 ms.
create 0 ms. upload 153 ms. scale 2 ms. download 114 ms.
create 0 ms. upload 230 ms. scale 4 ms. download 141 ms.
create 0.0000 ms. upload 183.7000 ms. scale 3.5000 ms. download 111.7000 ms.
NVIDIA Corporation
preload 12 ms.
create 0 ms. upload 181 ms. scale 1 ms. download 108 ms.
create 0 ms. upload 136 ms. scale 0 ms. download 70 ms.
create 0 ms. upload 127 ms. scale 1 ms. download 70 ms.
create 0 ms. upload 164 ms. scale 3 ms. download 91 ms.
create 0 ms. upload 191 ms. scale 3 ms. download 88 ms.
create 0 ms. upload 190 ms. scale 4 ms. download 101 ms.
create 0 ms. upload 187 ms. scale 3 ms. download 103 ms.
create 0 ms. upload 192 ms. scale 3 ms. download 94 ms.
create 0 ms. upload 210 ms. scale 3 ms. download 93 ms.
create 0 ms. upload 185 ms. scale 3 ms. download 94 ms.
create 0.0000 ms. upload 176.3000 ms. scale 2.4000 ms. download 91.2000 ms.
游戏本集成显卡:
Intel(R) Corporation
preload 1301 ms.
create 0 ms. upload 107 ms. scale 6 ms. download 4 ms.
create 2 ms. upload 18 ms. scale 5 ms. download 4 ms.
create 2 ms. upload 18 ms. scale 6 ms. download 3 ms.
create 2 ms. upload 16 ms. scale 5 ms. download 4 ms.
create 2 ms. upload 16 ms. scale 5 ms. download 4 ms.
create 1 ms. upload 17 ms. scale 5 ms. download 4 ms.
create 1 ms. upload 16 ms. scale 6 ms. download 3 ms.
create 2 ms. upload 16 ms. scale 5 ms. download 4 ms.
create 2 ms. upload 20 ms. scale 5 ms. download 7 ms.
create 2 ms. upload 17 ms. scale 5 ms. download 4 ms.
create 1.6000 ms. upload 26.1000 ms. scale 5.3000 ms. download 4.1000 ms.
Intel(R) Corporation
preload 2 ms.
create 0 ms. upload 182 ms. scale 17 ms. download 6 ms.
create 2 ms. upload 17 ms. scale 5 ms. download 4 ms.
create 2 ms. upload 17 ms. scale 5 ms. download 3 ms.
create 2 ms. upload 16 ms. scale 5 ms. download 4 ms.
create 2 ms. upload 19 ms. scale 6 ms. download 3 ms.
create 2 ms. upload 18 ms. scale 6 ms. download 3 ms.
create 3 ms. upload 21 ms. scale 5 ms. download 5 ms.
create 2 ms. upload 18 ms. scale 5 ms. download 4 ms.
create 2 ms. upload 20 ms. scale 5 ms. download 4 ms.
create 1 ms. upload 18 ms. scale 5 ms. download 4 ms.
create 1.8000 ms. upload 34.6000 ms. scale 6.4000 ms. download 4.0000 ms.
它的 API 是基于多个显卡设计的,可以多显卡调度。
https://developer.nvidia.com/opencl
OpenCL Simple Multi-GPU
This application demonstrates how to make use of multiple GPUs in OpenCL.
What is the best way to programmatically choose the best GPU in OpenCL?
感觉最靠谱的还是,写个程序都跑一下,看哪个 gpu 更快。
EXCEPTION_RECORD: (.exr -1)
ExceptionAddress: 62bd3fbc (igdrcl32_62ac0000!GTPin_Init+0x000e982c)
ExceptionCode: c0000094 (Integer divide-by-zero)
ExceptionFlags: 00000000
NumberParameters: 0
STACK_TEXT:
0efaee9c 62bd3fbc igdrcl32_62ac0000!GTPin_Init+0xe982c
0efaeea4 62c450c3 igdrcl32_62ac0000!clGetGLContextInfoKHR+0x5cd73
0efaeebc 62bf61a8 igdrcl32_62ac0000!clGetGLContextInfoKHR+0xde58
0efaeed8 62af7337 igdrcl32_62ac0000!GTPin_Init+0xcba7
0efaef14 62bf7c08 igdrcl32_62ac0000!clGetGLContextInfoKHR+0xf8b8
0efaef44 62bf7fe7 igdrcl32_62ac0000!clGetGLContextInfoKHR+0xfc97
0efaef94 62b0055b igdrcl32_62ac0000!GTPin_Init+0x15dcb
0efaeff8 62bf708e igdrcl32_62ac0000!clGetGLContextInfoKHR+0xed3e
0efaf044 62bf6f9b igdrcl32_62ac0000!clGetGLContextInfoKHR+0xec4b
0efaf080 62bf6cc6 igdrcl32_62ac0000!clGetGLContextInfoKHR+0xe976
0efaf0b8 62c03937 igdrcl32_62ac0000!clGetGLContextInfoKHR+0x1b5e7
0efaf0e8 62b0c425 igdrcl32_62ac0000!GTPin_Init+0x21c95
0efaf110 62b0c303 igdrcl32_62ac0000!GTPin_Init+0x21b73
0efaf124 62b0c1e8 igdrcl32_62ac0000!GTPin_Init+0x21a58
0efaf180 62acd09b igdrcl32_62ac0000!clGetPlatformIDs+0x20b
0efaf2a4 62adfc8f igdrcl32_62ac0000!clEnqueueTask+0x29f
0efaf2c0 693912ec opencl+0x12ec
0efaf2f4 693965a7 opencl!clWaitForEvents+0x5b7
0efaf734 7734a234 ntdll+0x5a234
0efaf758 76f6bd07 KERNELBASE+0x11bd07
0efaf770 69396634 opencl!clWaitForEvents+0x644
EXCEPTION_RECORD: (.exr -1)
ExceptionAddress: 6abb3fbc (igdrcl32_6aaa0000!GTPin_Init+0x000e982c)
ExceptionCode: c0000094 (Integer divide-by-zero)
ExceptionFlags: 00000000
NumberParameters: 0
STACK_TEXT:
0f0beb1c 6abb3fbc igdrcl32_6aaa0000!GTPin_Init+0xe982c
0f0beb24 6ac250c3 igdrcl32_6aaa0000!clGetGLContextInfoKHR+0x5cd73
0f0beb3c 6abd61a8 igdrcl32_6aaa0000!clGetGLContextInfoKHR+0xde58
0f0beb58 6aad7337 igdrcl32_6aaa0000!GTPin_Init+0xcba7
0f0beb94 6abd7c08 igdrcl32_6aaa0000!clGetGLContextInfoKHR+0xf8b8
0f0bebc4 6abd7fe7 igdrcl32_6aaa0000!clGetGLContextInfoKHR+0xfc97
0f0bec14 6aae055b igdrcl32_6aaa0000!GTPin_Init+0x15dcb
0f0bec78 6abd708e igdrcl32_6aaa0000!clGetGLContextInfoKHR+0xed3e
0f0becc4 6abd6f9b igdrcl32_6aaa0000!clGetGLContextInfoKHR+0xec4b
0f0bed00 6abd6cc6 igdrcl32_6aaa0000!clGetGLContextInfoKHR+0xe976
0f0bed38 6abe3937 igdrcl32_6aaa0000!clGetGLContextInfoKHR+0x1b5e7
0f0bed68 6aaec425 igdrcl32_6aaa0000!GTPin_Init+0x21c95
0f0bed90 6aaec303 igdrcl32_6aaa0000!GTPin_Init+0x21b73
0f0beda4 6aaec1e8 igdrcl32_6aaa0000!GTPin_Init+0x21a58
0f0bee00 6aaad09b igdrcl32_6aaa0000!clGetPlatformIDs+0x20b
0f0bef24 6aabfc8f igdrcl32_6aaa0000!clEnqueueTask+0x29f
0f0bef40 6e8d12ec opencl+0x12ec
0f0bef74 6e8d65a7 opencl!clWaitForEvents+0x5b7
0f0bf3b4 77187ec4 ntdll+0x57ec4
0f0bf3d8 76c44137 KERNELBASE+0x114137
0f0bf3f0 6e8d6634 opencl!clWaitForEvents+0x644
EXCEPTION_RECORD: (.exr -1)
ExceptionAddress: 0f64811c (igdrcl32_f5c0000+0x0008811c)
ExceptionCode: c0000094 (Integer divide-by-zero)
ExceptionFlags: 00000000
NumberParameters: 0
STACK_TEXT:
09feee6c 0f64811c igdrcl32_f5c0000+0x8811c
09feee74 0f7972b9 igdrcl32_f5c0000+0x1d72b9
09feee8c 0f731288 igdrcl32_f5c0000+0x171288
09feeea8 0f601d64 igdrcl32_f5c0000+0x41d64
09feeee4 0f733e97 igdrcl32_f5c0000+0x173e97
09feef14 0f734180 igdrcl32_f5c0000+0x174180
09feef64 0f60a121 igdrcl32_f5c0000+0x4a121
09feefc8 0f623464 igdrcl32_f5c0000+0x63464
09fef014 0f62338b igdrcl32_f5c0000+0x6338b
09fef050 0f6230c9 igdrcl32_f5c0000+0x630c9
09fef088 0f76e5ec igdrcl32_f5c0000+0x1ae5ec
09fef0c0 0f644938 igdrcl32_f5c0000+0x84938
09fef11c 0f5d5784 igdrcl32_f5c0000+0x15784
09fef1f4 0f5e7b3f igdrcl32_f5c0000+0x27b3f
09fef210 719c132c opencl+0x132c
09fef244 719c65a7 opencl!clWaitForEvents+0x5b7
09fef684 77199dd6 ntdll+0x59dd6
09fef6a8 76eae917 KERNELBASE+0x10e917
09fef6c0 719c6634 opencl!clWaitForEvents+0x644
EXCEPTION_RECORD: (.exr -1)
ExceptionAddress: 71f05260 (opencl!clGetDeviceIDs+0x00000040)
ExceptionCode: c0000005 (Access violation)
ExceptionFlags: 00000000
NumberParameters: 2
Parameter[0]: 00000000
Parameter[1]: 5d45c448
Attempt to read from address 5d45c448
STACK_TEXT:
09abf588 71f05260 opencl!clGetDeviceIDs+0x40
EXCEPTION_RECORD: (.exr -1)
ExceptionAddress: 6da28112 (opencl!clWaitForEvents+0x00002122)
ExceptionCode: c0000005 (Access violation)
ExceptionFlags: 00000000
NumberParameters: 2
Parameter[0]: 00000000
Parameter[1]: 00000000
Attempt to read from address 00000000
STACK_TEXT:
0fbfeea8 6da28112 opencl!clWaitForEvents+0x2122
0fbff180 6da26210 opencl!clWaitForEvents+0x220
0fbff5c4 776fbdf4 ntdll+0x3bdf4
0fbff5e0 76d2d69e kernel32+0x2d69e
0fbff5f8 6da26634 opencl!clWaitForEvents+0x644