Halide
gpu_context.h
Go to the documentation of this file.
1 #if defined(TEST_OPENCL)
2 // Implement OpenCL custom context.
3 
4 #define CL_TARGET_OPENCL_VERSION 120
5 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
6 #ifdef __APPLE__
7 #include <OpenCL/cl.h>
8 #else
9 #include <CL/cl.h>
10 #endif
11 
12 // Create the global context. This is just a helper function not called by Halide.
13 inline bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
14  cl_int err = 0;
15 
16  const cl_uint maxPlatforms = 4;
17  cl_platform_id platforms[maxPlatforms];
18  cl_uint platformCount = 0;
19 
20  err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
21  if (err != CL_SUCCESS) {
22  printf("clGetPlatformIDs failed (%d)\n", err);
23  return false;
24  }
25 
26  cl_platform_id platform = nullptr;
27 
28  if (platformCount > 0) {
29  platform = platforms[0];
30  }
31  if (platform == nullptr) {
32  printf("Failed to get platform\n");
33  return false;
34  }
35 
37 
38  // Make sure we have a device
39  const cl_uint maxDevices = 4;
40  cl_device_id devices[maxDevices];
41  cl_uint deviceCount = 0;
42  err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
43  if (err != CL_SUCCESS) {
44  printf("clGetDeviceIDs failed (%d)\n", err);
45  return false;
46  }
47  if (deviceCount == 0) {
48  printf("Failed to get device\n");
49  return false;
50  }
51 
52  cl_device_id dev = devices[deviceCount - 1];
53 
54  // Create context and command queue.
56  0};
57  cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
58  if (err != CL_SUCCESS) {
59  printf("clCreateContext failed (%d)\n", err);
60  return false;
61  }
62 
63  cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
64  if (err != CL_SUCCESS) {
65  printf("clCreateCommandQueue failed (%d)\n", err);
66  return false;
67  }
68  return true;
69 }
70 
71 inline void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) {
72  clReleaseCommandQueue(cl_q);
73  clReleaseContext(cl_ctx);
74 }
75 
76 #elif defined(TEST_CUDA)
77 // Implement CUDA custom context.
78 #include <cuda.h>
79 
80 inline bool create_cuda_context(CUcontext &cuda_ctx) {
81  // Initialize CUDA
82  CUresult err = cuInit(0);
83  if (err != CUDA_SUCCESS) {
84  printf("cuInit failed (%d)\n", err);
85  return false;
86  }
87 
88  // Make sure we have a device
89  int deviceCount = 0;
90  err = cuDeviceGetCount(&deviceCount);
91  if (err != CUDA_SUCCESS) {
92  printf("cuGetDeviceCount failed (%d)\n", err);
93  return false;
94  }
95  if (deviceCount <= 0) {
96  printf("No CUDA devices available\n");
97  return false;
98  }
99 
100  CUdevice dev;
101  // Get device
102  CUresult status;
103  // Try to get a device >0 first, since 0 should be our display device
104  // For now, don't try devices > 2 to maintain compatibility with previous behavior.
105  if (deviceCount > 2) deviceCount = 2;
106  for (int id = deviceCount - 1; id >= 0; id--) {
107  status = cuDeviceGet(&dev, id);
108  if (status == CUDA_SUCCESS) break;
109  }
110 
111  if (status != CUDA_SUCCESS) {
112  printf("Failed to get CUDA device\n");
113  return status;
114  }
115 
116  // Create context
117  err = cuCtxCreate(&cuda_ctx, 0, dev);
118  if (err != CUDA_SUCCESS) {
119  printf("cuCtxCreate failed (%d)\n", err);
120  return false;
121  }
122 
123  return true;
124 }
125 
126 inline void destroy_cuda_context(CUcontext cuda_ctx) {
127  cuCtxDestroy(cuda_ctx);
128 }
129 
130 #elif defined(TEST_METAL) && defined(__OBJC__)
131 #include <Metal/MTLCommandQueue.h>
132 #include <Metal/MTLDevice.h>
133 
134 inline bool create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
135  device = MTLCreateSystemDefaultDevice();
136  if (device == nullptr) {
137  NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
138  if (devices != nullptr) {
139  device = devices[0];
140  }
141  }
142  if (device == nullptr) {
143  printf("Failed to find Metal device.\n");
144  return false;
145  }
146  queue = [device newCommandQueue];
147  if (queue == nullptr) {
148  printf("Failed to create Metal command queue.\n");
149  return false;
150  }
151  return true;
152 }
153 
154 inline void destroy_metal_context(id<MTLDevice> device, id<MTLCommandQueue> queue) {
155  [queue release];
156  [device release];
157 }
158 
159 #elif defined(TEST_WEBGPU)
160 
161 #include "mini_webgpu.h"
162 
163 extern "C" {
164 // TODO: Remove all of this when wgpuInstanceProcessEvents() is supported.
165 // See https://github.com/halide/Halide/issues/7248
166 #ifdef WITH_DAWN_NATIVE
167 // From <unistd.h>, used to spin-lock while waiting for device initialization.
168 int usleep(uint32_t);
169 #else
170 // Defined by Emscripten, and used to yield execution to asynchronous Javascript
171 // work in combination with Emscripten's "Asyncify" mechanism.
172 void emscripten_sleep(unsigned int ms);
173 #endif
174 }
175 
176 inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapter_out, WGPUDevice *device_out, WGPUBuffer *staging_buffer_out) {
177  struct Results {
178  WGPUInstance instance = nullptr;
179  WGPUAdapter adapter = nullptr;
180  WGPUDevice device = nullptr;
181  WGPUBuffer staging_buffer = nullptr;
182  bool success = true;
183  } results;
184 
185  // TODO: Unify this when Emscripten implements wgpuCreateInstance().
186  // See https://github.com/halide/Halide/issues/7248
187 #ifdef WITH_DAWN_NATIVE
188  WGPUInstanceDescriptor desc{};
189  desc.nextInChain = nullptr;
190  results.instance = wgpuCreateInstance(&desc);
191 #else
192  results.instance = nullptr;
193 #endif
194 
195  auto request_adapter_callback = [](WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata) {
196  auto *results = (Results *)userdata;
197 
198  if (status != WGPURequestAdapterStatus_Success) {
199  results->success = false;
200  return;
201  }
202  results->adapter = adapter;
203 
204  // Use the defaults for most limits.
205  WGPURequiredLimits requestedLimits{};
206  requestedLimits.nextInChain = nullptr;
207  memset(&requestedLimits.limits, 0xFF, sizeof(WGPULimits));
208 
209  // TODO: Enable for Emscripten when wgpuAdapterGetLimits is supported.
210  // See https://github.com/halide/Halide/issues/7248
211 #ifdef WITH_DAWN_NATIVE
212  WGPUSupportedLimits supportedLimits{};
213  supportedLimits.nextInChain = nullptr;
214  if (!wgpuAdapterGetLimits(adapter, &supportedLimits)) {
215  results->success = false;
216  return;
217  } else {
218  // Raise the limits on buffer size and workgroup storage size.
219  requestedLimits.limits.maxBufferSize = supportedLimits.limits.maxBufferSize;
220  requestedLimits.limits.maxStorageBufferBindingSize = supportedLimits.limits.maxStorageBufferBindingSize;
221  requestedLimits.limits.maxComputeWorkgroupStorageSize = supportedLimits.limits.maxComputeWorkgroupStorageSize;
222  }
223 #endif
224 
225  WGPUDeviceDescriptor desc{};
226  desc.nextInChain = nullptr;
227  desc.label = nullptr;
228  desc.requiredFeaturesCount = 0;
229  desc.requiredFeatures = nullptr;
230  desc.requiredLimits = &requestedLimits;
231 
232  auto request_device_callback = [](WGPURequestDeviceStatus status,
233  WGPUDevice device,
234  char const *message,
235  void *userdata) {
236  auto *results = (Results *)userdata;
237  if (status != WGPURequestDeviceStatus_Success) {
238  results->success = false;
239  return;
240  }
241  results->device = device;
242 
243  auto device_lost_callback = [](WGPUDeviceLostReason reason,
244  char const *message,
245  void *userdata) {
246  fprintf(stderr, "WGPU Device Lost: %d %s", (int)reason, message);
247  abort();
248  };
249  wgpuDeviceSetDeviceLostCallback(device, device_lost_callback, userdata);
250 
251  // Create a staging buffer for transfers.
252  constexpr int kStagingBufferSize = 4 * 1024 * 1024;
253  WGPUBufferDescriptor desc{};
254  desc.nextInChain = nullptr;
255  desc.label = nullptr;
257  desc.size = kStagingBufferSize;
258  desc.mappedAtCreation = false;
259  results->staging_buffer = wgpuDeviceCreateBuffer(device, &desc);
260  if (results->staging_buffer == nullptr) {
261  results->success = false;
262  return;
263  }
264  };
265 
266  wgpuAdapterRequestDevice(adapter, &desc, request_device_callback, userdata);
267  };
268 
269  wgpuInstanceRequestAdapter(results.instance, nullptr, request_adapter_callback, &results);
270 
271  // Wait for device initialization to complete.
272  while (!results.device && results.success) {
273  // TODO: Use wgpuInstanceProcessEvents() when it is supported.
274  // See https://github.com/halide/Halide/issues/7248
275 #ifndef WITH_DAWN_NATIVE
276  emscripten_sleep(10);
277 #else
278  usleep(1000);
279 #endif
280  }
281 
282  *instance_out = results.instance;
283  *adapter_out = results.adapter;
284  *device_out = results.device;
285  *staging_buffer_out = results.staging_buffer;
286  return results.success;
287 }
288 
289 inline void destroy_webgpu_context(WGPUInstance instance, WGPUAdapter adapter, WGPUDevice device, WGPUBuffer staging_buffer) {
290  wgpuDeviceSetDeviceLostCallback(device, nullptr, nullptr);
291  wgpuBufferRelease(staging_buffer);
292  wgpuDeviceRelease(device);
293  wgpuAdapterRelease(adapter);
294 
295  // TODO: Unify this when Emscripten supports wgpuInstanceRelease().
296  // See https://github.com/halide/Halide/issues/7248
297 #ifdef WITH_DAWN_NATIVE
298  wgpuInstanceRelease(instance);
299 #endif
300 }
301 
302 #endif
WGPUDeviceDescriptor::nextInChain
const WGPUChainedStruct * nextInChain
Definition: mini_webgpu.h:1320
WGPURequestDeviceStatus
WGPURequestDeviceStatus
Definition: mini_webgpu.h:373
Halide::Runtime::Internal::Cuda::CUresult
CUresult
Definition: mini_cuda.h:43
WGPUBufferDescriptor
Definition: mini_webgpu.h:729
wgpuAdapterRelease
WGPU_EXPORT void wgpuAdapterRelease(WGPUAdapter adapter)
Halide::Runtime::Internal::Cuda::CUDA_SUCCESS
@ CUDA_SUCCESS
Definition: mini_cuda.h:44
WGPUSupportedLimits::nextInChain
WGPUChainedStructOut * nextInChain
Definition: mini_webgpu.h:1274
Halide::Runtime::Internal::Vulkan::device_type
WEAK char device_type[256]
Definition: vulkan_extensions.h:23
CL_DEVICE_TYPE_ALL
#define CL_DEVICE_TYPE_ALL
Definition: mini_cl.h:218
CL_CONTEXT_PLATFORM
#define CL_CONTEXT_PLATFORM
Definition: mini_cl.h:330
WGPUSupportedLimits
Definition: mini_webgpu.h:1273
WGPUInstanceDescriptor
Definition: mini_webgpu.h:875
wgpuInstanceRequestAdapter
WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPURequestAdapterOptions const *options, WGPURequestAdapterCallback callback, void *userdata)
abort
void abort()
Halide::Runtime::Internal::Cuda::CUcontext
struct CUctx_st * CUcontext
CUDA context.
Definition: mini_cuda.h:22
WGPUInstanceDescriptor::nextInChain
const WGPUChainedStruct * nextInChain
Definition: mini_webgpu.h:876
memset
void * memset(void *s, int val, size_t n)
cl_int
int32_t cl_int
Definition: mini_cl.h:44
CL_SUCCESS
#define CL_SUCCESS
Definition: mini_cl.h:133
Halide::Runtime::Internal::Cuda::CUdevice
int CUdevice
CUDA device.
Definition: mini_cuda.h:21
mini_webgpu.h
WGPUDeviceDescriptor
Definition: mini_webgpu.h:1319
wgpuInstanceRelease
WGPU_EXPORT void wgpuInstanceRelease(WGPUInstance instance)
WGPUDevice
struct WGPUDeviceImpl * WGPUDevice
Definition: mini_webgpu.h:72
cl_uint
uint32_t cl_uint
Definition: mini_cl.h:45
wgpuAdapterRequestDevice
WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPUDeviceDescriptor const *descriptor, WGPURequestDeviceCallback callback, void *userdata)
cl_command_queue
struct _cl_command_queue * cl_command_queue
Definition: mini_cl.h:58
cl_device_type
cl_bitfield cl_device_type
Definition: mini_cl.h:67
WGPURequiredLimits
Definition: mini_webgpu.h:1268
WGPUBuffer
struct WGPUBufferImpl * WGPUBuffer
Definition: mini_webgpu.h:67
wgpuCreateInstance
WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPUInstanceDescriptor const *descriptor)
WGPUBufferDescriptor::nextInChain
const WGPUChainedStruct * nextInChain
Definition: mini_webgpu.h:730
cl_platform_id
struct _cl_platform_id * cl_platform_id
Definition: mini_cl.h:55
wgpuDeviceRelease
WGPU_EXPORT void wgpuDeviceRelease(WGPUDevice device)
cl_device_id
struct _cl_device_id * cl_device_id
Definition: mini_cl.h:56
WGPURequestDeviceStatus_Success
@ WGPURequestDeviceStatus_Success
Definition: mini_webgpu.h:374
wgpuDeviceSetDeviceLostCallback
WGPU_EXPORT void wgpuDeviceSetDeviceLostCallback(WGPUDevice device, WGPUDeviceLostCallback callback, void *userdata)
WGPUInstance
struct WGPUInstanceImpl * WGPUInstance
Definition: mini_webgpu.h:74
wgpuAdapterGetLimits
WGPU_EXPORT bool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimits *limits)
wgpuDeviceCreateBuffer
WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescriptor const *descriptor)
WGPUBufferUsage_MapRead
@ WGPUBufferUsage_MapRead
Definition: mini_webgpu.h:630
WGPULimits
Definition: mini_webgpu.h:879
cl_context_properties
intptr_t cl_context_properties
Definition: mini_cl.h:78
cl_context
struct _cl_context * cl_context
Definition: mini_cl.h:57
uint32_t
unsigned __INT32_TYPE__ uint32_t
Definition: runtime_internal.h:25
WGPUDeviceLostReason
WGPUDeviceLostReason
Definition: mini_webgpu.h:225
wgpuBufferRelease
WGPU_EXPORT void wgpuBufferRelease(WGPUBuffer buffer)
WGPUBufferUsage_CopyDst
@ WGPUBufferUsage_CopyDst
Definition: mini_webgpu.h:633
WGPURequestAdapterStatus_Success
@ WGPURequestAdapterStatus_Success
Definition: mini_webgpu.h:366
WGPURequestAdapterStatus
WGPURequestAdapterStatus
Definition: mini_webgpu.h:365
WGPUAdapter
struct WGPUAdapterImpl * WGPUAdapter
Definition: mini_webgpu.h:64
WGPURequiredLimits::nextInChain
const WGPUChainedStruct * nextInChain
Definition: mini_webgpu.h:1269