Halide 19.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
HalideBuffer.h
Go to the documentation of this file.
1/** \file
2 * Defines a Buffer type that wraps from halide_buffer_t and adds
3 * functionality, and methods for more conveniently iterating over the
4 * samples in a halide_buffer_t outside of Halide code. */
5
6#ifndef HALIDE_RUNTIME_BUFFER_H
7#define HALIDE_RUNTIME_BUFFER_H
8
9#include <algorithm>
10#include <atomic>
11#include <cassert>
12#include <cstdint>
13#include <cstdlib>
14#include <cstring>
15#include <limits>
16#include <memory>
17#include <type_traits>
18#include <vector>
19
20#ifdef __APPLE__
21#include <AvailabilityVersions.h>
22#include <TargetConditionals.h>
23#endif
24
25#if defined(__has_feature)
26#if __has_feature(memory_sanitizer)
27#include <sanitizer/msan_interface.h>
28#endif
29#endif
30
31#include "HalideRuntime.h"
32
33#ifdef _MSC_VER
34#include <malloc.h>
35#define HALIDE_ALLOCA _alloca
36#else
37#define HALIDE_ALLOCA __builtin_alloca
38#endif
39
40// gcc 5.1 has a false positive warning on this code
41#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
42#pragma GCC diagnostic ignored "-Warray-bounds"
43#endif
44
45#ifndef HALIDE_RUNTIME_BUFFER_CHECK_INDICES
46#define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
47#endif
48
49#ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
50// Conservatively align buffer allocations to 128 bytes by default.
51// This is enough alignment for all the platforms currently in use.
52// Redefine this in your compiler settings if you desire more/less alignment.
53#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
54#endif
55
57 "HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT must be a power of 2.");
58
59// Unfortunately, not all C++17 runtimes support aligned_alloc
60// (it may depends on OS/SDK version); this is provided as an opt-out
61// if you are compiling on a platform that doesn't provide a (good)
62// implementation. (Note that we actually use the C11 `::aligned_alloc()`
63// rather than the C++17 `std::aligned_alloc()` because at least one platform
64// we found supports the former but not the latter.)
65#ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
66
67// clang-format off
68#ifdef _MSC_VER
69
70 // MSVC doesn't implement aligned_alloc(), even in C++17 mode, and
71 // has stated they probably never will, so, always default it off here.
72 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
73
74#elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
75
76 // Android doesn't provide aligned_alloc until API 28
77 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
78
79#elif defined(__APPLE__)
80
81 #if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_15)
82
83 // macOS doesn't provide aligned_alloc until 10.15
84 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
85
86 #elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_14_0)
87
88 // iOS doesn't provide aligned_alloc until 14.0
89 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
90
91 #else
92
93 // Assume it's ok on all other Apple targets
94 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
95
96 #endif
97
98#else
99
100 #if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)
101
102 // ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
103 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
104
105 #else
106
107 // Not Windows, Android, or Apple: just assume it's ok
108 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
109
110 #endif
111
112#endif
113// clang-format on
114
115#endif // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
116
117namespace Halide {
118namespace Runtime {
119
120// Forward-declare our Buffer class
121template<typename T, int Dims, int InClassDimStorage>
122class Buffer;
123
124// A helper to check if a parameter pack is entirely implicitly
125// int-convertible to use with std::enable_if
126template<typename... Args>
127struct AllInts : std::false_type {};
128
129template<>
130struct AllInts<> : std::true_type {};
131
132template<typename T, typename... Args>
133struct AllInts<T, Args...> {
134 static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
135};
136
137// Floats and doubles are technically implicitly int-convertible, but
138// doing so produces a warning we treat as an error, so just disallow
139// it here.
140template<typename... Args>
141struct AllInts<float, Args...> : std::false_type {};
142
143template<typename... Args>
144struct AllInts<double, Args...> : std::false_type {};
145
146namespace Internal {
147// A helper to detect if there are any zeros in a container
148template<typename Container>
149bool any_zero(const Container &c) {
150 for (int i : c) {
151 if (i == 0) {
152 return true;
153 }
154 }
155 return false;
156}
157
159 static inline void *(*default_allocate_fn)(size_t) = nullptr;
160 static inline void (*default_deallocate_fn)(void *) = nullptr;
161};
162} // namespace Internal
163
164/** A struct acting as a header for allocations owned by the Buffer
165 * class itself. */
167 void (*deallocate_fn)(void *);
168 std::atomic<int> ref_count;
169
170 // Note that ref_count always starts at 1
171 explicit AllocationHeader(void (*deallocate_fn)(void *))
173 }
174};
175
176/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
177enum struct BufferDeviceOwnership : int {
178 Allocated, ///> halide_device_free will be called when device ref count goes to zero
179 WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
180 Unmanaged, ///> No free routine will be called when device ref count goes to zero
181 AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
182 Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero.
183};
184
185/** A similar struct for managing device allocations. */
187 // This is only ever constructed when there's something to manage,
188 // so start at one.
189 std::atomic<int> count{1};
191};
192
193constexpr int AnyDims = -1;
194
195/** A templated Buffer class that wraps halide_buffer_t and adds
196 * functionality. When using Halide from C++, this is the preferred
197 * way to create input and output buffers. The overhead of using this
198 * class relative to a naked halide_buffer_t is minimal - it uses another
199 * ~16 bytes on the stack, and does no dynamic allocations when using
200 * it to represent existing memory of a known maximum dimensionality.
201 *
202 * The template parameter T is the element type. For buffers where the
203 * element type is unknown, or may vary, use void or const void.
204 *
205 * The template parameter Dims is the number of dimensions. For buffers where
206 * the dimensionality type is unknown at, or may vary, use AnyDims.
207 *
208 * InClassDimStorage is the maximum number of dimensions that can be represented
209 * using space inside the class itself. Set it to the maximum dimensionality
210 * you expect this buffer to be. If the actual dimensionality exceeds
211 * this, heap storage is allocated to track the shape of the buffer.
212 * InClassDimStorage defaults to 4, which should cover nearly all usage.
213 *
214 * The class optionally allocates and owns memory for the image using
215 * a shared pointer allocated with the provided allocator. If they are
216 * null, malloc and free are used. Any device-side allocation is
217 * considered as owned if and only if the host-side allocation is
218 * owned. */
219template<typename T = void,
220 int Dims = AnyDims,
221 int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
222class Buffer {
223 /** The underlying halide_buffer_t */
224 halide_buffer_t buf = {};
225
226 /** Some in-class storage for shape of the dimensions. */
227 halide_dimension_t shape[InClassDimStorage];
228
229 /** The allocation owned by this Buffer. NULL if the Buffer does not
230 * own the memory. */
231 AllocationHeader *alloc = nullptr;
232
233 /** A reference count for the device allocation owned by this
234 * buffer. */
235 mutable DeviceRefCount *dev_ref_count = nullptr;
236
237 /** True if T is of type void or const void */
238 static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;
239
240 /** A type function that adds a const qualifier if T is a const type. */
241 template<typename T2>
242 using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;
243
244 /** T unless T is (const) void, in which case (const)
245 * uint8_t. Useful for providing return types for operator() */
246 using not_void_T = typename std::conditional<T_is_void,
247 add_const_if_T_is_const<uint8_t>,
248 T>::type;
249
250 /** T with constness removed. Useful for return type of copy(). */
251 using not_const_T = typename std::remove_const<T>::type;
252
253 /** The type the elements are stored as. Equal to not_void_T
254 * unless T is a pointer, in which case uint64_t. Halide stores
255 * all pointer types as uint64s internally, even on 32-bit
256 * systems. */
257 using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;
258
259public:
260 /** True if the Halide type is not void (or const void). */
261 static constexpr bool has_static_halide_type = !T_is_void;
262
263 /** Get the Halide type of T. Callers should not use the result if
264 * has_static_halide_type is false. */
266 return halide_type_of<typename std::remove_cv<not_void_T>::type>();
267 }
268
269 /** Does this Buffer own the host memory it refers to? */
270 bool owns_host_memory() const {
271 return alloc != nullptr;
272 }
273
274 static constexpr bool has_static_dimensions = (Dims != AnyDims);
275
276 /** Callers should not use the result if
277 * has_static_dimensions is false. */
278 static constexpr int static_dimensions() {
279 return Dims;
280 }
281
282 static_assert(!has_static_dimensions || static_dimensions() >= 0);
283
284private:
285 /** Increment the reference count of any owned allocation */
286 void incref() const {
287 if (owns_host_memory()) {
288 alloc->ref_count++;
289 }
290 if (buf.device) {
291 if (!dev_ref_count) {
292 // I seem to have a non-zero dev field but no
293 // reference count for it. I must have been given a
294 // device allocation by a Halide pipeline, and have
295 // never been copied from since. Take sole ownership
296 // of it.
297 dev_ref_count = new DeviceRefCount;
298 }
299 dev_ref_count->count++;
300 }
301 }
302
303 // Note that this is called "cropped" but can also encompass a slice/embed
304 // operation as well.
305 struct DevRefCountCropped : DeviceRefCount {
306 // We will only store Buffers that have a dynamic number of dimensions.
307 // Buffers that cropped or sliced from need to be first converted to
308 // one with variable size. This is required because we cannot possibly
309 // know what the actual dimensionality is of the buffer this is a
310 // crop or slice from. Since cropping a sliced buffer is also possible,
311 // no optimizations can be made for cropped buffers either.
312 Buffer<T, AnyDims> cropped_from;
313 explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
314 : cropped_from(cropped_from) {
316 }
317 };
318
319 /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
320 void crop_from(const Buffer<T, AnyDims> &cropped_from) {
321 assert(dev_ref_count == nullptr);
322 dev_ref_count = new DevRefCountCropped(cropped_from);
323 }
324
325 /** Decrement the reference count of any owned allocation and free host
326 * and device memory if it hits zero. Sets alloc to nullptr. */
327 void decref(bool device_only = false) {
328 if (owns_host_memory() && !device_only) {
329 int new_count = --(alloc->ref_count);
330 if (new_count == 0) {
331 void (*fn)(void *) = alloc->deallocate_fn;
332 alloc->~AllocationHeader();
333 fn(alloc);
334 }
335 buf.host = nullptr;
336 alloc = nullptr;
337 set_host_dirty(false);
338 }
339 int new_count = 0;
340 if (dev_ref_count) {
341 new_count = --(dev_ref_count->count);
342 }
343 if (new_count == 0) {
344 if (buf.device) {
345 assert(!(alloc && device_dirty()) &&
346 "Implicitly freeing a dirty device allocation while a host allocation still lives. "
347 "Call device_free explicitly if you want to drop dirty device-side data. "
348 "Call copy_to_host explicitly if you want the data copied to the host allocation "
349 "before the device allocation is freed.");
350 int result = halide_error_code_success;
351 if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
352 result = buf.device_interface->detach_native(nullptr, &buf);
353 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
354 result = buf.device_interface->device_and_host_free(nullptr, &buf);
355 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
356 result = buf.device_interface->device_release_crop(nullptr, &buf);
357 } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
358 result = buf.device_interface->device_free(nullptr, &buf);
359 }
360 // No reasonable way to return the error, but we can at least assert-fail in debug builds.
361 assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
362 (void)result;
363 }
364 if (dev_ref_count) {
365 if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
366 delete (DevRefCountCropped *)dev_ref_count;
367 } else {
368 delete dev_ref_count;
369 }
370 }
371 }
372 dev_ref_count = nullptr;
373 buf.device = 0;
374 buf.device_interface = nullptr;
375 }
376
377 void free_shape_storage() {
378 if (buf.dim != shape) {
379 delete[] buf.dim;
380 buf.dim = nullptr;
381 }
382 }
383
384 template<int DimsSpecified>
385 void make_static_shape_storage() {
386 static_assert(Dims == AnyDims || Dims == DimsSpecified,
387 "Number of arguments to Buffer() does not match static dimensionality");
388 buf.dimensions = DimsSpecified;
389 if constexpr (Dims == AnyDims) {
390 if constexpr (DimsSpecified <= InClassDimStorage) {
391 buf.dim = shape;
392 } else {
393 static_assert(DimsSpecified >= 1);
394 buf.dim = new halide_dimension_t[DimsSpecified];
395 }
396 } else {
397 static_assert(InClassDimStorage >= Dims);
398 buf.dim = shape;
399 }
400 }
401
402 void make_shape_storage(const int dimensions) {
403 if (Dims != AnyDims && Dims != dimensions) {
404 assert(false && "Number of arguments to Buffer() does not match static dimensionality");
405 }
406 // This should usually be inlined, so if dimensions is statically known,
407 // we can skip the call to new
409 buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
410 }
411
412 void copy_shape_from(const halide_buffer_t &other) {
413 // All callers of this ensure that buf.dimensions == other.dimensions.
414 make_shape_storage(other.dimensions);
415 std::copy(other.dim, other.dim + other.dimensions, buf.dim);
416 }
417
418 template<typename T2, int D2, int S2>
419 void move_shape_from(Buffer<T2, D2, S2> &&other) {
420 if (other.shape == other.buf.dim) {
421 copy_shape_from(other.buf);
422 } else {
423 buf.dim = other.buf.dim;
424 other.buf.dim = nullptr;
425 }
426 other.buf = halide_buffer_t();
427 }
428
429 /** Initialize the shape from a halide_buffer_t. */
430 void initialize_from_buffer(const halide_buffer_t &b,
431 BufferDeviceOwnership ownership) {
432 memcpy(&buf, &b, sizeof(halide_buffer_t));
433 copy_shape_from(b);
434 if (b.device) {
435 dev_ref_count = new DeviceRefCount;
436 dev_ref_count->ownership = ownership;
437 }
438 }
439
440 /** Initialize the shape from an array of ints */
441 void initialize_shape(const int *sizes) {
442 for (int i = 0; i < buf.dimensions; i++) {
443 buf.dim[i].min = 0;
444 buf.dim[i].extent = sizes[i];
445 if (i == 0) {
446 buf.dim[i].stride = 1;
447 } else {
448 buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
449 }
450 }
451 }
452
453 /** Initialize the shape from a vector of extents */
454 void initialize_shape(const std::vector<int> &sizes) {
455 assert(buf.dimensions == (int)sizes.size());
456 initialize_shape(sizes.data());
457 }
458
459 /** Initialize the shape from the static shape of an array */
460 template<typename Array, size_t N>
461 void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
462 buf.dim[next].min = 0;
463 buf.dim[next].extent = (int)N;
464 if (next == 0) {
465 buf.dim[next].stride = 1;
466 } else {
467 initialize_shape_from_array_shape(next - 1, vals[0]);
468 buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
469 }
470 }
471
472 /** Base case for the template recursion above. */
473 template<typename T2>
474 void initialize_shape_from_array_shape(int, const T2 &) {
475 }
476
477 /** Get the dimensionality of a multi-dimensional C array */
478 template<typename Array, size_t N>
479 static int dimensionality_of_array(Array (&vals)[N]) {
480 return dimensionality_of_array(vals[0]) + 1;
481 }
482
483 template<typename T2>
484 static int dimensionality_of_array(const T2 &) {
485 return 0;
486 }
487
488 /** Get the underlying halide_type_t of an array's element type. */
489 template<typename Array, size_t N>
490 static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
491 return scalar_type_of_array(vals[0]);
492 }
493
494 template<typename T2>
495 static halide_type_t scalar_type_of_array(const T2 &) {
496 return halide_type_of<typename std::remove_cv<T2>::type>();
497 }
498
499 /** Crop a single dimension without handling device allocation. */
500 void crop_host(int d, int min, int extent) {
501 assert(dim(d).min() <= min);
502 assert(dim(d).max() >= min + extent - 1);
503 ptrdiff_t shift = min - dim(d).min();
504 if (buf.host != nullptr) {
505 buf.host += (shift * dim(d).stride()) * type().bytes();
506 }
507 buf.dim[d].min = min;
508 buf.dim[d].extent = extent;
509 }
510
511 /** Crop as many dimensions as are in rect, without handling device allocation. */
512 void crop_host(const std::vector<std::pair<int, int>> &rect) {
513 assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
514 int limit = (int)rect.size();
515 assert(limit <= dimensions());
516 for (int i = 0; i < limit; i++) {
517 crop_host(i, rect[i].first, rect[i].second);
518 }
519 }
520
521 void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
522 assert(buf.device_interface != nullptr);
523 if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == halide_error_code_success) {
524 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
525 // is it possible to get to this point without incref having run at least once since
526 // the device field was set? (I.e. in the internal logic of crop. incref might have been
527 // called.)
528 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
529 result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
530 } else {
531 result_host_cropped.crop_from(*this);
532 }
533 }
534 }
535
536 /** slice a single dimension without handling device allocation. */
537 void slice_host(int d, int pos) {
538 static_assert(Dims == AnyDims);
539 assert(dimensions() > 0);
540 assert(d >= 0 && d < dimensions());
541 assert(pos >= dim(d).min() && pos <= dim(d).max());
542 buf.dimensions--;
543 ptrdiff_t shift = pos - buf.dim[d].min;
544 if (buf.host != nullptr) {
545 buf.host += (shift * buf.dim[d].stride) * type().bytes();
546 }
547 for (int i = d; i < buf.dimensions; i++) {
548 buf.dim[i] = buf.dim[i + 1];
549 }
550 buf.dim[buf.dimensions] = {0, 0, 0};
551 }
552
553 void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
554 assert(buf.device_interface != nullptr);
555 if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == halide_error_code_success) {
556 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
557 // is it possible to get to this point without incref having run at least once since
558 // the device field was set? (I.e. in the internal logic of slice. incref might have been
559 // called.)
560 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
561 // crop_from() is correct here, despite the fact that we are slicing.
562 result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
563 } else {
564 // crop_from() is correct here, despite the fact that we are slicing.
565 result_host_sliced.crop_from(*this);
566 }
567 }
568 }
569
570public:
571 typedef T ElemType;
572
573 /** Read-only access to the shape */
574 class Dimension {
575 const halide_dimension_t &d;
576
577 public:
578 /** The lowest coordinate in this dimension */
580 return d.min;
581 }
582
583 /** The number of elements in memory you have to step over to
584 * increment this coordinate by one. */
586 return d.stride;
587 }
588
589 /** The extent of the image along this dimension */
591 return d.extent;
592 }
593
594 /** The highest coordinate in this dimension */
596 return min() + extent() - 1;
597 }
598
599 /** An iterator class, so that you can iterate over
600 * coordinates in a dimensions using a range-based for loop. */
601 struct iterator {
602 int val;
603 int operator*() const {
604 return val;
605 }
606 bool operator!=(const iterator &other) const {
607 return val != other.val;
608 }
610 val++;
611 return *this;
612 }
613 };
614
615 /** An iterator that points to the min coordinate */
617 return {min()};
618 }
619
620 /** An iterator that points to one past the max coordinate */
622 return {min() + extent()};
623 }
624
626 : d(dim) {
627 }
628 };
629
630 /** Access the shape of the buffer */
632 assert(i >= 0 && i < this->dimensions());
633 return Dimension(buf.dim[i]);
634 }
635
636 /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
637 // @{
638 int min(int i) const {
639 return dim(i).min();
640 }
641 int extent(int i) const {
642 return dim(i).extent();
643 }
644 int stride(int i) const {
645 return dim(i).stride();
646 }
647 // @}
648
649 /** The total number of elements this buffer represents. Equal to
650 * the product of the extents */
651 size_t number_of_elements() const {
652 return buf.number_of_elements();
653 }
654
655 /** Get the dimensionality of the buffer. */
656 int dimensions() const {
657 if constexpr (has_static_dimensions) {
658 return Dims;
659 } else {
660 return buf.dimensions;
661 }
662 }
663
664 /** Get the type of the elements. */
666 return buf.type;
667 }
668
669 /** A pointer to the element with the lowest address. If all
670 * strides are positive, equal to the host pointer. */
671 T *begin() const {
672 assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
673 return (T *)buf.begin();
674 }
675
676 /** A pointer to one beyond the element with the highest address. */
677 T *end() const {
678 assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
679 return (T *)buf.end();
680 }
681
682 /** The total number of bytes spanned by the data in memory. */
683 size_t size_in_bytes() const {
684 return buf.size_in_bytes();
685 }
686
687 /** Reset the Buffer to be equivalent to a default-constructed Buffer
688 * of the same static type (if any); Buffer<void> will have its runtime
689 * type reset to uint8. */
690 void reset() {
691 *this = Buffer();
692 }
693
695 : shape() {
696 buf.type = static_halide_type();
697 // If Dims are statically known, must create storage that many.
698 // otherwise, make a zero-dimensional buffer.
699 constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
700 make_static_shape_storage<buf_dimensions>();
701 }
702
703 /** Make a Buffer from a halide_buffer_t */
704 explicit Buffer(const halide_buffer_t &buf,
706 assert(T_is_void || buf.type == static_halide_type());
707 initialize_from_buffer(buf, ownership);
708 }
709
710 /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
711 template<typename T2, int D2, int S2>
712 friend class Buffer;
713
714private:
715 template<typename T2, int D2, int S2>
716 static void static_assert_can_convert_from() {
717 static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
718 "Can't convert from a Buffer<const T> to a Buffer<T>");
719 static_assert(std::is_same<typename std::remove_const<T>::type,
720 typename std::remove_const<T2>::type>::value ||
722 "type mismatch constructing Buffer");
723 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
724 "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
725 }
726
727public:
728 static void set_default_allocate_fn(void *(*allocate_fn)(size_t)) {
730 }
731 static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
733 }
734
735 /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
736 * If this can be determined at compile time, fail with a static assert; otherwise
737 * return a boolean based on runtime typing. */
738 template<typename T2, int D2, int S2>
739 static bool can_convert_from(const Buffer<T2, D2, S2> &other) {
740 static_assert_can_convert_from<T2, D2, S2>();
741 if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
742 if (other.type() != static_halide_type()) {
743 return false;
744 }
745 }
746 if (Dims != AnyDims) {
747 if (other.dimensions() != Dims) {
748 return false;
749 }
750 }
751 return true;
752 }
753
754 /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
755 * cannot be constructed from some other Buffer type. */
756 template<typename T2, int D2, int S2>
757 static void assert_can_convert_from(const Buffer<T2, D2, S2> &other) {
758 // Explicitly call static_assert_can_convert_from() here so
759 // that we always get compile-time checking, even if compiling with
760 // assertions disabled.
761 static_assert_can_convert_from<T2, D2, S2>();
762 assert(can_convert_from(other));
763 }
764
765 /** Copy constructor. Does not copy underlying data. */
767 : buf(other.buf),
768 alloc(other.alloc) {
769 other.incref();
770 dev_ref_count = other.dev_ref_count;
771 copy_shape_from(other.buf);
772 }
773
774 /** Construct a Buffer from a Buffer of different dimensionality
775 * and type. Asserts that the type and dimensionality matches (at runtime,
776 * if one of the types is void). Note that this constructor is
777 * implicit. This, for example, lets you pass things like
778 * Buffer<T> or Buffer<const void> to functions expected
779 * Buffer<const T>. */
780 template<typename T2, int D2, int S2>
782 : buf(other.buf),
783 alloc(other.alloc) {
785 other.incref();
786 dev_ref_count = other.dev_ref_count;
787 copy_shape_from(other.buf);
788 }
789
790 /** Move constructor */
792 : buf(other.buf),
793 alloc(other.alloc),
794 dev_ref_count(other.dev_ref_count) {
795 other.dev_ref_count = nullptr;
796 other.alloc = nullptr;
797 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
798 }
799
800 /** Move-construct a Buffer from a Buffer of different
801 * dimensionality and type. Asserts that the types match (at
802 * runtime if one of the types is void). */
803 template<typename T2, int D2, int S2>
805 : buf(other.buf),
806 alloc(other.alloc),
807 dev_ref_count(other.dev_ref_count) {
809 other.dev_ref_count = nullptr;
810 other.alloc = nullptr;
811 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
812 }
813
814 /** Assign from another Buffer of possibly-different
815 * dimensionality and type. Asserts that the types match (at
816 * runtime if one of the types is void). */
817 template<typename T2, int D2, int S2>
819 if ((const void *)this == (const void *)&other) {
820 return *this;
821 }
823 other.incref();
824 decref();
825 dev_ref_count = other.dev_ref_count;
826 alloc = other.alloc;
827 free_shape_storage();
828 buf = other.buf;
829 copy_shape_from(other.buf);
830 return *this;
831 }
832
833 /** Standard assignment operator */
835 // The cast to void* here is just to satisfy clang-tidy
836 if ((const void *)this == (const void *)&other) {
837 return *this;
838 }
839 other.incref();
840 decref();
841 dev_ref_count = other.dev_ref_count;
842 alloc = other.alloc;
843 free_shape_storage();
844 buf = other.buf;
845 copy_shape_from(other.buf);
846 return *this;
847 }
848
849 /** Move from another Buffer of possibly-different
850 * dimensionality and type. Asserts that the types match (at
851 * runtime if one of the types is void). */
852 template<typename T2, int D2, int S2>
855 decref();
856 alloc = other.alloc;
857 other.alloc = nullptr;
858 dev_ref_count = other.dev_ref_count;
859 other.dev_ref_count = nullptr;
860 free_shape_storage();
861 buf = other.buf;
862 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
863 return *this;
864 }
865
866 /** Standard move-assignment operator */
868 decref();
869 alloc = other.alloc;
870 other.alloc = nullptr;
871 dev_ref_count = other.dev_ref_count;
872 other.dev_ref_count = nullptr;
873 free_shape_storage();
874 buf = other.buf;
875 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
876 return *this;
877 }
878
879 /** Check the product of the extents fits in memory. */
881 size_t size = type().bytes();
882 for (int i = 0; i < dimensions(); i++) {
883 size *= dim(i).extent();
884 }
885 // We allow 2^31 or 2^63 bytes, so drop the top bit.
886 size = (size << 1) >> 1;
887 for (int i = 0; i < dimensions(); i++) {
888 size /= dim(i).extent();
889 }
890 assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
891 }
892
893 /** Allocate memory for this Buffer. Drops the reference to any
894 * owned memory. */
895 void allocate(void *(*allocate_fn)(size_t) = nullptr,
896 void (*deallocate_fn)(void *) = nullptr) {
897 // Drop any existing allocation
898 deallocate();
899
900 // Conservatively align images to (usually) 128 bytes. This is enough
901 // alignment for all the platforms we might use. Also ensure that the allocation
902 // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
903 constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;
904
905 const auto align_up = [=](size_t value) -> size_t {
906 return (value + alignment - 1) & ~(alignment - 1);
907 };
908
909 size_t size = size_in_bytes();
910
911#if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
912 // Only use aligned_alloc() if no custom allocators are specified.
914 // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
915 // on any supported platform, so we will just overallocate by 'alignment'
916 // so that the user storage also starts at an aligned point. This is a bit
917 // wasteful, but probably not a big deal.
918 static_assert(sizeof(AllocationHeader) <= alignment);
919 void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
920 assert((uintptr_t)alloc_storage == align_up((uintptr_t)alloc_storage));
921 alloc = new (alloc_storage) AllocationHeader(free);
922 buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
923 return;
924 }
925 // else fall thru
926#endif
927 if (!allocate_fn) {
929 if (!allocate_fn) {
930 allocate_fn = malloc;
931 }
932 }
933 if (!deallocate_fn) {
935 if (!deallocate_fn) {
936 deallocate_fn = free;
937 }
938 }
939
940 static_assert(sizeof(AllocationHeader) <= alignment);
941
942 // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
943 // make sure this is OK for AllocationHeader, since it always goes at the start
944 static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));
945
946 const size_t requested_size = align_up(size + alignment +
947 std::max(0, (int)sizeof(AllocationHeader) -
948 (int)sizeof(std::max_align_t)));
949 void *alloc_storage = allocate_fn(requested_size);
950 alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
951 uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
952 buf.host = (uint8_t *)align_up((uintptr_t)unaligned_ptr);
953 }
954
955 /** Drop reference to any owned host or device memory, possibly
956 * freeing it, if this buffer held the last reference to
957 * it. Retains the shape of the buffer. Does nothing if this
958 * buffer did not allocate its own memory. */
959 void deallocate() {
960 decref();
961 }
962
963 /** Drop reference to any owned device memory, possibly freeing it
964 * if this buffer held the last reference to it. Asserts that
965 * device_dirty is false. */
967 decref(true);
968 }
969
970 /** Allocate a new image of the given size with a runtime
971 * type. Only used when you do know what size you want but you
972 * don't know statically what type the elements are. Pass zeroes
973 * to make a buffer suitable for bounds query calls. */
974 template<typename... Args,
975 typename = typename std::enable_if<AllInts<Args...>::value>::type>
976 Buffer(halide_type_t t, int first, Args... rest) {
977 if (!T_is_void) {
978 assert(static_halide_type() == t);
979 }
980 int extents[] = {first, (int)rest...};
981 buf.type = t;
982 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
983 make_static_shape_storage<buf_dimensions>();
984 initialize_shape(extents);
985 if (!Internal::any_zero(extents)) {
987 allocate();
988 }
989 }
990
991 /** Allocate a new image of the given size. Pass zeroes to make a
992 * buffer suitable for bounds query calls. */
993 // @{
994
995 // The overload with one argument is 'explicit', so that
996 // (say) int is not implicitly convertible to Buffer<int>
997 explicit Buffer(int first) {
998 static_assert(!T_is_void,
999 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1000 int extents[] = {first};
1001 buf.type = static_halide_type();
1002 constexpr int buf_dimensions = 1;
1003 make_static_shape_storage<buf_dimensions>();
1004 initialize_shape(extents);
1005 if (first != 0) {
1007 allocate();
1008 }
1009 }
1010
1011 template<typename... Args,
1012 typename = typename std::enable_if<AllInts<Args...>::value>::type>
1013 Buffer(int first, int second, Args... rest) {
1014 static_assert(!T_is_void,
1015 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1016 int extents[] = {first, second, (int)rest...};
1017 buf.type = static_halide_type();
1018 constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
1019 make_static_shape_storage<buf_dimensions>();
1020 initialize_shape(extents);
1021 if (!Internal::any_zero(extents)) {
1023 allocate();
1024 }
1025 }
1026 // @}
1027
1028 /** Allocate a new image of unknown type using a vector of ints as the size. */
1029 Buffer(halide_type_t t, const std::vector<int> &sizes) {
1030 if (!T_is_void) {
1031 assert(static_halide_type() == t);
1032 }
1033 buf.type = t;
1034 // make_shape_storage() will do a runtime check that dimensionality matches.
1035 make_shape_storage((int)sizes.size());
1036 initialize_shape(sizes);
1037 if (!Internal::any_zero(sizes)) {
1039 allocate();
1040 }
1041 }
1042
1043 /** Allocate a new image of known type using a vector of ints as the size. */
1044 explicit Buffer(const std::vector<int> &sizes)
1045 : Buffer(static_halide_type(), sizes) {
1046 }
1047
1048private:
1049 // Create a copy of the sizes vector, ordered as specified by order.
1050 static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
1051 assert(order.size() == sizes.size());
1052 std::vector<int> ordered_sizes(sizes.size());
1053 for (size_t i = 0; i < sizes.size(); ++i) {
1054 ordered_sizes[i] = sizes.at(order[i]);
1055 }
1056 return ordered_sizes;
1057 }
1058
1059public:
1060 /** Allocate a new image of unknown type using a vector of ints as the size and
1061 * a vector of indices indicating the storage order for each dimension. The
1062 * length of the sizes vector and the storage-order vector must match. For instance,
1063 * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
1064 Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
1065 : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
1066 transpose(storage_order);
1067 }
1068
1069 Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
1070 : Buffer(static_halide_type(), sizes, storage_order) {
1071 }
1072
1073 /** Make an Buffer that refers to a statically sized array. Does not
1074 * take ownership of the data, and does not set the host_dirty flag. */
1075 template<typename Array, size_t N>
1076 explicit Buffer(Array (&vals)[N]) {
1077 const int buf_dimensions = dimensionality_of_array(vals);
1078 buf.type = scalar_type_of_array(vals);
1079 buf.host = (uint8_t *)vals;
1080 make_shape_storage(buf_dimensions);
1081 initialize_shape_from_array_shape(buf.dimensions - 1, vals);
1082 }
1083
1084 /** Initialize an Buffer of runtime type from a pointer and some
1085 * sizes. Assumes dense row-major packing and a min coordinate of
1086 * zero. Does not take ownership of the data and does not set the
1087 * host_dirty flag. */
1088 template<typename... Args,
1089 typename = typename std::enable_if<AllInts<Args...>::value>::type>
1090 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
1091 if (!T_is_void) {
1092 assert(static_halide_type() == t);
1093 }
1094 int extents[] = {first, (int)rest...};
1095 buf.type = t;
1096 buf.host = (uint8_t *)const_cast<void *>(data);
1097 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1098 make_static_shape_storage<buf_dimensions>();
1099 initialize_shape(extents);
1100 }
1101
1102 /** Initialize an Buffer from a pointer and some sizes. Assumes
1103 * dense row-major packing and a min coordinate of zero. Does not
1104 * take ownership of the data and does not set the host_dirty flag. */
1105 template<typename... Args,
1106 typename = typename std::enable_if<AllInts<Args...>::value>::type>
1107 explicit Buffer(T *data, int first, Args &&...rest) {
1108 int extents[] = {first, (int)rest...};
1109 buf.type = static_halide_type();
1110 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1111 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1112 make_static_shape_storage<buf_dimensions>();
1113 initialize_shape(extents);
1114 }
1115
1116 /** Initialize an Buffer from a pointer and a vector of
1117 * sizes. Assumes dense row-major packing and a min coordinate of
1118 * zero. Does not take ownership of the data and does not set the
1119 * host_dirty flag. */
1120 explicit Buffer(T *data, const std::vector<int> &sizes) {
1121 buf.type = static_halide_type();
1122 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1123 make_shape_storage((int)sizes.size());
1124 initialize_shape(sizes);
1125 }
1126
1127 /** Initialize an Buffer of runtime type from a pointer and a
1128 * vector of sizes. Assumes dense row-major packing and a min
1129 * coordinate of zero. Does not take ownership of the data and
1130 * does not set the host_dirty flag. */
1131 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
1132 if (!T_is_void) {
1133 assert(static_halide_type() == t);
1134 }
1135 buf.type = t;
1136 buf.host = (uint8_t *)const_cast<void *>(data);
1137 make_shape_storage((int)sizes.size());
1138 initialize_shape(sizes);
1139 }
1140
1141 /** Initialize an Buffer from a pointer to the min coordinate and
1142 * an array describing the shape. Does not take ownership of the
1143 * data, and does not set the host_dirty flag. */
1144 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
1145 if (!T_is_void) {
1146 assert(static_halide_type() == t);
1147 }
1148 buf.type = t;
1149 buf.host = (uint8_t *)const_cast<void *>(data);
1150 make_shape_storage(d);
1151 for (int i = 0; i < d; i++) {
1152 buf.dim[i] = shape[i];
1153 }
1154 }
1155
1156 /** Initialize a Buffer from a pointer to the min coordinate and
1157 * a vector describing the shape. Does not take ownership of the
1158 * data, and does not set the host_dirty flag. */
1159 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
1160 const std::vector<halide_dimension_t> &shape)
1161 : Buffer(t, data, (int)shape.size(), shape.data()) {
1162 }
1163
1164 /** Initialize an Buffer from a pointer to the min coordinate and
1165 * an array describing the shape. Does not take ownership of the
1166 * data and does not set the host_dirty flag. */
1167 explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1168 buf.type = static_halide_type();
1169 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1170 make_shape_storage(d);
1171 for (int i = 0; i < d; i++) {
1172 buf.dim[i] = shape[i];
1173 }
1174 }
1175
1176 /** Initialize a Buffer from a pointer to the min coordinate and
1177 * a vector describing the shape. Does not take ownership of the
1178 * data, and does not set the host_dirty flag. */
1179 explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1180 : Buffer(data, (int)shape.size(), shape.data()) {
1181 }
1182
1183 /** Destructor. Will release any underlying owned allocation if
1184 * this is the last reference to it. Will assert fail if there are
1185 * weak references to this Buffer outstanding. */
1187 decref();
1188 free_shape_storage();
1189 }
1190
1191 /** Get a pointer to the raw halide_buffer_t this wraps. */
1192 // @{
1194 return &buf;
1195 }
1196
1198 return &buf;
1199 }
1200 // @}
1201
1202 /** Provide a cast operator to halide_buffer_t *, so that
1203 * instances can be passed directly to Halide filters. */
1204 operator halide_buffer_t *() {
1205 return &buf;
1206 }
1207
1208 /** Return a typed reference to this Buffer. Useful for converting
1209 * a reference to a Buffer<void> to a reference to, for example, a
1210 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1211 * You can also optionally sspecify a new value for Dims; this is useful
1212 * mainly for removing the dimensionality constraint on a Buffer with
1213 * explicit dimensionality. Does a runtime assert if the source buffer type
1214 * is void or the new dimensionality is incompatible. */
1215 template<typename T2, int D2 = Dims>
1220
1221 /** Return a const typed reference to this Buffer. Useful for converting
1222 * a reference to a Buffer<void> to a reference to, for example, a
1223 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1224 * You can also optionally sspecify a new value for Dims; this is useful
1225 * mainly for removing the dimensionality constraint on a Buffer with
1226 * explicit dimensionality. Does a runtime assert if the source buffer type
1227 * is void or the new dimensionality is incompatible. */
1228 template<typename T2, int D2 = Dims>
1233
1234 /** Return an rval reference to this Buffer. Useful for converting
1235 * a reference to a Buffer<void> to a reference to, for example, a
1236 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1237 * You can also optionally sspecify a new value for Dims; this is useful
1238 * mainly for removing the dimensionality constraint on a Buffer with
1239 * explicit dimensionality. Does a runtime assert if the source buffer type
1240 * is void or the new dimensionality is incompatible. */
1241 template<typename T2, int D2 = Dims>
1246
1247 /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1248 * to recapitulate the type argument. */
1249 // @{
1251 Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() & {
1252 // Note that we can skip the assert_can_convert_from(), since T -> const T
1253 // conversion is always legal.
1254 return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1255 }
1256
1258 const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() const & {
1259 return *((const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1260 }
1261
1263 Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> as_const() && {
1264 return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1265 }
1266 // @}
1267
1268 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
1269 * passing arguments */
1270 template<typename T2 = T, typename = typename std::enable_if<!std::is_const<T2>::value>::type>
1271 operator Buffer<typename std::add_const<T2>::type, Dims, InClassDimStorage> &() & {
1272 return as_const();
1273 }
1274
1275 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
1276 * passing arguments */
1277 template<typename TVoid,
1278 typename T2 = T,
1279 typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1280 !std::is_void<T2>::value &&
1281 !std::is_const<T2>::value>::type>
1283 return as<TVoid, Dims>();
1284 }
1285
1286 /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
1287 * passing arguments */
1288 template<typename TVoid,
1289 typename T2 = T,
1290 typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1291 !std::is_void<T2>::value &&
1292 std::is_const<T2>::value>::type>
1296
1297 /** Conventional names for the first three dimensions. */
1298 // @{
1299 int width() const {
1300 return (dimensions() > 0) ? dim(0).extent() : 1;
1301 }
1302 int height() const {
1303 return (dimensions() > 1) ? dim(1).extent() : 1;
1304 }
1305 int channels() const {
1306 return (dimensions() > 2) ? dim(2).extent() : 1;
1307 }
1308 // @}
1309
1310 /** Conventional names for the min and max value of each dimension */
1311 // @{
1312 int left() const {
1313 return dim(0).min();
1314 }
1315
1316 int right() const {
1317 return dim(0).max();
1318 }
1319
1320 int top() const {
1321 return dim(1).min();
1322 }
1323
1324 int bottom() const {
1325 return dim(1).max();
1326 }
1327 // @}
1328
1329 /** Make a new image which is a deep copy of this image. Use crop
1330 * or slice followed by copy to make a copy of only a portion of
1331 * the image. The new image has the same nesting order of dimensions
1332 * (e.g. channels innermost), but resets the strides to the default
1333 * (each stride is the product of the extents of the inner dimensions).
1334 * Note that this means any strides of zero get broadcast into a non-zero stride.
1335 *
1336 * Note that the returned Buffer is always of a non-const type T (ie:
1337 *
1338 * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1339 *
1340 * which is always safe, since we are making a deep copy. (The caller
1341 * can easily cast it back to Buffer<const T> if desired, which is
1342 * always safe and free.)
1343 */
1344 Buffer<not_const_T, Dims, InClassDimStorage> copy(void *(*allocate_fn)(size_t) = nullptr,
1345 void (*deallocate_fn)(void *) = nullptr) const {
1347 dst.copy_from(*this);
1348 return dst;
1349 }
1350
1351 /** Like copy(), but the copy is created in interleaved memory layout
1352 * (vs. keeping the same memory layout as the original). Requires that 'this'
1353 * has exactly 3 dimensions.
1354 */
1356 void (*deallocate_fn)(void *) = nullptr) const {
1357 static_assert(Dims == AnyDims || Dims == 3);
1358 assert(dimensions() == 3);
1360 dst.set_min(min(0), min(1), min(2));
1361 dst.allocate(allocate_fn, deallocate_fn);
1362 dst.copy_from(*this);
1363 return dst;
1364 }
1365
1366 /** Like copy(), but the copy is created in planar memory layout
1367 * (vs. keeping the same memory layout as the original).
1368 */
1369 Buffer<not_const_T, Dims, InClassDimStorage> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
1370 void (*deallocate_fn)(void *) = nullptr) const {
1371 std::vector<int> mins, extents;
1372 const int dims = dimensions();
1373 mins.reserve(dims);
1374 extents.reserve(dims);
1375 for (int d = 0; d < dims; ++d) {
1376 mins.push_back(dim(d).min());
1377 extents.push_back(dim(d).extent());
1378 }
1380 dst.set_min(mins);
1381 dst.allocate(allocate_fn, deallocate_fn);
1382 dst.copy_from(*this);
1383 return dst;
1384 }
1385
1386 /** Make a copy of the Buffer which shares the underlying host and/or device
1387 * allocations as the existing Buffer. This is purely syntactic sugar for
1388 * cases where you have a const reference to a Buffer but need a temporary
1389 * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1390 * inline way to create a temporary. \code
1391 * void call_my_func(const Buffer<const uint8_t>& input) {
1392 * my_func(input.alias(), output);
1393 * }\endcode
1394 */
1396 return *this;
1397 }
1398
1399 /** Fill a Buffer with the values at the same coordinates in
1400 * another Buffer. Restricts itself to coordinates contained
1401 * within the intersection of the two buffers. If the two Buffers
1402 * are not in the same coordinate system, you will need to
1403 * translate the argument Buffer first. E.g. if you're blitting a
1404 * sprite onto a framebuffer, you'll want to translate the sprite
1405 * to the correct location first like so: \code
1406 * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1407 */
1408 template<typename T2, int D2, int S2>
1410 static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
1411 assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1412 assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1413
1415
1416 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
1417 assert(src.dimensions() == dst.dimensions());
1418
1419 // Trim the copy to the region in common
1420 const int d = dimensions();
1421 for (int i = 0; i < d; i++) {
1422 int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1423 int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1424 if (max_coord < min_coord) {
1425 // The buffers do not overlap.
1426 return;
1427 }
1428 dst.crop(i, min_coord, max_coord - min_coord + 1);
1429 src.crop(i, min_coord, max_coord - min_coord + 1);
1430 }
1431
1432 // If T is void, we need to do runtime dispatch to an
1433 // appropriately-typed lambda. We're copying, so we only care
1434 // about the element size. (If not, this should optimize away
1435 // into a static dispatch to the right-sized copy.)
1436 if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1437 using MemType = uint8_t;
1438 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1439 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1440 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1441 } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1442 using MemType = uint16_t;
1443 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1444 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1445 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1446 } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1447 using MemType = uint32_t;
1448 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1449 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1450 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1451 } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1452 using MemType = uint64_t;
1453 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1454 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1455 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1456 } else {
1457 assert(false && "type().bytes() must be 1, 2, 4, or 8");
1458 }
1460 }
1461
1462 /** Make an image that refers to a sub-range of this image along
1463 * the given dimension. Asserts that the crop region is within
1464 * the existing bounds: you cannot "crop outwards", even if you know there
1465 * is valid Buffer storage (e.g. because you already cropped inwards). */
1467 // Make a fresh copy of the underlying buffer (but not a fresh
1468 // copy of the allocation, if there is one).
1470
1471 // This guarantees the prexisting device ref is dropped if the
1472 // device_crop call fails and maintains the buffer in a consistent
1473 // state.
1474 im.device_deallocate();
1475
1476 im.crop_host(d, min, extent);
1477 if (buf.device_interface != nullptr) {
1478 complete_device_crop(im);
1479 }
1480 return im;
1481 }
1482
1483 /** Crop an image in-place along the given dimension. This does
1484 * not move any data around in memory - it just changes the min
1485 * and extent of the given dimension. */
1486 void crop(int d, int min, int extent) {
1487 // An optimization for non-device buffers. For the device case,
1488 // a temp buffer is required, so reuse the not-in-place version.
1489 // TODO(zalman|abadams): Are nop crops common enough to special
1490 // case the device part of the if to do nothing?
1491 if (buf.device_interface != nullptr) {
1492 *this = cropped(d, min, extent);
1493 } else {
1494 crop_host(d, min, extent);
1495 }
1496 }
1497
1498 /** Make an image that refers to a sub-rectangle of this image along
1499 * the first N dimensions. Asserts that the crop region is within
1500 * the existing bounds. The cropped image may drop any device handle
1501 * if the device_interface cannot accomplish the crop in-place. */
1502 Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
1503 // Make a fresh copy of the underlying buffer (but not a fresh
1504 // copy of the allocation, if there is one).
1506
1507 // This guarantees the prexisting device ref is dropped if the
1508 // device_crop call fails and maintains the buffer in a consistent
1509 // state.
1510 im.device_deallocate();
1511
1512 im.crop_host(rect);
1513 if (buf.device_interface != nullptr) {
1514 complete_device_crop(im);
1515 }
1516 return im;
1517 }
1518
1519 /** Crop an image in-place along the first N dimensions. This does
1520 * not move any data around in memory, nor does it free memory. It
1521 * just rewrites the min/extent of each dimension to refer to a
1522 * subregion of the same allocation. */
1523 void crop(const std::vector<std::pair<int, int>> &rect) {
1524 // An optimization for non-device buffers. For the device case,
1525 // a temp buffer is required, so reuse the not-in-place version.
1526 // TODO(zalman|abadams): Are nop crops common enough to special
1527 // case the device part of the if to do nothing?
1528 if (buf.device_interface != nullptr) {
1529 *this = cropped(rect);
1530 } else {
1531 crop_host(rect);
1532 }
1533 }
1534
1535 /** Make an image which refers to the same data with using
1536 * translated coordinates in the given dimension. Positive values
1537 * move the image data to the right or down relative to the
1538 * coordinate system. Drops any device handle. */
1541 im.translate(d, dx);
1542 return im;
1543 }
1544
1545 /** Translate an image in-place along one dimension by changing
1546 * how it is indexed. Does not move any data around in memory. */
1547 void translate(int d, int delta) {
1548 assert(d >= 0 && d < this->dimensions());
1550 buf.dim[d].min += delta;
1551 }
1552
1553 /** Make an image which refers to the same data translated along
1554 * the first N dimensions. */
1555 Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
1557 im.translate(delta);
1558 return im;
1559 }
1560
1561 /** Translate an image along the first N dimensions by changing
1562 * how it is indexed. Does not move any data around in memory. */
1563 void translate(const std::vector<int> &delta) {
1565 assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1566 int limit = (int)delta.size();
1567 assert(limit <= dimensions());
1568 for (int i = 0; i < limit; i++) {
1569 translate(i, delta[i]);
1570 }
1571 }
1572
1573 /** Set the min coordinate of an image in the first N dimensions. */
1574 // @{
1575 void set_min(const std::vector<int> &mins) {
1576 assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1578 for (size_t i = 0; i < mins.size(); i++) {
1579 buf.dim[i].min = mins[i];
1580 }
1581 }
1582
1583 template<typename... Args>
1584 void set_min(Args... args) {
1585 set_min(std::vector<int>{args...});
1586 }
1587 // @}
1588
1589 /** Test if a given coordinate is within the bounds of an image. */
1590 // @{
1591 bool contains(const std::vector<int> &coords) const {
1592 assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1593 for (size_t i = 0; i < coords.size(); i++) {
1594 if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1595 return false;
1596 }
1597 }
1598 return true;
1599 }
1600
1601 template<typename... Args>
1602 bool contains(Args... args) const {
1603 return contains(std::vector<int>{args...});
1604 }
1605 // @}
1606
1607 /** Make a buffer which refers to the same data in the same layout
1608 * using a swapped indexing order for the dimensions given. So
1609 * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1610 * strongly that A.address_of(i, j) == B.address_of(j, i). */
1613 im.transpose(d1, d2);
1614 return im;
1615 }
1616
1617 /** Transpose a buffer in-place by changing how it is indexed. For
1618 * example, transpose(0, 1) on a two-dimensional buffer means that
1619 * the value referred to by coordinates (i, j) is now reached at
1620 * the coordinates (j, i), and vice versa. This is done by
1621 * reordering the per-dimension metadata rather than by moving
1622 * data around in memory, so other views of the same memory will
1623 * not see the data as having been transposed. */
1624 void transpose(int d1, int d2) {
1625 assert(d1 >= 0 && d1 < this->dimensions());
1626 assert(d2 >= 0 && d2 < this->dimensions());
1627 std::swap(buf.dim[d1], buf.dim[d2]);
1628 }
1629
1630 /** A generalized transpose: instead of swapping two dimensions,
1631 * pass a vector that lists each dimension index exactly once, in
1632 * the desired order. This does not move any data around in memory
1633 * - it just permutes how it is indexed. */
1634 void transpose(const std::vector<int> &order) {
1635 assert((int)order.size() == dimensions());
1636 if (dimensions() < 2) {
1637 // My, that was easy
1638 return;
1639 }
1640
1641 std::vector<int> order_sorted = order;
1642 for (size_t i = 1; i < order_sorted.size(); i++) {
1643 for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1644 std::swap(order_sorted[j], order_sorted[j - 1]);
1645 transpose(j, j - 1);
1646 }
1647 }
1648 }
1649
1650 /** Make a buffer which refers to the same data in the same
1651 * layout using a different ordering of the dimensions. */
1652 Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
1654 im.transpose(order);
1655 return im;
1656 }
1657
1658 /** Make a lower-dimensional buffer that refers to one slice of
1659 * this buffer. */
1660 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1661 sliced(int d, int pos) const {
1662 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1663 assert(dimensions() > 0);
1664
1666
1667 // This guarantees the prexisting device ref is dropped if the
1668 // device_slice call fails and maintains the buffer in a consistent
1669 // state.
1670 im.device_deallocate();
1671
1672 im.slice_host(d, pos);
1673 if (buf.device_interface != nullptr) {
1674 complete_device_slice(im, d, pos);
1675 }
1676 return im;
1677 }
1678
1679 /** Make a lower-dimensional buffer that refers to one slice of this
1680 * buffer at the dimension's minimum. */
1681 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1682 sliced(int d) const {
1683 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1684 assert(dimensions() > 0);
1685
1686 return sliced(d, dim(d).min());
1687 }
1688
1689 /** Rewrite the buffer to refer to a single lower-dimensional
1690 * slice of itself along the given dimension at the given
1691 * coordinate. Does not move any data around or free the original
1692 * memory, so other views of the same data are unaffected. Can
1693 * only be called on a Buffer with dynamic dimensionality. */
1694 void slice(int d, int pos) {
1695 static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
1696 assert(dimensions() > 0);
1697
1698 // An optimization for non-device buffers. For the device case,
1699 // a temp buffer is required, so reuse the not-in-place version.
1700 // TODO(zalman|abadams): Are nop slices common enough to special
1701 // case the device part of the if to do nothing?
1702 if (buf.device_interface != nullptr) {
1703 *this = sliced(d, pos);
1704 } else {
1705 slice_host(d, pos);
1706 }
1707 }
1708
1709 /** Slice a buffer in-place at the dimension's minimum. */
1710 void slice(int d) {
1711 slice(d, dim(d).min());
1712 }
1713
1714 /** Make a new buffer that views this buffer as a single slice in a
1715 * higher-dimensional space. The new dimension has extent one and
1716 * the given min. This operation is the opposite of slice. As an
1717 * example, the following condition is true:
1718 *
1719 \code
1720 im2 = im.embedded(1, 17);
1721 &im(x, y, c) == &im2(x, 17, y, c);
1722 \endcode
1723 */
1724 Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
1725 embedded(int d, int pos = 0) const {
1727 im.embed(d, pos);
1728 return im;
1729 }
1730
1731 /** Embed a buffer in-place, increasing the
1732 * dimensionality. */
1733 void embed(int d, int pos = 0) {
1734 static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
1735 assert(d >= 0 && d <= dimensions());
1736 add_dimension();
1737 translate(dimensions() - 1, pos);
1738 for (int i = dimensions() - 1; i > d; i--) {
1739 transpose(i, i - 1);
1740 }
1741 }
1742
1743 /** Add a new dimension with a min of zero and an extent of
1744 * one. The stride is the extent of the outermost dimension times
1745 * its stride. The new dimension is the last dimension. This is a
1746 * special case of embed. */
1748 static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
1749 const int dims = buf.dimensions;
1750 buf.dimensions++;
1751 if (buf.dim != shape) {
1752 // We're already on the heap. Reallocate.
1753 halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
1754 for (int i = 0; i < dims; i++) {
1755 new_shape[i] = buf.dim[i];
1756 }
1757 delete[] buf.dim;
1758 buf.dim = new_shape;
1759 } else if (dims == InClassDimStorage) {
1760 // Transition from the in-class storage to the heap
1761 make_shape_storage(buf.dimensions);
1762 for (int i = 0; i < dims; i++) {
1763 buf.dim[i] = shape[i];
1764 }
1765 } else {
1766 // We still fit in the class
1767 }
1768 buf.dim[dims] = {0, 1, 0};
1769 if (dims == 0) {
1770 buf.dim[dims].stride = 1;
1771 } else {
1772 buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1773 }
1774 }
1775
1776 /** Add a new dimension with a min of zero, an extent of one, and
1777 * the specified stride. The new dimension is the last
1778 * dimension. This is a special case of embed. */
1780 add_dimension();
1781 buf.dim[buf.dimensions - 1].stride = s;
1782 }
1783
1784 /** Methods for managing any GPU allocation. */
1785 // @{
1786 // Set the host dirty flag. Called by every operator()
1787 // access. Must be inlined so it can be hoisted out of loops.
1789 void set_host_dirty(bool v = true) {
1790 assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1791 buf.set_host_dirty(v);
1792 }
1793
1794 // Check if the device allocation is dirty. Called by
1795 // set_host_dirty, which is called by every accessor. Must be
1796 // inlined so it can be hoisted out of loops.
1798 bool device_dirty() const {
1799 return buf.device_dirty();
1800 }
1801
1802 bool host_dirty() const {
1803 return buf.host_dirty();
1804 }
1805
1806 void set_device_dirty(bool v = true) {
1807 assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1808 buf.set_device_dirty(v);
1809 }
1810
1811 int copy_to_host(void *ctx = nullptr) {
1812 if (device_dirty()) {
1813 return buf.device_interface->copy_to_host(ctx, &buf);
1814 }
1816 }
1817
1818 int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1819 if (host_dirty()) {
1820 return device_interface->copy_to_device(ctx, &buf, device_interface);
1821 }
1823 }
1824
1825 int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1826 return device_interface->device_malloc(ctx, &buf, device_interface);
1827 }
1828
1829 int device_free(void *ctx = nullptr) {
1830 if (dev_ref_count) {
1831 assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
1832 "Can't call device_free on an unmanaged or wrapped native device handle. "
1833 "Free the source allocation or call device_detach_native instead.");
1834 // Multiple people may be holding onto this dev field
1835 assert(dev_ref_count->count == 1 &&
1836 "Multiple Halide::Runtime::Buffer objects share this device "
1837 "allocation. Freeing it would create dangling references. "
1838 "Don't call device_free on Halide buffers that you have copied or "
1839 "passed by value.");
1840 }
1841 int ret = halide_error_code_success;
1842 if (buf.device_interface) {
1843 ret = buf.device_interface->device_free(ctx, &buf);
1844 }
1845 if (dev_ref_count) {
1846 delete dev_ref_count;
1847 dev_ref_count = nullptr;
1848 }
1849 return ret;
1850 }
1851
1852 int device_wrap_native(const struct halide_device_interface_t *device_interface,
1853 uint64_t handle, void *ctx = nullptr) {
1854 assert(device_interface);
1855 dev_ref_count = new DeviceRefCount;
1857 return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1858 }
1859
1860 int device_detach_native(void *ctx = nullptr) {
1861 assert(dev_ref_count &&
1863 "Only call device_detach_native on buffers wrapping a native "
1864 "device handle via device_wrap_native. This buffer was allocated "
1865 "using device_malloc, or is unmanaged. "
1866 "Call device_free or free the original allocation instead.");
1867 // Multiple people may be holding onto this dev field
1868 assert(dev_ref_count->count == 1 &&
1869 "Multiple Halide::Runtime::Buffer objects share this device "
1870 "allocation. Freeing it could create dangling references. "
1871 "Don't call device_detach_native on Halide buffers that you "
1872 "have copied or passed by value.");
1873 int ret = halide_error_code_success;
1874 if (buf.device_interface) {
1875 ret = buf.device_interface->detach_native(ctx, &buf);
1876 }
1877 delete dev_ref_count;
1878 dev_ref_count = nullptr;
1879 return ret;
1880 }
1881
1882 int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1883 return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1884 }
1885
1886 int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1887 if (dev_ref_count) {
1889 "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1890 "Free the source allocation or call device_detach_native instead.");
1891 // Multiple people may be holding onto this dev field
1892 assert(dev_ref_count->count == 1 &&
1893 "Multiple Halide::Runtime::Buffer objects share this device "
1894 "allocation. Freeing it would create dangling references. "
1895 "Don't call device_and_host_free on Halide buffers that you have copied or "
1896 "passed by value.");
1897 }
1898 int ret = halide_error_code_success;
1899 if (buf.device_interface) {
1900 ret = buf.device_interface->device_and_host_free(ctx, &buf);
1901 }
1902 if (dev_ref_count) {
1903 delete dev_ref_count;
1904 dev_ref_count = nullptr;
1905 }
1906 return ret;
1907 }
1908
1909 int device_sync(void *ctx = nullptr) {
1910 return buf.device_sync(ctx);
1911 }
1912
1914 return buf.device != 0;
1915 }
1916
1917 /** Return the method by which the device field is managed. */
1919 if (dev_ref_count == nullptr) {
1921 }
1922 return dev_ref_count->ownership;
1923 }
1924 // @}
1925
1926 /** If you use the (x, y, c) indexing convention, then Halide
1927 * Buffers are stored planar by default. This function constructs
1928 * an interleaved RGB or RGBA image that can still be indexed
1929 * using (x, y, c). Passing it to a generator requires that the
1930 * generator has been compiled with support for interleaved (also
1931 * known as packed or chunky) memory layouts. */
1933 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1935 // Note that this is equivalent to calling transpose({2, 0, 1}),
1936 // but slightly more efficient.
1937 im.transpose(0, 1);
1938 im.transpose(1, 2);
1939 return im;
1940 }
1941
1942 /** If you use the (x, y, c) indexing convention, then Halide
1943 * Buffers are stored planar by default. This function constructs
1944 * an interleaved RGB or RGBA image that can still be indexed
1945 * using (x, y, c). Passing it to a generator requires that the
1946 * generator has been compiled with support for interleaved (also
1947 * known as packed or chunky) memory layouts. */
1951
1952 /** Wrap an existing interleaved image. */
1953 static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage>
1955 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1956 Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
1957 im.transpose(0, 1);
1958 im.transpose(1, 2);
1959 return im;
1960 }
1961
1962 /** Wrap an existing interleaved image. */
1966
1967 /** Make a zero-dimensional Buffer */
1969 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1970 Buffer<add_const_if_T_is_const<void>, AnyDims, InClassDimStorage> buf(t, 1);
1971 buf.slice(0, 0);
1972 return buf;
1973 }
1974
1975 /** Make a zero-dimensional Buffer */
1977 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1979 buf.slice(0, 0);
1980 return buf;
1981 }
1982
1983 /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1985 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1987 buf.slice(0, 0);
1988 return buf;
1989 }
1990
1991 /** Make a buffer with the same shape and memory nesting order as
1992 * another buffer. It may have a different type. */
1993 template<typename T2, int D2, int S2>
1995 void *(*allocate_fn)(size_t) = nullptr,
1996 void (*deallocate_fn)(void *) = nullptr) {
1997 static_assert(Dims == D2 || Dims == AnyDims);
1998 const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
1999 return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
2000 allocate_fn, deallocate_fn);
2001 }
2002
2003private:
2004 static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
2005 int dimensions,
2006 halide_dimension_t *shape,
2007 void *(*allocate_fn)(size_t),
2008 void (*deallocate_fn)(void *)) {
2009 // Reorder the dimensions of src to have strides in increasing order
2010 std::vector<int> swaps;
2011 for (int i = dimensions - 1; i > 0; i--) {
2012 for (int j = i; j > 0; j--) {
2013 if (shape[j - 1].stride > shape[j].stride) {
2014 std::swap(shape[j - 1], shape[j]);
2015 swaps.push_back(j);
2016 }
2017 }
2018 }
2019
2020 // Rewrite the strides to be dense (this messes up src, which
2021 // is why we took it by value).
2022 for (int i = 0; i < dimensions; i++) {
2023 if (i == 0) {
2024 shape[i].stride = 1;
2025 } else {
2026 shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
2027 }
2028 }
2029
2030 // Undo the dimension reordering
2031 while (!swaps.empty()) {
2032 int j = swaps.back();
2033 std::swap(shape[j - 1], shape[j]);
2034 swaps.pop_back();
2035 }
2036
2037 // Use an explicit runtime type, and make dst a Buffer<void>, to allow
2038 // using this method with Buffer<void> for either src or dst.
2039 Buffer<> dst(dst_type, nullptr, dimensions, shape);
2040 dst.allocate(allocate_fn, deallocate_fn);
2041
2042 return dst;
2043 }
2044
2045 template<typename... Args>
2047 ptrdiff_t
2048 offset_of(int d, int first, Args... rest) const {
2049#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2050 assert(first >= this->buf.dim[d].min);
2051 assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
2052#endif
2053 return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
2054 }
2055
2057 ptrdiff_t offset_of(int d) const {
2058 return 0;
2059 }
2060
2061 template<typename... Args>
2063 storage_T *
2064 address_of(Args... args) const {
2065 if (T_is_void) {
2066 return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
2067 } else {
2068 return (storage_T *)(this->buf.host) + offset_of(0, args...);
2069 }
2070 }
2071
2073 ptrdiff_t offset_of(const int *pos) const {
2074 ptrdiff_t offset = 0;
2075 for (int i = this->dimensions() - 1; i >= 0; i--) {
2076#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2077 assert(pos[i] >= this->buf.dim[i].min);
2078 assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
2079#endif
2080 offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
2081 }
2082 return offset;
2083 }
2084
2086 storage_T *address_of(const int *pos) const {
2087 if (T_is_void) {
2088 return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
2089 } else {
2090 return (storage_T *)this->buf.host + offset_of(pos);
2091 }
2092 }
2093
2094public:
2095 /** Get a pointer to the address of the min coordinate. */
2096 T *data() const {
2097 return (T *)(this->buf.host);
2098 }
2099
2100 /** Access elements. Use im(...) to get a reference to an element,
2101 * and use &im(...) to get the address of an element. If you pass
2102 * fewer arguments than the buffer has dimensions, the rest are
2103 * treated as their min coordinate. The non-const versions set the
2104 * host_dirty flag to true.
2105 */
2106 //@{
2107 template<typename... Args,
2108 typename = typename std::enable_if<AllInts<Args...>::value>::type>
2109 HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
2110 static_assert(!T_is_void,
2111 "Cannot use operator() on Buffer<void> types");
2112 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2113 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2114 assert(!device_dirty());
2115 return *((const not_void_T *)(address_of(first, rest...)));
2116 }
2117
2119 const not_void_T &
2120 operator()() const {
2121 static_assert(!T_is_void,
2122 "Cannot use operator() on Buffer<void> types");
2123 constexpr int expected_dims = 0;
2124 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2125 assert(!device_dirty());
2126 return *((const not_void_T *)(data()));
2127 }
2128
2130 const not_void_T &
2131 operator()(const int *pos) const {
2132 static_assert(!T_is_void,
2133 "Cannot use operator() on Buffer<void> types");
2134 assert(!device_dirty());
2135 return *((const not_void_T *)(address_of(pos)));
2136 }
2137
2138 template<typename... Args,
2139 typename = typename std::enable_if<AllInts<Args...>::value>::type>
2141 not_void_T &
2142 operator()(int first, Args... rest) {
2143 static_assert(!T_is_void,
2144 "Cannot use operator() on Buffer<void> types");
2145 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2146 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2148 return *((not_void_T *)(address_of(first, rest...)));
2149 }
2150
2152 not_void_T &
2154 static_assert(!T_is_void,
2155 "Cannot use operator() on Buffer<void> types");
2156 constexpr int expected_dims = 0;
2157 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2159 return *((not_void_T *)(data()));
2160 }
2161
2163 not_void_T &
2164 operator()(const int *pos) {
2165 static_assert(!T_is_void,
2166 "Cannot use operator() on Buffer<void> types");
2168 return *((not_void_T *)(address_of(pos)));
2169 }
2170 // @}
2171
2172 /** Tests that all values in this buffer are equal to val. */
2173 bool all_equal(not_void_T val) const {
2174 bool all_equal = true;
2175 for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
2176 return all_equal;
2177 }
2178
2181 for_each_value([=](T &v) { v = val; });
2182 return *this;
2183 }
2184
2185private:
2186 /** Helper functions for for_each_value. */
2187 // @{
2188 template<int N>
2189 struct for_each_value_task_dim {
2190 std::ptrdiff_t extent;
2191 std::ptrdiff_t stride[N];
2192 };
2193
2194 // Given an array of strides, and a bunch of pointers to pointers
2195 // (all of different types), advance the pointers using the
2196 // strides.
2197 template<typename Ptr, typename... Ptrs>
2198 HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
2199 ptr += *stride;
2200 advance_ptrs(stride + 1, ptrs...);
2201 }
2202
2204 static void advance_ptrs(const std::ptrdiff_t *) {
2205 }
2206
2207 template<typename Fn, typename Ptr, typename... Ptrs>
2208 HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
2209 const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
2210 if (d == 0) {
2211 if (innermost_strides_are_one) {
2212 Ptr end = ptr + t[0].extent;
2213 while (ptr != end) {
2214 f(*ptr++, (*ptrs++)...);
2215 }
2216 } else {
2217 for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
2218 f(*ptr, (*ptrs)...);
2219 advance_ptrs(t[0].stride, ptr, ptrs...);
2220 }
2221 }
2222 } else {
2223 for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
2224 for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
2225 advance_ptrs(t[d].stride, ptr, ptrs...);
2226 }
2227 }
2228 }
2229
2230 // Return pair is <new_dimensions, innermost_strides_are_one>
2231 template<int N>
2232 HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
2233 const halide_buffer_t **buffers) {
2234 const int dimensions = buffers[0]->dimensions;
2235 assert(dimensions > 0);
2236
2237 // Check the buffers all have clean host allocations
2238 for (int i = 0; i < N; i++) {
2239 if (buffers[i]->device) {
2240 assert(buffers[i]->host &&
2241 "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2242 assert(!buffers[i]->device_dirty() &&
2243 "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2244 } else {
2245 assert(buffers[i]->host &&
2246 "Buffer passed to for_each_value has no host or device allocation");
2247 }
2248 }
2249
2250 // Extract the strides in all the dimensions
2251 for (int i = 0; i < dimensions; i++) {
2252 for (int j = 0; j < N; j++) {
2253 assert(buffers[j]->dimensions == dimensions);
2254 assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2255 buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2256 const int s = buffers[j]->dim[i].stride;
2257 t[i].stride[j] = s;
2258 }
2259 t[i].extent = buffers[0]->dim[i].extent;
2260
2261 // Order the dimensions by stride, so that the traversal is cache-coherent.
2262 // Use the last dimension for this, because this is the source in copies.
2263 // It appears to be better to optimize read order than write order.
2264 for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
2265 std::swap(t[j], t[j - 1]);
2266 }
2267 }
2268
2269 // flatten dimensions where possible to make a larger inner
2270 // loop for autovectorization.
2271 int d = dimensions;
2272 for (int i = 1; i < d; i++) {
2273 bool flat = true;
2274 for (int j = 0; j < N; j++) {
2275 flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2276 }
2277 if (flat) {
2278 t[i - 1].extent *= t[i].extent;
2279 for (int j = i; j < d - 1; j++) {
2280 t[j] = t[j + 1];
2281 }
2282 i--;
2283 d--;
2284 }
2285 }
2286
2287 // Note that we assert() that dimensions > 0 above
2288 // (our one-and-only caller will only call us that way)
2289 // so the unchecked access to t[0] should be safe.
2290 bool innermost_strides_are_one = true;
2291 for (int i = 0; i < N; i++) {
2292 innermost_strides_are_one &= (t[0].stride[i] == 1);
2293 }
2294
2295 return {d, innermost_strides_are_one};
2296 }
2297
2298 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2299 void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
2300 if (dimensions() > 0) {
2301 const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
2302 Buffer<>::for_each_value_task_dim<N> *t =
2303 (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA(alloc_size);
2304 // Move the preparatory code into a non-templated helper to
2305 // save code size.
2306 const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2307 auto [new_dims, innermost_strides_are_one] = Buffer<>::for_each_value_prep(t, buffers);
2308 if (new_dims > 0) {
2309 Buffer<>::for_each_value_helper(f, new_dims - 1,
2310 innermost_strides_are_one,
2311 t,
2312 data(), (other_buffers.data())...);
2313 return;
2314 }
2315 // else fall thru
2316 }
2317
2318 // zero-dimensional case
2319 f(*data(), (*other_buffers.data())...);
2320 }
2321 // @}
2322
2323public:
2324 /** Call a function on every value in the buffer, and the
2325 * corresponding values in some number of other buffers of the
2326 * same size. The function should take a reference, const
2327 * reference, or value of the correct type for each buffer. This
2328 * effectively lifts a function of scalars to an element-wise
2329 * function of buffers. This produces code that the compiler can
2330 * autovectorize. This is slightly cheaper than for_each_element,
2331 * because it does not need to track the coordinates.
2332 *
2333 * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2334 * 'this' or the other-buffers arguments) will allow mutation of the
2335 * buffer contents, while a Buffer<const T> will not. Attempting to specify
2336 * a mutable reference for the lambda argument of a Buffer<const T>
2337 * will result in a compilation error. */
2338 // @{
2339 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2340 HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_value(Fn &&f, Args &&...other_buffers) const {
2341 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2342 return *this;
2343 }
2344
2345 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2348 for_each_value(Fn &&f, Args &&...other_buffers) {
2349 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2350 return *this;
2351 }
2352 // @}
2353
2354private:
2355 // Helper functions for for_each_element
2356 struct for_each_element_task_dim {
2357 int min, max;
2358 };
2359
2360 /** If f is callable with this many args, call it. The first
2361 * argument is just to make the overloads distinct. Actual
2362 * overload selection is done using the enable_if. */
2363 template<typename Fn,
2364 typename... Args,
2365 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2366 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2367 f(args...);
2368 }
2369
2370 /** If the above overload is impossible, we add an outer loop over
2371 * an additional argument and try again. */
2372 template<typename Fn,
2373 typename... Args>
2374 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2375 for (int i = t[d].min; i <= t[d].max; i++) {
2376 for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2377 }
2378 }
2379
2380 /** Determine the minimum number of arguments a callable can take
2381 * using the same trick. */
2382 template<typename Fn,
2383 typename... Args,
2384 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2385 HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2386 return (int)(sizeof...(Args));
2387 }
2388
2389 /** The recursive version is only enabled up to a recursion limit
2390 * of 256. This catches callables that aren't callable with any
2391 * number of ints. */
2392 template<typename Fn,
2393 typename... Args>
2394 HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2395 static_assert(sizeof...(args) <= 256,
2396 "Callable passed to for_each_element must accept either a const int *,"
2397 " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2398 return num_args(0, std::forward<Fn>(f), 0, args...);
2399 }
2400
2401 /** A version where the callable takes a position array instead,
2402 * with compile-time recursion on the dimensionality. This
2403 * overload is preferred to the one below using the same int vs
2404 * double trick as above, but is impossible once d hits -1 using
2405 * std::enable_if. */
2406 template<int d,
2407 typename Fn,
2408 typename = typename std::enable_if<(d >= 0)>::type>
2409 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2410 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2411 for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
2412 }
2413 }
2414
2415 /** Base case for recursion above. */
2416 template<int d,
2417 typename Fn,
2418 typename = typename std::enable_if<(d < 0)>::type>
2419 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2420 f(pos);
2421 }
2422
2423 /** A run-time-recursive version (instead of
2424 * compile-time-recursive) that requires the callable to take a
2425 * pointer to a position array instead. Dispatches to the
2426 * compile-time-recursive version once the dimensionality gets
2427 * small. */
2428 template<typename Fn>
2429 static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2430 if (d == -1) {
2431 f(pos);
2432 } else if (d == 0) {
2433 // Once the dimensionality gets small enough, dispatch to
2434 // a compile-time-recursive version for better codegen of
2435 // the inner loops.
2436 for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2437 } else if (d == 1) {
2438 for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2439 } else if (d == 2) {
2440 for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2441 } else if (d == 3) {
2442 for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2443 } else {
2444 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2445 for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
2446 }
2447 }
2448 }
2449
2450 /** We now have two overloads for for_each_element. This one
2451 * triggers if the callable takes a const int *.
2452 */
2453 template<typename Fn,
2454 typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2455 static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2456 const int size = dims * sizeof(int);
2457 int *pos = (int *)HALIDE_ALLOCA(size);
2458 // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
2459 // Add this memset to silence it.
2460 memset(pos, 0, size);
2461 for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2462 }
2463
2464 /** This one triggers otherwise. It treats the callable as
2465 * something that takes some number of ints. */
2466 template<typename Fn>
2467 HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2468 int args = num_args(0, std::forward<Fn>(f));
2469 assert(dims >= args);
2470 for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2471 }
2472
2473 template<typename Fn>
2474 void for_each_element_impl(Fn &&f) const {
2475 for_each_element_task_dim *t =
2476 (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2477 for (int i = 0; i < dimensions(); i++) {
2478 t[i].min = dim(i).min();
2479 t[i].max = dim(i).max();
2480 }
2481 for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2482 }
2483
2484public:
2485 /** Call a function at each site in a buffer. This is likely to be
2486 * much slower than using Halide code to populate a buffer, but is
2487 * convenient for tests. If the function has more arguments than the
2488 * buffer has dimensions, the remaining arguments will be zero. If it
2489 * has fewer arguments than the buffer has dimensions then the last
2490 * few dimensions of the buffer are not iterated over. For example,
2491 * the following code exploits this to set a floating point RGB image
2492 * to red:
2493
2494 \code
2495 Buffer<float, 3> im(100, 100, 3);
2496 im.for_each_element([&](int x, int y) {
2497 im(x, y, 0) = 1.0f;
2498 im(x, y, 1) = 0.0f;
2499 im(x, y, 2) = 0.0f:
2500 });
2501 \endcode
2502
2503 * The compiled code is equivalent to writing the a nested for loop,
2504 * and compilers are capable of optimizing it in the same way.
2505 *
2506 * If the callable can be called with an int * as the sole argument,
2507 * that version is called instead. Each location in the buffer is
2508 * passed to it in a coordinate array. This version is higher-overhead
2509 * than the variadic version, but is useful for writing generic code
2510 * that accepts buffers of arbitrary dimensionality. For example, the
2511 * following sets the value at all sites in an arbitrary-dimensional
2512 * buffer to their first coordinate:
2513
2514 \code
2515 im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2516 \endcode
2517
2518 * It is also possible to use for_each_element to iterate over entire
2519 * rows or columns by cropping the buffer to a single column or row
2520 * respectively and iterating over elements of the result. For example,
2521 * to set the diagonal of the image to 1 by iterating over the columns:
2522
2523 \code
2524 Buffer<float, 3> im(100, 100, 3);
2525 im.sliced(1, 0).for_each_element([&](int x, int c) {
2526 im(x, x, c) = 1.0f;
2527 });
2528 \endcode
2529
2530 * Or, assuming the memory layout is known to be dense per row, one can
2531 * memset each row of an image like so:
2532
2533 \code
2534 Buffer<float, 3> im(100, 100, 3);
2535 im.sliced(0, 0).for_each_element([&](int y, int c) {
2536 memset(&im(0, y, c), 0, sizeof(float) * im.width());
2537 });
2538 \endcode
2539
2540 */
2541 // @{
2542 template<typename Fn>
2544 for_each_element_impl(f);
2545 return *this;
2546 }
2547
2548 template<typename Fn>
2552 for_each_element_impl(f);
2553 return *this;
2554 }
2555 // @}
2556
2557private:
2558 template<typename Fn>
2559 struct FillHelper {
2560 Fn f;
2562
2563 template<typename... Args,
2564 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2565 void operator()(Args... args) {
2566 (*buf)(args...) = f(args...);
2567 }
2568
2569 FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
2570 : f(std::forward<Fn>(f)), buf(buf) {
2571 }
2572 };
2573
2574public:
2575 /** Fill a buffer by evaluating a callable at every site. The
2576 * callable should look much like a callable passed to
2577 * for_each_element, but it should return the value that should be
2578 * stored to the coordinate corresponding to the arguments. */
2579 template<typename Fn,
2580 typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
2582 // We'll go via for_each_element. We need a variadic wrapper lambda.
2583 FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2584 return for_each_element(wrapper);
2585 }
2586
2587 /** Check if an input buffer passed extern stage is a querying
2588 * bounds. Compared to doing the host pointer check directly,
2589 * this both adds clarity to code and will facilitate moving to
2590 * another representation for bounds query arguments. */
2591 bool is_bounds_query() const {
2592 return buf.is_bounds_query();
2593 }
2594
2595 /** Convenient check to verify that all of the interesting bytes in the Buffer
2596 * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2597 * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2598 * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2599 * the entire Buffer storage.) */
2600 void msan_check_mem_is_initialized(bool entire = false) const {
2601#if defined(__has_feature)
2602#if __has_feature(memory_sanitizer)
2603 if (entire) {
2604 __msan_check_mem_is_initialized(data(), size_in_bytes());
2605 } else {
2606 for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2607 }
2608#endif
2609#endif
2610 }
2611};
2612
2613} // namespace Runtime
2614} // namespace Halide
2615
2616#undef HALIDE_ALLOCA
2617
2618#endif // HALIDE_RUNTIME_IMAGE_H
#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
#define HALIDE_ALLOCA
This file declares the routines used by Halide internally in its runtime.
#define HALIDE_NEVER_INLINE
@ halide_error_code_success
There was no error.
#define HALIDE_ALWAYS_INLINE
struct halide_buffer_t halide_buffer_t
The raw representation of an image passed around by generated Halide code.
Read-only access to the shape.
HALIDE_ALWAYS_INLINE int min() const
The lowest coordinate in this dimension.
Dimension(const halide_dimension_t &dim)
HALIDE_ALWAYS_INLINE int max() const
The highest coordinate in this dimension.
HALIDE_ALWAYS_INLINE iterator end() const
An iterator that points to one past the max coordinate.
HALIDE_ALWAYS_INLINE int stride() const
The number of elements in memory you have to step over to increment this coordinate by one.
HALIDE_ALWAYS_INLINE iterator begin() const
An iterator that points to the min coordinate.
HALIDE_ALWAYS_INLINE int extent() const
The extent of the image along this dimension.
A templated Buffer class that wraps halide_buffer_t and adds functionality.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T2, D2, S2 > &other)
Assign from another Buffer of possibly-different dimensionality and type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_planar(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in planar memory layout (vs.
Buffer< T, Dims, InClassDimStorage > transposed(const std::vector< int > &order) const
Make a buffer which refers to the same data in the same layout using a different ordering of the dime...
void translate(int d, int delta)
Translate an image in-place along one dimension by changing how it is indexed.
Buffer(const halide_buffer_t &buf, BufferDeviceOwnership ownership=BufferDeviceOwnership::Unmanaged)
Make a Buffer from a halide_buffer_t.
void allocate(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Allocate memory for this Buffer.
Buffer< not_const_T, Dims, InClassDimStorage > copy(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Make a new image which is a deep copy of this image.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims+1)> embedded(int d, int pos=0) const
Make a new buffer that views this buffer as a single slice in a higher-dimensional space.
friend class Buffer
Give Buffers access to the members of Buffers of different dimensionalities and types.
void add_dimension()
Add a new dimension with a min of zero and an extent of one.
void slice(int d)
Slice a buffer in-place at the dimension's minimum.
static void set_default_allocate_fn(void *(*allocate_fn)(size_t))
bool owns_host_memory() const
Does this Buffer own the host memory it refers to?
int width() const
Conventional names for the first three dimensions.
void transpose(const std::vector< int > &order)
A generalized transpose: instead of swapping two dimensions, pass a vector that lists each dimension ...
void set_min(const std::vector< int > &mins)
Set the min coordinate of an image in the first N dimensions.
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f)
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< int > &sizes)
Initialize an Buffer of runtime type from a pointer and a vector of sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > as() &&
Return an rval reference to this Buffer.
int copy_to_host(void *ctx=nullptr)
Buffer(halide_type_t t, const std::vector< int > &sizes)
Allocate a new image of unknown type using a vector of ints as the size.
int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_free(void *ctx=nullptr)
int extent(int i) const
bool contains(Args... args) const
void crop(const std::vector< std::pair< int, int > > &rect)
Crop an image in-place along the first N dimensions.
HALIDE_ALWAYS_INLINE const Buffer< typename std::add_const< T >::type, Dims, InClassDimStorage > & as_const() const &
void set_device_dirty(bool v=true)
HALIDE_ALWAYS_INLINE const not_void_T & operator()(const int *pos) const
Buffer(T *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
Buffer(Buffer< T2, D2, S2 > &&other)
Move-construct a Buffer from a Buffer of different dimensionality and type.
void slice(int d, int pos)
Rewrite the buffer to refer to a single lower-dimensional slice of itself along the given dimension a...
HALIDE_ALWAYS_INLINE const not_void_T & operator()(int first, Args... rest) const
Access elements.
HALIDE_ALWAYS_INLINE void set_host_dirty(bool v=true)
Methods for managing any GPU allocation.
void msan_check_mem_is_initialized(bool entire=false) const
Convenient check to verify that all of the interesting bytes in the Buffer are initialized under MSAN...
HALIDE_ALWAYS_INLINE Buffer< typename std::add_const< T >::type, Dims, InClassDimStorage > as_const() &&
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Standard move-assignment operator.
int device_detach_native(void *ctx=nullptr)
int device_wrap_native(const struct halide_device_interface_t *device_interface, uint64_t handle, void *ctx=nullptr)
static constexpr bool has_static_halide_type
True if the Halide type is not void (or const void).
Buffer< T, Dims, InClassDimStorage > translated(const std::vector< int > &delta) const
Make an image which refers to the same data translated along the first N dimensions.
HALIDE_ALWAYS_INLINE Dimension dim(int i) const
Access the shape of the buffer.
Buffer(int first, int second, Args... rest)
HALIDE_ALWAYS_INLINE Buffer< typename std::add_const< T >::type, Dims, InClassDimStorage > & as_const() &
as_const() is syntactic sugar for .as<const T>(), to avoid the need to recapitulate the type argument...
Buffer< T, Dims, InClassDimStorage > transposed(int d1, int d2) const
Make a buffer which refers to the same data in the same layout using a swapped indexing order for the...
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers)
HALIDE_ALWAYS_INLINE not_void_T & operator()()
BufferDeviceOwnership device_ownership() const
Return the method by which the device field is managed.
void check_overflow()
Check the product of the extents fits in memory.
static bool can_convert_from(const Buffer< T2, D2, S2 > &other)
Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_interleaved(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in interleaved memory layout (vs.
int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_sync(void *ctx=nullptr)
static Buffer< void, Dims, InClassDimStorage > make_interleaved(halide_type_t t, int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
Buffer(const std::vector< int > &sizes)
Allocate a new image of known type using a vector of ints as the size.
void embed(int d, int pos=0)
Embed a buffer in-place, increasing the dimensionality.
static constexpr halide_type_t static_halide_type()
Get the Halide type of T.
Buffer(T *data, int first, Args &&...rest)
Initialize an Buffer from a pointer and some sizes.
int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(Array(&vals)[N])
Make an Buffer that refers to a statically sized array.
const halide_buffer_t * raw_buffer() const
HALIDE_ALWAYS_INLINE not_void_T & operator()(int first, Args... rest)
static Buffer< T, Dims, InClassDimStorage > make_interleaved(int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
halide_type_t type() const
Get the type of the elements.
int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(int first)
Allocate a new image of the given size.
halide_buffer_t * raw_buffer()
Get a pointer to the raw halide_buffer_t this wraps.
T * end() const
A pointer to one beyond the element with the highest address.
HALIDE_ALWAYS_INLINE bool device_dirty() const
Buffer< T, Dims, InClassDimStorage > cropped(const std::vector< std::pair< int, int > > &rect) const
Make an image that refers to a sub-rectangle of this image along the first N dimensions.
static constexpr int static_dimensions()
Callers should not use the result if has_static_dimensions is false.
void transpose(int d1, int d2)
Transpose a buffer in-place by changing how it is indexed.
void deallocate()
Drop reference to any owned host or device memory, possibly freeing it, if this buffer held the last ...
size_t size_in_bytes() const
The total number of bytes spanned by the data in memory.
bool has_device_allocation() const
void reset()
Reset the Buffer to be equivalent to a default-constructed Buffer of the same static type (if any); B...
Buffer(halide_type_t t, int first, Args... rest)
Allocate a new image of the given size with a runtime type.
int dimensions() const
Get the dimensionality of the buffer.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
int min(int i) const
Access to the mins, strides, extents.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f) const
Call a function at each site in a buffer.
void device_deallocate()
Drop reference to any owned device memory, possibly freeing it if this buffer held the last reference...
HALIDE_ALWAYS_INLINE const not_void_T & operator()() const
static Buffer< T, Dims, InClassDimStorage > make_scalar()
Make a zero-dimensional Buffer.
void add_dimension_with_stride(int s)
Add a new dimension with a min of zero, an extent of one, and the specified stride.
Buffer(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Move constructor.
Buffer< T, Dims, InClassDimStorage > cropped(int d, int min, int extent) const
Make an image that refers to a sub-range of this image along the given dimension.
void crop(int d, int min, int extent)
Crop an image in-place along the given dimension.
Buffer< T, Dims, InClassDimStorage > & fill(Fn &&f)
Fill a buffer by evaluating a callable at every site.
static Buffer< T, Dims, InClassDimStorage > make_scalar(T *data)
Make a zero-dimensional Buffer that points to non-owned, existing data.
Buffer< T, Dims, InClassDimStorage > alias() const
Make a copy of the Buffer which shares the underlying host and/or device allocations as the existing ...
void set_min(Args... args)
size_t number_of_elements() const
The total number of elements this buffer represents.
static void assert_can_convert_from(const Buffer< T2, D2, S2 > &other)
Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage> cannot be const...
void translate(const std::vector< int > &delta)
Translate an image along the first N dimensions by changing how it is indexed.
Buffer(const Buffer< T, Dims, InClassDimStorage > &other)
Copy constructor.
HALIDE_ALWAYS_INLINE not_void_T & operator()(const int *pos)
T * data() const
Get a pointer to the address of the min coordinate.
Buffer< T, Dims, InClassDimStorage > & fill(not_void_T val)
Buffer(const std::vector< int > &sizes, const std::vector< int > &storage_order)
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T2, D2, S2 > &&other)
Move from another Buffer of possibly-different dimensionality and type.
Buffer(halide_type_t t, const std::vector< int > &sizes, const std::vector< int > &storage_order)
Allocate a new image of unknown type using a vector of ints as the size and a vector of indices indic...
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
static constexpr bool has_static_dimensions
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d, int pos) const
Make a lower-dimensional buffer that refers to one slice of this buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_interleaved(halide_type_t t, T *data, int width, int height, int channels)
Wrap an existing interleaved image.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers) const
Call a function on every value in the buffer, and the corresponding values in some number of other bu...
bool is_bounds_query() const
Check if an input buffer passed extern stage is a querying bounds.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d) const
Make a lower-dimensional buffer that refers to one slice of this buffer at the dimension's minimum.
int left() const
Conventional names for the min and max value of each dimension.
void copy_from(Buffer< T2, D2, S2 > src)
Fill a Buffer with the values at the same coordinates in another Buffer.
Buffer< T, Dims, InClassDimStorage > translated(int d, int dx) const
Make an image which refers to the same data with using translated coordinates in the given dimension.
int stride(int i) const
static Buffer< T, Dims, InClassDimStorage > make_interleaved(T *data, int width, int height, int channels)
Wrap an existing interleaved image.
static void set_default_deallocate_fn(void(*deallocate_fn)(void *))
static Buffer< T, Dims, InClassDimStorage > make_with_shape_of(Buffer< T2, D2, S2 > src, void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Make a buffer with the same shape and memory nesting order as another buffer.
Buffer(const Buffer< T2, D2, S2 > &other)
Construct a Buffer from a Buffer of different dimensionality and type.
bool contains(const std::vector< int > &coords) const
Test if a given coordinate is within the bounds of an image.
Buffer(T *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
Buffer(T *data, const std::vector< int > &sizes)
Initialize an Buffer from a pointer and a vector of sizes.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T, Dims, InClassDimStorage > &other)
Standard assignment operator.
T * begin() const
A pointer to the element with the lowest address.
bool all_equal(not_void_T val) const
Tests that all values in this buffer are equal to val.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int first, Args &&...rest)
Initialize an Buffer of runtime type from a pointer and some sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > & as() &
Return a typed reference to this Buffer.
HALIDE_ALWAYS_INLINE const Buffer< T2, D2, InClassDimStorage > & as() const &
Return a const typed reference to this Buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_scalar(halide_type_t t)
Make a zero-dimensional Buffer.
bool any_zero(const Container &c)
constexpr int AnyDims
BufferDeviceOwnership
This indicates how to deallocate the device for a Halide::Runtime::Buffer.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
Expr max(const FuncRef &a, const FuncRef &b)
Definition Func.h:600
unsigned __INT64_TYPE__ uint64_t
__UINTPTR_TYPE__ uintptr_t
void * malloc(size_t)
ALWAYS_INLINE T align_up(T p, size_t alignment)
unsigned __INT8_TYPE__ uint8_t
__PTRDIFF_TYPE__ ptrdiff_t
unsigned __INT16_TYPE__ uint16_t
void * memcpy(void *s1, const void *s2, size_t n)
__SIZE_TYPE__ size_t
void * memset(void *s, int val, size_t n)
unsigned __INT32_TYPE__ uint32_t
void free(void *)
A struct acting as a header for allocations owned by the Buffer class itself.
AllocationHeader(void(*deallocate_fn)(void *))
An iterator class, so that you can iterate over coordinates in a dimensions using a range-based for l...
bool operator!=(const iterator &other) const
A similar struct for managing device allocations.
BufferDeviceOwnership ownership
The raw representation of an image passed around by generated Halide code.
int32_t dimensions
The dimensionality of the buffer.
halide_dimension_t * dim
The shape of the buffer.
uint64_t device
A device-handle for e.g.
uint8_t * host
A pointer to the start of the data in main memory.
struct halide_type_t type
The type of each buffer element.
const struct halide_device_interface_t * device_interface
The interface used to interpret the above handle.
Each GPU API provides a halide_device_interface_t struct pointing to the code that manages device all...
int(* device_slice)(void *user_context, const struct halide_buffer_t *src, int slice_dim, int slice_pos, struct halide_buffer_t *dst)
int(* device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle, const struct halide_device_interface_t *device_interface)
int(* device_release_crop)(void *user_context, struct halide_buffer_t *buf)
int(* device_crop)(void *user_context, const struct halide_buffer_t *src, struct halide_buffer_t *dst)
int(* copy_to_host)(void *user_context, struct halide_buffer_t *buf)
int(* copy_to_device)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* device_free)(void *user_context, struct halide_buffer_t *buf)
int(* detach_native)(void *user_context, struct halide_buffer_t *buf)
int(* device_and_host_free)(void *user_context, struct halide_buffer_t *buf)
int(* device_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
A runtime tag for a type in the halide type system.