Halide 21.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
HalideBuffer.h
Go to the documentation of this file.
1/** \file
2 * Defines a Buffer type that wraps from halide_buffer_t and adds
3 * functionality, and methods for more conveniently iterating over the
4 * samples in a halide_buffer_t outside of Halide code. */
5
6#ifndef HALIDE_RUNTIME_BUFFER_H
7#define HALIDE_RUNTIME_BUFFER_H
8
9#include <algorithm>
10#include <atomic>
11#include <cassert>
12#include <cstdint>
13#include <cstdlib>
14#include <cstring>
15#include <limits>
16#include <memory>
17#include <type_traits>
18#include <vector>
19
20#ifdef __APPLE__
21#include <AvailabilityVersions.h>
22#include <TargetConditionals.h>
23#endif
24
25#if defined(__has_feature)
26#if __has_feature(memory_sanitizer)
27#include <sanitizer/msan_interface.h>
28#endif
29#endif
30
31#include "HalideRuntime.h"
32
33#ifdef _MSC_VER
34#include <malloc.h>
35#define HALIDE_ALLOCA _alloca
36#else
37#define HALIDE_ALLOCA __builtin_alloca
38#endif
39
40// gcc 5.1 has a false positive warning on this code
41#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
42#pragma GCC diagnostic ignored "-Warray-bounds"
43#endif
44
45#ifndef HALIDE_RUNTIME_BUFFER_CHECK_INDICES
46#define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
47#endif
48
49#ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
50// Conservatively align buffer allocations to 128 bytes by default.
51// This is enough alignment for all the platforms currently in use.
52// Redefine this in your compiler settings if you desire more/less alignment.
53#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
54#endif
55
57 "HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT must be a power of 2.");
58
59// Unfortunately, not all C++17 runtimes support aligned_alloc
60// (it may depends on OS/SDK version); this is provided as an opt-out
61// if you are compiling on a platform that doesn't provide a (good)
62// implementation. (Note that we actually use the C11 `::aligned_alloc()`
63// rather than the C++17 `std::aligned_alloc()` because at least one platform
64// we found supports the former but not the latter.)
65#ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
66
67// clang-format off
68#ifdef _WIN32
69
70 // Windows (regardless of which compiler) doesn't implement aligned_alloc(),
71 // even in C++17 mode, and has stated they probably never will, as the issue
72 // is in the incompatibility that free() needs to be able to free both pointers
73 // returned by malloc() and aligned_alloc(). So, always default it off here.
74 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
75
76#elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
77
78 // Android doesn't provide aligned_alloc until API 28
79 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
80
81#elif defined(__APPLE__)
82
83 #if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_15)
84
85 // macOS doesn't provide aligned_alloc until 10.15
86 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
87
88 #elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_14_0)
89
90 // iOS doesn't provide aligned_alloc until 14.0
91 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
92
93 #else
94
95 // Assume it's ok on all other Apple targets
96 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
97
98 #endif
99
100#else
101
102 #if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)
103
104 // ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
105 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
106
107 #else
108
109 // Not Windows, Android, or Apple: just assume it's ok
110 #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
111
112 #endif
113
114#endif
115// clang-format on
116
117#endif // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
118
119namespace Halide {
120namespace Runtime {
121
122// Forward-declare our Buffer class
123template<typename T, int Dims, int InClassDimStorage>
124class Buffer;
125
126// A helper to check if a parameter pack is entirely implicitly
127// int-convertible to use with std::enable_if
128template<typename... Args>
129struct AllInts : std::false_type {};
130
131template<>
132struct AllInts<> : std::true_type {};
133
134template<typename T, typename... Args>
135struct AllInts<T, Args...> {
136 static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
137};
138
139// Floats and doubles are technically implicitly int-convertible, but
140// doing so produces a warning we treat as an error, so just disallow
141// it here.
142template<typename... Args>
143struct AllInts<float, Args...> : std::false_type {};
144
145template<typename... Args>
146struct AllInts<double, Args...> : std::false_type {};
147
148namespace Internal {
149// A helper to detect if there are any zeros in a container
150template<typename Container>
151bool any_zero(const Container &c) {
152 for (int i : c) {
153 if (i == 0) {
154 return true;
155 }
156 }
157 return false;
158}
159
161 static inline void *(*default_allocate_fn)(size_t) = nullptr;
162 static inline void (*default_deallocate_fn)(void *) = nullptr;
163};
164} // namespace Internal
165
166/** A struct acting as a header for allocations owned by the Buffer
167 * class itself. */
169 void (*deallocate_fn)(void *);
170 std::atomic<int> ref_count;
171
172 // Note that ref_count always starts at 1
173 explicit AllocationHeader(void (*deallocate_fn)(void *))
175 }
176};
177
178/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
179enum struct BufferDeviceOwnership : int {
180 Allocated, ///> halide_device_free will be called when device ref count goes to zero
181 WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
182 Unmanaged, ///> No free routine will be called when device ref count goes to zero
183 AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
184 Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero.
185};
186
187/** A similar struct for managing device allocations. */
189 // This is only ever constructed when there's something to manage,
190 // so start at one.
191 std::atomic<int> count{1};
193};
194
195constexpr int AnyDims = -1;
196
197/** A templated Buffer class that wraps halide_buffer_t and adds
198 * functionality. When using Halide from C++, this is the preferred
199 * way to create input and output buffers. The overhead of using this
200 * class relative to a naked halide_buffer_t is minimal - it uses another
201 * ~16 bytes on the stack, and does no dynamic allocations when using
202 * it to represent existing memory of a known maximum dimensionality.
203 *
204 * The template parameter T is the element type. For buffers where the
205 * element type is unknown, or may vary, use void or const void.
206 *
207 * The template parameter Dims is the number of dimensions. For buffers where
208 * the dimensionality type is unknown at, or may vary, use AnyDims.
209 *
210 * InClassDimStorage is the maximum number of dimensions that can be represented
211 * using space inside the class itself. Set it to the maximum dimensionality
212 * you expect this buffer to be. If the actual dimensionality exceeds
213 * this, heap storage is allocated to track the shape of the buffer.
214 * InClassDimStorage defaults to 4, which should cover nearly all usage.
215 *
216 * The class optionally allocates and owns memory for the image using
217 * a shared pointer allocated with the provided allocator. If they are
218 * null, malloc and free are used. Any device-side allocation is
219 * considered as owned if and only if the host-side allocation is
220 * owned. */
221template<typename T = void,
222 int Dims = AnyDims,
223 int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
224class Buffer {
225 /** The underlying halide_buffer_t */
226 halide_buffer_t buf = {};
227
228 /** Some in-class storage for shape of the dimensions. */
229 halide_dimension_t shape[InClassDimStorage];
230
231 /** The allocation owned by this Buffer. NULL if the Buffer does not
232 * own the memory. */
233 AllocationHeader *alloc = nullptr;
234
235 /** A reference count for the device allocation owned by this
236 * buffer. */
237 mutable DeviceRefCount *dev_ref_count = nullptr;
238
239 /** True if T is of type void or const void */
240 static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;
241
242 /** A type function that adds a const qualifier if T is a const type. */
243 template<typename T2>
244 using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;
245
246 /** T unless T is (const) void, in which case (const)
247 * uint8_t. Useful for providing return types for operator() */
248 using not_void_T = typename std::conditional<T_is_void,
249 add_const_if_T_is_const<uint8_t>,
250 T>::type;
251
252 /** T with constness removed. Useful for return type of copy(). */
253 using not_const_T = typename std::remove_const<T>::type;
254
255 /** The type the elements are stored as. Equal to not_void_T
256 * unless T is a pointer, in which case uint64_t. Halide stores
257 * all pointer types as uint64s internally, even on 32-bit
258 * systems. */
259 using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;
260
261public:
262 /** True if the Halide type is not void (or const void). */
263 static constexpr bool has_static_halide_type = !T_is_void;
264
265 /** Get the Halide type of T. Callers should not use the result if
266 * has_static_halide_type is false. */
268 return halide_type_of<typename std::remove_cv<not_void_T>::type>();
269 }
270
271 /** Does this Buffer own the host memory it refers to? */
272 bool owns_host_memory() const {
273 return alloc != nullptr;
274 }
275
276 static constexpr bool has_static_dimensions = (Dims != AnyDims);
277
278 /** Callers should not use the result if
279 * has_static_dimensions is false. */
280 static constexpr int static_dimensions() {
281 return Dims;
282 }
283
284 static_assert(!has_static_dimensions || static_dimensions() >= 0);
285
286private:
287 /** Increment the reference count of any owned allocation */
288 void incref() const {
289 if (owns_host_memory()) {
290 alloc->ref_count++;
291 }
292 if (buf.device) {
293 if (!dev_ref_count) {
294 // I seem to have a non-zero dev field but no
295 // reference count for it. I must have been given a
296 // device allocation by a Halide pipeline, and have
297 // never been copied from since. Take sole ownership
298 // of it.
299 dev_ref_count = new DeviceRefCount;
300 }
301 dev_ref_count->count++;
302 }
303 }
304
305 // Note that this is called "cropped" but can also encompass a slice/embed
306 // operation as well.
307 struct DevRefCountCropped : DeviceRefCount {
308 // We will only store Buffers that have a dynamic number of dimensions.
309 // Buffers that cropped or sliced from need to be first converted to
310 // one with variable size. This is required because we cannot possibly
311 // know what the actual dimensionality is of the buffer this is a
312 // crop or slice from. Since cropping a sliced buffer is also possible,
313 // no optimizations can be made for cropped buffers either.
314 Buffer<T, AnyDims> cropped_from;
315 explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
316 : cropped_from(cropped_from) {
318 }
319 };
320
321 /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
322 void crop_from(const Buffer<T, AnyDims> &cropped_from) {
323 assert(dev_ref_count == nullptr);
324 dev_ref_count = new DevRefCountCropped(cropped_from);
325 }
326
327 /** Decrement the reference count of any owned allocation and free host
328 * and device memory if it hits zero. Sets alloc to nullptr. */
329 void decref(bool device_only = false) {
330 if (owns_host_memory() && !device_only) {
331 int new_count = --(alloc->ref_count);
332 if (new_count == 0) {
333 void (*fn)(void *) = alloc->deallocate_fn;
334 alloc->~AllocationHeader();
335 fn(alloc);
336 }
337 buf.host = nullptr;
338 alloc = nullptr;
339 set_host_dirty(false);
340 }
341 int new_count = 0;
342 if (dev_ref_count) {
343 new_count = --(dev_ref_count->count);
344 }
345 if (new_count == 0) {
346 if (buf.device) {
347 assert(!(alloc && device_dirty()) &&
348 "Implicitly freeing a dirty device allocation while a host allocation still lives. "
349 "Call device_free explicitly if you want to drop dirty device-side data. "
350 "Call copy_to_host explicitly if you want the data copied to the host allocation "
351 "before the device allocation is freed.");
352 int result = halide_error_code_success;
353 if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
354 result = buf.device_interface->detach_native(nullptr, &buf);
355 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
356 result = buf.device_interface->device_and_host_free(nullptr, &buf);
357 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
358 result = buf.device_interface->device_release_crop(nullptr, &buf);
359 } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
360 result = buf.device_interface->device_free(nullptr, &buf);
361 }
362 // No reasonable way to return the error, but we can at least assert-fail in debug builds.
363 assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
364 (void)result;
365 }
366 if (dev_ref_count) {
367 if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
368 delete (DevRefCountCropped *)dev_ref_count;
369 } else {
370 delete dev_ref_count;
371 }
372 }
373 }
374 dev_ref_count = nullptr;
375 buf.device = 0;
376 buf.device_interface = nullptr;
377 }
378
379 void free_shape_storage() {
380 if (buf.dim != shape) {
381 delete[] buf.dim;
382 buf.dim = nullptr;
383 }
384 }
385
386 template<int DimsSpecified>
387 void make_static_shape_storage() {
388 static_assert(Dims == AnyDims || Dims == DimsSpecified,
389 "Number of arguments to Buffer() does not match static dimensionality");
390 buf.dimensions = DimsSpecified;
391 if constexpr (Dims == AnyDims) {
392 if constexpr (DimsSpecified <= InClassDimStorage) {
393 buf.dim = shape;
394 } else {
395 static_assert(DimsSpecified >= 1);
396 buf.dim = new halide_dimension_t[DimsSpecified];
397 }
398 } else {
399 static_assert(InClassDimStorage >= Dims);
400 buf.dim = shape;
401 }
402 }
403
404 void make_shape_storage(const int dimensions) {
405 if (Dims != AnyDims && Dims != dimensions) {
406 assert(false && "Number of arguments to Buffer() does not match static dimensionality");
407 }
408 // This should usually be inlined, so if dimensions is statically known,
409 // we can skip the call to new
410 buf.dimensions = dimensions;
411 buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
412 }
413
414 void copy_shape_from(const halide_buffer_t &other) {
415 // All callers of this ensure that buf.dimensions == other.dimensions.
416 make_shape_storage(other.dimensions);
417 std::copy(other.dim, other.dim + other.dimensions, buf.dim);
418 }
419
420 template<typename T2, int D2, int S2>
421 void move_shape_from(Buffer<T2, D2, S2> &&other) {
422 if (other.shape == other.buf.dim) {
423 copy_shape_from(other.buf);
424 } else {
425 buf.dim = other.buf.dim;
426 other.buf.dim = nullptr;
427 }
428 other.buf = halide_buffer_t();
429 }
430
431 /** Initialize the shape from a halide_buffer_t. */
432 void initialize_from_buffer(const halide_buffer_t &b,
433 BufferDeviceOwnership ownership) {
434 memcpy(&buf, &b, sizeof(halide_buffer_t));
435 copy_shape_from(b);
436 if (b.device) {
437 dev_ref_count = new DeviceRefCount;
438 dev_ref_count->ownership = ownership;
439 }
440 }
441
442 /** Initialize the shape from an array of ints */
443 void initialize_shape(const int *sizes) {
444 for (int i = 0; i < buf.dimensions; i++) {
445 buf.dim[i].min = 0;
446 buf.dim[i].extent = sizes[i];
447 if (i == 0) {
448 buf.dim[i].stride = 1;
449 } else {
450 buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
451 }
452 }
453 }
454
455 /** Initialize the shape from a vector of extents */
456 void initialize_shape(const std::vector<int> &sizes) {
457 assert(buf.dimensions == (int)sizes.size());
458 initialize_shape(sizes.data());
459 }
460
461 /** Initialize the shape from the static shape of an array */
462 template<typename Array, size_t N>
463 void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
464 buf.dim[next].min = 0;
465 buf.dim[next].extent = (int)N;
466 if (next == 0) {
467 buf.dim[next].stride = 1;
468 } else {
469 initialize_shape_from_array_shape(next - 1, vals[0]);
470 buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
471 }
472 }
473
474 /** Base case for the template recursion above. */
475 template<typename T2>
476 void initialize_shape_from_array_shape(int, const T2 &) {
477 }
478
479 /** Get the dimensionality of a multi-dimensional C array */
480 template<typename Array, size_t N>
481 static int dimensionality_of_array(Array (&vals)[N]) {
482 return dimensionality_of_array(vals[0]) + 1;
483 }
484
485 template<typename T2>
486 static int dimensionality_of_array(const T2 &) {
487 return 0;
488 }
489
490 /** Get the underlying halide_type_t of an array's element type. */
491 template<typename Array, size_t N>
492 static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
493 return scalar_type_of_array(vals[0]);
494 }
495
496 template<typename T2>
497 static halide_type_t scalar_type_of_array(const T2 &) {
498 return halide_type_of<typename std::remove_cv<T2>::type>();
499 }
500
501 /** Crop a single dimension without handling device allocation. */
502 void crop_host(int d, int min, int extent) {
503 assert(dim(d).min() <= min);
504 assert(dim(d).max() >= min + extent - 1);
505 ptrdiff_t shift = min - dim(d).min();
506 if (buf.host != nullptr) {
507 buf.host += (shift * dim(d).stride()) * type().bytes();
508 }
509 buf.dim[d].min = min;
510 buf.dim[d].extent = extent;
511 }
512
513 /** Crop as many dimensions as are in rect, without handling device allocation. */
514 void crop_host(const std::vector<std::pair<int, int>> &rect) {
515 assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
516 int limit = (int)rect.size();
517 assert(limit <= dimensions());
518 for (int i = 0; i < limit; i++) {
519 crop_host(i, rect[i].first, rect[i].second);
520 }
521 }
522
523 void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
524 assert(buf.device_interface != nullptr);
525 if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == halide_error_code_success) {
526 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
527 // is it possible to get to this point without incref having run at least once since
528 // the device field was set? (I.e. in the internal logic of crop. incref might have been
529 // called.)
530 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
531 result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
532 } else {
533 result_host_cropped.crop_from(*this);
534 }
535 }
536 }
537
538 /** slice a single dimension without handling device allocation. */
539 void slice_host(int d, int pos) {
540 static_assert(Dims == AnyDims);
541 assert(dimensions() > 0);
542 assert(d >= 0 && d < dimensions());
543 assert(pos >= dim(d).min() && pos <= dim(d).max());
544 buf.dimensions--;
545 ptrdiff_t shift = pos - buf.dim[d].min;
546 if (buf.host != nullptr) {
547 buf.host += (shift * buf.dim[d].stride) * type().bytes();
548 }
549 for (int i = d; i < buf.dimensions; i++) {
550 buf.dim[i] = buf.dim[i + 1];
551 }
552 buf.dim[buf.dimensions] = {0, 0, 0};
553 }
554
555 void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
556 assert(buf.device_interface != nullptr);
557 if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == halide_error_code_success) {
558 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
559 // is it possible to get to this point without incref having run at least once since
560 // the device field was set? (I.e. in the internal logic of slice. incref might have been
561 // called.)
562 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
563 // crop_from() is correct here, despite the fact that we are slicing.
564 result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
565 } else {
566 // crop_from() is correct here, despite the fact that we are slicing.
567 result_host_sliced.crop_from(*this);
568 }
569 }
570 }
571
572public:
573 typedef T ElemType;
574
575 /** Read-only access to the shape */
576 class Dimension {
577 const halide_dimension_t &d;
578
579 public:
580 /** The lowest coordinate in this dimension */
582 return d.min;
583 }
584
585 /** The number of elements in memory you have to step over to
586 * increment this coordinate by one. */
588 return d.stride;
589 }
590
591 /** The extent of the image along this dimension */
593 return d.extent;
594 }
595
596 /** The highest coordinate in this dimension */
598 return min() + extent() - 1;
599 }
600
601 /** An iterator class, so that you can iterate over
602 * coordinates in a dimensions using a range-based for loop. */
603 struct iterator {
604 int val;
605 int operator*() const {
606 return val;
607 }
608 bool operator!=(const iterator &other) const {
609 return val != other.val;
610 }
612 val++;
613 return *this;
614 }
615 };
616
617 /** An iterator that points to the min coordinate */
619 return {min()};
620 }
621
622 /** An iterator that points to one past the max coordinate */
624 return {min() + extent()};
625 }
626
628 : d(dim) {
629 }
630 };
631
632 /** Access the shape of the buffer */
634 assert(i >= 0 && i < this->dimensions());
635 return Dimension(buf.dim[i]);
636 }
637
638 /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
639 // @{
640 int min(int i) const {
641 return dim(i).min();
642 }
643 int extent(int i) const {
644 return dim(i).extent();
645 }
646 int stride(int i) const {
647 return dim(i).stride();
648 }
649 // @}
650
651 /** The total number of elements this buffer represents. Equal to
652 * the product of the extents */
653 size_t number_of_elements() const {
654 return buf.number_of_elements();
655 }
656
657 /** Get the dimensionality of the buffer. */
658 int dimensions() const {
659 if constexpr (has_static_dimensions) {
660 return Dims;
661 } else {
662 return buf.dimensions;
663 }
664 }
665
666 /** Get the type of the elements. */
668 return buf.type;
669 }
670
671 /** A pointer to the element with the lowest address. If all
672 * strides are positive, equal to the host pointer. */
673 T *begin() const {
674 assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
675 return (T *)buf.begin();
676 }
677
678 /** A pointer to one beyond the element with the highest address. */
679 T *end() const {
680 assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
681 return (T *)buf.end();
682 }
683
684 /** The total number of bytes spanned by the data in memory. */
685 size_t size_in_bytes() const {
686 return buf.size_in_bytes();
687 }
688
689 /** Reset the Buffer to be equivalent to a default-constructed Buffer
690 * of the same static type (if any); Buffer<void> will have its runtime
691 * type reset to uint8. */
692 void reset() {
693 *this = Buffer();
694 }
695
697 : shape() {
698 buf.type = static_halide_type();
699 // If Dims are statically known, must create storage that many.
700 // otherwise, make a zero-dimensional buffer.
701 constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
702 make_static_shape_storage<buf_dimensions>();
703 }
704
705 /** Make a Buffer from a halide_buffer_t */
706 explicit Buffer(const halide_buffer_t &buf,
708 assert(T_is_void || buf.type == static_halide_type());
709 initialize_from_buffer(buf, ownership);
710 }
711
712 /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
713 template<typename T2, int D2, int S2>
714 friend class Buffer;
715
716private:
717 template<typename T2, int D2, int S2>
718 static void static_assert_can_convert_from() {
719 static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
720 "Can't convert from a Buffer<const T> to a Buffer<T>");
721 static_assert(std::is_same<typename std::remove_const<T>::type,
722 typename std::remove_const<T2>::type>::value ||
723 T_is_void || Buffer<T2, D2, S2>::T_is_void,
724 "type mismatch constructing Buffer");
725 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
726 "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
727 }
728
729public:
730 static void set_default_allocate_fn(void *(*allocate_fn)(size_t)) {
732 }
733 static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
735 }
736
737 /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
738 * If this can be determined at compile time, fail with a static assert; otherwise
739 * return a boolean based on runtime typing. */
740 template<typename T2, int D2, int S2>
741 static bool can_convert_from(const Buffer<T2, D2, S2> &other) {
742 static_assert_can_convert_from<T2, D2, S2>();
743 if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
744 if (other.type() != static_halide_type()) {
745 return false;
746 }
747 }
748 if (Dims != AnyDims) {
749 if (other.dimensions() != Dims) {
750 return false;
751 }
752 }
753 return true;
754 }
755
756 /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
757 * cannot be constructed from some other Buffer type. */
758 template<typename T2, int D2, int S2>
759 static void assert_can_convert_from(const Buffer<T2, D2, S2> &other) {
760 // Explicitly call static_assert_can_convert_from() here so
761 // that we always get compile-time checking, even if compiling with
762 // assertions disabled.
763 static_assert_can_convert_from<T2, D2, S2>();
764 assert(can_convert_from(other));
765 }
766
767 /** Copy constructor. Does not copy underlying data. */
769 : buf(other.buf),
770 alloc(other.alloc) {
771 other.incref();
772 dev_ref_count = other.dev_ref_count;
773 copy_shape_from(other.buf);
774 }
775
776 /** Construct a Buffer from a Buffer of different dimensionality
777 * and type. Asserts that the type and dimensionality matches (at runtime,
778 * if one of the types is void). Note that this constructor is
779 * implicit. This, for example, lets you pass things like
780 * Buffer<T> or Buffer<const void> to functions expected
781 * Buffer<const T>. */
782 template<typename T2, int D2, int S2>
784 : buf(other.buf),
785 alloc(other.alloc) {
787 other.incref();
788 dev_ref_count = other.dev_ref_count;
789 copy_shape_from(other.buf);
790 }
791
792 /** Move constructor */
794 : buf(other.buf),
795 alloc(other.alloc),
796 dev_ref_count(other.dev_ref_count) {
797 other.dev_ref_count = nullptr;
798 other.alloc = nullptr;
799 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
800 }
801
802 /** Move-construct a Buffer from a Buffer of different
803 * dimensionality and type. Asserts that the types match (at
804 * runtime if one of the types is void). */
805 template<typename T2, int D2, int S2>
807 : buf(other.buf),
808 alloc(other.alloc),
809 dev_ref_count(other.dev_ref_count) {
811 other.dev_ref_count = nullptr;
812 other.alloc = nullptr;
813 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
814 }
815
816 /** Assign from another Buffer of possibly-different
817 * dimensionality and type. Asserts that the types match (at
818 * runtime if one of the types is void). */
819 template<typename T2, int D2, int S2>
821 if ((const void *)this == (const void *)&other) {
822 return *this;
823 }
825 other.incref();
826 decref();
827 dev_ref_count = other.dev_ref_count;
828 alloc = other.alloc;
829 free_shape_storage();
830 buf = other.buf;
831 copy_shape_from(other.buf);
832 return *this;
833 }
834
835 /** Standard assignment operator */
837 // The cast to void* here is just to satisfy clang-tidy
838 if ((const void *)this == (const void *)&other) {
839 return *this;
840 }
841 other.incref();
842 decref();
843 dev_ref_count = other.dev_ref_count;
844 alloc = other.alloc;
845 free_shape_storage();
846 buf = other.buf;
847 copy_shape_from(other.buf);
848 return *this;
849 }
850
851 /** Move from another Buffer of possibly-different
852 * dimensionality and type. Asserts that the types match (at
853 * runtime if one of the types is void). */
854 template<typename T2, int D2, int S2>
857 decref();
858 alloc = other.alloc;
859 other.alloc = nullptr;
860 dev_ref_count = other.dev_ref_count;
861 other.dev_ref_count = nullptr;
862 free_shape_storage();
863 buf = other.buf;
864 move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
865 return *this;
866 }
867
868 /** Standard move-assignment operator */
870 decref();
871 alloc = other.alloc;
872 other.alloc = nullptr;
873 dev_ref_count = other.dev_ref_count;
874 other.dev_ref_count = nullptr;
875 free_shape_storage();
876 buf = other.buf;
877 move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
878 return *this;
879 }
880
881 /** Check the product of the extents fits in memory. */
883 size_t size = type().bytes();
884 for (int i = 0; i < dimensions(); i++) {
885 size *= dim(i).extent();
886 }
887 // We allow 2^31 or 2^63 bytes, so drop the top bit.
888 size = (size << 1) >> 1;
889 for (int i = 0; i < dimensions(); i++) {
890 size /= dim(i).extent();
891 }
892 assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
893 }
894
895 /** Allocate memory for this Buffer. Drops the reference to any
896 * owned memory. */
897 void allocate(void *(*allocate_fn)(size_t) = nullptr,
898 void (*deallocate_fn)(void *) = nullptr) {
899 // Drop any existing allocation
900 deallocate();
901
902 // Conservatively align images to (usually) 128 bytes. This is enough
903 // alignment for all the platforms we might use. Also ensure that the allocation
904 // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
905 constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;
906
907 const auto align_up = [=](size_t value) -> size_t {
908 return (value + alignment - 1) & ~(alignment - 1);
909 };
910
911 size_t size = size_in_bytes();
912
913#if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
914 // Only use aligned_alloc() if no custom allocators are specified.
916 // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
917 // on any supported platform, so we will just overallocate by 'alignment'
918 // so that the user storage also starts at an aligned point. This is a bit
919 // wasteful, but probably not a big deal.
920 static_assert(sizeof(AllocationHeader) <= alignment);
921 void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
922 assert((uintptr_t)alloc_storage == align_up((uintptr_t)alloc_storage));
923 alloc = new (alloc_storage) AllocationHeader(free);
924 buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
925 return;
926 }
927 // else fall thru
928#endif
929 if (!allocate_fn) {
931 if (!allocate_fn) {
932 allocate_fn = malloc;
933 }
934 }
935 if (!deallocate_fn) {
937 if (!deallocate_fn) {
938 deallocate_fn = free;
939 }
940 }
941
942 static_assert(sizeof(AllocationHeader) <= alignment);
943
944 // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
945 // make sure this is OK for AllocationHeader, since it always goes at the start
946 static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));
947
948 const size_t requested_size = align_up(size + alignment +
949 std::max(0, (int)sizeof(AllocationHeader) -
950 (int)sizeof(std::max_align_t)));
951 void *alloc_storage = allocate_fn(requested_size);
952 alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
953 uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
954 buf.host = (uint8_t *)align_up((uintptr_t)unaligned_ptr);
955 }
956
957 /** Drop reference to any owned host or device memory, possibly
958 * freeing it, if this buffer held the last reference to
959 * it. Retains the shape of the buffer. Does nothing if this
960 * buffer did not allocate its own memory. */
961 void deallocate() {
962 decref();
963 }
964
965 /** Drop reference to any owned device memory, possibly freeing it
966 * if this buffer held the last reference to it. Asserts that
967 * device_dirty is false. */
969 decref(true);
970 }
971
972 /** Allocate a new image of the given size with a runtime
973 * type. Only used when you do know what size you want but you
974 * don't know statically what type the elements are. Pass zeros
975 * to make a buffer suitable for bounds query calls. */
976 template<typename... Args,
977 typename = typename std::enable_if<AllInts<Args...>::value>::type>
978 Buffer(halide_type_t t, int first, Args... rest) {
979 if (!T_is_void) {
980 assert(static_halide_type() == t);
981 }
982 int extents[] = {first, (int)rest...};
983 buf.type = t;
984 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
985 make_static_shape_storage<buf_dimensions>();
986 initialize_shape(extents);
987 if (!Internal::any_zero(extents)) {
989 allocate();
990 }
991 }
992
993 /** Allocate a new image of the given size. Pass zeros to make a
994 * buffer suitable for bounds query calls. */
995 // @{
996
997 // The overload with one argument is 'explicit', so that
998 // (say) int is not implicitly convertible to Buffer<int>
999 explicit Buffer(int first) {
1000 static_assert(!T_is_void,
1001 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1002 int extents[] = {first};
1003 buf.type = static_halide_type();
1004 constexpr int buf_dimensions = 1;
1005 make_static_shape_storage<buf_dimensions>();
1006 initialize_shape(extents);
1007 if (first != 0) {
1009 allocate();
1010 }
1011 }
1012
1013 template<typename... Args,
1014 typename = typename std::enable_if<AllInts<Args...>::value>::type>
1015 Buffer(int first, int second, Args... rest) {
1016 static_assert(!T_is_void,
1017 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1018 int extents[] = {first, second, (int)rest...};
1019 buf.type = static_halide_type();
1020 constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
1021 make_static_shape_storage<buf_dimensions>();
1022 initialize_shape(extents);
1023 if (!Internal::any_zero(extents)) {
1025 allocate();
1026 }
1027 }
1028 // @}
1029
1030 /** Allocate a new image of unknown type using a vector of ints as the size. */
1031 Buffer(halide_type_t t, const std::vector<int> &sizes) {
1032 if (!T_is_void) {
1033 assert(static_halide_type() == t);
1034 }
1035 buf.type = t;
1036 // make_shape_storage() will do a runtime check that dimensionality matches.
1037 make_shape_storage((int)sizes.size());
1038 initialize_shape(sizes);
1039 if (!Internal::any_zero(sizes)) {
1041 allocate();
1042 }
1043 }
1044
1045 /** Allocate a new image of known type using a vector of ints as the size. */
1046 explicit Buffer(const std::vector<int> &sizes)
1047 : Buffer(static_halide_type(), sizes) {
1048 }
1049
1050private:
1051 // Create a copy of the sizes vector, ordered as specified by order.
1052 static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
1053 assert(order.size() == sizes.size());
1054 std::vector<int> ordered_sizes(sizes.size());
1055 for (size_t i = 0; i < sizes.size(); ++i) {
1056 ordered_sizes[i] = sizes.at(order[i]);
1057 }
1058 return ordered_sizes;
1059 }
1060
1061public:
1062 /** Allocate a new image of unknown type using a vector of ints as the size and
1063 * a vector of indices indicating the storage order for each dimension. The
1064 * length of the sizes vector and the storage-order vector must match. For instance,
1065 * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
1066 Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
1067 : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
1068 transpose(storage_order);
1069 }
1070
1071 Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
1072 : Buffer(static_halide_type(), sizes, storage_order) {
1073 }
1074
1075 /** Make an Buffer that refers to a statically sized array. Does not
1076 * take ownership of the data, and does not set the host_dirty flag. */
1077 template<typename Array, size_t N>
1078 explicit Buffer(Array (&vals)[N]) {
1079 const int buf_dimensions = dimensionality_of_array(vals);
1080 buf.type = scalar_type_of_array(vals);
1081 buf.host = (uint8_t *)vals;
1082 make_shape_storage(buf_dimensions);
1083 initialize_shape_from_array_shape(buf.dimensions - 1, vals);
1084 }
1085
1086 /** Initialize an Buffer of runtime type from a pointer and some
1087 * sizes. Assumes dense row-major packing and a min coordinate of
1088 * zero. Does not take ownership of the data and does not set the
1089 * host_dirty flag. */
1090 template<typename... Args,
1091 typename = typename std::enable_if<AllInts<Args...>::value>::type>
1092 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
1093 if (!T_is_void) {
1094 assert(static_halide_type() == t);
1095 }
1096 int extents[] = {first, (int)rest...};
1097 buf.type = t;
1098 buf.host = (uint8_t *)const_cast<void *>(data);
1099 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1100 make_static_shape_storage<buf_dimensions>();
1101 initialize_shape(extents);
1102 }
1103
1104 /** Initialize an Buffer from a pointer and some sizes. Assumes
1105 * dense row-major packing and a min coordinate of zero. Does not
1106 * take ownership of the data and does not set the host_dirty flag. */
1107 template<typename... Args,
1108 typename = typename std::enable_if<AllInts<Args...>::value>::type>
1109 explicit Buffer(T *data, int first, Args &&...rest) {
1110 int extents[] = {first, (int)rest...};
1111 buf.type = static_halide_type();
1112 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1113 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1114 make_static_shape_storage<buf_dimensions>();
1115 initialize_shape(extents);
1116 }
1117
1118 /** Initialize an Buffer from a pointer and a vector of
1119 * sizes. Assumes dense row-major packing and a min coordinate of
1120 * zero. Does not take ownership of the data and does not set the
1121 * host_dirty flag. */
1122 explicit Buffer(T *data, const std::vector<int> &sizes) {
1123 buf.type = static_halide_type();
1124 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1125 make_shape_storage((int)sizes.size());
1126 initialize_shape(sizes);
1127 }
1128
1129 /** Initialize an Buffer of runtime type from a pointer and a
1130 * vector of sizes. Assumes dense row-major packing and a min
1131 * coordinate of zero. Does not take ownership of the data and
1132 * does not set the host_dirty flag. */
1133 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
1134 if (!T_is_void) {
1135 assert(static_halide_type() == t);
1136 }
1137 buf.type = t;
1138 buf.host = (uint8_t *)const_cast<void *>(data);
1139 make_shape_storage((int)sizes.size());
1140 initialize_shape(sizes);
1141 }
1142
1143 /** Initialize an Buffer from a pointer to the min coordinate and
1144 * an array describing the shape. Does not take ownership of the
1145 * data, and does not set the host_dirty flag. */
1146 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
1147 if (!T_is_void) {
1148 assert(static_halide_type() == t);
1149 }
1150 buf.type = t;
1151 buf.host = (uint8_t *)const_cast<void *>(data);
1152 make_shape_storage(d);
1153 for (int i = 0; i < d; i++) {
1154 buf.dim[i] = shape[i];
1155 }
1156 }
1157
1158 /** Initialize a Buffer from a pointer to the min coordinate and
1159 * a vector describing the shape. Does not take ownership of the
1160 * data, and does not set the host_dirty flag. */
1161 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
1162 const std::vector<halide_dimension_t> &shape)
1163 : Buffer(t, data, (int)shape.size(), shape.data()) {
1164 }
1165
1166 /** Initialize an Buffer from a pointer to the min coordinate and
1167 * an array describing the shape. Does not take ownership of the
1168 * data and does not set the host_dirty flag. */
1169 explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1170 buf.type = static_halide_type();
1171 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1172 make_shape_storage(d);
1173 for (int i = 0; i < d; i++) {
1174 buf.dim[i] = shape[i];
1175 }
1176 }
1177
1178 /** Initialize a Buffer from a pointer to the min coordinate and
1179 * a vector describing the shape. Does not take ownership of the
1180 * data, and does not set the host_dirty flag. */
1181 explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1182 : Buffer(data, (int)shape.size(), shape.data()) {
1183 }
1184
1185 /** Destructor. Will release any underlying owned allocation if
1186 * this is the last reference to it. Will assert fail if there are
1187 * weak references to this Buffer outstanding. */
1189 decref();
1190 free_shape_storage();
1191 }
1192
1193 /** Get a pointer to the raw halide_buffer_t this wraps. */
1194 // @{
1196 return &buf;
1197 }
1198
1200 return &buf;
1201 }
1202 // @}
1203
1204 /** Provide a cast operator to halide_buffer_t *, so that
1205 * instances can be passed directly to Halide filters. */
1206 operator halide_buffer_t *() {
1207 return &buf;
1208 }
1209
1210 /** Return a typed reference to this Buffer. Useful for converting
1211 * a reference to a Buffer<void> to a reference to, for example, a
1212 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1213 * You can also optionally sspecify a new value for Dims; this is useful
1214 * mainly for removing the dimensionality constraint on a Buffer with
1215 * explicit dimensionality. Does a runtime assert if the source buffer type
1216 * is void or the new dimensionality is incompatible. */
1217 template<typename T2, int D2 = Dims>
1222
1223 /** Return a const typed reference to this Buffer. Useful for converting
1224 * a reference to a Buffer<void> to a reference to, for example, a
1225 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1226 * You can also optionally sspecify a new value for Dims; this is useful
1227 * mainly for removing the dimensionality constraint on a Buffer with
1228 * explicit dimensionality. Does a runtime assert if the source buffer type
1229 * is void or the new dimensionality is incompatible. */
1230 template<typename T2, int D2 = Dims>
1235
1236 /** Return an rval reference to this Buffer. Useful for converting
1237 * a reference to a Buffer<void> to a reference to, for example, a
1238 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1239 * You can also optionally sspecify a new value for Dims; this is useful
1240 * mainly for removing the dimensionality constraint on a Buffer with
1241 * explicit dimensionality. Does a runtime assert if the source buffer type
1242 * is void or the new dimensionality is incompatible. */
1243 template<typename T2, int D2 = Dims>
1248
1249 /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1250 * to recapitulate the type argument. */
1251 // @{
1253 Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() & {
1254 // Note that we can skip the assert_can_convert_from(), since T -> const T
1255 // conversion is always legal.
1256 return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1257 }
1258
1260 const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() const & {
1261 return *((const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1262 }
1263
1265 Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> as_const() && {
1266 return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1267 }
1268 // @}
1269
1270 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
1271 * passing arguments */
1272 template<typename T2 = T, typename = typename std::enable_if<!std::is_const<T2>::value>::type>
1273 operator Buffer<typename std::add_const<T2>::type, Dims, InClassDimStorage> &() & {
1274 return as_const();
1275 }
1276
1277 /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
1278 * passing arguments */
1279 template<typename TVoid,
1280 typename T2 = T,
1281 typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1282 !std::is_void<T2>::value &&
1283 !std::is_const<T2>::value>::type>
1285 return as<TVoid, Dims>();
1286 }
1287
1288 /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
1289 * passing arguments */
1290 template<typename TVoid,
1291 typename T2 = T,
1292 typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1293 !std::is_void<T2>::value &&
1294 std::is_const<T2>::value>::type>
1298
1299 /** Conventional names for the first three dimensions. */
1300 // @{
1301 int width() const {
1302 return (dimensions() > 0) ? dim(0).extent() : 1;
1303 }
1304 int height() const {
1305 return (dimensions() > 1) ? dim(1).extent() : 1;
1306 }
1307 int channels() const {
1308 return (dimensions() > 2) ? dim(2).extent() : 1;
1309 }
1310 // @}
1311
1312 /** Conventional names for the min and max value of each dimension */
1313 // @{
1314 int left() const {
1315 return dim(0).min();
1316 }
1317
1318 int right() const {
1319 return dim(0).max();
1320 }
1321
1322 int top() const {
1323 return dim(1).min();
1324 }
1325
1326 int bottom() const {
1327 return dim(1).max();
1328 }
1329 // @}
1330
1331 /** Make a new image which is a deep copy of this image. Use crop
1332 * or slice followed by copy to make a copy of only a portion of
1333 * the image. The new image has the same nesting order of dimensions
1334 * (e.g. channels innermost), but resets the strides to the default
1335 * (each stride is the product of the extents of the inner dimensions).
1336 * Note that this means any strides of zero get broadcast into a non-zero stride.
1337 *
1338 * Note that the returned Buffer is always of a non-const type T (ie:
1339 *
1340 * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1341 *
1342 * which is always safe, since we are making a deep copy. (The caller
1343 * can easily cast it back to Buffer<const T> if desired, which is
1344 * always safe and free.)
1345 */
1346 Buffer<not_const_T, Dims, InClassDimStorage> copy(void *(*allocate_fn)(size_t) = nullptr,
1347 void (*deallocate_fn)(void *) = nullptr) const {
1349 dst.copy_from(*this);
1350 return dst;
1351 }
1352
1353 /** Like copy(), but the copy is created in interleaved memory layout
1354 * (vs. keeping the same memory layout as the original). Requires that 'this'
1355 * has exactly 3 dimensions.
1356 */
1358 void (*deallocate_fn)(void *) = nullptr) const {
1359 static_assert(Dims == AnyDims || Dims == 3);
1360 assert(dimensions() == 3);
1362 dst.set_min(min(0), min(1), min(2));
1363 dst.allocate(allocate_fn, deallocate_fn);
1364 dst.copy_from(*this);
1365 return dst;
1366 }
1367
1368 /** Like copy(), but the copy is created in planar memory layout
1369 * (vs. keeping the same memory layout as the original).
1370 */
1371 Buffer<not_const_T, Dims, InClassDimStorage> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
1372 void (*deallocate_fn)(void *) = nullptr) const {
1373 std::vector<int> mins, extents;
1374 const int dims = dimensions();
1375 mins.reserve(dims);
1376 extents.reserve(dims);
1377 for (int d = 0; d < dims; ++d) {
1378 mins.push_back(dim(d).min());
1379 extents.push_back(dim(d).extent());
1380 }
1382 dst.set_min(mins);
1383 dst.allocate(allocate_fn, deallocate_fn);
1384 dst.copy_from(*this);
1385 return dst;
1386 }
1387
1388 /** Make a copy of the Buffer which shares the underlying host and/or device
1389 * allocations as the existing Buffer. This is purely syntactic sugar for
1390 * cases where you have a const reference to a Buffer but need a temporary
1391 * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1392 * inline way to create a temporary. \code
1393 * void call_my_func(const Buffer<const uint8_t>& input) {
1394 * my_func(input.alias(), output);
1395 * }\endcode
1396 */
1398 return *this;
1399 }
1400
1401 /** Fill a Buffer with the values at the same coordinates in
1402 * another Buffer. Restricts itself to coordinates contained
1403 * within the intersection of the two buffers. If the two Buffers
1404 * are not in the same coordinate system, you will need to
1405 * translate the argument Buffer first. E.g. if you're blitting a
1406 * sprite onto a framebuffer, you'll want to translate the sprite
1407 * to the correct location first like so: \code
1408 * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1409 */
1410 template<typename T2, int D2, int S2>
1412 static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
1413 assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1414 assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1415
1417
1418 static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
1419 assert(src.dimensions() == dst.dimensions());
1420
1421 // Trim the copy to the region in common
1422 const int d = dimensions();
1423 for (int i = 0; i < d; i++) {
1424 int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1425 int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1426 if (max_coord < min_coord) {
1427 // The buffers do not overlap.
1428 return;
1429 }
1430 dst.crop(i, min_coord, max_coord - min_coord + 1);
1431 src.crop(i, min_coord, max_coord - min_coord + 1);
1432 }
1433
1434 // If T is void, we need to do runtime dispatch to an
1435 // appropriately-typed lambda. We're copying, so we only care
1436 // about the element size. (If not, this should optimize away
1437 // into a static dispatch to the right-sized copy.)
1438 if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1439 using MemType = uint8_t;
1440 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1441 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1442 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1443 } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1444 using MemType = uint16_t;
1445 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1446 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1447 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1448 } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1449 using MemType = uint32_t;
1450 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1451 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1452 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1453 } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1454 using MemType = uint64_t;
1455 auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1456 auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1457 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1458 } else {
1459 assert(false && "type().bytes() must be 1, 2, 4, or 8");
1460 }
1462 }
1463
1464 /** Make an image that refers to a sub-range of this image along
1465 * the given dimension. Asserts that the crop region is within
1466 * the existing bounds: you cannot "crop outwards", even if you know there
1467 * is valid Buffer storage (e.g. because you already cropped inwards). */
1469 // Make a fresh copy of the underlying buffer (but not a fresh
1470 // copy of the allocation, if there is one).
1472
1473 // This guarantees the prexisting device ref is dropped if the
1474 // device_crop call fails and maintains the buffer in a consistent
1475 // state.
1476 im.device_deallocate();
1477
1478 im.crop_host(d, min, extent);
1479 if (buf.device_interface != nullptr) {
1480 complete_device_crop(im);
1481 }
1482 return im;
1483 }
1484
1485 /** Crop an image in-place along the given dimension. This does
1486 * not move any data around in memory - it just changes the min
1487 * and extent of the given dimension. */
1488 void crop(int d, int min, int extent) {
1489 // An optimization for non-device buffers. For the device case,
1490 // a temp buffer is required, so reuse the not-in-place version.
1491 // TODO(zalman|abadams): Are nop crops common enough to special
1492 // case the device part of the if to do nothing?
1493 if (buf.device_interface != nullptr) {
1494 *this = cropped(d, min, extent);
1495 } else {
1496 crop_host(d, min, extent);
1497 }
1498 }
1499
1500 /** Make an image that refers to a sub-rectangle of this image along
1501 * the first N dimensions. Asserts that the crop region is within
1502 * the existing bounds. The cropped image may drop any device handle
1503 * if the device_interface cannot accomplish the crop in-place. */
1504 Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
1505 // Make a fresh copy of the underlying buffer (but not a fresh
1506 // copy of the allocation, if there is one).
1508
1509 // This guarantees the prexisting device ref is dropped if the
1510 // device_crop call fails and maintains the buffer in a consistent
1511 // state.
1512 im.device_deallocate();
1513
1514 im.crop_host(rect);
1515 if (buf.device_interface != nullptr) {
1516 complete_device_crop(im);
1517 }
1518 return im;
1519 }
1520
1521 /** Crop an image in-place along the first N dimensions. This does
1522 * not move any data around in memory, nor does it free memory. It
1523 * just rewrites the min/extent of each dimension to refer to a
1524 * subregion of the same allocation. */
1525 void crop(const std::vector<std::pair<int, int>> &rect) {
1526 // An optimization for non-device buffers. For the device case,
1527 // a temp buffer is required, so reuse the not-in-place version.
1528 // TODO(zalman|abadams): Are nop crops common enough to special
1529 // case the device part of the if to do nothing?
1530 if (buf.device_interface != nullptr) {
1531 *this = cropped(rect);
1532 } else {
1533 crop_host(rect);
1534 }
1535 }
1536
1537 /** Make an image which refers to the same data with using
1538 * translated coordinates in the given dimension. Positive values
1539 * move the image data to the right or down relative to the
1540 * coordinate system. Drops any device handle. */
1543 im.translate(d, dx);
1544 return im;
1545 }
1546
1547 /** Translate an image in-place along one dimension by changing
1548 * how it is indexed. Does not move any data around in memory. */
1549 void translate(int d, int delta) {
1550 assert(d >= 0 && d < this->dimensions());
1552 buf.dim[d].min += delta;
1553 }
1554
1555 /** Make an image which refers to the same data translated along
1556 * the first N dimensions. */
1557 Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
1559 im.translate(delta);
1560 return im;
1561 }
1562
1563 /** Translate an image along the first N dimensions by changing
1564 * how it is indexed. Does not move any data around in memory. */
1565 void translate(const std::vector<int> &delta) {
1567 assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1568 int limit = (int)delta.size();
1569 assert(limit <= dimensions());
1570 for (int i = 0; i < limit; i++) {
1571 translate(i, delta[i]);
1572 }
1573 }
1574
1575 /** Set the min coordinate of an image in the first N dimensions. */
1576 // @{
1577 void set_min(const std::vector<int> &mins) {
1578 assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1580 for (size_t i = 0; i < mins.size(); i++) {
1581 buf.dim[i].min = mins[i];
1582 }
1583 }
1584
1585 template<typename... Args>
1586 void set_min(Args... args) {
1587 set_min(std::vector<int>{args...});
1588 }
1589 // @}
1590
1591 /** Test if a given coordinate is within the bounds of an image. */
1592 // @{
1593 bool contains(const std::vector<int> &coords) const {
1594 assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1595 for (size_t i = 0; i < coords.size(); i++) {
1596 if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1597 return false;
1598 }
1599 }
1600 return true;
1601 }
1602
1603 template<typename... Args>
1604 bool contains(Args... args) const {
1605 return contains(std::vector<int>{args...});
1606 }
1607 // @}
1608
1609 /** Make a buffer which refers to the same data in the same layout
1610 * using a swapped indexing order for the dimensions given. So
1611 * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1612 * strongly that A.address_of(i, j) == B.address_of(j, i). */
1615 im.transpose(d1, d2);
1616 return im;
1617 }
1618
1619 /** Transpose a buffer in-place by changing how it is indexed. For
1620 * example, transpose(0, 1) on a two-dimensional buffer means that
1621 * the value referred to by coordinates (i, j) is now reached at
1622 * the coordinates (j, i), and vice versa. This is done by
1623 * reordering the per-dimension metadata rather than by moving
1624 * data around in memory, so other views of the same memory will
1625 * not see the data as having been transposed. */
1626 void transpose(int d1, int d2) {
1627 assert(d1 >= 0 && d1 < this->dimensions());
1628 assert(d2 >= 0 && d2 < this->dimensions());
1629 std::swap(buf.dim[d1], buf.dim[d2]);
1630 }
1631
1632 /** A generalized transpose: instead of swapping two dimensions,
1633 * pass a vector that lists each dimension index exactly once, in
1634 * the desired order. This does not move any data around in memory
1635 * - it just permutes how it is indexed. */
1636 void transpose(const std::vector<int> &order) {
1637 assert((int)order.size() == dimensions());
1638 if (dimensions() < 2) {
1639 // My, that was easy
1640 return;
1641 }
1642
1643 std::vector<int> order_sorted = order;
1644 for (size_t i = 1; i < order_sorted.size(); i++) {
1645 for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1646 std::swap(order_sorted[j], order_sorted[j - 1]);
1647 transpose(j, j - 1);
1648 }
1649 }
1650 }
1651
1652 /** Make a buffer which refers to the same data in the same
1653 * layout using a different ordering of the dimensions. */
1654 Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
1656 im.transpose(order);
1657 return im;
1658 }
1659
1660 /** Make a lower-dimensional buffer that refers to one slice of
1661 * this buffer. */
1662 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1663 sliced(int d, int pos) const {
1664 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1665 assert(dimensions() > 0);
1666
1668
1669 // This guarantees the prexisting device ref is dropped if the
1670 // device_slice call fails and maintains the buffer in a consistent
1671 // state.
1672 im.device_deallocate();
1673
1674 im.slice_host(d, pos);
1675 if (buf.device_interface != nullptr) {
1676 complete_device_slice(im, d, pos);
1677 }
1678 return im;
1679 }
1680
1681 /** Make a lower-dimensional buffer that refers to one slice of this
1682 * buffer at the dimension's minimum. */
1683 Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1684 sliced(int d) const {
1685 static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1686 assert(dimensions() > 0);
1687
1688 return sliced(d, dim(d).min());
1689 }
1690
1691 /** Rewrite the buffer to refer to a single lower-dimensional
1692 * slice of itself along the given dimension at the given
1693 * coordinate. Does not move any data around or free the original
1694 * memory, so other views of the same data are unaffected. Can
1695 * only be called on a Buffer with dynamic dimensionality. */
1696 void slice(int d, int pos) {
1697 static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
1698 assert(dimensions() > 0);
1699
1700 // An optimization for non-device buffers. For the device case,
1701 // a temp buffer is required, so reuse the not-in-place version.
1702 // TODO(zalman|abadams): Are nop slices common enough to special
1703 // case the device part of the if to do nothing?
1704 if (buf.device_interface != nullptr) {
1705 *this = sliced(d, pos);
1706 } else {
1707 slice_host(d, pos);
1708 }
1709 }
1710
1711 /** Slice a buffer in-place at the dimension's minimum. */
1712 void slice(int d) {
1713 slice(d, dim(d).min());
1714 }
1715
1716 /** Make a new buffer that views this buffer as a single slice in a
1717 * higher-dimensional space. The new dimension has extent one and
1718 * the given min. This operation is the opposite of slice. As an
1719 * example, the following condition is true:
1720 *
1721 \code
1722 im2 = im.embedded(1, 17);
1723 &im(x, y, c) == &im2(x, 17, y, c);
1724 \endcode
1725 */
1726 Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
1727 embedded(int d, int pos = 0) const {
1729 im.embed(d, pos);
1730 return im;
1731 }
1732
1733 /** Embed a buffer in-place, increasing the
1734 * dimensionality. */
1735 void embed(int d, int pos = 0) {
1736 static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
1737 assert(d >= 0 && d <= dimensions());
1738 add_dimension();
1739 translate(dimensions() - 1, pos);
1740 for (int i = dimensions() - 1; i > d; i--) {
1741 transpose(i, i - 1);
1742 }
1743 }
1744
1745 /** Add a new dimension with a min of zero and an extent of
1746 * one. The stride is the extent of the outermost dimension times
1747 * its stride. The new dimension is the last dimension. This is a
1748 * special case of embed. */
1750 static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
1751 const int dims = buf.dimensions;
1752 buf.dimensions++;
1753 if (buf.dim != shape) {
1754 // We're already on the heap. Reallocate.
1755 halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
1756 for (int i = 0; i < dims; i++) {
1757 new_shape[i] = buf.dim[i];
1758 }
1759 delete[] buf.dim;
1760 buf.dim = new_shape;
1761 } else if (dims == InClassDimStorage) {
1762 // Transition from the in-class storage to the heap
1763 make_shape_storage(buf.dimensions);
1764 for (int i = 0; i < dims; i++) {
1765 buf.dim[i] = shape[i];
1766 }
1767 } else {
1768 // We still fit in the class
1769 }
1770 buf.dim[dims] = {0, 1, 0};
1771 if (dims == 0) {
1772 buf.dim[dims].stride = 1;
1773 } else {
1774 buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1775 }
1776 }
1777
1778 /** Add a new dimension with a min of zero, an extent of one, and
1779 * the specified stride. The new dimension is the last
1780 * dimension. This is a special case of embed. */
1782 add_dimension();
1783 buf.dim[buf.dimensions - 1].stride = s;
1784 }
1785
1786 /** Methods for managing any GPU allocation. */
1787 // @{
1788 // Set the host dirty flag. Called by every operator()
1789 // access. Must be inlined so it can be hoisted out of loops.
1791 void set_host_dirty(bool v = true) {
1792 assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1793 buf.set_host_dirty(v);
1794 }
1795
1796 // Check if the device allocation is dirty. Called by
1797 // set_host_dirty, which is called by every accessor. Must be
1798 // inlined so it can be hoisted out of loops.
1800 bool device_dirty() const {
1801 return buf.device_dirty();
1802 }
1803
1804 bool host_dirty() const {
1805 return buf.host_dirty();
1806 }
1807
1808 void set_device_dirty(bool v = true) {
1809 assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1810 buf.set_device_dirty(v);
1811 }
1812
1813 int copy_to_host(void *ctx = nullptr) {
1814 if (device_dirty()) {
1815 return buf.device_interface->copy_to_host(ctx, &buf);
1816 }
1818 }
1819
1820 int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1821 if (host_dirty()) {
1822 return device_interface->copy_to_device(ctx, &buf, device_interface);
1823 }
1825 }
1826
1827 int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1828 return device_interface->device_malloc(ctx, &buf, device_interface);
1829 }
1830
1831 int device_free(void *ctx = nullptr) {
1832 if (dev_ref_count) {
1833 assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
1834 "Can't call device_free on an unmanaged or wrapped native device handle. "
1835 "Free the source allocation or call device_detach_native instead.");
1836 // Multiple people may be holding onto this dev field
1837 assert(dev_ref_count->count == 1 &&
1838 "Multiple Halide::Runtime::Buffer objects share this device "
1839 "allocation. Freeing it would create dangling references. "
1840 "Don't call device_free on Halide buffers that you have copied or "
1841 "passed by value.");
1842 }
1843 int ret = halide_error_code_success;
1844 if (buf.device_interface) {
1845 ret = buf.device_interface->device_free(ctx, &buf);
1846 }
1847 if (dev_ref_count) {
1848 delete dev_ref_count;
1849 dev_ref_count = nullptr;
1850 }
1851 return ret;
1852 }
1853
1854 int device_wrap_native(const struct halide_device_interface_t *device_interface,
1855 uint64_t handle, void *ctx = nullptr) {
1856 assert(device_interface);
1857 dev_ref_count = new DeviceRefCount;
1859 return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1860 }
1861
1862 int device_detach_native(void *ctx = nullptr) {
1863 assert(dev_ref_count &&
1864 dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
1865 "Only call device_detach_native on buffers wrapping a native "
1866 "device handle via device_wrap_native. This buffer was allocated "
1867 "using device_malloc, or is unmanaged. "
1868 "Call device_free or free the original allocation instead.");
1869 // Multiple people may be holding onto this dev field
1870 assert(dev_ref_count->count == 1 &&
1871 "Multiple Halide::Runtime::Buffer objects share this device "
1872 "allocation. Freeing it could create dangling references. "
1873 "Don't call device_detach_native on Halide buffers that you "
1874 "have copied or passed by value.");
1875 int ret = halide_error_code_success;
1876 if (buf.device_interface) {
1877 ret = buf.device_interface->detach_native(ctx, &buf);
1878 }
1879 delete dev_ref_count;
1880 dev_ref_count = nullptr;
1881 return ret;
1882 }
1883
1884 int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1885 return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1886 }
1887
1888 int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1889 if (dev_ref_count) {
1890 assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
1891 "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1892 "Free the source allocation or call device_detach_native instead.");
1893 // Multiple people may be holding onto this dev field
1894 assert(dev_ref_count->count == 1 &&
1895 "Multiple Halide::Runtime::Buffer objects share this device "
1896 "allocation. Freeing it would create dangling references. "
1897 "Don't call device_and_host_free on Halide buffers that you have copied or "
1898 "passed by value.");
1899 }
1900 int ret = halide_error_code_success;
1901 if (buf.device_interface) {
1902 ret = buf.device_interface->device_and_host_free(ctx, &buf);
1903 }
1904 if (dev_ref_count) {
1905 delete dev_ref_count;
1906 dev_ref_count = nullptr;
1907 }
1908 return ret;
1909 }
1910
1911 int device_sync(void *ctx = nullptr) {
1912 return buf.device_sync(ctx);
1913 }
1914
1916 return buf.device != 0;
1917 }
1918
1919 /** Return the method by which the device field is managed. */
1921 if (dev_ref_count == nullptr) {
1923 }
1924 return dev_ref_count->ownership;
1925 }
1926 // @}
1927
1928 /** If you use the (x, y, c) indexing convention, then Halide
1929 * Buffers are stored planar by default. This function constructs
1930 * an interleaved RGB or RGBA image that can still be indexed
1931 * using (x, y, c). Passing it to a generator requires that the
1932 * generator has been compiled with support for interleaved (also
1933 * known as packed or chunky) memory layouts. */
1935 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1937 // Note that this is equivalent to calling transpose({2, 0, 1}),
1938 // but slightly more efficient.
1939 im.transpose(0, 1);
1940 im.transpose(1, 2);
1941 return im;
1942 }
1943
1944 /** If you use the (x, y, c) indexing convention, then Halide
1945 * Buffers are stored planar by default. This function constructs
1946 * an interleaved RGB or RGBA image that can still be indexed
1947 * using (x, y, c). Passing it to a generator requires that the
1948 * generator has been compiled with support for interleaved (also
1949 * known as packed or chunky) memory layouts. */
1953
1954 /** Wrap an existing interleaved image. */
1955 static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage>
1957 static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1958 Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
1959 im.transpose(0, 1);
1960 im.transpose(1, 2);
1961 return im;
1962 }
1963
1964 /** Wrap an existing interleaved image. */
1968
1969 /** Make a zero-dimensional Buffer */
1971 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1972 Buffer<add_const_if_T_is_const<void>, AnyDims, InClassDimStorage> buf(t, 1);
1973 buf.slice(0, 0);
1974 return buf;
1975 }
1976
1977 /** Make a zero-dimensional Buffer */
1979 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1981 buf.slice(0, 0);
1982 return buf;
1983 }
1984
1985 /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1987 static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1989 buf.slice(0, 0);
1990 return buf;
1991 }
1992
1993 /** Make a buffer with the same shape and memory nesting order as
1994 * another buffer. It may have a different type. */
1995 template<typename T2, int D2, int S2>
1996 // NOLINTNEXTLINE(performance-unnecessary-value-param)
1998 void *(*allocate_fn)(size_t) = nullptr,
1999 void (*deallocate_fn)(void *) = nullptr) {
2000 // Note that src is taken by value because its dims are mutated
2001 // in-place by the helper. Do not change to taking it by reference.
2002 static_assert(Dims == D2 || Dims == AnyDims);
2003 const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
2004 return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
2005 allocate_fn, deallocate_fn);
2006 }
2007
2008private:
2009 static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
2010 int dimensions,
2011 halide_dimension_t *shape,
2012 void *(*allocate_fn)(size_t),
2013 void (*deallocate_fn)(void *)) {
2014 // Reorder the dimensions of src to have strides in increasing order
2015 std::vector<int> swaps;
2016 for (int i = dimensions - 1; i > 0; i--) {
2017 for (int j = i; j > 0; j--) {
2018 if (shape[j - 1].stride > shape[j].stride) {
2019 std::swap(shape[j - 1], shape[j]);
2020 swaps.push_back(j);
2021 }
2022 }
2023 }
2024
2025 // Rewrite the strides to be dense (this messes up src, which
2026 // is why we took it by value).
2027 for (int i = 0; i < dimensions; i++) {
2028 if (i == 0) {
2029 shape[i].stride = 1;
2030 } else {
2031 shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
2032 }
2033 }
2034
2035 // Undo the dimension reordering
2036 while (!swaps.empty()) {
2037 int j = swaps.back();
2038 std::swap(shape[j - 1], shape[j]);
2039 swaps.pop_back();
2040 }
2041
2042 // Use an explicit runtime type, and make dst a Buffer<void>, to allow
2043 // using this method with Buffer<void> for either src or dst.
2044 Buffer<> dst(dst_type, nullptr, dimensions, shape);
2045 dst.allocate(allocate_fn, deallocate_fn);
2046
2047 return dst;
2048 }
2049
2050 template<typename... Args>
2052 ptrdiff_t
2053 offset_of(int d, int first, Args... rest) const {
2054#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2055 assert(first >= this->buf.dim[d].min);
2056 assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
2057#endif
2058 return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
2059 }
2060
2062 ptrdiff_t offset_of(int d) const {
2063 return 0;
2064 }
2065
2066 template<typename... Args>
2067 HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
2068 if (T_is_void) {
2069 return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
2070 } else {
2071 return (storage_T *)(this->buf.host) + offset_of(0, args...);
2072 }
2073 }
2074
2076 ptrdiff_t offset_of(const int *pos) const {
2077 ptrdiff_t offset = 0;
2078 for (int i = this->dimensions() - 1; i >= 0; i--) {
2079#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2080 assert(pos[i] >= this->buf.dim[i].min);
2081 assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
2082#endif
2083 offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
2084 }
2085 return offset;
2086 }
2087
2089 storage_T *address_of(const int *pos) const {
2090 if (T_is_void) {
2091 return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
2092 } else {
2093 return (storage_T *)this->buf.host + offset_of(pos);
2094 }
2095 }
2096
2097public:
2098 /** Get a pointer to the address of the min coordinate. */
2099 T *data() const {
2100 return (T *)(this->buf.host);
2101 }
2102
2103 /** Access elements. Use im(...) to get a reference to an element,
2104 * and use &im(...) to get the address of an element. If you pass
2105 * fewer arguments than the buffer has dimensions, the rest are
2106 * treated as their min coordinate. The non-const versions set the
2107 * host_dirty flag to true.
2108 */
2109 //@{
2110 template<typename... Args,
2111 typename = typename std::enable_if<AllInts<Args...>::value>::type>
2112 HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
2113 static_assert(!T_is_void,
2114 "Cannot use operator() on Buffer<void> types");
2115 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2116 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2117 assert(!device_dirty());
2118 return *((const not_void_T *)(address_of(first, rest...)));
2119 }
2120
2122 const not_void_T &operator()() const {
2123 static_assert(!T_is_void,
2124 "Cannot use operator() on Buffer<void> types");
2125 constexpr int expected_dims = 0;
2126 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2127 assert(!device_dirty());
2128 return *((const not_void_T *)(data()));
2129 }
2130
2132 const not_void_T &
2133 operator()(const int *pos) const {
2134 static_assert(!T_is_void,
2135 "Cannot use operator() on Buffer<void> types");
2136 assert(!device_dirty());
2137 return *((const not_void_T *)(address_of(pos)));
2138 }
2139
2140 template<typename... Args,
2141 typename = typename std::enable_if<AllInts<Args...>::value>::type>
2142 HALIDE_ALWAYS_INLINE not_void_T &operator()(int first, Args... rest) {
2143 static_assert(!T_is_void,
2144 "Cannot use operator() on Buffer<void> types");
2145 constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2146 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2148 return *((not_void_T *)(address_of(first, rest...)));
2149 }
2150
2152 not_void_T &
2154 static_assert(!T_is_void,
2155 "Cannot use operator() on Buffer<void> types");
2156 constexpr int expected_dims = 0;
2157 static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2159 return *((not_void_T *)(data()));
2160 }
2161
2163 not_void_T &
2164 operator()(const int *pos) {
2165 static_assert(!T_is_void,
2166 "Cannot use operator() on Buffer<void> types");
2168 return *((not_void_T *)(address_of(pos)));
2169 }
2170 // @}
2171
2172 /** Tests that all values in this buffer are equal to val. */
2173 bool all_equal(not_void_T val) const {
2174 bool all_equal = true;
2175 for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
2176 return all_equal;
2177 }
2178
2181 for_each_value([=](T &v) { v = val; });
2182 return *this;
2183 }
2184
2185private:
2186 /** Helper functions for for_each_value. */
2187 // @{
2188 template<int N>
2189 struct for_each_value_task_dim {
2190 std::ptrdiff_t extent;
2191 std::ptrdiff_t stride[N];
2192 };
2193
2194 // Given an array of strides, and a bunch of pointers to pointers
2195 // (all of different types), advance the pointers using the
2196 // strides.
2197 template<typename Ptr, typename... Ptrs>
2198 HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
2199 ptr += *stride;
2200 advance_ptrs(stride + 1, ptrs...);
2201 }
2202
2204 static void advance_ptrs(const std::ptrdiff_t *) {
2205 }
2206
2207 template<typename Fn, typename Ptr, typename... Ptrs>
2208 HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
2209 const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
2210 if (d == 0) {
2211 if (innermost_strides_are_one) {
2212 Ptr end = ptr + t[0].extent;
2213 while (ptr != end) {
2214 f(*ptr++, (*ptrs++)...);
2215 }
2216 } else {
2217 for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
2218 f(*ptr, (*ptrs)...);
2219 advance_ptrs(t[0].stride, ptr, ptrs...);
2220 }
2221 }
2222 } else {
2223 for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
2224 for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
2225 advance_ptrs(t[d].stride, ptr, ptrs...);
2226 }
2227 }
2228 }
2229
2230 // Return pair is <new_dimensions, innermost_strides_are_one>
2231 template<int N>
2232 HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
2233 const halide_buffer_t **buffers) {
2234 const int dimensions = buffers[0]->dimensions;
2235 assert(dimensions > 0);
2236
2237 // Check the buffers all have clean host allocations
2238 for (int i = 0; i < N; i++) {
2239 if (buffers[i]->device) {
2240 assert(buffers[i]->host &&
2241 "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2242 assert(!buffers[i]->device_dirty() &&
2243 "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2244 } else {
2245 assert(buffers[i]->host &&
2246 "Buffer passed to for_each_value has no host or device allocation");
2247 }
2248 }
2249
2250 // Extract the strides in all the dimensions
2251 for (int i = 0; i < dimensions; i++) {
2252 for (int j = 0; j < N; j++) {
2253 assert(buffers[j]->dimensions == dimensions);
2254 assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2255 buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2256 const int s = buffers[j]->dim[i].stride;
2257 t[i].stride[j] = s;
2258 }
2259 t[i].extent = buffers[0]->dim[i].extent;
2260
2261 // Order the dimensions by stride, so that the traversal is cache-coherent.
2262 // Use the last dimension for this, because this is the source in copies.
2263 // It appears to be better to optimize read order than write order.
2264 for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
2265 std::swap(t[j], t[j - 1]);
2266 }
2267 }
2268
2269 // flatten dimensions where possible to make a larger inner
2270 // loop for autovectorization.
2271 int d = dimensions;
2272 for (int i = 1; i < d; i++) {
2273 bool flat = true;
2274 for (int j = 0; j < N; j++) {
2275 flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2276 }
2277 if (flat) {
2278 t[i - 1].extent *= t[i].extent;
2279 for (int j = i; j < d - 1; j++) {
2280 t[j] = t[j + 1];
2281 }
2282 i--;
2283 d--;
2284 }
2285 }
2286
2287 // Note that we assert() that dimensions > 0 above
2288 // (our one-and-only caller will only call us that way)
2289 // so the unchecked access to t[0] should be safe.
2290 bool innermost_strides_are_one = true;
2291 for (int i = 0; i < N; i++) {
2292 innermost_strides_are_one &= (t[0].stride[i] == 1);
2293 }
2294
2295 return {d, innermost_strides_are_one};
2296 }
2297
2298 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2299 void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
2300 if (dimensions() > 0) {
2301 const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
2304 // Move the preparatory code into a non-templated helper to
2305 // save code size.
2306 const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2307 auto [new_dims, innermost_strides_are_one] = Buffer<>::for_each_value_prep(t, buffers);
2308 if (new_dims > 0) {
2309 Buffer<>::for_each_value_helper(f, new_dims - 1,
2310 innermost_strides_are_one,
2311 t,
2312 data(), (other_buffers.data())...);
2313 return;
2314 }
2315 // else fall thru
2316 }
2317
2318 // zero-dimensional case
2319 f(*data(), (*other_buffers.data())...);
2320 }
2321 // @}
2322
2323public:
2324 /** Call a function on every value in the buffer, and the
2325 * corresponding values in some number of other buffers of the
2326 * same size. The function should take a reference, const
2327 * reference, or value of the correct type for each buffer. This
2328 * effectively lifts a function of scalars to an element-wise
2329 * function of buffers. This produces code that the compiler can
2330 * autovectorize. This is slightly cheaper than for_each_element,
2331 * because it does not need to track the coordinates.
2332 *
2333 * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2334 * 'this' or the other-buffers arguments) will allow mutation of the
2335 * buffer contents, while a Buffer<const T> will not. Attempting to specify
2336 * a mutable reference for the lambda argument of a Buffer<const T>
2337 * will result in a compilation error. */
2338 // @{
2339 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2340 HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_value(Fn &&f, Args &&...other_buffers) const {
2341 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2342 return *this;
2343 }
2344
2345 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2348 for_each_value(Fn &&f, Args &&...other_buffers) {
2349 for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2350 return *this;
2351 }
2352 // @}
2353
2354private:
2355 // Helper functions for for_each_element
2356 struct for_each_element_task_dim {
2357 int min, max;
2358 };
2359
2360 /** If f is callable with this many args, call it. The first
2361 * argument is just to make the overloads distinct. Actual
2362 * overload selection is done using the enable_if. */
2363 template<typename Fn,
2364 typename... Args,
2365 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2366 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2367 f(args...);
2368 }
2369
2370 /** If the above overload is impossible, we add an outer loop over
2371 * an additional argument and try again. */
2372 template<typename Fn,
2373 typename... Args>
2374 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2375 for (int i = t[d].min; i <= t[d].max; i++) {
2376 for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2377 }
2378 }
2379
2380 /** Determine the minimum number of arguments a callable can take
2381 * using the same trick. */
2382 template<typename Fn,
2383 typename... Args,
2384 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2385 HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2386 return (int)(sizeof...(Args));
2387 }
2388
2389 /** The recursive version is only enabled up to a recursion limit
2390 * of 256. This catches callables that aren't callable with any
2391 * number of ints. */
2392 template<typename Fn,
2393 typename... Args>
2394 HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2395 static_assert(sizeof...(args) <= 256,
2396 "Callable passed to for_each_element must accept either a const int *,"
2397 " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2398 return num_args(0, std::forward<Fn>(f), 0, args...);
2399 }
2400
2401 /** A version where the callable takes a position array instead,
2402 * with compile-time recursion on the dimensionality. This
2403 * overload is preferred to the one below using the same int vs
2404 * double trick as above, but is impossible once d hits -1 using
2405 * std::enable_if. */
2406 template<int d,
2407 typename Fn,
2408 typename = typename std::enable_if<(d >= 0)>::type>
2409 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2410 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2411 for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
2412 }
2413 }
2414
2415 /** Base case for recursion above. */
2416 template<int d,
2417 typename Fn,
2418 typename = typename std::enable_if<(d < 0)>::type>
2419 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2420 f(pos);
2421 }
2422
2423 /** A run-time-recursive version (instead of
2424 * compile-time-recursive) that requires the callable to take a
2425 * pointer to a position array instead. Dispatches to the
2426 * compile-time-recursive version once the dimensionality gets
2427 * small. */
2428 template<typename Fn>
2429 static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2430 if (d == -1) {
2431 f(pos);
2432 } else if (d == 0) {
2433 // Once the dimensionality gets small enough, dispatch to
2434 // a compile-time-recursive version for better codegen of
2435 // the inner loops.
2436 for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2437 } else if (d == 1) {
2438 for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2439 } else if (d == 2) {
2440 for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2441 } else if (d == 3) {
2442 for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2443 } else {
2444 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2445 for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
2446 }
2447 }
2448 }
2449
2450 /** We now have two overloads for for_each_element. This one
2451 * triggers if the callable takes a const int *.
2452 */
2453 template<typename Fn,
2454 typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2455 static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2456 const int size = dims * sizeof(int);
2457 int *pos = (int *)HALIDE_ALLOCA(size);
2458 // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
2459 // Add this memset to silence it.
2460 memset(pos, 0, size);
2461 for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2462 }
2463
2464 /** This one triggers otherwise. It treats the callable as
2465 * something that takes some number of ints. */
2466 template<typename Fn>
2467 HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2468 int args = num_args(0, std::forward<Fn>(f));
2469 assert(dims >= args);
2470 for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2471 }
2472
2473 template<typename Fn>
2474 void for_each_element_impl(Fn &&f) const {
2475 for_each_element_task_dim *t =
2476 (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2477 for (int i = 0; i < dimensions(); i++) {
2478 t[i].min = dim(i).min();
2479 t[i].max = dim(i).max();
2480 }
2481 for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2482 }
2483
2484public:
2485 /** Call a function at each site in a buffer. This is likely to be
2486 * much slower than using Halide code to populate a buffer, but is
2487 * convenient for tests. If the function has more arguments than the
2488 * buffer has dimensions, the remaining arguments will be zero. If it
2489 * has fewer arguments than the buffer has dimensions then the last
2490 * few dimensions of the buffer are not iterated over. For example,
2491 * the following code exploits this to set a floating point RGB image
2492 * to red:
2493
2494 \code
2495 Buffer<float, 3> im(100, 100, 3);
2496 im.for_each_element([&](int x, int y) {
2497 im(x, y, 0) = 1.0f;
2498 im(x, y, 1) = 0.0f;
2499 im(x, y, 2) = 0.0f:
2500 });
2501 \endcode
2502
2503 * The compiled code is equivalent to writing the a nested for loop,
2504 * and compilers are capable of optimizing it in the same way.
2505 *
2506 * If the callable can be called with an int * as the sole argument,
2507 * that version is called instead. Each location in the buffer is
2508 * passed to it in a coordinate array. This version is higher-overhead
2509 * than the variadic version, but is useful for writing generic code
2510 * that accepts buffers of arbitrary dimensionality. For example, the
2511 * following sets the value at all sites in an arbitrary-dimensional
2512 * buffer to their first coordinate:
2513
2514 \code
2515 im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2516 \endcode
2517
2518 * It is also possible to use for_each_element to iterate over entire
2519 * rows or columns by cropping the buffer to a single column or row
2520 * respectively and iterating over elements of the result. For example,
2521 * to set the diagonal of the image to 1 by iterating over the columns:
2522
2523 \code
2524 Buffer<float, 3> im(100, 100, 3);
2525 im.sliced(1, 0).for_each_element([&](int x, int c) {
2526 im(x, x, c) = 1.0f;
2527 });
2528 \endcode
2529
2530 * Or, assuming the memory layout is known to be dense per row, one can
2531 * memset each row of an image like so:
2532
2533 \code
2534 Buffer<float, 3> im(100, 100, 3);
2535 im.sliced(0, 0).for_each_element([&](int y, int c) {
2536 memset(&im(0, y, c), 0, sizeof(float) * im.width());
2537 });
2538 \endcode
2539
2540 */
2541 // @{
2542 template<typename Fn>
2544 for_each_element_impl(f);
2545 return *this;
2546 }
2547
2548 template<typename Fn>
2552 for_each_element_impl(f);
2553 return *this;
2554 }
2555 // @}
2556
2557private:
2558 template<typename Fn>
2559 struct FillHelper {
2560 Fn f;
2562
2563 template<typename... Args,
2564 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2565 void operator()(Args... args) {
2566 (*buf)(args...) = f(args...);
2567 }
2568
2569 FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
2570 : f(std::forward<Fn>(f)), buf(buf) {
2571 }
2572 };
2573
2574public:
2575 /** Fill a buffer by evaluating a callable at every site. The
2576 * callable should look much like a callable passed to
2577 * for_each_element, but it should return the value that should be
2578 * stored to the coordinate corresponding to the arguments. */
2579 template<typename Fn,
2580 typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
2582 // We'll go via for_each_element. We need a variadic wrapper lambda.
2583 FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2584 return for_each_element(wrapper);
2585 }
2586
2587 /** Check if an input buffer passed extern stage is a querying
2588 * bounds. Compared to doing the host pointer check directly,
2589 * this both adds clarity to code and will facilitate moving to
2590 * another representation for bounds query arguments. */
2591 bool is_bounds_query() const {
2592 return buf.is_bounds_query();
2593 }
2594
2595 /** Convenient check to verify that all of the interesting bytes in the Buffer
2596 * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2597 * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2598 * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2599 * the entire Buffer storage.) */
2600 void msan_check_mem_is_initialized(bool entire = false) const {
2601#if defined(__has_feature)
2602#if __has_feature(memory_sanitizer)
2603 if (entire) {
2604 __msan_check_mem_is_initialized(data(), size_in_bytes());
2605 } else {
2606 for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2607 }
2608#endif
2609#endif
2610 }
2611};
2612
2613} // namespace Runtime
2614} // namespace Halide
2615
2616#undef HALIDE_ALLOCA
2617
2618#endif // HALIDE_RUNTIME_IMAGE_H
#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
#define HALIDE_ALLOCA
This file declares the routines used by Halide internally in its runtime.
struct halide_dimension_t halide_dimension_t
#define HALIDE_NEVER_INLINE
@ halide_error_code_success
There was no error.
#define HALIDE_ALWAYS_INLINE
struct halide_buffer_t halide_buffer_t
The raw representation of an image passed around by generated Halide code.
Dimension(const halide_dimension_t &dim)
halide_type_t type() const
Get the type of the elements.
Read-only access to the shape.
HALIDE_ALWAYS_INLINE int min() const
The lowest coordinate in this dimension.
Dimension(const halide_dimension_t &dim)
HALIDE_ALWAYS_INLINE int max() const
The highest coordinate in this dimension.
HALIDE_ALWAYS_INLINE iterator end() const
An iterator that points to one past the max coordinate.
HALIDE_ALWAYS_INLINE int stride() const
The number of elements in memory you have to step over to increment this coordinate by one.
HALIDE_ALWAYS_INLINE iterator begin() const
An iterator that points to the min coordinate.
HALIDE_ALWAYS_INLINE int extent() const
The extent of the image along this dimension.
A templated Buffer class that wraps halide_buffer_t and adds functionality.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T2, D2, S2 > &other)
Assign from another Buffer of possibly-different dimensionality and type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_planar(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in planar memory layout (vs.
Buffer< T, Dims, InClassDimStorage > transposed(const std::vector< int > &order) const
Make a buffer which refers to the same data in the same layout using a different ordering of the dime...
void translate(int d, int delta)
Translate an image in-place along one dimension by changing how it is indexed.
Buffer(const halide_buffer_t &buf, BufferDeviceOwnership ownership=BufferDeviceOwnership::Unmanaged)
Make a Buffer from a halide_buffer_t.
void allocate(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Allocate memory for this Buffer.
Buffer< not_const_T, Dims, InClassDimStorage > copy(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Make a new image which is a deep copy of this image.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims+1)> embedded(int d, int pos=0) const
Make a new buffer that views this buffer as a single slice in a higher-dimensional space.
friend class Buffer
Give Buffers access to the members of Buffers of different dimensionalities and types.
void add_dimension()
Add a new dimension with a min of zero and an extent of one.
void slice(int d)
Slice a buffer in-place at the dimension's minimum.
static void set_default_allocate_fn(void *(*allocate_fn)(size_t))
bool owns_host_memory() const
Does this Buffer own the host memory it refers to?
int width() const
Conventional names for the first three dimensions.
void transpose(const std::vector< int > &order)
A generalized transpose: instead of swapping two dimensions, pass a vector that lists each dimension ...
void set_min(const std::vector< int > &mins)
Set the min coordinate of an image in the first N dimensions.
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f)
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< int > &sizes)
Initialize an Buffer of runtime type from a pointer and a vector of sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > as() &&
Return an rval reference to this Buffer.
int copy_to_host(void *ctx=nullptr)
Buffer(halide_type_t t, const std::vector< int > &sizes)
Allocate a new image of unknown type using a vector of ints as the size.
int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_free(void *ctx=nullptr)
int extent(int i) const
bool contains(Args... args) const
void crop(const std::vector< std::pair< int, int > > &rect)
Crop an image in-place along the first N dimensions.
HALIDE_ALWAYS_INLINE const Buffer< typename std::add_const< T >::type, Dims, InClassDimStorage > & as_const() const &
void set_device_dirty(bool v=true)
HALIDE_ALWAYS_INLINE const not_void_T & operator()(const int *pos) const
Buffer(T *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
Buffer(Buffer< T2, D2, S2 > &&other)
Move-construct a Buffer from a Buffer of different dimensionality and type.
void slice(int d, int pos)
Rewrite the buffer to refer to a single lower-dimensional slice of itself along the given dimension a...
HALIDE_ALWAYS_INLINE const not_void_T & operator()(int first, Args... rest) const
Access elements.
HALIDE_ALWAYS_INLINE void set_host_dirty(bool v=true)
Methods for managing any GPU allocation.
void msan_check_mem_is_initialized(bool entire=false) const
Convenient check to verify that all of the interesting bytes in the Buffer are initialized under MSAN...
HALIDE_ALWAYS_INLINE Buffer< typename std::add_const< T >::type, Dims, InClassDimStorage > as_const() &&
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Standard move-assignment operator.
int device_detach_native(void *ctx=nullptr)
int device_wrap_native(const struct halide_device_interface_t *device_interface, uint64_t handle, void *ctx=nullptr)
static constexpr bool has_static_halide_type
True if the Halide type is not void (or const void).
Buffer< T, Dims, InClassDimStorage > translated(const std::vector< int > &delta) const
Make an image which refers to the same data translated along the first N dimensions.
HALIDE_ALWAYS_INLINE Dimension dim(int i) const
Access the shape of the buffer.
Buffer(int first, int second, Args... rest)
HALIDE_ALWAYS_INLINE Buffer< typename std::add_const< T >::type, Dims, InClassDimStorage > & as_const() &
as_const() is syntactic sugar for .as<const T>(), to avoid the need to recapitulate the type argument...
Buffer< T, Dims, InClassDimStorage > transposed(int d1, int d2) const
Make a buffer which refers to the same data in the same layout using a swapped indexing order for the...
HALIDE_ALWAYS_INLINE Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers)
HALIDE_ALWAYS_INLINE not_void_T & operator()()
BufferDeviceOwnership device_ownership() const
Return the method by which the device field is managed.
void check_overflow()
Check the product of the extents fits in memory.
static bool can_convert_from(const Buffer< T2, D2, S2 > &other)
Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
Buffer< not_const_T, Dims, InClassDimStorage > copy_to_interleaved(void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr) const
Like copy(), but the copy is created in interleaved memory layout (vs.
int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
int device_sync(void *ctx=nullptr)
static Buffer< void, Dims, InClassDimStorage > make_interleaved(halide_type_t t, int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
Buffer(const std::vector< int > &sizes)
Allocate a new image of known type using a vector of ints as the size.
void embed(int d, int pos=0)
Embed a buffer in-place, increasing the dimensionality.
static constexpr halide_type_t static_halide_type()
Get the Halide type of T.
Buffer(T *data, int first, Args &&...rest)
Initialize an Buffer from a pointer and some sizes.
int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(Array(&vals)[N])
Make an Buffer that refers to a statically sized array.
const halide_buffer_t * raw_buffer() const
HALIDE_ALWAYS_INLINE not_void_T & operator()(int first, Args... rest)
static Buffer< T, Dims, InClassDimStorage > make_interleaved(int width, int height, int channels)
If you use the (x, y, c) indexing convention, then Halide Buffers are stored planar by default.
halide_type_t type() const
Get the type of the elements.
int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx=nullptr)
Buffer(int first)
Allocate a new image of the given size.
halide_buffer_t * raw_buffer()
Get a pointer to the raw halide_buffer_t this wraps.
T * end() const
A pointer to one beyond the element with the highest address.
HALIDE_ALWAYS_INLINE bool device_dirty() const
Buffer< T, Dims, InClassDimStorage > cropped(const std::vector< std::pair< int, int > > &rect) const
Make an image that refers to a sub-rectangle of this image along the first N dimensions.
static constexpr int static_dimensions()
Callers should not use the result if has_static_dimensions is false.
void transpose(int d1, int d2)
Transpose a buffer in-place by changing how it is indexed.
void deallocate()
Drop reference to any owned host or device memory, possibly freeing it, if this buffer held the last ...
size_t size_in_bytes() const
The total number of bytes spanned by the data in memory.
bool has_device_allocation() const
void reset()
Reset the Buffer to be equivalent to a default-constructed Buffer of the same static type (if any); B...
Buffer(halide_type_t t, int first, Args... rest)
Allocate a new image of the given size with a runtime type.
int dimensions() const
Get the dimensionality of the buffer.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int d, const halide_dimension_t *shape)
Initialize an Buffer from a pointer to the min coordinate and an array describing the shape.
int min(int i) const
Access to the mins, strides, extents.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_element(Fn &&f) const
Call a function at each site in a buffer.
void device_deallocate()
Drop reference to any owned device memory, possibly freeing it if this buffer held the last reference...
HALIDE_ALWAYS_INLINE const not_void_T & operator()() const
static Buffer< T, Dims, InClassDimStorage > make_scalar()
Make a zero-dimensional Buffer.
void add_dimension_with_stride(int s)
Add a new dimension with a min of zero, an extent of one, and the specified stride.
Buffer(Buffer< T, Dims, InClassDimStorage > &&other) noexcept
Move constructor.
Buffer< T, Dims, InClassDimStorage > cropped(int d, int min, int extent) const
Make an image that refers to a sub-range of this image along the given dimension.
void crop(int d, int min, int extent)
Crop an image in-place along the given dimension.
Buffer< T, Dims, InClassDimStorage > & fill(Fn &&f)
Fill a buffer by evaluating a callable at every site.
static Buffer< T, Dims, InClassDimStorage > make_scalar(T *data)
Make a zero-dimensional Buffer that points to non-owned, existing data.
Buffer< T, Dims, InClassDimStorage > alias() const
Make a copy of the Buffer which shares the underlying host and/or device allocations as the existing ...
void set_min(Args... args)
size_t number_of_elements() const
The total number of elements this buffer represents.
static void assert_can_convert_from(const Buffer< T2, D2, S2 > &other)
Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage> cannot be const...
void translate(const std::vector< int > &delta)
Translate an image along the first N dimensions by changing how it is indexed.
Buffer(const Buffer< T, Dims, InClassDimStorage > &other)
Copy constructor.
HALIDE_ALWAYS_INLINE not_void_T & operator()(const int *pos)
T * data() const
Get a pointer to the address of the min coordinate.
Buffer< T, Dims, InClassDimStorage > & fill(not_void_T val)
Buffer(const std::vector< int > &sizes, const std::vector< int > &storage_order)
Buffer< T, Dims, InClassDimStorage > & operator=(Buffer< T2, D2, S2 > &&other)
Move from another Buffer of possibly-different dimensionality and type.
Buffer(halide_type_t t, const std::vector< int > &sizes, const std::vector< int > &storage_order)
Allocate a new image of unknown type using a vector of ints as the size and a vector of indices indic...
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
static constexpr bool has_static_dimensions
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d, int pos) const
Make a lower-dimensional buffer that refers to one slice of this buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_interleaved(halide_type_t t, T *data, int width, int height, int channels)
Wrap an existing interleaved image.
HALIDE_ALWAYS_INLINE const Buffer< T, Dims, InClassDimStorage > & for_each_value(Fn &&f, Args &&...other_buffers) const
Call a function on every value in the buffer, and the corresponding values in some number of other bu...
bool is_bounds_query() const
Check if an input buffer passed extern stage is a querying bounds.
Buffer< T,(Dims==AnyDims ? AnyDims :Dims - 1)> sliced(int d) const
Make a lower-dimensional buffer that refers to one slice of this buffer at the dimension's minimum.
int left() const
Conventional names for the min and max value of each dimension.
void copy_from(Buffer< T2, D2, S2 > src)
Fill a Buffer with the values at the same coordinates in another Buffer.
Buffer< T, Dims, InClassDimStorage > translated(int d, int dx) const
Make an image which refers to the same data with using translated coordinates in the given dimension.
int stride(int i) const
static Buffer< T, Dims, InClassDimStorage > make_interleaved(T *data, int width, int height, int channels)
Wrap an existing interleaved image.
static void set_default_deallocate_fn(void(*deallocate_fn)(void *))
static Buffer< T, Dims, InClassDimStorage > make_with_shape_of(Buffer< T2, D2, S2 > src, void *(*allocate_fn)(size_t)=nullptr, void(*deallocate_fn)(void *)=nullptr)
Make a buffer with the same shape and memory nesting order as another buffer.
Buffer(const Buffer< T2, D2, S2 > &other)
Construct a Buffer from a Buffer of different dimensionality and type.
bool contains(const std::vector< int > &coords) const
Test if a given coordinate is within the bounds of an image.
Buffer(T *data, const std::vector< halide_dimension_t > &shape)
Initialize a Buffer from a pointer to the min coordinate and a vector describing the shape.
Buffer(T *data, const std::vector< int > &sizes)
Initialize an Buffer from a pointer and a vector of sizes.
Buffer< T, Dims, InClassDimStorage > & operator=(const Buffer< T, Dims, InClassDimStorage > &other)
Standard assignment operator.
T * begin() const
A pointer to the element with the lowest address.
bool all_equal(not_void_T val) const
Tests that all values in this buffer are equal to val.
Buffer(halide_type_t t, add_const_if_T_is_const< void > *data, int first, Args &&...rest)
Initialize an Buffer of runtime type from a pointer and some sizes.
HALIDE_ALWAYS_INLINE Buffer< T2, D2, InClassDimStorage > & as() &
Return a typed reference to this Buffer.
HALIDE_ALWAYS_INLINE const Buffer< T2, D2, InClassDimStorage > & as() const &
Return a const typed reference to this Buffer.
static Buffer< add_const_if_T_is_const< void >, Dims, InClassDimStorage > make_scalar(halide_type_t t)
Make a zero-dimensional Buffer.
bool any_zero(const Container &c)
constexpr int AnyDims
BufferDeviceOwnership
This indicates how to deallocate the device for a Halide::Runtime::Buffer.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition Func.h:603
Expr max(const FuncRef &a, const FuncRef &b)
Definition Func.h:606
unsigned __INT64_TYPE__ uint64_t
__UINTPTR_TYPE__ uintptr_t
void * malloc(size_t)
ALWAYS_INLINE T align_up(T p, size_t alignment)
unsigned __INT8_TYPE__ uint8_t
__PTRDIFF_TYPE__ ptrdiff_t
unsigned __INT16_TYPE__ uint16_t
void * memcpy(void *s1, const void *s2, size_t n)
__SIZE_TYPE__ size_t
void * memset(void *s, int val, size_t n)
unsigned __INT32_TYPE__ uint32_t
void free(void *)
An iterator class, so that you can iterate over coordinates in a dimensions using a range-based for l...
A struct acting as a header for allocations owned by the Buffer class itself.
AllocationHeader(void(*deallocate_fn)(void *))
An iterator class, so that you can iterate over coordinates in a dimensions using a range-based for l...
bool operator!=(const iterator &other) const
A similar struct for managing device allocations.
BufferDeviceOwnership ownership
The raw representation of an image passed around by generated Halide code.
int32_t dimensions
The dimensionality of the buffer.
halide_dimension_t * dim
The shape of the buffer.
uint64_t device
A device-handle for e.g.
uint8_t * host
A pointer to the start of the data in main memory.
Each GPU API provides a halide_device_interface_t struct pointing to the code that manages device all...
int(* device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle, const struct halide_device_interface_t *device_interface)
int(* copy_to_device)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
int(* device_malloc)(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface)
A runtime tag for a type in the halide type system.