Go to the documentation of this file.
1 /** \file
2  * Defines a Buffer type that wraps from halide_buffer_t and adds
3  * functionality, and methods for more conveniently iterating over the
4  * samples in a halide_buffer_t outside of Halide code. */
9 #include <algorithm>
10 #include <atomic>
11 #include <cassert>
12 #include <cstdint>
13 #include <cstdlib>
14 #include <cstring>
15 #include <limits>
16 #include <memory>
17 #include <vector>
19 #ifdef __APPLE__
20 #include <AvailabilityVersions.h>
21 #include <TargetConditionals.h>
22 #endif
24 #if defined(__has_feature)
25 #if __has_feature(memory_sanitizer)
26 #include <sanitizer/msan_interface.h>
27 #endif
28 #endif
30 #include "HalideRuntime.h"
32 #ifdef _MSC_VER
33 #include <malloc.h>
34 #define HALIDE_ALLOCA _alloca
35 #else
36 #define HALIDE_ALLOCA __builtin_alloca
37 #endif
39 // gcc 5.1 has a false positive warning on this code
40 #if __GNUC__ == 5 && __GNUC_MINOR__ == 1
41 #pragma GCC diagnostic ignored "-Warray-bounds"
42 #endif
46 #endif
49 // Conservatively align buffer allocations to 128 bytes by default.
50 // This is enough alignment for all the platforms currently in use.
51 // Redefine this in your compiler settings if you desire more/less alignment.
53 #endif
58 // Unfortunately, not all C++17 runtimes support aligned_alloc
59 // (it may depends on OS/SDK version); this is provided as an opt-out
60 // if you are compiling on a platform that doesn't provide a (good)
61 // implementation. (Note that we actually use the C11 `::aligned_alloc()`
62 // rather than the C++17 `std::aligned_alloc()` because at least one platform
63 // we found supports the former but not the latter.)
66 // clang-format off
67 #ifdef _MSC_VER
69  // MSVC doesn't implement aligned_alloc(), even in C++17 mode, and
70  // has stated they probably never will, so, always default it off here.
73 #elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
75  // Android doesn't provide aligned_alloc until API 28
78 #elif defined(__APPLE__)
82  // macOS doesn't provide aligned_alloc until 10.15
87  // iOS doesn't provide aligned_alloc until 14.0
90  #else
92  // Assume it's ok on all other Apple targets
95  #endif
97 #else
99  #if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)
101  // ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
104  #else
106  // Not Windows, Android, or Apple: just assume it's ok
109  #endif
111 #endif
112 // clang-format on
116 namespace Halide {
117 namespace Runtime {
119 // Forward-declare our Buffer class
120 template<typename T, int Dims, int InClassDimStorage>
121 class Buffer;
123 // A helper to check if a parameter pack is entirely implicitly
124 // int-convertible to use with std::enable_if
125 template<typename... Args>
126 struct AllInts : std::false_type {};
128 template<>
129 struct AllInts<> : std::true_type {};
131 template<typename T, typename... Args>
132 struct AllInts<T, Args...> {
133  static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
134 };
136 // Floats and doubles are technically implicitly int-convertible, but
137 // doing so produces a warning we treat as an error, so just disallow
138 // it here.
139 template<typename... Args>
140 struct AllInts<float, Args...> : std::false_type {};
142 template<typename... Args>
143 struct AllInts<double, Args...> : std::false_type {};
145 // A helper to detect if there are any zeros in a container
146 namespace Internal {
147 template<typename Container>
148 bool any_zero(const Container &c) {
149  for (int i : c) {
150  if (i == 0) {
151  return true;
152  }
153  }
154  return false;
155 }
156 } // namespace Internal
158 /** A struct acting as a header for allocations owned by the Buffer
159  * class itself. */
161  void (*deallocate_fn)(void *);
162  std::atomic<int> ref_count;
164  // Note that ref_count always starts at 1
167  }
168 };
170 /** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
171 enum struct BufferDeviceOwnership : int {
172  Allocated, ///> halide_device_free will be called when device ref count goes to zero
173  WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
174  Unmanaged, ///> No free routine will be called when device ref count goes to zero
175  AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
176  Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero.
177 };
179 /** A similar struct for managing device allocations. */
181  // This is only ever constructed when there's something to manage,
182  // so start at one.
183  std::atomic<int> count{1};
185 };
187 constexpr int AnyDims = -1;
189 /** A templated Buffer class that wraps halide_buffer_t and adds
190  * functionality. When using Halide from C++, this is the preferred
191  * way to create input and output buffers. The overhead of using this
192  * class relative to a naked halide_buffer_t is minimal - it uses another
193  * ~16 bytes on the stack, and does no dynamic allocations when using
194  * it to represent existing memory of a known maximum dimensionality.
195  *
196  * The template parameter T is the element type. For buffers where the
197  * element type is unknown, or may vary, use void or const void.
198  *
199  * The template parameter Dims is the number of dimensions. For buffers where
200  * the dimensionality type is unknown at, or may vary, use AnyDims.
201  *
202  * InClassDimStorage is the maximum number of dimensions that can be represented
203  * using space inside the class itself. Set it to the maximum dimensionality
204  * you expect this buffer to be. If the actual dimensionality exceeds
205  * this, heap storage is allocated to track the shape of the buffer.
206  * InClassDimStorage defaults to 4, which should cover nearly all usage.
207  *
208  * The class optionally allocates and owns memory for the image using
209  * a shared pointer allocated with the provided allocator. If they are
210  * null, malloc and free are used. Any device-side allocation is
211  * considered as owned if and only if the host-side allocation is
212  * owned. */
213 template<typename T = void,
214  int Dims = AnyDims,
215  int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
216 class Buffer {
217  /** The underlying halide_buffer_t */
218  halide_buffer_t buf = {};
220  /** Some in-class storage for shape of the dimensions. */
221  halide_dimension_t shape[InClassDimStorage];
223  /** The allocation owned by this Buffer. NULL if the Buffer does not
224  * own the memory. */
225  AllocationHeader *alloc = nullptr;
227  /** A reference count for the device allocation owned by this
228  * buffer. */
229  mutable DeviceRefCount *dev_ref_count = nullptr;
231  /** True if T is of type void or const void */
232  static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;
234  /** A type function that adds a const qualifier if T is a const type. */
235  template<typename T2>
236  using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;
238  /** T unless T is (const) void, in which case (const)
239  * uint8_t. Useful for providing return types for operator() */
240  using not_void_T = typename std::conditional<T_is_void,
241  add_const_if_T_is_const<uint8_t>,
242  T>::type;
244  /** T with constness removed. Useful for return type of copy(). */
245  using not_const_T = typename std::remove_const<T>::type;
247  /** The type the elements are stored as. Equal to not_void_T
248  * unless T is a pointer, in which case uint64_t. Halide stores
249  * all pointer types as uint64s internally, even on 32-bit
250  * systems. */
251  using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;
253 public:
254  /** True if the Halide type is not void (or const void). */
255  static constexpr bool has_static_halide_type = !T_is_void;
257  /** Get the Halide type of T. Callers should not use the result if
258  * has_static_halide_type is false. */
259  static constexpr halide_type_t static_halide_type() {
260  return halide_type_of<typename std::remove_cv<not_void_T>::type>();
261  }
263  /** Does this Buffer own the host memory it refers to? */
264  bool owns_host_memory() const {
265  return alloc != nullptr;
266  }
268  static constexpr bool has_static_dimensions = (Dims != AnyDims);
270  /** Callers should not use the result if
271  * has_static_dimensions is false. */
272  static constexpr int static_dimensions() {
273  return Dims;
274  }
276  static_assert(!has_static_dimensions || static_dimensions() >= 0);
278 private:
279  /** Increment the reference count of any owned allocation */
280  void incref() const {
281  if (owns_host_memory()) {
282  alloc->ref_count++;
283  }
284  if (buf.device) {
285  if (!dev_ref_count) {
286  // I seem to have a non-zero dev field but no
287  // reference count for it. I must have been given a
288  // device allocation by a Halide pipeline, and have
289  // never been copied from since. Take sole ownership
290  // of it.
291  dev_ref_count = new DeviceRefCount;
292  }
293  dev_ref_count->count++;
294  }
295  }
297  // Note that this is called "cropped" but can also encompass a slice/embed
298  // operation as well.
299  struct DevRefCountCropped : DeviceRefCount {
300  Buffer<T, Dims, InClassDimStorage> cropped_from;
301  DevRefCountCropped(const Buffer<T, Dims, InClassDimStorage> &cropped_from)
302  : cropped_from(cropped_from) {
303  ownership = BufferDeviceOwnership::Cropped;
304  }
305  };
307  /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
308  void crop_from(const Buffer<T, Dims, InClassDimStorage> &cropped_from) {
309  assert(dev_ref_count == nullptr);
310  dev_ref_count = new DevRefCountCropped(cropped_from);
311  }
313  /** Decrement the reference count of any owned allocation and free host
314  * and device memory if it hits zero. Sets alloc to nullptr. */
315  void decref(bool device_only = false) {
316  if (owns_host_memory() && !device_only) {
317  int new_count = --(alloc->ref_count);
318  if (new_count == 0) {
319  void (*fn)(void *) = alloc->deallocate_fn;
320  alloc->~AllocationHeader();
321  fn(alloc);
322  }
323  buf.host = nullptr;
324  alloc = nullptr;
325  set_host_dirty(false);
326  }
327  int new_count = 0;
328  if (dev_ref_count) {
329  new_count = --(dev_ref_count->count);
330  }
331  if (new_count == 0) {
332  if (buf.device) {
333  assert(!(alloc && device_dirty()) &&
334  "Implicitly freeing a dirty device allocation while a host allocation still lives. "
335  "Call device_free explicitly if you want to drop dirty device-side data. "
336  "Call copy_to_host explicitly if you want the data copied to the host allocation "
337  "before the device allocation is freed.");
338  int result = halide_error_code_success;
339  if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
340  result = buf.device_interface->detach_native(nullptr, &buf);
341  } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
342  result = buf.device_interface->device_and_host_free(nullptr, &buf);
343  } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
344  result = buf.device_interface->device_release_crop(nullptr, &buf);
345  } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
346  result = buf.device_interface->device_free(nullptr, &buf);
347  }
348  // No reasonable way to return the error, but we can at least assert-fail in debug builds.
349  assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
350  (void)result;
351  }
352  if (dev_ref_count) {
353  if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
354  delete (DevRefCountCropped *)dev_ref_count;
355  } else {
356  delete dev_ref_count;
357  }
358  }
359  }
360  dev_ref_count = nullptr;
361  buf.device = 0;
362  buf.device_interface = nullptr;
363  }
365  void free_shape_storage() {
366  if (buf.dim != shape) {
367  delete[] buf.dim;
368  buf.dim = nullptr;
369  }
370  }
372  template<int DimsSpecified>
373  void make_static_shape_storage() {
374  static_assert(Dims == AnyDims || Dims == DimsSpecified,
375  "Number of arguments to Buffer() does not match static dimensionality");
376  buf.dimensions = DimsSpecified;
377  if constexpr (Dims == AnyDims) {
378  if constexpr (DimsSpecified <= InClassDimStorage) {
379  buf.dim = shape;
380  } else {
381  static_assert(DimsSpecified >= 1);
382  buf.dim = new halide_dimension_t[DimsSpecified];
383  }
384  } else {
385  static_assert(InClassDimStorage >= Dims);
386  buf.dim = shape;
387  }
388  }
390  void make_shape_storage(const int dimensions) {
391  if (Dims != AnyDims && Dims != dimensions) {
392  assert(false && "Number of arguments to Buffer() does not match static dimensionality");
393  }
394  // This should usually be inlined, so if dimensions is statically known,
395  // we can skip the call to new
396  buf.dimensions = dimensions;
397  buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
398  }
400  void copy_shape_from(const halide_buffer_t &other) {
401  // All callers of this ensure that buf.dimensions == other.dimensions.
402  make_shape_storage(other.dimensions);
403  std::copy(other.dim, other.dim + other.dimensions, buf.dim);
404  }
406  template<typename T2, int D2, int S2>
407  void move_shape_from(Buffer<T2, D2, S2> &&other) {
408  if (other.shape == other.buf.dim) {
409  copy_shape_from(other.buf);
410  } else {
411  buf.dim = other.buf.dim;
412  other.buf.dim = nullptr;
413  }
414  }
416  /** Initialize the shape from a halide_buffer_t. */
417  void initialize_from_buffer(const halide_buffer_t &b,
418  BufferDeviceOwnership ownership) {
419  memcpy(&buf, &b, sizeof(halide_buffer_t));
420  copy_shape_from(b);
421  if (b.device) {
422  dev_ref_count = new DeviceRefCount;
423  dev_ref_count->ownership = ownership;
424  }
425  }
427  /** Initialize the shape from an array of ints */
428  void initialize_shape(const int *sizes) {
429  for (int i = 0; i < buf.dimensions; i++) {
430  buf.dim[i].min = 0;
431  buf.dim[i].extent = sizes[i];
432  if (i == 0) {
433  buf.dim[i].stride = 1;
434  } else {
435  buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
436  }
437  }
438  }
440  /** Initialize the shape from a vector of extents */
441  void initialize_shape(const std::vector<int> &sizes) {
442  assert(buf.dimensions == (int)sizes.size());
443  initialize_shape(sizes.data());
444  }
446  /** Initialize the shape from the static shape of an array */
447  template<typename Array, size_t N>
448  void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
449  buf.dim[next].min = 0;
450  buf.dim[next].extent = (int)N;
451  if (next == 0) {
452  buf.dim[next].stride = 1;
453  } else {
454  initialize_shape_from_array_shape(next - 1, vals[0]);
455  buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
456  }
457  }
459  /** Base case for the template recursion above. */
460  template<typename T2>
461  void initialize_shape_from_array_shape(int, const T2 &) {
462  }
464  /** Get the dimensionality of a multi-dimensional C array */
465  template<typename Array, size_t N>
466  static int dimensionality_of_array(Array (&vals)[N]) {
467  return dimensionality_of_array(vals[0]) + 1;
468  }
470  template<typename T2>
471  static int dimensionality_of_array(const T2 &) {
472  return 0;
473  }
475  /** Get the underlying halide_type_t of an array's element type. */
476  template<typename Array, size_t N>
477  static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
478  return scalar_type_of_array(vals[0]);
479  }
481  template<typename T2>
482  static halide_type_t scalar_type_of_array(const T2 &) {
483  return halide_type_of<typename std::remove_cv<T2>::type>();
484  }
486  /** Crop a single dimension without handling device allocation. */
487  void crop_host(int d, int min, int extent) {
488  assert(dim(d).min() <= min);
489  assert(dim(d).max() >= min + extent - 1);
490  ptrdiff_t shift = min - dim(d).min();
491  if (buf.host != nullptr) {
492  buf.host += (shift * dim(d).stride()) * type().bytes();
493  }
494  buf.dim[d].min = min;
495  buf.dim[d].extent = extent;
496  }
498  /** Crop as many dimensions as are in rect, without handling device allocation. */
499  void crop_host(const std::vector<std::pair<int, int>> &rect) {
500  assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
501  int limit = (int)rect.size();
502  assert(limit <= dimensions());
503  for (int i = 0; i < limit; i++) {
504  crop_host(i, rect[i].first, rect[i].second);
505  }
506  }
508  void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
509  assert(buf.device_interface != nullptr);
510  if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == halide_error_code_success) {
511  const Buffer<T, Dims, InClassDimStorage> *cropped_from = this;
512  // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
513  // is it possible to get to this point without incref having run at least once since
514  // the device field was set? (I.e. in the internal logic of crop. incref might have been
515  // called.)
516  if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
517  cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
518  }
519  result_host_cropped.crop_from(*cropped_from);
520  }
521  }
523  /** slice a single dimension without handling device allocation. */
524  void slice_host(int d, int pos) {
525  static_assert(Dims == AnyDims);
526  assert(dimensions() > 0);
527  assert(d >= 0 && d < dimensions());
528  assert(pos >= dim(d).min() && pos <= dim(d).max());
529  buf.dimensions--;
530  ptrdiff_t shift = pos - buf.dim[d].min;
531  if (buf.host != nullptr) {
532  buf.host += (shift * buf.dim[d].stride) * type().bytes();
533  }
534  for (int i = d; i < buf.dimensions; i++) {
535  buf.dim[i] = buf.dim[i + 1];
536  }
537  buf.dim[buf.dimensions] = {0, 0, 0};
538  }
540  void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
541  assert(buf.device_interface != nullptr);
542  if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == halide_error_code_success) {
543  const Buffer<T, Dims, InClassDimStorage> *sliced_from = this;
544  // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
545  // is it possible to get to this point without incref having run at least once since
546  // the device field was set? (I.e. in the internal logic of slice. incref might have been
547  // called.)
548  if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
549  sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
550  }
551  // crop_from() is correct here, despite the fact that we are slicing.
552  result_host_sliced.crop_from(*sliced_from);
553  }
554  }
556 public:
557  typedef T ElemType;
559  /** Read-only access to the shape */
560  class Dimension {
561  const halide_dimension_t &d;
563  public:
564  /** The lowest coordinate in this dimension */
565  HALIDE_ALWAYS_INLINE int min() const {
566  return d.min;
567  }
569  /** The number of elements in memory you have to step over to
570  * increment this coordinate by one. */
572  return d.stride;
573  }
575  /** The extent of the image along this dimension */
577  return d.extent;
578  }
580  /** The highest coordinate in this dimension */
581  HALIDE_ALWAYS_INLINE int max() const {
582  return min() + extent() - 1;
583  }
585  /** An iterator class, so that you can iterate over
586  * coordinates in a dimensions using a range-based for loop. */
587  struct iterator {
588  int val;
589  int operator*() const {
590  return val;
591  }
592  bool operator!=(const iterator &other) const {
593  return val != other.val;
594  }
596  val++;
597  return *this;
598  }
599  };
601  /** An iterator that points to the min coordinate */
603  return {min()};
604  }
606  /** An iterator that points to one past the max coordinate */
608  return {min() + extent()};
609  }
612  : d(dim) {
613  }
614  };
616  /** Access the shape of the buffer */
617  HALIDE_ALWAYS_INLINE Dimension dim(int i) const {
618  assert(i >= 0 && i < this->dimensions());
619  return Dimension(buf.dim[i]);
620  }
622  /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
623  // @{
624  int min(int i) const {
625  return dim(i).min();
626  }
627  int extent(int i) const {
628  return dim(i).extent();
629  }
630  int stride(int i) const {
631  return dim(i).stride();
632  }
633  // @}
635  /** The total number of elements this buffer represents. Equal to
636  * the product of the extents */
637  size_t number_of_elements() const {
638  return buf.number_of_elements();
639  }
641  /** Get the dimensionality of the buffer. */
642  int dimensions() const {
643  if constexpr (has_static_dimensions) {
644  return Dims;
645  } else {
646  return buf.dimensions;
647  }
648  }
650  /** Get the type of the elements. */
651  halide_type_t type() const {
652  return buf.type;
653  }
655  /** A pointer to the element with the lowest address. If all
656  * strides are positive, equal to the host pointer. */
657  T *begin() const {
658  assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
659  return (T *)buf.begin();
660  }
662  /** A pointer to one beyond the element with the highest address. */
663  T *end() const {
664  assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
665  return (T *)buf.end();
666  }
668  /** The total number of bytes spanned by the data in memory. */
669  size_t size_in_bytes() const {
670  return buf.size_in_bytes();
671  }
673  /** Reset the Buffer to be equivalent to a default-constructed Buffer
674  * of the same static type (if any); Buffer<void> will have its runtime
675  * type reset to uint8. */
676  void reset() {
677  *this = Buffer();
678  }
681  : shape() {
682  buf.type = static_halide_type();
683  // If Dims are statically known, must create storage that many.
684  // otherwise, make a zero-dimensional buffer.
685  constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
686  make_static_shape_storage<buf_dimensions>();
687  }
689  /** Make a Buffer from a halide_buffer_t */
690  explicit Buffer(const halide_buffer_t &buf,
692  assert(T_is_void || buf.type == static_halide_type());
693  initialize_from_buffer(buf, ownership);
694  }
696  /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
697  template<typename T2, int D2, int S2>
698  friend class Buffer;
700 private:
701  template<typename T2, int D2, int S2>
702  static void static_assert_can_convert_from() {
703  static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
704  "Can't convert from a Buffer<const T> to a Buffer<T>");
705  static_assert(std::is_same<typename std::remove_const<T>::type,
706  typename std::remove_const<T2>::type>::value ||
707  T_is_void || Buffer<T2, D2, S2>::T_is_void,
708  "type mismatch constructing Buffer");
709  static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
710  "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
711  }
713 public:
714  /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
715  * If this can be determined at compile time, fail with a static assert; otherwise
716  * return a boolean based on runtime typing. */
717  template<typename T2, int D2, int S2>
718  static bool can_convert_from(const Buffer<T2, D2, S2> &other) {
719  static_assert_can_convert_from<T2, D2, S2>();
720  if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
721  if (other.type() != static_halide_type()) {
722  return false;
723  }
724  }
725  if (Dims != AnyDims) {
726  if (other.dimensions() != Dims) {
727  return false;
728  }
729  }
730  return true;
731  }
733  /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
734  * cannot be constructed from some other Buffer type. */
735  template<typename T2, int D2, int S2>
736  static void assert_can_convert_from(const Buffer<T2, D2, S2> &other) {
737  // Explicitly call static_assert_can_convert_from() here so
738  // that we always get compile-time checking, even if compiling with
739  // assertions disabled.
740  static_assert_can_convert_from<T2, D2, S2>();
741  assert(can_convert_from(other));
742  }
744  /** Copy constructor. Does not copy underlying data. */
746  : buf(other.buf),
747  alloc(other.alloc) {
748  other.incref();
749  dev_ref_count = other.dev_ref_count;
750  copy_shape_from(other.buf);
751  }
753  /** Construct a Buffer from a Buffer of different dimensionality
754  * and type. Asserts that the type and dimensionality matches (at runtime,
755  * if one of the types is void). Note that this constructor is
756  * implicit. This, for example, lets you pass things like
757  * Buffer<T> or Buffer<const void> to functions expected
758  * Buffer<const T>. */
759  template<typename T2, int D2, int S2>
761  : buf(other.buf),
762  alloc(other.alloc) {
763  assert_can_convert_from(other);
764  other.incref();
765  dev_ref_count = other.dev_ref_count;
766  copy_shape_from(other.buf);
767  }
769  /** Move constructor */
771  : buf(other.buf),
772  alloc(other.alloc),
773  dev_ref_count(other.dev_ref_count) {
774  other.dev_ref_count = nullptr;
775  other.alloc = nullptr;
776  move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
777  other.buf = halide_buffer_t();
778  }
780  /** Move-construct a Buffer from a Buffer of different
781  * dimensionality and type. Asserts that the types match (at
782  * runtime if one of the types is void). */
783  template<typename T2, int D2, int S2>
785  : buf(other.buf),
786  alloc(other.alloc),
787  dev_ref_count(other.dev_ref_count) {
788  assert_can_convert_from(other);
789  other.dev_ref_count = nullptr;
790  other.alloc = nullptr;
791  move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
792  other.buf = halide_buffer_t();
793  }
795  /** Assign from another Buffer of possibly-different
796  * dimensionality and type. Asserts that the types match (at
797  * runtime if one of the types is void). */
798  template<typename T2, int D2, int S2>
800  if ((const void *)this == (const void *)&other) {
801  return *this;
802  }
803  assert_can_convert_from(other);
804  other.incref();
805  decref();
806  dev_ref_count = other.dev_ref_count;
807  alloc = other.alloc;
808  free_shape_storage();
809  buf = other.buf;
810  copy_shape_from(other.buf);
811  return *this;
812  }
814  /** Standard assignment operator */
816  // The cast to void* here is just to satisfy clang-tidy
817  if ((const void *)this == (const void *)&other) {
818  return *this;
819  }
820  other.incref();
821  decref();
822  dev_ref_count = other.dev_ref_count;
823  alloc = other.alloc;
824  free_shape_storage();
825  buf = other.buf;
826  copy_shape_from(other.buf);
827  return *this;
828  }
830  /** Move from another Buffer of possibly-different
831  * dimensionality and type. Asserts that the types match (at
832  * runtime if one of the types is void). */
833  template<typename T2, int D2, int S2>
835  assert_can_convert_from(other);
836  decref();
837  alloc = other.alloc;
838  other.alloc = nullptr;
839  dev_ref_count = other.dev_ref_count;
840  other.dev_ref_count = nullptr;
841  free_shape_storage();
842  buf = other.buf;
843  move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
844  other.buf = halide_buffer_t();
845  return *this;
846  }
848  /** Standard move-assignment operator */
850  decref();
851  alloc = other.alloc;
852  other.alloc = nullptr;
853  dev_ref_count = other.dev_ref_count;
854  other.dev_ref_count = nullptr;
855  free_shape_storage();
856  buf = other.buf;
857  move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
858  other.buf = halide_buffer_t();
859  return *this;
860  }
862  /** Check the product of the extents fits in memory. */
863  void check_overflow() {
864  size_t size = type().bytes();
865  for (int i = 0; i < dimensions(); i++) {
866  size *= dim(i).extent();
867  }
868  // We allow 2^31 or 2^63 bytes, so drop the top bit.
869  size = (size << 1) >> 1;
870  for (int i = 0; i < dimensions(); i++) {
871  size /= dim(i).extent();
872  }
873  assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
874  }
876  /** Allocate memory for this Buffer. Drops the reference to any
877  * owned memory. */
878  void allocate(void *(*allocate_fn)(size_t) = nullptr,
879  void (*deallocate_fn)(void *) = nullptr) {
880  // Drop any existing allocation
881  deallocate();
883  // Conservatively align images to (usually) 128 bytes. This is enough
884  // alignment for all the platforms we might use. Also ensure that the allocation
885  // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
886  constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;
888  const auto align_up = [=](size_t value) -> size_t {
889  return (value + alignment - 1) & ~(alignment - 1);
890  };
892  size_t size = size_in_bytes();
895  // Only use aligned_alloc() if no custom allocators are specified.
896  if (!allocate_fn && !deallocate_fn) {
897  // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
898  // on any supported platform, so we will just overallocate by 'alignment'
899  // so that the user storage also starts at an aligned point. This is a bit
900  // wasteful, but probably not a big deal.
901  static_assert(sizeof(AllocationHeader) <= alignment);
902  void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
903  assert((uintptr_t)alloc_storage == align_up((uintptr_t)alloc_storage));
904  alloc = new (alloc_storage) AllocationHeader(free);
905  buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
906  return;
907  }
908  // else fall thru
909 #endif
910  if (!allocate_fn) {
911  allocate_fn = malloc;
912  }
913  if (!deallocate_fn) {
914  deallocate_fn = free;
915  }
917  static_assert(sizeof(AllocationHeader) <= alignment);
919  // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
920  // make sure this is OK for AllocationHeader, since it always goes at the start
921  static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));
923  const size_t requested_size = align_up(size + alignment +
924  std::max(0, (int)sizeof(AllocationHeader) -
925  (int)sizeof(std::max_align_t)));
926  void *alloc_storage = allocate_fn(requested_size);
927  alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
928  uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
929  buf.host = (uint8_t *)align_up((uintptr_t)unaligned_ptr);
930  }
932  /** Drop reference to any owned host or device memory, possibly
933  * freeing it, if this buffer held the last reference to
934  * it. Retains the shape of the buffer. Does nothing if this
935  * buffer did not allocate its own memory. */
936  void deallocate() {
937  decref();
938  }
940  /** Drop reference to any owned device memory, possibly freeing it
941  * if this buffer held the last reference to it. Asserts that
942  * device_dirty is false. */
944  decref(true);
945  }
947  /** Allocate a new image of the given size with a runtime
948  * type. Only used when you do know what size you want but you
949  * don't know statically what type the elements are. Pass zeroes
950  * to make a buffer suitable for bounds query calls. */
951  template<typename... Args,
952  typename = typename std::enable_if<AllInts<Args...>::value>::type>
953  Buffer(halide_type_t t, int first, Args... rest) {
954  if (!T_is_void) {
955  assert(static_halide_type() == t);
956  }
957  int extents[] = {first, (int)rest...};
958  buf.type = t;
959  constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
960  make_static_shape_storage<buf_dimensions>();
961  initialize_shape(extents);
962  if (!Internal::any_zero(extents)) {
963  check_overflow();
964  allocate();
965  }
966  }
968  /** Allocate a new image of the given size. Pass zeroes to make a
969  * buffer suitable for bounds query calls. */
970  // @{
972  // The overload with one argument is 'explicit', so that
973  // (say) int is not implicitly convertible to Buffer<int>
974  explicit Buffer(int first) {
975  static_assert(!T_is_void,
976  "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
977  int extents[] = {first};
978  buf.type = static_halide_type();
979  constexpr int buf_dimensions = 1;
980  make_static_shape_storage<buf_dimensions>();
981  initialize_shape(extents);
982  if (first != 0) {
983  check_overflow();
984  allocate();
985  }
986  }
988  template<typename... Args,
989  typename = typename std::enable_if<AllInts<Args...>::value>::type>
990  Buffer(int first, int second, Args... rest) {
991  static_assert(!T_is_void,
992  "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
993  int extents[] = {first, second, (int)rest...};
994  buf.type = static_halide_type();
995  constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
996  make_static_shape_storage<buf_dimensions>();
997  initialize_shape(extents);
998  if (!Internal::any_zero(extents)) {
999  check_overflow();
1000  allocate();
1001  }
1002  }
1003  // @}
1005  /** Allocate a new image of unknown type using a vector of ints as the size. */
1006  Buffer(halide_type_t t, const std::vector<int> &sizes) {
1007  if (!T_is_void) {
1008  assert(static_halide_type() == t);
1009  }
1010  buf.type = t;
1011  // make_shape_storage() will do a runtime check that dimensionality matches.
1012  make_shape_storage((int)sizes.size());
1013  initialize_shape(sizes);
1014  if (!Internal::any_zero(sizes)) {
1015  check_overflow();
1016  allocate();
1017  }
1018  }
1020  /** Allocate a new image of known type using a vector of ints as the size. */
1021  explicit Buffer(const std::vector<int> &sizes)
1022  : Buffer(static_halide_type(), sizes) {
1023  }
1025 private:
1026  // Create a copy of the sizes vector, ordered as specified by order.
1027  static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
1028  assert(order.size() == sizes.size());
1029  std::vector<int> ordered_sizes(sizes.size());
1030  for (size_t i = 0; i < sizes.size(); ++i) {
1031  ordered_sizes[i] = sizes.at(order[i]);
1032  }
1033  return ordered_sizes;
1034  }
1036 public:
1037  /** Allocate a new image of unknown type using a vector of ints as the size and
1038  * a vector of indices indicating the storage order for each dimension. The
1039  * length of the sizes vector and the storage-order vector must match. For instance,
1040  * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
1041  Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
1042  : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
1043  transpose(storage_order);
1044  }
1046  Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
1047  : Buffer(static_halide_type(), sizes, storage_order) {
1048  }
1050  /** Make an Buffer that refers to a statically sized array. Does not
1051  * take ownership of the data, and does not set the host_dirty flag. */
1052  template<typename Array, size_t N>
1053  explicit Buffer(Array (&vals)[N]) {
1054  const int buf_dimensions = dimensionality_of_array(vals);
1055  buf.type = scalar_type_of_array(vals);
1056  buf.host = (uint8_t *)vals;
1057  make_shape_storage(buf_dimensions);
1058  initialize_shape_from_array_shape(buf.dimensions - 1, vals);
1059  }
1061  /** Initialize an Buffer of runtime type from a pointer and some
1062  * sizes. Assumes dense row-major packing and a min coordinate of
1063  * zero. Does not take ownership of the data and does not set the
1064  * host_dirty flag. */
1065  template<typename... Args,
1066  typename = typename std::enable_if<AllInts<Args...>::value>::type>
1067  explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
1068  if (!T_is_void) {
1069  assert(static_halide_type() == t);
1070  }
1071  int extents[] = {first, (int)rest...};
1072  buf.type = t;
1073  buf.host = (uint8_t *)const_cast<void *>(data);
1074  constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1075  make_static_shape_storage<buf_dimensions>();
1076  initialize_shape(extents);
1077  }
1079  /** Initialize an Buffer from a pointer and some sizes. Assumes
1080  * dense row-major packing and a min coordinate of zero. Does not
1081  * take ownership of the data and does not set the host_dirty flag. */
1082  template<typename... Args,
1083  typename = typename std::enable_if<AllInts<Args...>::value>::type>
1084  explicit Buffer(T *data, int first, Args &&...rest) {
1085  int extents[] = {first, (int)rest...};
1086  buf.type = static_halide_type();
1087  buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1088  constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1089  make_static_shape_storage<buf_dimensions>();
1090  initialize_shape(extents);
1091  }
1093  /** Initialize an Buffer from a pointer and a vector of
1094  * sizes. Assumes dense row-major packing and a min coordinate of
1095  * zero. Does not take ownership of the data and does not set the
1096  * host_dirty flag. */
1097  explicit Buffer(T *data, const std::vector<int> &sizes) {
1098  buf.type = static_halide_type();
1099  buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1100  make_shape_storage((int)sizes.size());
1101  initialize_shape(sizes);
1102  }
1104  /** Initialize an Buffer of runtime type from a pointer and a
1105  * vector of sizes. Assumes dense row-major packing and a min
1106  * coordinate of zero. Does not take ownership of the data and
1107  * does not set the host_dirty flag. */
1108  explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
1109  if (!T_is_void) {
1110  assert(static_halide_type() == t);
1111  }
1112  buf.type = t;
1113  buf.host = (uint8_t *)const_cast<void *>(data);
1114  make_shape_storage((int)sizes.size());
1115  initialize_shape(sizes);
1116  }
1118  /** Initialize an Buffer from a pointer to the min coordinate and
1119  * an array describing the shape. Does not take ownership of the
1120  * data, and does not set the host_dirty flag. */
1121  explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
1122  if (!T_is_void) {
1123  assert(static_halide_type() == t);
1124  }
1125  buf.type = t;
1126  buf.host = (uint8_t *)const_cast<void *>(data);
1127  make_shape_storage(d);
1128  for (int i = 0; i < d; i++) {
1129  buf.dim[i] = shape[i];
1130  }
1131  }
1133  /** Initialize a Buffer from a pointer to the min coordinate and
1134  * a vector describing the shape. Does not take ownership of the
1135  * data, and does not set the host_dirty flag. */
1136  explicit inline Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
1137  const std::vector<halide_dimension_t> &shape)
1138  : Buffer(t, data, (int)shape.size(), shape.data()) {
1139  }
1141  /** Initialize an Buffer from a pointer to the min coordinate and
1142  * an array describing the shape. Does not take ownership of the
1143  * data and does not set the host_dirty flag. */
1144  explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1145  buf.type = static_halide_type();
1146  buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1147  make_shape_storage(d);
1148  for (int i = 0; i < d; i++) {
1149  buf.dim[i] = shape[i];
1150  }
1151  }
1153  /** Initialize a Buffer from a pointer to the min coordinate and
1154  * a vector describing the shape. Does not take ownership of the
1155  * data, and does not set the host_dirty flag. */
1156  explicit inline Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1157  : Buffer(data, (int)shape.size(), shape.data()) {
1158  }
1160  /** Destructor. Will release any underlying owned allocation if
1161  * this is the last reference to it. Will assert fail if there are
1162  * weak references to this Buffer outstanding. */
1164  decref();
1165  free_shape_storage();
1166  }
1168  /** Get a pointer to the raw halide_buffer_t this wraps. */
1169  // @{
1171  return &buf;
1172  }
1174  const halide_buffer_t *raw_buffer() const {
1175  return &buf;
1176  }
1177  // @}
1179  /** Provide a cast operator to halide_buffer_t *, so that
1180  * instances can be passed directly to Halide filters. */
1181  operator halide_buffer_t *() {
1182  return &buf;
1183  }
1185  /** Return a typed reference to this Buffer. Useful for converting
1186  * a reference to a Buffer<void> to a reference to, for example, a
1187  * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1188  * You can also optionally sspecify a new value for Dims; this is useful
1189  * mainly for removing the dimensionality constraint on a Buffer with
1190  * explicit dimensionality. Does a runtime assert if the source buffer type
1191  * is void or the new dimensionality is incompatible. */
1192  template<typename T2, int D2 = Dims>
1195  return *((Buffer<T2, D2, InClassDimStorage> *)this);
1196  }
1198  /** Return a const typed reference to this Buffer. Useful for converting
1199  * a reference to a Buffer<void> to a reference to, for example, a
1200  * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1201  * You can also optionally sspecify a new value for Dims; this is useful
1202  * mainly for removing the dimensionality constraint on a Buffer with
1203  * explicit dimensionality. Does a runtime assert if the source buffer type
1204  * is void or the new dimensionality is incompatible. */
1205  template<typename T2, int D2 = Dims>
1208  return *((const Buffer<T2, D2, InClassDimStorage> *)this);
1209  }
1211  /** Return an rval reference to this Buffer. Useful for converting
1212  * a reference to a Buffer<void> to a reference to, for example, a
1213  * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1214  * You can also optionally sspecify a new value for Dims; this is useful
1215  * mainly for removing the dimensionality constraint on a Buffer with
1216  * explicit dimensionality. Does a runtime assert if the source buffer type
1217  * is void or the new dimensionality is incompatible. */
1218  template<typename T2, int D2 = Dims>
1221  return *((Buffer<T2, D2, InClassDimStorage> *)this);
1222  }
1224  /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1225  * to recapitulate the type argument. */
1226  // @{
1228  Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() & {
1229  // Note that we can skip the assert_can_convert_from(), since T -> const T
1230  // conversion is always legal.
1231  return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1232  }
1235  const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() const & {
1236  return *((const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1237  }
1240  Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> as_const() && {
1241  return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1242  }
1243  // @}
1245  /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
1246  * passing arguments */
1247  template<typename T2 = T, typename = typename std::enable_if<!std::is_const<T2>::value>::type>
1248  operator Buffer<typename std::add_const<T2>::type, Dims, InClassDimStorage> &() & {
1249  return as_const();
1250  }
1252  /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
1253  * passing arguments */
1254  template<typename TVoid,
1255  typename T2 = T,
1256  typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1257  !std::is_void<T2>::value &&
1258  !std::is_const<T2>::value>::type>
1260  return as<TVoid, Dims>();
1261  }
1263  /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
1264  * passing arguments */
1265  template<typename TVoid,
1266  typename T2 = T,
1267  typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1268  !std::is_void<T2>::value &&
1269  std::is_const<T2>::value>::type>
1271  return as<const TVoid, Dims>();
1272  }
1274  /** Conventional names for the first three dimensions. */
1275  // @{
1276  int width() const {
1277  return (dimensions() > 0) ? dim(0).extent() : 1;
1278  }
1279  int height() const {
1280  return (dimensions() > 1) ? dim(1).extent() : 1;
1281  }
1282  int channels() const {
1283  return (dimensions() > 2) ? dim(2).extent() : 1;
1284  }
1285  // @}
1287  /** Conventional names for the min and max value of each dimension */
1288  // @{
1289  int left() const {
1290  return dim(0).min();
1291  }
1293  int right() const {
1294  return dim(0).max();
1295  }
1297  int top() const {
1298  return dim(1).min();
1299  }
1301  int bottom() const {
1302  return dim(1).max();
1303  }
1304  // @}
1306  /** Make a new image which is a deep copy of this image. Use crop
1307  * or slice followed by copy to make a copy of only a portion of
1308  * the image. The new image uses the same memory layout as the
1309  * original, with holes compacted away. Note that the returned
1310  * Buffer is always of a non-const type T (ie:
1311  *
1312  * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1313  *
1314  * which is always safe, since we are making a deep copy. (The caller
1315  * can easily cast it back to Buffer<const T> if desired, which is
1316  * always safe and free.)
1317  */
1318  Buffer<not_const_T, Dims, InClassDimStorage> copy(void *(*allocate_fn)(size_t) = nullptr,
1319  void (*deallocate_fn)(void *) = nullptr) const {
1321  dst.copy_from(*this);
1322  return dst;
1323  }
1325  /** Like copy(), but the copy is created in interleaved memory layout
1326  * (vs. keeping the same memory layout as the original). Requires that 'this'
1327  * has exactly 3 dimensions.
1328  */
1330  void (*deallocate_fn)(void *) = nullptr) const {
1331  static_assert(Dims == AnyDims || Dims == 3);
1332  assert(dimensions() == 3);
1334  dst.set_min(min(0), min(1), min(2));
1335  dst.allocate(allocate_fn, deallocate_fn);
1336  dst.copy_from(*this);
1337  return dst;
1338  }
1340  /** Like copy(), but the copy is created in planar memory layout
1341  * (vs. keeping the same memory layout as the original).
1342  */
1343  Buffer<not_const_T, Dims, InClassDimStorage> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
1344  void (*deallocate_fn)(void *) = nullptr) const {
1345  std::vector<int> mins, extents;
1346  const int dims = dimensions();
1347  mins.reserve(dims);
1348  extents.reserve(dims);
1349  for (int d = 0; d < dims; ++d) {
1350  mins.push_back(dim(d).min());
1351  extents.push_back(dim(d).extent());
1352  }
1354  dst.set_min(mins);
1355  dst.allocate(allocate_fn, deallocate_fn);
1356  dst.copy_from(*this);
1357  return dst;
1358  }
1360  /** Make a copy of the Buffer which shares the underlying host and/or device
1361  * allocations as the existing Buffer. This is purely syntactic sugar for
1362  * cases where you have a const reference to a Buffer but need a temporary
1363  * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1364  * inline way to create a temporary. \code
1365  * void call_my_func(const Buffer<const uint8_t>& input) {
1366  * my_func(input.alias(), output);
1367  * }\endcode
1368  */
1370  return *this;
1371  }
1373  /** Fill a Buffer with the values at the same coordinates in
1374  * another Buffer. Restricts itself to coordinates contained
1375  * within the intersection of the two buffers. If the two Buffers
1376  * are not in the same coordinate system, you will need to
1377  * translate the argument Buffer first. E.g. if you're blitting a
1378  * sprite onto a framebuffer, you'll want to translate the sprite
1379  * to the correct location first like so: \code
1380  * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1381  */
1382  template<typename T2, int D2, int S2>
1384  static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
1385  assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1386  assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1390  static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
1391  assert(src.dimensions() == dst.dimensions());
1393  // Trim the copy to the region in common
1394  const int d = dimensions();
1395  for (int i = 0; i < d; i++) {
1396  int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1397  int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1398  if (max_coord < min_coord) {
1399  // The buffers do not overlap.
1400  return;
1401  }
1402  dst.crop(i, min_coord, max_coord - min_coord + 1);
1403  src.crop(i, min_coord, max_coord - min_coord + 1);
1404  }
1406  // If T is void, we need to do runtime dispatch to an
1407  // appropriately-typed lambda. We're copying, so we only care
1408  // about the element size. (If not, this should optimize away
1409  // into a static dispatch to the right-sized copy.)
1410  if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1411  using MemType = uint8_t;
1412  auto &typed_dst = (Buffer<MemType, Dims, InClassDimStorage> &)dst;
1413  auto &typed_src = (Buffer<const MemType, D2, S2> &)src;
1414  typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1415  } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1416  using MemType = uint16_t;
1417  auto &typed_dst = (Buffer<MemType, Dims, InClassDimStorage> &)dst;
1418  auto &typed_src = (Buffer<const MemType, D2, S2> &)src;
1419  typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1420  } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1421  using MemType = uint32_t;
1422  auto &typed_dst = (Buffer<MemType, Dims, InClassDimStorage> &)dst;
1423  auto &typed_src = (Buffer<const MemType, D2, S2> &)src;
1424  typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1425  } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1426  using MemType = uint64_t;
1427  auto &typed_dst = (Buffer<MemType, Dims, InClassDimStorage> &)dst;
1428  auto &typed_src = (Buffer<const MemType, D2, S2> &)src;
1429  typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1430  } else {
1431  assert(false && "type().bytes() must be 1, 2, 4, or 8");
1432  }
1433  set_host_dirty();
1434  }
1436  /** Make an image that refers to a sub-range of this image along
1437  * the given dimension. Asserts that the crop region is within
1438  * the existing bounds: you cannot "crop outwards", even if you know there
1439  * is valid Buffer storage (e.g. because you already cropped inwards). */
1440  Buffer<T, Dims, InClassDimStorage> cropped(int d, int min, int extent) const {
1441  // Make a fresh copy of the underlying buffer (but not a fresh
1442  // copy of the allocation, if there is one).
1445  // This guarantees the prexisting device ref is dropped if the
1446  // device_crop call fails and maintains the buffer in a consistent
1447  // state.
1448  im.device_deallocate();
1450  im.crop_host(d, min, extent);
1451  if (buf.device_interface != nullptr) {
1452  complete_device_crop(im);
1453  }
1454  return im;
1455  }
1457  /** Crop an image in-place along the given dimension. This does
1458  * not move any data around in memory - it just changes the min
1459  * and extent of the given dimension. */
1460  void crop(int d, int min, int extent) {
1461  // An optimization for non-device buffers. For the device case,
1462  // a temp buffer is required, so reuse the not-in-place version.
1463  // TODO(zalman|abadams): Are nop crops common enough to special
1464  // case the device part of the if to do nothing?
1465  if (buf.device_interface != nullptr) {
1466  *this = cropped(d, min, extent);
1467  } else {
1468  crop_host(d, min, extent);
1469  }
1470  }
1472  /** Make an image that refers to a sub-rectangle of this image along
1473  * the first N dimensions. Asserts that the crop region is within
1474  * the existing bounds. The cropped image may drop any device handle
1475  * if the device_interface cannot accomplish the crop in-place. */
1476  Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
1477  // Make a fresh copy of the underlying buffer (but not a fresh
1478  // copy of the allocation, if there is one).
1481  // This guarantees the prexisting device ref is dropped if the
1482  // device_crop call fails and maintains the buffer in a consistent
1483  // state.
1484  im.device_deallocate();
1486  im.crop_host(rect);
1487  if (buf.device_interface != nullptr) {
1488  complete_device_crop(im);
1489  }
1490  return im;
1491  }
1493  /** Crop an image in-place along the first N dimensions. This does
1494  * not move any data around in memory, nor does it free memory. It
1495  * just rewrites the min/extent of each dimension to refer to a
1496  * subregion of the same allocation. */
1497  void crop(const std::vector<std::pair<int, int>> &rect) {
1498  // An optimization for non-device buffers. For the device case,
1499  // a temp buffer is required, so reuse the not-in-place version.
1500  // TODO(zalman|abadams): Are nop crops common enough to special
1501  // case the device part of the if to do nothing?
1502  if (buf.device_interface != nullptr) {
1503  *this = cropped(rect);
1504  } else {
1505  crop_host(rect);
1506  }
1507  }
1509  /** Make an image which refers to the same data with using
1510  * translated coordinates in the given dimension. Positive values
1511  * move the image data to the right or down relative to the
1512  * coordinate system. Drops any device handle. */
1515  im.translate(d, dx);
1516  return im;
1517  }
1519  /** Translate an image in-place along one dimension by changing
1520  * how it is indexed. Does not move any data around in memory. */
1521  void translate(int d, int delta) {
1522  assert(d >= 0 && d < this->dimensions());
1523  device_deallocate();
1524  buf.dim[d].min += delta;
1525  }
1527  /** Make an image which refers to the same data translated along
1528  * the first N dimensions. */
1529  Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
1531  im.translate(delta);
1532  return im;
1533  }
1535  /** Translate an image along the first N dimensions by changing
1536  * how it is indexed. Does not move any data around in memory. */
1537  void translate(const std::vector<int> &delta) {
1538  device_deallocate();
1539  assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1540  int limit = (int)delta.size();
1541  assert(limit <= dimensions());
1542  for (int i = 0; i < limit; i++) {
1543  translate(i, delta[i]);
1544  }
1545  }
1547  /** Set the min coordinate of an image in the first N dimensions. */
1548  // @{
1549  void set_min(const std::vector<int> &mins) {
1550  assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1551  device_deallocate();
1552  for (size_t i = 0; i < mins.size(); i++) {
1553  buf.dim[i].min = mins[i];
1554  }
1555  }
1557  template<typename... Args>
1558  void set_min(Args... args) {
1559  set_min(std::vector<int>{args...});
1560  }
1561  // @}
1563  /** Test if a given coordinate is within the bounds of an image. */
1564  // @{
1565  bool contains(const std::vector<int> &coords) const {
1566  assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1567  for (size_t i = 0; i < coords.size(); i++) {
1568  if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1569  return false;
1570  }
1571  }
1572  return true;
1573  }
1575  template<typename... Args>
1576  bool contains(Args... args) const {
1577  return contains(std::vector<int>{args...});
1578  }
1579  // @}
1581  /** Make a buffer which refers to the same data in the same layout
1582  * using a swapped indexing order for the dimensions given. So
1583  * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1584  * strongly that A.address_of(i, j) == B.address_of(j, i). */
1587  im.transpose(d1, d2);
1588  return im;
1589  }
1591  /** Transpose a buffer in-place by changing how it is indexed. For
1592  * example, transpose(0, 1) on a two-dimensional buffer means that
1593  * the value referred to by coordinates (i, j) is now reached at
1594  * the coordinates (j, i), and vice versa. This is done by
1595  * reordering the per-dimension metadata rather than by moving
1596  * data around in memory, so other views of the same memory will
1597  * not see the data as having been transposed. */
1598  void transpose(int d1, int d2) {
1599  assert(d1 >= 0 && d1 < this->dimensions());
1600  assert(d2 >= 0 && d2 < this->dimensions());
1601  std::swap(buf.dim[d1], buf.dim[d2]);
1602  }
1604  /** A generalized transpose: instead of swapping two dimensions,
1605  * pass a vector that lists each dimension index exactly once, in
1606  * the desired order. This does not move any data around in memory
1607  * - it just permutes how it is indexed. */
1608  void transpose(const std::vector<int> &order) {
1609  assert((int)order.size() == dimensions());
1610  if (dimensions() < 2) {
1611  // My, that was easy
1612  return;
1613  }
1615  std::vector<int> order_sorted = order;
1616  for (size_t i = 1; i < order_sorted.size(); i++) {
1617  for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1618  std::swap(order_sorted[j], order_sorted[j - 1]);
1619  transpose(j, j - 1);
1620  }
1621  }
1622  }
1624  /** Make a buffer which refers to the same data in the same
1625  * layout using a different ordering of the dimensions. */
1626  Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
1628  im.transpose(order);
1629  return im;
1630  }
1632  /** Make a lower-dimensional buffer that refers to one slice of
1633  * this buffer. */
1634  Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1635  sliced(int d, int pos) const {
1636  static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1637  assert(dimensions() > 0);
1641  // This guarantees the prexisting device ref is dropped if the
1642  // device_slice call fails and maintains the buffer in a consistent
1643  // state.
1644  im.device_deallocate();
1646  im.slice_host(d, pos);
1647  if (buf.device_interface != nullptr) {
1648  complete_device_slice(im, d, pos);
1649  }
1650  return im;
1651  }
1653  /** Make a lower-dimensional buffer that refers to one slice of this
1654  * buffer at the dimension's minimum. */
1655  Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1656  sliced(int d) const {
1657  static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1658  assert(dimensions() > 0);
1660  return sliced(d, dim(d).min());
1661  }
1663  /** Rewrite the buffer to refer to a single lower-dimensional
1664  * slice of itself along the given dimension at the given
1665  * coordinate. Does not move any data around or free the original
1666  * memory, so other views of the same data are unaffected. Can
1667  * only be called on a Buffer with dynamic dimensionality. */
1668  void slice(int d, int pos) {
1669  static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
1670  assert(dimensions() > 0);
1672  // An optimization for non-device buffers. For the device case,
1673  // a temp buffer is required, so reuse the not-in-place version.
1674  // TODO(zalman|abadams): Are nop slices common enough to special
1675  // case the device part of the if to do nothing?
1676  if (buf.device_interface != nullptr) {
1677  *this = sliced(d, pos);
1678  } else {
1679  slice_host(d, pos);
1680  }
1681  }
1683  /** Slice a buffer in-place at the dimension's minimum. */
1684  inline void slice(int d) {
1685  slice(d, dim(d).min());
1686  }
1688  /** Make a new buffer that views this buffer as a single slice in a
1689  * higher-dimensional space. The new dimension has extent one and
1690  * the given min. This operation is the opposite of slice. As an
1691  * example, the following condition is true:
1692  *
1693  \code
1694  im2 = im.embedded(1, 17);
1695  &im(x, y, c) == &im2(x, 17, y, c);
1696  \endcode
1697  */
1698  Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
1699  embedded(int d, int pos = 0) const {
1701  im.embed(d, pos);
1702  return im;
1703  }
1705  /** Embed a buffer in-place, increasing the
1706  * dimensionality. */
1707  void embed(int d, int pos = 0) {
1708  static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
1709  assert(d >= 0 && d <= dimensions());
1710  add_dimension();
1711  translate(dimensions() - 1, pos);
1712  for (int i = dimensions() - 1; i > d; i--) {
1713  transpose(i, i - 1);
1714  }
1715  }
1717  /** Add a new dimension with a min of zero and an extent of
1718  * one. The stride is the extent of the outermost dimension times
1719  * its stride. The new dimension is the last dimension. This is a
1720  * special case of embed. */
1721  void add_dimension() {
1722  static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
1723  const int dims = buf.dimensions;
1724  buf.dimensions++;
1725  if (buf.dim != shape) {
1726  // We're already on the heap. Reallocate.
1727  halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
1728  for (int i = 0; i < dims; i++) {
1729  new_shape[i] = buf.dim[i];
1730  }
1731  delete[] buf.dim;
1732  buf.dim = new_shape;
1733  } else if (dims == InClassDimStorage) {
1734  // Transition from the in-class storage to the heap
1735  make_shape_storage(buf.dimensions);
1736  for (int i = 0; i < dims; i++) {
1737  buf.dim[i] = shape[i];
1738  }
1739  } else {
1740  // We still fit in the class
1741  }
1742  buf.dim[dims] = {0, 1, 0};
1743  if (dims == 0) {
1744  buf.dim[dims].stride = 1;
1745  } else {
1746  buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1747  }
1748  }
1750  /** Add a new dimension with a min of zero, an extent of one, and
1751  * the specified stride. The new dimension is the last
1752  * dimension. This is a special case of embed. */
1754  add_dimension();
1755  buf.dim[buf.dimensions - 1].stride = s;
1756  }
1758  /** Methods for managing any GPU allocation. */
1759  // @{
1760  // Set the host dirty flag. Called by every operator()
1761  // access. Must be inlined so it can be hoisted out of loops.
1763  void set_host_dirty(bool v = true) {
1764  assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1765  buf.set_host_dirty(v);
1766  }
1768  // Check if the device allocation is dirty. Called by
1769  // set_host_dirty, which is called by every accessor. Must be
1770  // inlined so it can be hoisted out of loops.
1772  bool device_dirty() const {
1773  return buf.device_dirty();
1774  }
1776  bool host_dirty() const {
1777  return buf.host_dirty();
1778  }
1780  void set_device_dirty(bool v = true) {
1781  assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1782  buf.set_device_dirty(v);
1783  }
1785  int copy_to_host(void *ctx = nullptr) {
1786  if (device_dirty()) {
1787  return buf.device_interface->copy_to_host(ctx, &buf);
1788  }
1790  }
1792  int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1793  if (host_dirty()) {
1794  return device_interface->copy_to_device(ctx, &buf, device_interface);
1795  }
1797  }
1799  int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1800  return device_interface->device_malloc(ctx, &buf, device_interface);
1801  }
1803  int device_free(void *ctx = nullptr) {
1804  if (dev_ref_count) {
1805  assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
1806  "Can't call device_free on an unmanaged or wrapped native device handle. "
1807  "Free the source allocation or call device_detach_native instead.");
1808  // Multiple people may be holding onto this dev field
1809  assert(dev_ref_count->count == 1 &&
1810  "Multiple Halide::Runtime::Buffer objects share this device "
1811  "allocation. Freeing it would create dangling references. "
1812  "Don't call device_free on Halide buffers that you have copied or "
1813  "passed by value.");
1814  }
1815  int ret = halide_error_code_success;
1816  if (buf.device_interface) {
1817  ret = buf.device_interface->device_free(ctx, &buf);
1818  }
1819  if (dev_ref_count) {
1820  delete dev_ref_count;
1821  dev_ref_count = nullptr;
1822  }
1823  return ret;
1824  }
1826  int device_wrap_native(const struct halide_device_interface_t *device_interface,
1827  uint64_t handle, void *ctx = nullptr) {
1828  assert(device_interface);
1829  dev_ref_count = new DeviceRefCount;
1831  return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1832  }
1834  int device_detach_native(void *ctx = nullptr) {
1835  assert(dev_ref_count &&
1836  dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
1837  "Only call device_detach_native on buffers wrapping a native "
1838  "device handle via device_wrap_native. This buffer was allocated "
1839  "using device_malloc, or is unmanaged. "
1840  "Call device_free or free the original allocation instead.");
1841  // Multiple people may be holding onto this dev field
1842  assert(dev_ref_count->count == 1 &&
1843  "Multiple Halide::Runtime::Buffer objects share this device "
1844  "allocation. Freeing it could create dangling references. "
1845  "Don't call device_detach_native on Halide buffers that you "
1846  "have copied or passed by value.");
1847  int ret = halide_error_code_success;
1848  if (buf.device_interface) {
1849  ret = buf.device_interface->detach_native(ctx, &buf);
1850  }
1851  delete dev_ref_count;
1852  dev_ref_count = nullptr;
1853  return ret;
1854  }
1856  int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1857  return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1858  }
1860  int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1861  if (dev_ref_count) {
1862  assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
1863  "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1864  "Free the source allocation or call device_detach_native instead.");
1865  // Multiple people may be holding onto this dev field
1866  assert(dev_ref_count->count == 1 &&
1867  "Multiple Halide::Runtime::Buffer objects share this device "
1868  "allocation. Freeing it would create dangling references. "
1869  "Don't call device_and_host_free on Halide buffers that you have copied or "
1870  "passed by value.");
1871  }
1872  int ret = halide_error_code_success;
1873  if (buf.device_interface) {
1874  ret = buf.device_interface->device_and_host_free(ctx, &buf);
1875  }
1876  if (dev_ref_count) {
1877  delete dev_ref_count;
1878  dev_ref_count = nullptr;
1879  }
1880  return ret;
1881  }
1883  int device_sync(void *ctx = nullptr) {
1884  return buf.device_sync(ctx);
1885  }
1887  bool has_device_allocation() const {
1888  return buf.device != 0;
1889  }
1891  /** Return the method by which the device field is managed. */
1893  if (dev_ref_count == nullptr) {
1895  }
1896  return dev_ref_count->ownership;
1897  }
1898  // @}
1900  /** If you use the (x, y, c) indexing convention, then Halide
1901  * Buffers are stored planar by default. This function constructs
1902  * an interleaved RGB or RGBA image that can still be indexed
1903  * using (x, y, c). Passing it to a generator requires that the
1904  * generator has been compiled with support for interleaved (also
1905  * known as packed or chunky) memory layouts. */
1906  static Buffer<void, Dims, InClassDimStorage> make_interleaved(halide_type_t t, int width, int height, int channels) {
1907  static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1908  Buffer<void, Dims, InClassDimStorage> im(t, channels, width, height);
1909  // Note that this is equivalent to calling transpose({2, 0, 1}),
1910  // but slightly more efficient.
1911  im.transpose(0, 1);
1912  im.transpose(1, 2);
1913  return im;
1914  }
1916  /** If you use the (x, y, c) indexing convention, then Halide
1917  * Buffers are stored planar by default. This function constructs
1918  * an interleaved RGB or RGBA image that can still be indexed
1919  * using (x, y, c). Passing it to a generator requires that the
1920  * generator has been compiled with support for interleaved (also
1921  * known as packed or chunky) memory layouts. */
1922  static Buffer<T, Dims, InClassDimStorage> make_interleaved(int width, int height, int channels) {
1923  return make_interleaved(static_halide_type(), width, height, channels);
1924  }
1926  /** Wrap an existing interleaved image. */
1927  static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage>
1928  make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
1929  static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1930  Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
1931  im.transpose(0, 1);
1932  im.transpose(1, 2);
1933  return im;
1934  }
1936  /** Wrap an existing interleaved image. */
1937  static Buffer<T, Dims, InClassDimStorage> make_interleaved(T *data, int width, int height, int channels) {
1938  return make_interleaved(static_halide_type(), data, width, height, channels);
1939  }
1941  /** Make a zero-dimensional Buffer */
1943  static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1944  Buffer<add_const_if_T_is_const<void>, AnyDims, InClassDimStorage> buf(t, 1);
1945  buf.slice(0, 0);
1946  return buf;
1947  }
1949  /** Make a zero-dimensional Buffer */
1951  static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1953  buf.slice(0, 0);
1954  return buf;
1955  }
1957  /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1959  static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1961  buf.slice(0, 0);
1962  return buf;
1963  }
1965  /** Make a buffer with the same shape and memory nesting order as
1966  * another buffer. It may have a different type. */
1967  template<typename T2, int D2, int S2>
1969  void *(*allocate_fn)(size_t) = nullptr,
1970  void (*deallocate_fn)(void *) = nullptr) {
1971  static_assert(Dims == D2 || Dims == AnyDims);
1972  const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
1973  return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
1974  allocate_fn, deallocate_fn);
1975  }
1977 private:
1978  static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
1979  int dimensions,
1980  halide_dimension_t *shape,
1981  void *(*allocate_fn)(size_t),
1982  void (*deallocate_fn)(void *)) {
1983  // Reorder the dimensions of src to have strides in increasing order
1984  std::vector<int> swaps;
1985  for (int i = dimensions - 1; i > 0; i--) {
1986  for (int j = i; j > 0; j--) {
1987  if (shape[j - 1].stride > shape[j].stride) {
1988  std::swap(shape[j - 1], shape[j]);
1989  swaps.push_back(j);
1990  }
1991  }
1992  }
1994  // Rewrite the strides to be dense (this messes up src, which
1995  // is why we took it by value).
1996  for (int i = 0; i < dimensions; i++) {
1997  if (i == 0) {
1998  shape[i].stride = 1;
1999  } else {
2000  shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
2001  }
2002  }
2004  // Undo the dimension reordering
2005  while (!swaps.empty()) {
2006  int j = swaps.back();
2007  std::swap(shape[j - 1], shape[j]);
2008  swaps.pop_back();
2009  }
2011  // Use an explicit runtime type, and make dst a Buffer<void>, to allow
2012  // using this method with Buffer<void> for either src or dst.
2013  Buffer<> dst(dst_type, nullptr, dimensions, shape);
2014  dst.allocate(allocate_fn, deallocate_fn);
2016  return dst;
2017  }
2019  template<typename... Args>
2021  ptrdiff_t
2022  offset_of(int d, int first, Args... rest) const {
2024  assert(first >= this->buf.dim[d].min);
2025  assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
2026 #endif
2027  return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
2028  }
2031  ptrdiff_t offset_of(int d) const {
2032  return 0;
2033  }
2035  template<typename... Args>
2037  storage_T *
2038  address_of(Args... args) const {
2039  if (T_is_void) {
2040  return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
2041  } else {
2042  return (storage_T *)(this->buf.host) + offset_of(0, args...);
2043  }
2044  }
2047  ptrdiff_t offset_of(const int *pos) const {
2048  ptrdiff_t offset = 0;
2049  for (int i = this->dimensions() - 1; i >= 0; i--) {
2051  assert(pos[i] >= this->buf.dim[i].min);
2052  assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
2053 #endif
2054  offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
2055  }
2056  return offset;
2057  }
2060  storage_T *address_of(const int *pos) const {
2061  if (T_is_void) {
2062  return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
2063  } else {
2064  return (storage_T *)this->buf.host + offset_of(pos);
2065  }
2066  }
2068 public:
2069  /** Get a pointer to the address of the min coordinate. */
2070  T *data() const {
2071  return (T *)(this->buf.host);
2072  }
2074  /** Access elements. Use im(...) to get a reference to an element,
2075  * and use &im(...) to get the address of an element. If you pass
2076  * fewer arguments than the buffer has dimensions, the rest are
2077  * treated as their min coordinate. The non-const versions set the
2078  * host_dirty flag to true.
2079  */
2080  //@{
2081  template<typename... Args,
2082  typename = typename std::enable_if<AllInts<Args...>::value>::type>
2083  HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
2084  static_assert(!T_is_void,
2085  "Cannot use operator() on Buffer<void> types");
2086  constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2087  static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2088  assert(!device_dirty());
2089  return *((const not_void_T *)(address_of(first, rest...)));
2090  }
2093  const not_void_T &
2094  operator()() const {
2095  static_assert(!T_is_void,
2096  "Cannot use operator() on Buffer<void> types");
2097  constexpr int expected_dims = 0;
2098  static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2099  assert(!device_dirty());
2100  return *((const not_void_T *)(data()));
2101  }
2104  const not_void_T &
2105  operator()(const int *pos) const {
2106  static_assert(!T_is_void,
2107  "Cannot use operator() on Buffer<void> types");
2108  assert(!device_dirty());
2109  return *((const not_void_T *)(address_of(pos)));
2110  }
2112  template<typename... Args,
2113  typename = typename std::enable_if<AllInts<Args...>::value>::type>
2115  not_void_T &
2116  operator()(int first, Args... rest) {
2117  static_assert(!T_is_void,
2118  "Cannot use operator() on Buffer<void> types");
2119  constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2120  static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2121  set_host_dirty();
2122  return *((not_void_T *)(address_of(first, rest...)));
2123  }
2126  not_void_T &
2128  static_assert(!T_is_void,
2129  "Cannot use operator() on Buffer<void> types");
2130  constexpr int expected_dims = 0;
2131  static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2132  set_host_dirty();
2133  return *((not_void_T *)(data()));
2134  }
2137  not_void_T &
2138  operator()(const int *pos) {
2139  static_assert(!T_is_void,
2140  "Cannot use operator() on Buffer<void> types");
2141  set_host_dirty();
2142  return *((not_void_T *)(address_of(pos)));
2143  }
2144  // @}
2146  /** Tests that all values in this buffer are equal to val. */
2147  bool all_equal(not_void_T val) const {
2148  bool all_equal = true;
2149  for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
2150  return all_equal;
2151  }
2154  set_host_dirty();
2155  for_each_value([=](T &v) { v = val; });
2156  return *this;
2157  }
2159 private:
2160  /** Helper functions for for_each_value. */
2161  // @{
2162  template<int N>
2163  struct for_each_value_task_dim {
2164  std::ptrdiff_t extent;
2165  std::ptrdiff_t stride[N];
2166  };
2168  // Given an array of strides, and a bunch of pointers to pointers
2169  // (all of different types), advance the pointers using the
2170  // strides.
2171  template<typename Ptr, typename... Ptrs>
2172  HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
2173  ptr += *stride;
2174  advance_ptrs(stride + 1, ptrs...);
2175  }
2178  static void advance_ptrs(const std::ptrdiff_t *) {
2179  }
2181  template<typename Fn, typename Ptr, typename... Ptrs>
2182  HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
2183  const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
2184  if (d == 0) {
2185  if (innermost_strides_are_one) {
2186  Ptr end = ptr + t[0].extent;
2187  while (ptr != end) {
2188  f(*ptr++, (*ptrs++)...);
2189  }
2190  } else {
2191  for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
2192  f(*ptr, (*ptrs)...);
2193  advance_ptrs(t[0].stride, ptr, ptrs...);
2194  }
2195  }
2196  } else {
2197  for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
2198  for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
2199  advance_ptrs(t[d].stride, ptr, ptrs...);
2200  }
2201  }
2202  }
2204  // Return pair is <new_dimensions, innermost_strides_are_one>
2205  template<int N>
2206  HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
2207  const halide_buffer_t **buffers) {
2208  const int dimensions = buffers[0]->dimensions;
2209  assert(dimensions > 0);
2211  // Check the buffers all have clean host allocations
2212  for (int i = 0; i < N; i++) {
2213  if (buffers[i]->device) {
2214  assert(buffers[i]->host &&
2215  "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2216  assert(!buffers[i]->device_dirty() &&
2217  "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2218  } else {
2219  assert(buffers[i]->host &&
2220  "Buffer passed to for_each_value has no host or device allocation");
2221  }
2222  }
2224  // Extract the strides in all the dimensions
2225  for (int i = 0; i < dimensions; i++) {
2226  for (int j = 0; j < N; j++) {
2227  assert(buffers[j]->dimensions == dimensions);
2228  assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2229  buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2230  const int s = buffers[j]->dim[i].stride;
2231  t[i].stride[j] = s;
2232  }
2233  t[i].extent = buffers[0]->dim[i].extent;
2235  // Order the dimensions by stride, so that the traversal is cache-coherent.
2236  // Use the last dimension for this, because this is the source in copies.
2237  // It appears to be better to optimize read order than write order.
2238  for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
2239  std::swap(t[j], t[j - 1]);
2240  }
2241  }
2243  // flatten dimensions where possible to make a larger inner
2244  // loop for autovectorization.
2245  int d = dimensions;
2246  for (int i = 1; i < d; i++) {
2247  bool flat = true;
2248  for (int j = 0; j < N; j++) {
2249  flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2250  }
2251  if (flat) {
2252  t[i - 1].extent *= t[i].extent;
2253  for (int j = i; j < d - 1; j++) {
2254  t[j] = t[j + 1];
2255  }
2256  i--;
2257  d--;
2258  }
2259  }
2261  // Note that we assert() that dimensions > 0 above
2262  // (our one-and-only caller will only call us that way)
2263  // so the unchecked access to t[0] should be safe.
2264  bool innermost_strides_are_one = true;
2265  for (int i = 0; i < N; i++) {
2266  innermost_strides_are_one &= (t[0].stride[i] == 1);
2267  }
2269  return {d, innermost_strides_are_one};
2270  }
2272  template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2273  void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
2274  if (dimensions() > 0) {
2275  const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
2276  Buffer<>::for_each_value_task_dim<N> *t =
2277  (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA(alloc_size);
2278  // Move the preparatory code into a non-templated helper to
2279  // save code size.
2280  const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2281  auto [new_dims, innermost_strides_are_one] = Buffer<>::for_each_value_prep(t, buffers);
2282  if (new_dims > 0) {
2283  Buffer<>::for_each_value_helper(f, new_dims - 1,
2284  innermost_strides_are_one,
2285  t,
2286  data(), (other_buffers.data())...);
2287  return;
2288  }
2289  // else fall thru
2290  }
2292  // zero-dimensional case
2293  f(*data(), (*other_buffers.data())...);
2294  }
2295  // @}
2297 public:
2298  /** Call a function on every value in the buffer, and the
2299  * corresponding values in some number of other buffers of the
2300  * same size. The function should take a reference, const
2301  * reference, or value of the correct type for each buffer. This
2302  * effectively lifts a function of scalars to an element-wise
2303  * function of buffers. This produces code that the compiler can
2304  * autovectorize. This is slightly cheaper than for_each_element,
2305  * because it does not need to track the coordinates.
2306  *
2307  * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2308  * 'this' or the other-buffers arguments) will allow mutation of the
2309  * buffer contents, while a Buffer<const T> will not. Attempting to specify
2310  * a mutable reference for the lambda argument of a Buffer<const T>
2311  * will result in a compilation error. */
2312  // @{
2313  template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2314  HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_value(Fn &&f, Args &&...other_buffers) const {
2315  for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2316  return *this;
2317  }
2319  template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2322  for_each_value(Fn &&f, Args &&...other_buffers) {
2323  for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2324  return *this;
2325  }
2326  // @}
2328 private:
2329  // Helper functions for for_each_element
2330  struct for_each_element_task_dim {
2331  int min, max;
2332  };
2334  /** If f is callable with this many args, call it. The first
2335  * argument is just to make the overloads distinct. Actual
2336  * overload selection is done using the enable_if. */
2337  template<typename Fn,
2338  typename... Args,
2339  typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2340  HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2341  f(args...);
2342  }
2344  /** If the above overload is impossible, we add an outer loop over
2345  * an additional argument and try again. */
2346  template<typename Fn,
2347  typename... Args>
2348  HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2349  for (int i = t[d].min; i <= t[d].max; i++) {
2350  for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2351  }
2352  }
2354  /** Determine the minimum number of arguments a callable can take
2355  * using the same trick. */
2356  template<typename Fn,
2357  typename... Args,
2358  typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2359  HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2360  return (int)(sizeof...(Args));
2361  }
2363  /** The recursive version is only enabled up to a recursion limit
2364  * of 256. This catches callables that aren't callable with any
2365  * number of ints. */
2366  template<typename Fn,
2367  typename... Args>
2368  HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2369  static_assert(sizeof...(args) <= 256,
2370  "Callable passed to for_each_element must accept either a const int *,"
2371  " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2372  return num_args(0, std::forward<Fn>(f), 0, args...);
2373  }
2375  /** A version where the callable takes a position array instead,
2376  * with compile-time recursion on the dimensionality. This
2377  * overload is preferred to the one below using the same int vs
2378  * double trick as above, but is impossible once d hits -1 using
2379  * std::enable_if. */
2380  template<int d,
2381  typename Fn,
2382  typename = typename std::enable_if<(d >= 0)>::type>
2383  HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2384  for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2385  for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
2386  }
2387  }
2389  /** Base case for recursion above. */
2390  template<int d,
2391  typename Fn,
2392  typename = typename std::enable_if<(d < 0)>::type>
2393  HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2394  f(pos);
2395  }
2397  /** A run-time-recursive version (instead of
2398  * compile-time-recursive) that requires the callable to take a
2399  * pointer to a position array instead. Dispatches to the
2400  * compile-time-recursive version once the dimensionality gets
2401  * small. */
2402  template<typename Fn>
2403  static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2404  if (d == -1) {
2405  f(pos);
2406  } else if (d == 0) {
2407  // Once the dimensionality gets small enough, dispatch to
2408  // a compile-time-recursive version for better codegen of
2409  // the inner loops.
2410  for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2411  } else if (d == 1) {
2412  for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2413  } else if (d == 2) {
2414  for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2415  } else if (d == 3) {
2416  for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2417  } else {
2418  for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2419  for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
2420  }
2421  }
2422  }
2424  /** We now have two overloads for for_each_element. This one
2425  * triggers if the callable takes a const int *.
2426  */
2427  template<typename Fn,
2428  typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2429  static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2430  const int size = dims * sizeof(int);
2431  int *pos = (int *)HALIDE_ALLOCA(size);
2432  // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
2433  // Add this memset to silence it.
2434  memset(pos, 0, size);
2435  for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2436  }
2438  /** This one triggers otherwise. It treats the callable as
2439  * something that takes some number of ints. */
2440  template<typename Fn>
2441  HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2442  int args = num_args(0, std::forward<Fn>(f));
2443  assert(dims >= args);
2444  for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2445  }
2447  template<typename Fn>
2448  void for_each_element_impl(Fn &&f) const {
2449  for_each_element_task_dim *t =
2450  (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2451  for (int i = 0; i < dimensions(); i++) {
2452  t[i].min = dim(i).min();
2453  t[i].max = dim(i).max();
2454  }
2455  for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2456  }
2458 public:
2459  /** Call a function at each site in a buffer. This is likely to be
2460  * much slower than using Halide code to populate a buffer, but is
2461  * convenient for tests. If the function has more arguments than the
2462  * buffer has dimensions, the remaining arguments will be zero. If it
2463  * has fewer arguments than the buffer has dimensions then the last
2464  * few dimensions of the buffer are not iterated over. For example,
2465  * the following code exploits this to set a floating point RGB image
2466  * to red:
2468  \code
2469  Buffer<float, 3> im(100, 100, 3);
2470  im.for_each_element([&](int x, int y) {
2471  im(x, y, 0) = 1.0f;
2472  im(x, y, 1) = 0.0f;
2473  im(x, y, 2) = 0.0f:
2474  });
2475  \endcode
2477  * The compiled code is equivalent to writing the a nested for loop,
2478  * and compilers are capable of optimizing it in the same way.
2479  *
2480  * If the callable can be called with an int * as the sole argument,
2481  * that version is called instead. Each location in the buffer is
2482  * passed to it in a coordinate array. This version is higher-overhead
2483  * than the variadic version, but is useful for writing generic code
2484  * that accepts buffers of arbitrary dimensionality. For example, the
2485  * following sets the value at all sites in an arbitrary-dimensional
2486  * buffer to their first coordinate:
2488  \code
2489  im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2490  \endcode
2492  * It is also possible to use for_each_element to iterate over entire
2493  * rows or columns by cropping the buffer to a single column or row
2494  * respectively and iterating over elements of the result. For example,
2495  * to set the diagonal of the image to 1 by iterating over the columns:
2497  \code
2498  Buffer<float, 3> im(100, 100, 3);
2499  im.sliced(1, 0).for_each_element([&](int x, int c) {
2500  im(x, x, c) = 1.0f;
2501  });
2502  \endcode
2504  * Or, assuming the memory layout is known to be dense per row, one can
2505  * memset each row of an image like so:
2507  \code
2508  Buffer<float, 3> im(100, 100, 3);
2509  im.sliced(0, 0).for_each_element([&](int y, int c) {
2510  memset(&im(0, y, c), 0, sizeof(float) * im.width());
2511  });
2512  \endcode
2514  */
2515  // @{
2516  template<typename Fn>
2518  for_each_element_impl(f);
2519  return *this;
2520  }
2522  template<typename Fn>
2526  for_each_element_impl(f);
2527  return *this;
2528  }
2529  // @}
2531 private:
2532  template<typename Fn>
2533  struct FillHelper {
2534  Fn f;
2537  template<typename... Args,
2538  typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2539  void operator()(Args... args) {
2540  (*buf)(args...) = f(args...);
2541  }
2543  FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
2544  : f(std::forward<Fn>(f)), buf(buf) {
2545  }
2546  };
2548 public:
2549  /** Fill a buffer by evaluating a callable at every site. The
2550  * callable should look much like a callable passed to
2551  * for_each_element, but it should return the value that should be
2552  * stored to the coordinate corresponding to the arguments. */
2553  template<typename Fn,
2554  typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
2556  // We'll go via for_each_element. We need a variadic wrapper lambda.
2557  FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2558  return for_each_element(wrapper);
2559  }
2561  /** Check if an input buffer passed extern stage is a querying
2562  * bounds. Compared to doing the host pointer check directly,
2563  * this both adds clarity to code and will facilitate moving to
2564  * another representation for bounds query arguments. */
2565  bool is_bounds_query() const {
2566  return buf.is_bounds_query();
2567  }
2569  /** Convenient check to verify that all of the interesting bytes in the Buffer
2570  * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2571  * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2572  * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2573  * the entire Buffer storage.) */
2574  void msan_check_mem_is_initialized(bool entire = false) const {
2575 #if defined(__has_feature)
2576 #if __has_feature(memory_sanitizer)
2577  if (entire) {
2578  __msan_check_mem_is_initialized(data(), size_in_bytes());
2579  } else {
2580  for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2581  }
2582 #endif
2583 #endif
2584  }
2585 };
2587 } // namespace Runtime
2588 } // namespace Halide
2590 #undef HALIDE_ALLOCA
