Halide
device_buffer_utils.h
Go to the documentation of this file.
1 #ifndef HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
2 #define HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
3 
4 #include "HalideRuntime.h"
5 #include "device_interface.h"
6 #include "printer.h"
7 
8 namespace Halide {
9 namespace Runtime {
10 namespace Internal {
11 
12 // A host <-> dev copy should be done with the fewest possible number
13 // of contiguous copies to minimize driver overhead. If our
14 // halide_buffer_t has strides larger than its extents (e.g. because
15 // it represents a sub-region of a larger halide_buffer_t) we can't
16 // safely copy it back and forth using a single contiguous copy,
17 // because we'd clobber in-between values that another thread might be
18 // using. In the best case we can do a single contiguous copy, but in
19 // the worst case we need to individually copy over every pixel.
20 //
21 // This problem is made extra difficult by the fact that the ordering
22 // of the dimensions in a halide_buffer_t doesn't relate to memory layout at
23 // all, so the strides could be in any order.
24 //
25 // We solve it by representing a copy job we need to perform as a
26 // device_copy struct. It describes a multi-dimensional array of
27 // copies to perform. Initially it describes copying over a single
28 // pixel at a time. We then try to discover contiguous groups of
29 // copies that can be coalesced into a single larger copy.
30 
31 // The struct that describes a host <-> dev copy to perform.
32 #define MAX_COPY_DIMS 16
33 struct device_copy {
34  // opaque handles for source and device memory.
36  // The offset in the source memory to start
38  // The multidimensional array of contiguous copy tasks that need to be done.
40  // The strides (in bytes) that separate adjacent copy tasks in each dimension.
43  // How many contiguous bytes to copy per task
45 };
46 
47 WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, int64_t dst_off) {
48  // Skip size-1 dimensions
49  while (d >= 0 && copy.extent[d] == 1) {
50  d--;
51  }
52 
53  if (d == -1) {
54  const void *from = (void *)(copy.src + src_off);
55  void *to = (void *)(copy.dst + dst_off);
56  memcpy(to, from, copy.chunk_size);
57  } else {
58  for (uint64_t i = 0; i < copy.extent[d]; i++) {
59  copy_memory_helper(copy, d - 1, src_off, dst_off);
60  src_off += copy.src_stride_bytes[d];
61  dst_off += copy.dst_stride_bytes[d];
62  }
63  }
64 }
65 
66 WEAK void copy_memory(const device_copy &copy, void *user_context) {
67  // If this is a zero copy buffer, these pointers will be the same.
68  if (copy.src != copy.dst) {
69  copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, 0);
70  } else {
71  debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n";
72  }
73 }
74 
75 // Fills the entire dst buffer, which must be contained within src
77  const halide_buffer_t *dst, bool dst_host) {
78  // Make a copy job representing copying the first pixel only.
79  device_copy c;
80  c.src = src_host ? (uint64_t)src->host : src->device;
81  c.dst = dst_host ? (uint64_t)dst->host : dst->device;
82  c.chunk_size = src->type.bytes();
83  for (int i = 0; i < MAX_COPY_DIMS; i++) {
84  c.extent[i] = 1;
85  c.src_stride_bytes[i] = 0;
86  c.dst_stride_bytes[i] = 0;
87  }
88 
89  // Offset the src base pointer to the right point in its buffer.
90  c.src_begin = 0;
91  for (int i = 0; i < src->dimensions; i++) {
92  c.src_begin += (uint64_t)src->dim[i].stride * (dst->dim[i].min - src->dim[i].min);
93  }
94  c.src_begin *= c.chunk_size;
95 
96  if (src->dimensions != dst->dimensions ||
97  src->type.bytes() != dst->type.bytes() ||
98  dst->dimensions > MAX_COPY_DIMS) {
99  // These conditions should also be checked for outside this fn.
100  device_copy zero = {0};
101  return zero;
102  }
103 
104  if (c.chunk_size == 0) {
105  // This buffer apparently represents no memory. Return a zero'd copy
106  // task.
107  device_copy zero = {0};
108  return zero;
109  }
110 
111  // Now expand it to copy all the pixels (one at a time) by taking
112  // the extents and strides from the halide_buffer_ts. Dimensions
113  // are added to the copy by inserting it such that the stride is
114  // in ascending order in the dst.
115  for (int i = 0; i < dst->dimensions; i++) {
116  // TODO: deal with negative strides.
117  uint64_t dst_stride_bytes = (uint64_t)dst->dim[i].stride * dst->type.bytes();
118  uint64_t src_stride_bytes = (uint64_t)src->dim[i].stride * src->type.bytes();
119  // Insert the dimension sorted into the buffer copy.
120  int insert;
121  for (insert = 0; insert < i; insert++) {
122  // If the stride is 0, we put it at the end because it can't be
123  // folded.
124  if (dst_stride_bytes < c.dst_stride_bytes[insert] && dst_stride_bytes != 0) {
125  break;
126  }
127  }
128  for (int j = i; j > insert; j--) {
129  c.extent[j] = c.extent[j - 1];
130  c.dst_stride_bytes[j] = c.dst_stride_bytes[j - 1];
131  c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
132  }
133  c.extent[insert] = dst->dim[i].extent;
134  // debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
135  c.dst_stride_bytes[insert] = dst_stride_bytes;
136  c.src_stride_bytes[insert] = src_stride_bytes;
137  };
138 
139  // Attempt to fold contiguous dimensions into the chunk
140  // size. Since the dimensions are sorted by stride, and the
141  // strides must be greater than or equal to the chunk size, this
142  // means we can just delete the innermost dimension as long as its
143  // stride in both src and dst is equal to the chunk size.
144  while (c.chunk_size &&
145  c.chunk_size == c.src_stride_bytes[0] &&
146  c.chunk_size == c.dst_stride_bytes[0]) {
147  // Fold the innermost dimension's extent into the chunk_size.
148  c.chunk_size *= c.extent[0];
149 
150  // Erase the innermost dimension from the list of dimensions to
151  // iterate over.
152  for (int j = 1; j < MAX_COPY_DIMS; j++) {
153  c.extent[j - 1] = c.extent[j];
154  c.src_stride_bytes[j - 1] = c.src_stride_bytes[j];
155  c.dst_stride_bytes[j - 1] = c.dst_stride_bytes[j];
156  }
157  c.extent[MAX_COPY_DIMS - 1] = 1;
158  c.src_stride_bytes[MAX_COPY_DIMS - 1] = 0;
159  c.dst_stride_bytes[MAX_COPY_DIMS - 1] = 0;
160  }
161  return c;
162 }
163 
165  return make_buffer_copy(buf, true, buf, false);
166 }
167 
169  return make_buffer_copy(buf, false, buf, true);
170 }
171 
172 // Caller is expected to verify that src->dimensions == dst->dimensions
174  int64_t offset = 0;
175  for (int i = 0; i < src->dimensions; i++) {
176  offset += (dst->dim[i].min - src->dim[i].min) * (int64_t)src->dim[i].stride;
177  }
178  offset *= src->type.bytes();
179  return offset;
180 }
181 
182 // Caller is expected to verify that src->dimensions == dst->dimensions + 1,
183 // and that slice_dim and slice_pos are valid within src
184 ALWAYS_INLINE int64_t calc_device_slice_byte_offset(const struct halide_buffer_t *src, int slice_dim, int slice_pos) {
185  int64_t offset = (slice_pos - src->dim[slice_dim].min) * (int64_t)src->dim[slice_dim].stride;
186  offset *= src->type.bytes();
187  return offset;
188 }
189 
190 } // namespace Internal
191 } // namespace Runtime
192 } // namespace Halide
193 
194 #endif // HALIDE_DEVICE_BUFFER_UTILS_H
Halide::Runtime::Internal::make_buffer_copy
WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, const halide_buffer_t *dst, bool dst_host)
Definition: device_buffer_utils.h:76
Halide::Runtime::Internal::device_copy
Definition: device_buffer_utils.h:33
Halide::Runtime::Internal::copy_memory
WEAK void copy_memory(const device_copy &copy, void *user_context)
Definition: device_buffer_utils.h:66
device_interface.h
halide_buffer_t::dim
halide_dimension_t * dim
The shape of the buffer.
Definition: HalideRuntime.h:1513
Halide::Runtime::Internal::device_copy::dst_stride_bytes
uint64_t dst_stride_bytes[MAX_COPY_DIMS]
Definition: device_buffer_utils.h:42
Halide::Runtime::Internal::device_copy::src_stride_bytes
uint64_t src_stride_bytes[MAX_COPY_DIMS]
Definition: device_buffer_utils.h:41
halide_buffer_t::type
struct halide_type_t type
The type of each buffer element.
Definition: HalideRuntime.h:1506
uint64_t
unsigned __INT64_TYPE__ uint64_t
Definition: runtime_internal.h:23
Halide::Runtime::Internal::device_copy::extent
uint64_t extent[MAX_COPY_DIMS]
Definition: device_buffer_utils.h:39
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
halide_buffer_t::dimensions
int32_t dimensions
The dimensionality of the buffer.
Definition: HalideRuntime.h:1509
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Runtime::Internal::device_copy::chunk_size
uint64_t chunk_size
Definition: device_buffer_utils.h:44
MAX_COPY_DIMS
#define MAX_COPY_DIMS
Definition: device_buffer_utils.h:32
printer.h
Halide::Runtime::Internal::copy_memory_helper
WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, int64_t dst_off)
Definition: device_buffer_utils.h:47
int64_t
signed __INT64_TYPE__ int64_t
Definition: runtime_internal.h:22
Halide::Runtime::Internal::calc_device_slice_byte_offset
ALWAYS_INLINE int64_t calc_device_slice_byte_offset(const struct halide_buffer_t *src, int slice_dim, int slice_pos)
Definition: device_buffer_utils.h:184
halide_buffer_t::host
uint8_t * host
A pointer to the start of the data in main memory.
Definition: HalideRuntime.h:1500
Halide::Runtime::Internal::make_device_to_host_copy
WEAK device_copy make_device_to_host_copy(const halide_buffer_t *buf)
Definition: device_buffer_utils.h:168
Halide::Runtime::Internal::device_copy::dst
uint64_t dst
Definition: device_buffer_utils.h:35
Halide::Runtime::Internal::make_host_to_device_copy
WEAK device_copy make_host_to_device_copy(const halide_buffer_t *buf)
Definition: device_buffer_utils.h:164
ALWAYS_INLINE
#define ALWAYS_INLINE
Definition: runtime_internal.h:55
HalideRuntime.h
memcpy
void * memcpy(void *s1, const void *s2, size_t n)
halide_buffer_t
The raw representation of an image passed around by generated Halide code.
Definition: HalideRuntime.h:1490
halide_dimension_t::stride
int32_t stride
Definition: HalideRuntime.h:1471
Halide::Runtime::Internal::device_copy::src
uint64_t src
Definition: device_buffer_utils.h:35
Halide::Runtime::Internal::calc_device_crop_byte_offset
ALWAYS_INLINE int64_t calc_device_crop_byte_offset(const struct halide_buffer_t *src, struct halide_buffer_t *dst)
Definition: device_buffer_utils.h:173
WEAK
#define WEAK
Definition: runtime_internal.h:52
Halide::Runtime::Internal::device_copy::src_begin
uint64_t src_begin
Definition: device_buffer_utils.h:37
halide_dimension_t::min
int32_t min
Definition: HalideRuntime.h:1471
halide_buffer_t::device
uint64_t device
A device-handle for e.g.
Definition: HalideRuntime.h:1492