Halide 21.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
device_buffer_utils.h
Go to the documentation of this file.
1#ifndef HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
2#define HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
3
4#include "HalideRuntime.h"
5#include "device_interface.h"
6#include "printer.h"
7
8namespace Halide {
9namespace Runtime {
10namespace Internal {
11
12// A host <-> dev copy should be done with the fewest possible number
13// of contiguous copies to minimize driver overhead. If our
14// halide_buffer_t has strides larger than its extents (e.g. because
15// it represents a sub-region of a larger halide_buffer_t) we can't
16// safely copy it back and forth using a single contiguous copy,
17// because we'd clobber in-between values that another thread might be
18// using. In the best case we can do a single contiguous copy, but in
19// the worst case we need to individually copy over every pixel.
20//
21// This problem is made extra difficult by the fact that the ordering
22// of the dimensions in a halide_buffer_t doesn't relate to memory layout at
23// all, so the strides could be in any order.
24//
25// We solve it by representing a copy job we need to perform as a
26// device_copy struct. It describes a multi-dimensional array of
27// copies to perform. Initially it describes copying over a single
28// pixel at a time. We then try to discover contiguous groups of
29// copies that can be coalesced into a single larger copy.
30
31// The struct that describes a host <-> dev copy to perform.
32#define MAX_COPY_DIMS 16
34 // opaque handles (host or device) for source and destination memory.
36 // The offset in the source and destination memory to start
38 // The multidimensional array of contiguous copy tasks that need to be done.
40 // The strides (in bytes) that separate adjacent copy tasks in each dimension.
43 // How many contiguous bytes to copy per task
45};
46
47WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, int64_t dst_off) {
48 if ((d < -1) || (d >= MAX_COPY_DIMS)) {
49 return; // TODO(marcos): we should probably flag an error somehow here
50 }
51
52 // Skip size-1 dimensions
53 while (d >= 0 && copy.extent[d] == 1) {
54 d--;
55 }
56
57 if (d == -1) {
58 const void *from = (void *)(copy.src + src_off);
59 void *to = (void *)(copy.dst + dst_off);
60 memcpy(to, from, copy.chunk_size);
61 } else {
62 for (uint64_t i = 0; i < copy.extent[d]; i++) {
63 copy_memory_helper(copy, d - 1, src_off, dst_off);
64 src_off += copy.src_stride_bytes[d];
65 dst_off += copy.dst_stride_bytes[d];
66 }
67 }
68}
69
70WEAK void copy_memory(const device_copy &copy, void *user_context) {
71 // If this is a zero copy buffer, these pointers will be the same.
72 if (copy.src != copy.dst) {
74 } else {
75 debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n";
76 }
77}
78
79// All crops are supported. It copies the maximum amount of pixels from src to dst.
80// That maximum number of pixels is determined by the overlapping region of the two
81// buffers. This means that you can use it in scenarios:
82// 1) Fill the entire dst buffer, when the dst buffer bounds are contained within src.
83// 2) Copy the entire src buffer, when the src buffer bounds are contained within dst, to dst.
84// 3) Copy only the overlapping region between two buffers, from src to dst.
86 const halide_buffer_t *dst, bool dst_host) {
87 // Make a copy job representing copying the first pixel only.
89 c.src = src_host ? (uint64_t)src->host : src->device;
90 c.dst = dst_host ? (uint64_t)dst->host : dst->device;
91 c.chunk_size = src->type.bytes();
92 for (int i = 0; i < MAX_COPY_DIMS; i++) {
93 c.extent[i] = 1;
94 c.src_stride_bytes[i] = 0;
95 c.dst_stride_bytes[i] = 0;
96 }
97
98 // Offset the src and dst base pointer to the right point in their buffer.
99 c.src_begin = 0;
100 c.dst_begin = 0;
101 for (int i = 0; i < src->dimensions; i++) {
102 int64_t dim_diff = int64_t(dst->dim[i].min - src->dim[i].min);
103 if (dim_diff > 0) {
104 c.src_begin += (int64_t)src->dim[i].stride * dim_diff;
105 } else {
106 c.dst_begin += (int64_t)dst->dim[i].stride * (-dim_diff);
107 }
108 }
109 c.src_begin *= c.chunk_size;
110 c.dst_begin *= c.chunk_size;
111
112 if (src->dimensions != dst->dimensions ||
113 src->type.bytes() != dst->type.bytes() ||
114 dst->dimensions > MAX_COPY_DIMS) {
115 // These conditions should also be checked for outside this fn.
116 device_copy zero = {0};
117 return zero;
118 }
119
120 if (c.chunk_size == 0) {
121 // This buffer apparently represents no memory. Return a zero'd copy
122 // task.
123 device_copy zero = {0};
124 return zero;
125 }
126
127 // Now expand it to copy all the pixels (one at a time) by taking
128 // the extents and strides from the halide_buffer_ts. Dimensions
129 // are added to the copy by inserting it such that the stride is
130 // in ascending order in the dst.
131 for (int i = 0; i < dst->dimensions; i++) {
132 // TODO: deal with negative strides.
133 uint64_t dst_stride_bytes = (uint64_t)dst->dim[i].stride * dst->type.bytes();
134 uint64_t src_stride_bytes = (uint64_t)src->dim[i].stride * src->type.bytes();
135 // Insert the dimension sorted into the buffer copy.
136 int insert;
137 for (insert = 0; insert < i; insert++) {
138 // If the stride is 0, we put it at the end because it can't be
139 // folded.
140 if (dst_stride_bytes < c.dst_stride_bytes[insert] && dst_stride_bytes != 0) {
141 break;
142 }
143 }
144 for (int j = i; j > insert; j--) {
145 c.extent[j] = c.extent[j - 1];
146 c.dst_stride_bytes[j] = c.dst_stride_bytes[j - 1];
147 c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
148 }
149 c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent);
150 // debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
151 c.dst_stride_bytes[insert] = dst_stride_bytes;
152 c.src_stride_bytes[insert] = src_stride_bytes;
153 };
154
155 // Attempt to fold contiguous dimensions into the chunk
156 // size. Since the dimensions are sorted by stride, and the
157 // strides must be greater than or equal to the chunk size, this
158 // means we can just delete the innermost dimension as long as its
159 // stride in both src and dst is equal to the chunk size.
160 while (c.chunk_size &&
161 c.chunk_size == c.src_stride_bytes[0] &&
162 c.chunk_size == c.dst_stride_bytes[0]) {
163 // Fold the innermost dimension's extent into the chunk_size.
164 c.chunk_size *= c.extent[0];
165
166 // Erase the innermost dimension from the list of dimensions to
167 // iterate over.
168 for (int j = 1; j < MAX_COPY_DIMS; j++) {
169 c.extent[j - 1] = c.extent[j];
170 c.src_stride_bytes[j - 1] = c.src_stride_bytes[j];
171 c.dst_stride_bytes[j - 1] = c.dst_stride_bytes[j];
172 }
173 c.extent[MAX_COPY_DIMS - 1] = 1;
176 }
177 return c;
178}
179
181 return make_buffer_copy(buf, true, buf, false);
182}
183
185 return make_buffer_copy(buf, false, buf, true);
186}
187
188// Caller is expected to verify that src->dimensions == dst->dimensions
190 int64_t offset = 0;
191 for (int i = 0; i < src->dimensions; i++) {
192 offset += (int64_t)(dst->dim[i].min - src->dim[i].min) * (int64_t)src->dim[i].stride;
193 }
194 offset *= src->type.bytes();
195 return offset;
196}
197
198// Caller is expected to verify that src->dimensions == dst->dimensions + 1,
199// and that slice_dim and slice_pos are valid within src
200ALWAYS_INLINE int64_t calc_device_slice_byte_offset(const struct halide_buffer_t *src, int slice_dim, int slice_pos) {
201 int64_t offset = (int64_t)(slice_pos - src->dim[slice_dim].min) * (int64_t)src->dim[slice_dim].stride;
202 offset *= src->type.bytes();
203 return offset;
204}
205
206} // namespace Internal
207} // namespace Runtime
208} // namespace Halide
209
210#endif // HALIDE_DEVICE_BUFFER_UTILS_H
#define debug(n)
For optional debugging during codegen, use the debug macro as follows:
Definition Debug.h:52
This file declares the routines used by Halide internally in its runtime.
#define MAX_COPY_DIMS
WEAK void copy_memory(const device_copy &copy, void *user_context)
WEAK device_copy make_host_to_device_copy(const halide_buffer_t *buf)
WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, int64_t dst_off)
WEAK device_copy make_device_to_host_copy(const halide_buffer_t *buf)
ALWAYS_INLINE int64_t calc_device_slice_byte_offset(const struct halide_buffer_t *src, int slice_dim, int slice_pos)
ALWAYS_INLINE int64_t calc_device_crop_byte_offset(const struct halide_buffer_t *src, struct halide_buffer_t *dst)
WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, const halide_buffer_t *dst, bool dst_host)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition Func.h:603
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
void * memcpy(void *s1, const void *s2, size_t n)
#define ALWAYS_INLINE
#define WEAK
The raw representation of an image passed around by generated Halide code.
int32_t dimensions
The dimensionality of the buffer.
halide_dimension_t * dim
The shape of the buffer.
uint64_t device
A device-handle for e.g.
uint8_t * host
A pointer to the start of the data in main memory.
struct halide_type_t type
The type of each buffer element.