Halide 19.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
Featurization.h
Go to the documentation of this file.
1#ifndef FEATURIZATION_H
2#define FEATURIZATION_H
3
4#include <cstdint>
5#include <cstring>
6#include <iostream>
7
8namespace Halide {
9namespace Internal {
10
11// The algorithm-specific features. For legacy reasons these are
12// called PipelineFeatures in the code.
13struct PipelineFeatures {
14 static constexpr size_t num_features() {
15 return sizeof(PipelineFeatures) / sizeof(int);
16 }
17
18 static constexpr uint32_t version() {
19 return 3;
20 }
21
22 // Access them by index.
23 int &operator[](int idx) {
24 return ((int *)(this))[idx];
25 }
26
27 int operator[](int idx) const {
28 return ((const int *)(this))[idx];
29 }
30
31 enum class OpType {
32 Const,
33 Cast,
35 Param,
36 Add,
37 Sub,
38 Mod,
39 Mul,
40 Div,
41 Min,
42 Max,
43 EQ,
44 NE,
45 LT,
46 LE,
47 And,
48 Or,
49 Not,
50 Select,
51 ImageCall, // Loads to an input buffer
52 FuncCall, // Calls to another pipeline stage
53 SelfCall, // Recursive calls from a Func to itself
54 ExternCall, // Math intrinsics, typically
55 Let,
57 };
58
59 enum class ScalarType {
60 Bool,
61 UInt8, // or Int8
62 UInt16, // or Int16
63 UInt32, // or Int32
64 UInt64, // or Int64
65 Float,
66 Double,
68 };
69
70 // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things
72
74
75 enum class AccessType {
79 Store,
81 };
82
83 // Finer granularity call/store node properties. These are a
84 // function of the matrix of derivatives of each arg to a
85 // call w.r.t the loop variables of the Stage. Each row of
86 // the matrix corresponds to one of the call arguments. In
87 // each case we illustrate such a call, assuming that the
88 // variables of this Func are x, y, z, and that the
89 // dimension vectorized over is the first (x).
90
91 // Square identity matrix. f(x - 2, y + 8, z + param)
93 // Square permutation matrix. f(y + 1, z - 3, x)
95 // Each row sums to 1. Each column sums to 1 or 0. f(y, x)
97 // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4)
99
100 void dump(std::ostream &os) const {
101 for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) {
102 const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"};
103 // Skip printing for types not used
104 if (!types_in_use[i]) {
105 continue;
106 }
107
108 os << " Featurization for type " << type_names[i] << "\n"
109 << " Op histogram:\n"
110 << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n"
111 << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n"
112 << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n"
113 << " Param: " << op_histogram[(int)OpType::Param][i] << "\n"
114 << " Add: " << op_histogram[(int)OpType::Add][i] << "\n"
115 << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n"
116 << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n"
117 << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n"
118 << " Div: " << op_histogram[(int)OpType::Div][i] << "\n"
119 << " Min: " << op_histogram[(int)OpType::Min][i] << "\n"
120 << " Max: " << op_histogram[(int)OpType::Max][i] << "\n"
121 << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n"
122 << " NE: " << op_histogram[(int)OpType::NE][i] << "\n"
123 << " LT: " << op_histogram[(int)OpType::LT][i] << "\n"
124 << " LE: " << op_histogram[(int)OpType::LE][i] << "\n"
125 << " And: " << op_histogram[(int)OpType::And][i] << "\n"
126 << " Or: " << op_histogram[(int)OpType::Or][i] << "\n"
127 << " Not: " << op_histogram[(int)OpType::Not][i] << "\n"
128 << " Select: " << op_histogram[(int)OpType::Select][i] << "\n"
129 << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n"
130 << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n"
131 << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n"
132 << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n"
133 << " Let: " << op_histogram[(int)OpType::Let][i] << "\n"
134 << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n"
135 << " Pointwise: "
136 << pointwise_accesses[0][i] << " "
137 << pointwise_accesses[1][i] << " "
138 << pointwise_accesses[2][i] << " "
139 << pointwise_accesses[3][i] << "\n"
140 << " Transpose: "
141 << transpose_accesses[0][i] << " "
142 << transpose_accesses[1][i] << " "
143 << transpose_accesses[2][i] << " "
144 << transpose_accesses[3][i] << "\n"
145 << " Broadcast: "
146 << broadcast_accesses[0][i] << " "
147 << broadcast_accesses[1][i] << " "
148 << broadcast_accesses[2][i] << " "
149 << broadcast_accesses[3][i] << "\n"
150 << " Slice: "
151 << slice_accesses[0][i] << " "
152 << slice_accesses[1][i] << " "
153 << slice_accesses[2][i] << " "
154 << slice_accesses[3][i] << "\n";
155 }
156 }
157};
158
159// The schedule-dependent portion of the featurization of a stage
160struct ScheduleFeatures {
161 static constexpr size_t num_features() {
162 return sizeof(ScheduleFeatures) / sizeof(double);
163 }
164
165 static constexpr uint32_t version() {
166 return 3;
167 }
168
169 double &operator[](int idx) {
170 return ((double *)(this))[idx];
171 }
172
173 double operator[](int idx) const {
174 return ((const double *)(this))[idx];
175 }
176
177 // The number of times storage for this stage is allocated. The
178 // product of outer loops at store_at site
180
181 // The number of times a tile of the stage is computed. The
182 // product of outer loops at compute_at site. Always at least as
183 // large as num_realizations.
184 double num_productions = 0;
185
186 // Number of times the innermost loop happens per allocation.
188
189 // Number of times the innermost stmt happens per tile computed.
191
192 // The total trip count of the innermost loop over the entire program.
193 // == num_realizations * points_computed_per_realization
194 // ~= num_productions * points_computed_per_production
195 // Only approximately equal because of the simplifications made
196 // regarding the modeling of sliding window
198
199 // The minimum number of points that are actually required to be
200 // computed to produce a correct output. Not actually a function
201 // of the schedule, but a useful reference point to see if a
202 // schedule has gone off the rails.
204
205 // Trip count of innermost loop nest.
207
208 // Trip count of just the pure loops in the innermost loop
209 // (i.e. excludes loops representing reductions).
211
212 // If this is to be unrolled, what is the product of the unrolling
213 // factors.
215
216 // The number of parallel jobs launched in the production of this
217 // stage. Always 1 unless the Func is compute_root, because we
218 // place all parallelism at the outermost level.
220
221 // The number of times this Func could be realized in parallel. 1
222 // when the Func is compute_root. Product of the containing
223 // parallel loops for other stages.
225
226 // Size of the region computed at the store_at site, measured in
227 // bytes. Does not take storage-folding optimizations into account.
229
230 // Size of the region computed per tile (at the compute_at site),
231 // measured in bytes. This includes the effect of storage-folding,
232 // so it's a better number to look at to estimate memory usage.
234
235 // If the stage were hypothetically scheduled at root, how much
236 // memory would it consumed. Doesn't vary w.r.t. the schedule, but
237 // a useful reference.
238 double bytes_at_root = 0;
239
240 // Same as the above, but only measuring the extent along the
241 // innermost dimension, so that we can reason about spatial
242 // locality, cache lines, prefetchers, etc.
246
247 // For inlined Funcs, how many calls are made to this Func total.
248 double inlined_calls = 0;
249
250 // Number of unique bytes and unique continguous segments of
251 // memory loaded from all inputs over a single trip of the loop
252 // containing the allocation site.
255
256 // The sum of the sizes of the allocations accessed at this
257 // site. Gives a hint as to the likely locality of it.
259
260 // The sum of the sizes of the temporary allocations while
261 // computing one tile of this Func. Probably a good thing if it
262 // fits in cache.
263 double working_set = 0;
264
265 // The vectorization factor (#simd lanes) to be used to compute
266 // this stage. Wasted work if it's smaller than the stage's native
267 // vector size.
268 double vector_size = 0;
269
270 // The native vector size for the narrowest type used. Does not
271 // vary with the schedule, but a useful reference point.
273
274 // Number of SIMD vectors computed
275 double num_vectors = 0;
276
277 // Number of scalars computed (e.g. from tails of loops)
278 double num_scalars = 0;
279
280 // The number of loads done per vector or scalar computed. Vector
281 // gathers count as a batch of scalar loads. These get amortized
282 // across unrolled blocks if some loads can be reused across the
283 // unrolled dimension.
287
288 // The memory footprint written over one per parallel task. The
289 // union of the regions if the stage is computed at finer
290 // granularity that one parallel task of some consumer.
291 double bytes_at_task = 0;
293
294 // The memory footprint accessed while computing a single vector.
297
298 // The memory footprint accessed per parallel task. Only counts
299 // loads from things computed outside of that parallel task (to
300 // measure the amount of traffic coming from another core).
303
304 // The sum of the sizes of all live allocations at various sites.
309
310 void dump(std::ostream &os) const {
311 os << " num_realizations: " << num_realizations << "\n"
312 << " num_productions: " << num_productions << "\n"
313 << " points_computed_per_realization: " << points_computed_per_realization << "\n"
314 << " points_computed_per_production: " << points_computed_per_production << "\n"
315 << " points_computed_total: " << points_computed_total << "\n"
316 << " points_computed_minimum: " << points_computed_minimum << "\n"
317 << " innermost_loop_extent: " << innermost_loop_extent << "\n"
318 << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n"
319 << " unrolled_loop_extent: " << unrolled_loop_extent << "\n"
320 << " inner_parallelism: " << inner_parallelism << "\n"
321 << " outer_parallelism: " << outer_parallelism << "\n"
322 << " bytes_at_realization: " << bytes_at_realization << "\n"
323 << " bytes_at_production: " << bytes_at_production << "\n"
324 << " bytes_at_root: " << bytes_at_root << "\n"
325 << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n"
326 << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n"
327 << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n"
328 << " inlined_calls: " << inlined_calls << "\n"
329 << " unique_bytes_read_per_realization: " << unique_bytes_read_per_realization << "\n"
330 << " unique_lines_read_per_realization: " << unique_lines_read_per_realization << "\n"
331 << " allocation_bytes_read_per_realization: " << allocation_bytes_read_per_realization << "\n"
332 << " working_set: " << working_set << "\n"
333 << " vector_size: " << vector_size << "\n"
334 << " native_vector_size: " << native_vector_size << "\n"
335 << " num_vectors: " << num_vectors << "\n"
336 << " num_scalars: " << num_scalars << "\n"
337 << " scalar_loads_per_vector: " << scalar_loads_per_vector << "\n"
338 << " vector_loads_per_vector: " << vector_loads_per_vector << "\n"
339 << " scalar_loads_per_scalar: " << scalar_loads_per_scalar << "\n"
340 << " bytes_at_task: " << bytes_at_task << "\n"
341 << " innermost_bytes_at_task: " << innermost_bytes_at_task << "\n"
342 << " unique_bytes_read_per_vector: " << unique_bytes_read_per_vector << "\n"
343 << " unique_lines_read_per_vector: " << unique_lines_read_per_vector << "\n"
344 << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n"
345 << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n"
346 << " working_set_at_task: " << working_set_at_task << "\n"
347 << " working_set_at_production: " << working_set_at_production << "\n"
348 << " working_set_at_realization: " << working_set_at_realization << "\n"
349 << " working_set_at_root: " << working_set_at_root << "\n";
350 }
351
352 bool equal(const ScheduleFeatures &other) const {
353 const size_t n_features = ScheduleFeatures::num_features();
354 for (size_t i = 0; i < n_features; i++) {
355 if ((*this)[i] != other[i]) {
356 return false;
357 }
358 }
359 return true;
360 }
361};
362
363/*
364Some feature values cannot be cached, and need to be recomputed.
365These intermediates allow for faster recomputation of such features.
366*/
373
374} // namespace Internal
375} // namespace Halide
376
377#endif
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT32_TYPE__ uint32_t
void dump(std::ostream &os) const
static constexpr uint32_t version()
int pointwise_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
int broadcast_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
static constexpr size_t num_features()
int op_histogram[(int) OpType::NumOpTypes][(int) ScalarType::NumScalarTypes]
int transpose_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
int types_in_use[(int) ScalarType::NumScalarTypes]
int slice_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
bool equal(const ScheduleFeatures &other) const
void dump(std::ostream &os) const
static constexpr uint32_t version()
static constexpr size_t num_features()