Halide 19.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
Featurization.h
Go to the documentation of this file.
1#ifndef FEATURIZATION_H
2#define FEATURIZATION_H
3
4#include <cstdint>
5#include <cstring>
6#include <iostream>
7
8#include "ASLog.h"
9
10namespace Halide {
11namespace Internal {
12
13// The algorithm-specific features. For legacy reasons these are
14// called PipelineFeatures in the code.
16 static constexpr size_t num_features() {
17 return sizeof(PipelineFeatures) / sizeof(int);
18 }
19
20 static constexpr uint32_t version() {
21 return 3;
22 }
23
24 // Access them by index.
25 int &operator[](int idx) {
26 return ((int *)(this))[idx];
27 }
28
29 int operator[](int idx) const {
30 return ((const int *)(this))[idx];
31 }
32
33 enum class OpType {
34 Const,
35 Cast,
37 Param,
38 Add,
39 Sub,
40 Mod,
41 Mul,
42 Div,
43 Min,
44 Max,
45 EQ,
46 NE,
47 LT,
48 LE,
49 And,
50 Or,
51 Not,
52 Select,
53 ImageCall, // Loads to an input buffer
54 FuncCall, // Calls to another pipeline stage
55 SelfCall, // Recursive calls from a Func to itself
56 ExternCall, // Math intrinsics, typically
57 Let,
59 };
60
61 enum class ScalarType {
62 Bool,
63 UInt8, // or Int8
64 UInt16, // or Int16
65 UInt32, // or Int32
66 UInt64, // or Int64
67 Float,
68 Double,
70 };
71
72 // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things
74
76
77 enum class AccessType {
81 Store,
83 };
84
85 // Finer granularity call/store node properties. These are a
86 // function of the matrix of derivatives of each arg to a
87 // call w.r.t the loop variables of the Stage. Each row of
88 // the matrix corresponds to one of the call arguments. In
89 // each case we illustrate such a call, assuming that the
90 // variables of this Func are x, y, z, and that the
91 // dimension vectorized over is the first (x).
92
93 // Square identity matrix. f(x - 2, y + 8, z + param)
95 // Square permutation matrix. f(y + 1, z - 3, x)
97 // Each row sums to 1. Each column sums to 1 or 0. f(y, x)
99 // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4)
101
102 template<typename OS>
103 void dump(OS &os) const {
104 for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) {
105 const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"};
106 // Skip printing for types not used
107 if (!types_in_use[i]) {
108 continue;
109 }
110
111 os << " Featurization for type " << type_names[i] << "\n"
112 << " Op histogram:\n"
113 << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n"
114 << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n"
115 << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n"
116 << " Param: " << op_histogram[(int)OpType::Param][i] << "\n"
117 << " Add: " << op_histogram[(int)OpType::Add][i] << "\n"
118 << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n"
119 << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n"
120 << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n"
121 << " Div: " << op_histogram[(int)OpType::Div][i] << "\n"
122 << " Min: " << op_histogram[(int)OpType::Min][i] << "\n"
123 << " Max: " << op_histogram[(int)OpType::Max][i] << "\n"
124 << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n"
125 << " NE: " << op_histogram[(int)OpType::NE][i] << "\n"
126 << " LT: " << op_histogram[(int)OpType::LT][i] << "\n"
127 << " LE: " << op_histogram[(int)OpType::LE][i] << "\n"
128 << " And: " << op_histogram[(int)OpType::And][i] << "\n"
129 << " Or: " << op_histogram[(int)OpType::Or][i] << "\n"
130 << " Not: " << op_histogram[(int)OpType::Not][i] << "\n"
131 << " Select: " << op_histogram[(int)OpType::Select][i] << "\n"
132 << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n"
133 << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n"
134 << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n"
135 << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n"
136 << " Let: " << op_histogram[(int)OpType::Let][i] << "\n"
137 << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n"
138 << " Pointwise: "
139 << pointwise_accesses[0][i] << " "
140 << pointwise_accesses[1][i] << " "
141 << pointwise_accesses[2][i] << " "
142 << pointwise_accesses[3][i] << "\n"
143 << " Transpose: "
144 << transpose_accesses[0][i] << " "
145 << transpose_accesses[1][i] << " "
146 << transpose_accesses[2][i] << " "
147 << transpose_accesses[3][i] << "\n"
148 << " Broadcast: "
149 << broadcast_accesses[0][i] << " "
150 << broadcast_accesses[1][i] << " "
151 << broadcast_accesses[2][i] << " "
152 << broadcast_accesses[3][i] << "\n"
153 << " Slice: "
154 << slice_accesses[0][i] << " "
155 << slice_accesses[1][i] << " "
156 << slice_accesses[2][i] << " "
157 << slice_accesses[3][i] << "\n";
158 }
159 }
160 void dump() const {
161 auto os = aslog(1);
162 dump(os);
163 }
164};
165
166// The schedule-dependent portion of the featurization of a stage
168 static constexpr size_t num_features() {
169 return sizeof(ScheduleFeatures) / sizeof(double);
170 }
171
172 static constexpr uint32_t version() {
173 return 3;
174 }
175
176 double &operator[](int idx) {
177 return ((double *)(this))[idx];
178 }
179
180 double operator[](int idx) const {
181 return ((const double *)(this))[idx];
182 }
183
184 // The number of times storage for this stage is allocated. The
185 // product of outer loops at store_at site
186 double num_realizations = 0;
187
188 // The number of times a tile of the stage is computed. The
189 // pProduct of outer loops at compute_at site. Always at least as
190 // large as num_realizations.
191 double num_productions = 0;
192
193 // Number of times the innermost loop happens per allocation.
195
196 // Number of times the innermost stmt happens per tile computed.
198
200
201 // The total trip count of the innermost loop over the entire program.
202 // == num_realizations * points_computed_per_realization
203 // ~= num_productions * points_computed_per_production
204 // Only approximately equal because of the simplifications made
205 // regarding the modeling of sliding window
206 double points_computed_total = 0;
207
208 // The minimum number of points that are actually required to be
209 // computed to produce a correct output. Not actually a function
210 // of the schedule, but a useful reference point to see if a
211 // schedule has gone off the rails.
212 double points_computed_minimum = 0;
213
214 // Trip count of innermost loop nest.
215 double innermost_loop_extent = 0;
216
217 // Trip count of just the pure loops in the innermost loop
218 // (i.e. excludes loops representing reductions).
220
221 // If this is to be unrolled, what is the product of the unrolling
222 // factors.
223 double unrolled_loop_extent = 0;
224
225 // The number of parallel jobs launched in the production of this
226 // stage. Always 1 unless the Func is compute_root, because we
227 // place all parallelism at the outermost level.
228 double inner_parallelism = 0;
229
230 // The number of times this Func could be realized in parallel. 1
231 // when the Func is compute_root. Product of the containing
232 // parallel loops for other stages.
233 double outer_parallelism = 0;
234
235 // Size of the region computed at the store_at site, measured in
236 // bytes. Does not take storage-folding optimizations into account.
237 double bytes_at_realization = 0;
238
239 // Size of the region computed per tile (at the compute_at site),
240 // measured in bytes. This includes the effect of storage-folding,
241 // so it's a better number to look at to estimate memory usage.
242 double bytes_at_production = 0;
243
244 // If the stage were hypothetically scheduled at root, how much
245 // memory would it consumed. Doesn't vary w.r.t. the schedule, but
246 // a useful reference.
247 double bytes_at_root = 0;
248
249 // Same as the above, but only measuring the extent along the
250 // innermost dimension, so that we can reason about spatial
251 // locality, cache lines, prefetchers, etc.
254 double innermost_bytes_at_root = 0;
255
256 // For inlined Funcs, how many calls are made to this Func total.
257 double inlined_calls = 0;
258
259 // Number of unique bytes and unique continguous segments of
260 // memory loaded from all inputs over a single trip of the loop
261 // containing the allocation site.
268
275
276 // The sum of the sizes of the allocations accessed at this
277 // site. Gives a hint as to the likely locality of it.
281
282 // The sum of the sizes of the temporary allocations while
283 // computing one tile of this Func. Probably a good thing if it
284 // fits in cache.
285 double working_set = 0;
286
287 // Number of scalars computed (e.g. from tails of loops)
288 double num_scalars = 0;
289
290 // The memory footprint written over one per parallel task. The
291 // union of the regions if the stage is computed at finer
292 // granularity that one parallel task of some consumer.
299
300 // The memory footprint accessed while computing a single point
303
304 // The memory footprint accessed per parallel task. Only counts
305 // loads from things computed outside of that parallel task (to
306 // measure the amount of traffic coming from another core).
309
310 // The sum of the sizes of all live allocations at various sites.
311 double working_set_at_task = 0;
312 double working_set_at_production = 0;
314 double working_set_at_root = 0;
315
316 double num_blocks = 1;
318 double block_occupancy = 1.0 / 1024.0;
319
320 double warp_lane_utilization = 1.0 / 32.0;
325
330
333
336
338
343
345 double expr_branching = 0;
346
347 template<typename OS>
348 void dump(OS &os) const {
349 os << " num_realizations: " << num_realizations << "\n"
350 << " num_productions: " << num_productions << "\n"
351 << " points_computed_per_realization: " << points_computed_per_realization << "\n"
352 << " points_computed_per_production: " << points_computed_per_production << "\n"
353 << " points_computed_per_thread: " << points_computed_per_thread << "\n"
354 << " points_computed_total: " << points_computed_total << "\n"
355 << " points_computed_minimum: " << points_computed_minimum << "\n"
356 << " innermost_loop_extent: " << innermost_loop_extent << "\n"
357 << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n"
358 << " unrolled_loop_extent: " << unrolled_loop_extent << "\n"
359 << " inner_parallelism: " << inner_parallelism << "\n"
360 << " outer_parallelism: " << outer_parallelism << "\n"
361 << " bytes_at_realization: " << bytes_at_realization << "\n"
362 << " bytes_at_production: " << bytes_at_production << "\n"
363 << " bytes_at_root: " << bytes_at_root << "\n"
364 << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n"
365 << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n"
366 << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n"
367 << " inlined_calls: " << inlined_calls << "\n"
368 << " unique_global_bytes_read_per_realization: " << unique_global_bytes_read_per_realization << "\n"
369 << " unique_shared_bytes_read_per_realization: " << unique_shared_bytes_read_per_realization << "\n"
370 << " unique_register_bytes_read_per_realization: " << unique_register_bytes_read_per_realization << "\n"
371 << " unique_global_lines_read_per_realization: " << unique_global_lines_read_per_realization << "\n"
372 << " unique_shared_lines_read_per_realization: " << unique_shared_lines_read_per_realization << "\n"
373 << " unique_register_lines_read_per_realization: " << unique_register_lines_read_per_realization << "\n"
374 << " unique_global_bytes_read_per_thread: " << unique_global_bytes_read_per_thread << "\n"
375 << " unique_shared_bytes_read_per_thread: " << unique_shared_bytes_read_per_thread << "\n"
376 << " unique_register_bytes_read_per_thread: " << unique_register_bytes_read_per_thread << "\n"
377 << " unique_global_lines_read_per_thread: " << unique_global_lines_read_per_thread << "\n"
378 << " unique_shared_lines_read_per_thread: " << unique_shared_lines_read_per_thread << "\n"
379 << " unique_register_lines_read_per_thread: " << unique_register_lines_read_per_thread << "\n"
380 << " global_allocation_bytes_read_per_realization: " << global_allocation_bytes_read_per_realization << "\n"
381 << " shared_allocation_bytes_read_per_realization: " << shared_allocation_bytes_read_per_realization << "\n"
382 << " register_allocation_bytes_read_per_realization: " << register_allocation_bytes_read_per_realization << "\n"
383 << " working_set: " << working_set << "\n"
384 << " num_scalars: " << num_scalars << "\n"
385 << " global_bytes_at_task: " << global_bytes_at_task << "\n"
386 << " shared_bytes_at_task: " << shared_bytes_at_task << "\n"
387 << " register_bytes_at_task: " << register_bytes_at_task << "\n"
388 << " global_innermost_bytes_at_task: " << global_innermost_bytes_at_task << "\n"
389 << " shared_innermost_bytes_at_task: " << shared_innermost_bytes_at_task << "\n"
390 << " register_innermost_bytes_at_task: " << register_innermost_bytes_at_task << "\n"
391 << " unique_bytes_read_per_point: " << unique_bytes_read_per_point << "\n"
392 << " unique_lines_read_per_point: " << unique_lines_read_per_point << "\n"
393 << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n"
394 << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n"
395 << " working_set_at_task: " << working_set_at_task << "\n"
396 << " working_set_at_production: " << working_set_at_production << "\n"
397 << " working_set_at_realization: " << working_set_at_realization << "\n"
398 << " working_set_at_root: " << working_set_at_root << "\n"
399 << " num_blocks: " << num_blocks << "\n"
400 << " num_warps_per_block: " << num_warps_per_block << "\n"
401 << " block_occupancy: " << block_occupancy << "\n"
402 << " warp_lane_utilization: " << warp_lane_utilization << "\n"
403 << " num_active_warps_per_block: " << num_active_warps_per_block << "\n"
404 << " warp_lane_utilization_at_block_y: " << warp_lane_utilization_at_block_y << "\n"
405 << " warp_lane_utilization_at_block_z: " << warp_lane_utilization_at_block_z << "\n"
406 << " idle_lane_wastage: " << idle_lane_wastage << "\n"
407 << " num_shared_mem_loads_per_block: " << num_shared_mem_loads_per_block << "\n"
408 << " num_global_mem_loads_per_block: " << num_global_mem_loads_per_block << "\n"
409 << " num_shared_mem_stores_per_block: " << num_shared_mem_stores_per_block << "\n"
410 << " num_global_mem_stores_per_block: " << num_global_mem_stores_per_block << "\n"
411 << " shared_mem_store_efficiency: " << shared_mem_store_efficiency << "\n"
412 << " shared_mem_load_efficiency: " << shared_mem_load_efficiency << "\n"
413 << " global_mem_store_efficiency: " << global_mem_store_efficiency << "\n"
414 << " global_mem_load_efficiency: " << global_mem_load_efficiency << "\n"
415 << " working_set_at_thread: " << working_set_at_thread << "\n"
416 << " shared_mem_occupancy: " << shared_mem_occupancy << "\n"
417 << " shared_mem_block_limit_factor: " << shared_mem_block_limit_factor << "\n"
418 << " max_warp_occupancy: " << max_warp_occupancy << "\n"
419 << " max_block_occupancy: " << max_block_occupancy << "\n"
420 << " num_threads_per_block: " << num_threads_per_block << "\n"
421 << " expr_branching: " << expr_branching << "\n";
422 }
423
424 void dump() const {
425 auto os = aslog(1);
426 dump(os);
427 }
428
429 bool equal(const ScheduleFeatures &other) const {
430 const size_t n_features = ScheduleFeatures::num_features();
431 for (size_t i = 0; i < n_features; i++) {
432 if ((*this)[i] != other[i]) {
433 return false;
434 }
435 }
436 return true;
437 }
438};
439
440} // namespace Internal
441} // namespace Halide
442
443#endif
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT32_TYPE__ uint32_t
static constexpr uint32_t version()
int pointwise_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
int broadcast_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
static constexpr size_t num_features()
int op_histogram[(int) OpType::NumOpTypes][(int) ScalarType::NumScalarTypes]
int transpose_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
int types_in_use[(int) ScalarType::NumScalarTypes]
int slice_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
bool equal(const ScheduleFeatures &other) const
static constexpr uint32_t version()
static constexpr size_t num_features()