Halide
Featurization.h
Go to the documentation of this file.
1 #ifndef FEATURIZATION_H
2 #define FEATURIZATION_H
3 
4 #include <cstring>
5 #include <iostream>
6 #include <stdint.h>
7 
8 #include "ASLog.h"
9 
10 namespace Halide {
11 namespace Internal {
12 
13 // The algorithm-specific features. For legacy reasons these are
14 // called PipelineFeatures in the code.
16  static constexpr size_t num_features() {
17  return sizeof(PipelineFeatures) / sizeof(int);
18  }
19 
20  static constexpr uint32_t version() {
21  return 3;
22  }
23 
24  // Access them by index.
25  int &operator[](int idx) {
26  return ((int *)(this))[idx];
27  }
28 
29  int operator[](int idx) const {
30  return ((const int *)(this))[idx];
31  }
32 
33  enum class OpType {
34  Const,
35  Cast,
36  Variable,
37  Param,
38  Add,
39  Sub,
40  Mod,
41  Mul,
42  Div,
43  Min,
44  Max,
45  EQ,
46  NE,
47  LT,
48  LE,
49  And,
50  Or,
51  Not,
52  Select,
53  ImageCall, // Loads to an input buffer
54  FuncCall, // Calls to another pipeline stage
55  SelfCall, // Recursive calls from a Func to itself
56  ExternCall, // Math intrinsics, typically
57  Let,
59  };
60 
61  enum class ScalarType {
62  Bool,
63  UInt8, // or Int8
64  UInt16, // or Int16
65  UInt32, // or Int32
66  UInt64, // or Int64
67  Float,
68  Double,
70  };
71 
72  // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things
74 
76 
77  enum class AccessType {
78  LoadFunc,
79  LoadSelf,
80  LoadImage,
81  Store,
83  };
84 
85  // Finer granularity call/store node properties. These are a
86  // function of the matrix of derivatives of each arg to a
87  // call w.r.t the loop variables of the Stage. Each row of
88  // the matrix corresponds to one of the call arguments. In
89  // each case we illustrate such a call, assuming that the
90  // variables of this Func are x, y, z, and that the
91  // dimension vectorized over is the first (x).
92 
93  // Square identity matrix. f(x - 2, y + 8, z + param)
95  // Square permutation matrix. f(y + 1, z - 3, x)
97  // Each row sums to 1. Each column sums to 1 or 0. f(y, x)
99  // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4)
101 
102  template<typename OS>
103  void dump(OS &os) const {
104  for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) {
105  const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"};
106  // Skip printing for types not used
107  if (!types_in_use[i]) continue;
108 
109  os << " Featurization for type " << type_names[i] << "\n"
110  << " Op histogram:\n"
111  << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n"
112  << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n"
113  << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n"
114  << " Param: " << op_histogram[(int)OpType::Param][i] << "\n"
115  << " Add: " << op_histogram[(int)OpType::Add][i] << "\n"
116  << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n"
117  << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n"
118  << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n"
119  << " Div: " << op_histogram[(int)OpType::Div][i] << "\n"
120  << " Min: " << op_histogram[(int)OpType::Min][i] << "\n"
121  << " Max: " << op_histogram[(int)OpType::Max][i] << "\n"
122  << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n"
123  << " NE: " << op_histogram[(int)OpType::NE][i] << "\n"
124  << " LT: " << op_histogram[(int)OpType::LT][i] << "\n"
125  << " LE: " << op_histogram[(int)OpType::LE][i] << "\n"
126  << " And: " << op_histogram[(int)OpType::And][i] << "\n"
127  << " Or: " << op_histogram[(int)OpType::Or][i] << "\n"
128  << " Not: " << op_histogram[(int)OpType::Not][i] << "\n"
129  << " Select: " << op_histogram[(int)OpType::Select][i] << "\n"
130  << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n"
131  << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n"
132  << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n"
133  << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n"
134  << " Let: " << op_histogram[(int)OpType::Let][i] << "\n"
135  << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n"
136  << " Pointwise: "
137  << pointwise_accesses[0][i] << " "
138  << pointwise_accesses[1][i] << " "
139  << pointwise_accesses[2][i] << " "
140  << pointwise_accesses[3][i] << "\n"
141  << " Transpose: "
142  << transpose_accesses[0][i] << " "
143  << transpose_accesses[1][i] << " "
144  << transpose_accesses[2][i] << " "
145  << transpose_accesses[3][i] << "\n"
146  << " Broadcast: "
147  << broadcast_accesses[0][i] << " "
148  << broadcast_accesses[1][i] << " "
149  << broadcast_accesses[2][i] << " "
150  << broadcast_accesses[3][i] << "\n"
151  << " Slice: "
152  << slice_accesses[0][i] << " "
153  << slice_accesses[1][i] << " "
154  << slice_accesses[2][i] << " "
155  << slice_accesses[3][i] << "\n";
156  }
157  }
158  void dump() const {
159  auto os = aslog(0);
160  dump(os);
161  }
162 };
163 
164 // The schedule-dependent portion of the featurization of a stage
166  static constexpr size_t num_features() {
167  return sizeof(ScheduleFeatures) / sizeof(double);
168  }
169 
170  static constexpr uint32_t version() {
171  return 3;
172  }
173 
174  double &operator[](int idx) {
175  return ((double *)(this))[idx];
176  }
177 
178  double operator[](int idx) const {
179  return ((const double *)(this))[idx];
180  }
181 
182  // The number of times storage for this stage is allocated. The
183  // product of outer loops at store_at site
184  double num_realizations = 0;
185 
186  // The number of times a tile of the stage is computed. The
187  // pProduct of outer loops at compute_at site. Always at least as
188  // large as num_realizations.
189  double num_productions = 0;
190 
191  // Number of times the innermost loop happens per allocation.
193 
194  // Number of times the innermost stmt happens per tile computed.
196 
197  // The total trip count of the innermost loop over the entire program.
198  // == num_realizations * points_computed_per_realization
199  // ~= num_productions * points_computed_per_production
200  // Only approximately equal because of the simplifications made
201  // regarding the modeling of sliding window
203 
204  // The minimum number of points that are actually required to be
205  // computed to produce a correct output. Not actually a function
206  // of the schedule, but a useful reference point to see if a
207  // schedule has gone off the rails.
209 
210  // Trip count of innermost loop nest.
212 
213  // Trip count of just the pure loops in the innermost loop
214  // (i.e. excludes loops representing reductions).
216 
217  // If this is to be unrolled, what is the product of the unrolling
218  // factors.
220 
221  // The number of parallel jobs launched in the production of this
222  // stage. Always 1 unless the Func is compute_root, because we
223  // place all parallelism at the outermost level.
224  double inner_parallelism = 0;
225 
226  // The number of times this Func could be realized in parallel. 1
227  // when the Func is compute_root. Product of the containing
228  // parallel loops for other stages.
229  double outer_parallelism = 0;
230 
231  // Size of the region computed at the store_at site, measured in
232  // bytes. Does not take storage-folding optimizations into account.
234 
235  // Size of the region computed per tile (at the compute_at site),
236  // measured in bytes. This includes the effect of storage-folding,
237  // so it's a better number to look at to estimate memory usage.
239 
240  // If the stage were hypothetically scheduled at root, how much
241  // memory would it consumed. Doesn't vary w.r.t. the schedule, but
242  // a useful reference.
243  double bytes_at_root = 0;
244 
245  // Same as the above, but only measuring the extent along the
246  // innermost dimension, so that we can reason about spatial
247  // locality, cache lines, prefetchers, etc.
251 
252  // For inlined Funcs, how many calls are made to this Func total.
253  double inlined_calls = 0;
254 
255  // Number of unique bytes and unique continguous segments of
256  // memory loaded from all inputs over a single trip of the loop
257  // containing the allocation site.
260 
261  // The sum of the sizes of the allocations accessed at this
262  // site. Gives a hint as to the likely locality of it.
264 
265  // The sum of the sizes of the temporary allocations while
266  // computing one tile of this Func. Probably a good thing if it
267  // fits in cache.
268  double working_set = 0;
269 
270  // The vectorization factor (#simd lanes) to be used to compute
271  // this stage. Wasted work if it's smaller than the stage's native
272  // vector size.
273  double vector_size = 0;
274 
275  // The native vector size for the narrowest type used. Does not
276  // vary with the schedule, but a useful reference point.
277  double native_vector_size = 0;
278 
279  // Number of SIMD vectors computed
280  double num_vectors = 0;
281 
282  // Number of scalars computed (e.g. from tails of loops)
283  double num_scalars = 0;
284 
285  // The number of loads done per vector or scalar computed. Vector
286  // gathers count as a batch of scalar loads. These get amortized
287  // across unrolled blocks if some loads can be reused across the
288  // unrolled dimension.
292 
293  // The memory footprint written over one per parallel task. The
294  // union of the regions if the stage is computed at finer
295  // granularity that one parallel task of some consumer.
296  double bytes_at_task = 0;
298 
299  // The memory footprint accessed while computing a single vector.
302 
303  // The memory footprint accessed per parallel task. Only counts
304  // loads from things computed outside of that parallel task (to
305  // measure the amount of traffic coming from another core).
308 
309  // The sum of the sizes of all live allocations at various sites.
314 
315  template<typename OS>
316  void dump(OS &os) const {
317  os << " num_realizations: " << num_realizations << "\n"
318  << " num_productions: " << num_productions << "\n"
319  << " points_computed_per_realization: " << points_computed_per_realization << "\n"
320  << " points_computed_per_production: " << points_computed_per_production << "\n"
321  << " points_computed_total: " << points_computed_total << "\n"
322  << " points_computed_minimum: " << points_computed_minimum << "\n"
323  << " innermost_loop_extent: " << innermost_loop_extent << "\n"
324  << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n"
325  << " unrolled_loop_extent: " << unrolled_loop_extent << "\n"
326  << " inner_parallelism: " << inner_parallelism << "\n"
327  << " outer_parallelism: " << outer_parallelism << "\n"
328  << " bytes_at_realization: " << bytes_at_realization << "\n"
329  << " bytes_at_production: " << bytes_at_production << "\n"
330  << " bytes_at_root: " << bytes_at_root << "\n"
331  << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n"
332  << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n"
333  << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n"
334  << " inlined_calls: " << inlined_calls << "\n"
335  << " unique_bytes_read_per_realization: " << unique_bytes_read_per_realization << "\n"
336  << " unique_lines_read_per_realization: " << unique_lines_read_per_realization << "\n"
337  << " allocation_bytes_read_per_realization: " << allocation_bytes_read_per_realization << "\n"
338  << " working_set: " << working_set << "\n"
339  << " vector_size: " << vector_size << "\n"
340  << " native_vector_size: " << native_vector_size << "\n"
341  << " num_vectors: " << num_vectors << "\n"
342  << " num_scalars: " << num_scalars << "\n"
343  << " scalar_loads_per_vector: " << scalar_loads_per_vector << "\n"
344  << " vector_loads_per_vector: " << vector_loads_per_vector << "\n"
345  << " scalar_loads_per_scalar: " << scalar_loads_per_scalar << "\n"
346  << " bytes_at_task: " << bytes_at_task << "\n"
347  << " innermost_bytes_at_task: " << innermost_bytes_at_task << "\n"
348  << " unique_bytes_read_per_vector: " << unique_bytes_read_per_vector << "\n"
349  << " unique_lines_read_per_vector: " << unique_lines_read_per_vector << "\n"
350  << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n"
351  << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n"
352  << " working_set_at_task: " << working_set_at_task << "\n"
353  << " working_set_at_production: " << working_set_at_production << "\n"
354  << " working_set_at_realization: " << working_set_at_realization << "\n"
355  << " working_set_at_root: " << working_set_at_root << "\n";
356  }
357  void dump() const {
358  auto os = aslog(0);
359  dump(os);
360  }
361 };
362 
363 } // namespace Internal
364 } // namespace Halide
365 
366 #endif
Halide::Internal::PipelineFeatures::AccessType::LoadFunc
@ LoadFunc
Halide::Internal::ScheduleFeatures::unrolled_loop_extent
double unrolled_loop_extent
Definition: Featurization.h:219
Halide::Internal::PipelineFeatures::op_histogram
int op_histogram[(int) OpType::NumOpTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:75
Halide::Internal::ScheduleFeatures::allocation_bytes_read_per_realization
double allocation_bytes_read_per_realization
Definition: Featurization.h:263
Halide::Internal::ScheduleFeatures::vector_size
double vector_size
Definition: Featurization.h:273
Halide::Internal::PipelineFeatures::OpType::Param
@ Param
Halide::Internal::ScheduleFeatures::working_set_at_realization
double working_set_at_realization
Definition: Featurization.h:312
Halide::Internal::ScheduleFeatures::working_set
double working_set
Definition: Featurization.h:268
Halide::Internal::PipelineFeatures::broadcast_accesses
int broadcast_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:98
Halide::Internal::ScheduleFeatures::working_set_at_root
double working_set_at_root
Definition: Featurization.h:313
Halide::Internal::ScheduleFeatures::innermost_bytes_at_realization
double innermost_bytes_at_realization
Definition: Featurization.h:248
Halide::Internal::ScheduleFeatures::unique_bytes_read_per_task
double unique_bytes_read_per_task
Definition: Featurization.h:306
Halide::Internal::ScheduleFeatures::points_computed_per_realization
double points_computed_per_realization
Definition: Featurization.h:192
Halide::Internal::PipelineFeatures::OpType::Mod
@ Mod
Halide::Internal::ScheduleFeatures::num_realizations
double num_realizations
Definition: Featurization.h:184
Halide::Internal::ScheduleFeatures::operator[]
double & operator[](int idx)
Definition: Featurization.h:174
Halide::Internal::ScheduleFeatures::version
static constexpr uint32_t version()
Definition: Featurization.h:170
Halide::Internal::ScheduleFeatures::unique_bytes_read_per_realization
double unique_bytes_read_per_realization
Definition: Featurization.h:258
Halide::Internal::PipelineFeatures::OpType::Let
@ Let
Halide::Internal::PipelineFeatures::OpType::LE
@ LE
Halide::Internal::ScheduleFeatures::working_set_at_task
double working_set_at_task
Definition: Featurization.h:310
Halide::Internal::PipelineFeatures::ScalarType::UInt16
@ UInt16
Halide::Internal::PipelineFeatures::OpType::NE
@ NE
Halide::Internal::ScheduleFeatures::native_vector_size
double native_vector_size
Definition: Featurization.h:277
Halide::Internal::PipelineFeatures::OpType::Min
@ Min
Halide::Internal::PipelineFeatures::ScalarType::UInt64
@ UInt64
Halide::Internal::ScheduleFeatures::num_vectors
double num_vectors
Definition: Featurization.h:280
Halide::Internal::ScheduleFeatures::innermost_pure_loop_extent
double innermost_pure_loop_extent
Definition: Featurization.h:215
Halide::Internal::ScheduleFeatures::outer_parallelism
double outer_parallelism
Definition: Featurization.h:229
Halide::Internal::PipelineFeatures::OpType::ExternCall
@ ExternCall
Halide::Internal::aslog
Definition: ASLog.h:15
Halide::Internal::PipelineFeatures
Definition: Featurization.h:15
Halide::Internal::PipelineFeatures::OpType::Variable
@ Variable
Halide::Internal::PipelineFeatures::OpType::Add
@ Add
Halide::Internal::ScheduleFeatures::num_scalars
double num_scalars
Definition: Featurization.h:283
Halide::Internal::PipelineFeatures::AccessType::Store
@ Store
Halide::Internal::PipelineFeatures::OpType::Const
@ Const
Halide::Internal::ScheduleFeatures::num_features
static constexpr size_t num_features()
Definition: Featurization.h:166
Halide::Internal::ScheduleFeatures::innermost_loop_extent
double innermost_loop_extent
Definition: Featurization.h:211
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AddAtomicMutex.h:21
Halide::Internal::PipelineFeatures::version
static constexpr uint32_t version()
Definition: Featurization.h:20
Halide::Internal::ScheduleFeatures
Definition: Featurization.h:165
Halide::Internal::PipelineFeatures::OpType::Or
@ Or
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::PipelineFeatures::OpType::NumOpTypes
@ NumOpTypes
Halide::Internal::PipelineFeatures::OpType::Cast
@ Cast
Halide::Internal::PipelineFeatures::num_features
static constexpr size_t num_features()
Definition: Featurization.h:16
Halide::Internal::ScheduleFeatures::vector_loads_per_vector
double vector_loads_per_vector
Definition: Featurization.h:290
Halide::Internal::PipelineFeatures::OpType::Max
@ Max
Halide::Internal::ScheduleFeatures::inlined_calls
double inlined_calls
Definition: Featurization.h:253
Halide::Internal::PipelineFeatures::operator[]
int & operator[](int idx)
Definition: Featurization.h:25
Halide::Internal::ScheduleFeatures::unique_bytes_read_per_vector
double unique_bytes_read_per_vector
Definition: Featurization.h:300
Halide::Internal::PipelineFeatures::types_in_use
int types_in_use[(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:73
Halide::Internal::ScheduleFeatures::inner_parallelism
double inner_parallelism
Definition: Featurization.h:224
Halide::Internal::ScheduleFeatures::num_productions
double num_productions
Definition: Featurization.h:189
Halide::Internal::PipelineFeatures::ScalarType::Bool
@ Bool
Halide::Internal::ScheduleFeatures::bytes_at_root
double bytes_at_root
Definition: Featurization.h:243
Halide::Internal::PipelineFeatures::OpType::ImageCall
@ ImageCall
Halide::Internal::PipelineFeatures::ScalarType::NumScalarTypes
@ NumScalarTypes
Halide::Internal::PipelineFeatures::OpType::LT
@ LT
Halide::Internal::ScheduleFeatures::dump
void dump() const
Definition: Featurization.h:357
Halide::Internal::PipelineFeatures::AccessType
AccessType
Definition: Featurization.h:77
Halide::Internal::PipelineFeatures::OpType::And
@ And
Halide::Internal::PipelineFeatures::ScalarType
ScalarType
Definition: Featurization.h:61
Halide::Internal::PipelineFeatures::transpose_accesses
int transpose_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:96
Halide::Internal::PipelineFeatures::OpType::FuncCall
@ FuncCall
Halide::Internal::PipelineFeatures::OpType::Div
@ Div
Halide::Internal::PipelineFeatures::OpType::Select
@ Select
Halide::Internal::PipelineFeatures::OpType::Sub
@ Sub
Halide::Internal::PipelineFeatures::dump
void dump(OS &os) const
Definition: Featurization.h:103
Halide::Internal::ScheduleFeatures::points_computed_per_production
double points_computed_per_production
Definition: Featurization.h:195
ASLog.h
Halide::Internal::PipelineFeatures::dump
void dump() const
Definition: Featurization.h:158
Halide::Internal::ScheduleFeatures::unique_lines_read_per_vector
double unique_lines_read_per_vector
Definition: Featurization.h:301
Halide::Internal::PipelineFeatures::OpType::Mul
@ Mul
Halide::Internal::PipelineFeatures::slice_accesses
int slice_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:100
Halide::Internal::ScheduleFeatures::bytes_at_task
double bytes_at_task
Definition: Featurization.h:296
Halide::Internal::ScheduleFeatures::innermost_bytes_at_task
double innermost_bytes_at_task
Definition: Featurization.h:297
Halide::Internal::ScheduleFeatures::scalar_loads_per_vector
double scalar_loads_per_vector
Definition: Featurization.h:289
Halide::Internal::PipelineFeatures::ScalarType::Float
@ Float
Halide::Internal::PipelineFeatures::AccessType::NumAccessTypes
@ NumAccessTypes
Halide::Internal::PipelineFeatures::OpType::SelfCall
@ SelfCall
Halide::Internal::PipelineFeatures::pointwise_accesses
int pointwise_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:94
Halide::Internal::ScheduleFeatures::unique_lines_read_per_task
double unique_lines_read_per_task
Definition: Featurization.h:307
Halide::Internal::ScheduleFeatures::bytes_at_production
double bytes_at_production
Definition: Featurization.h:238
Halide::Internal::PipelineFeatures::OpType::Not
@ Not
Halide::Internal::ScheduleFeatures::points_computed_minimum
double points_computed_minimum
Definition: Featurization.h:208
Halide::Internal::PipelineFeatures::OpType
OpType
Definition: Featurization.h:33
Halide::Internal::PipelineFeatures::OpType::EQ
@ EQ
Halide::Internal::ScheduleFeatures::scalar_loads_per_scalar
double scalar_loads_per_scalar
Definition: Featurization.h:291
Halide::Internal::PipelineFeatures::ScalarType::UInt8
@ UInt8
Halide::Internal::PipelineFeatures::ScalarType::UInt32
@ UInt32
Halide::Internal::ScheduleFeatures::operator[]
double operator[](int idx) const
Definition: Featurization.h:178
Halide::Internal::ScheduleFeatures::points_computed_total
double points_computed_total
Definition: Featurization.h:202
Halide::Internal::PipelineFeatures::operator[]
int operator[](int idx) const
Definition: Featurization.h:29
Halide::Internal::ScheduleFeatures::unique_lines_read_per_realization
double unique_lines_read_per_realization
Definition: Featurization.h:259
Halide::Internal::PipelineFeatures::AccessType::LoadImage
@ LoadImage
uint32_t
unsigned __INT32_TYPE__ uint32_t
Definition: runtime_internal.h:21
Halide::Internal::ScheduleFeatures::dump
void dump(OS &os) const
Definition: Featurization.h:316
Halide::Internal::ScheduleFeatures::innermost_bytes_at_root
double innermost_bytes_at_root
Definition: Featurization.h:250
Halide::Internal::PipelineFeatures::AccessType::LoadSelf
@ LoadSelf
Halide::Internal::PipelineFeatures::ScalarType::Double
@ Double
Halide::Internal::ScheduleFeatures::bytes_at_realization
double bytes_at_realization
Definition: Featurization.h:233
Halide::Internal::ScheduleFeatures::working_set_at_production
double working_set_at_production
Definition: Featurization.h:311
Halide::Internal::ScheduleFeatures::innermost_bytes_at_production
double innermost_bytes_at_production
Definition: Featurization.h:249