Halide
Featurization.h
Go to the documentation of this file.
1 #ifndef FEATURIZATION_H
2 #define FEATURIZATION_H
3 
4 #include <cstdint>
5 #include <cstring>
6 #include <iostream>
7 
8 #include "ASLog.h"
9 
10 namespace Halide {
11 namespace Internal {
12 
13 // The algorithm-specific features. For legacy reasons these are
14 // called PipelineFeatures in the code.
15 struct PipelineFeatures {
16  static constexpr size_t num_features() {
17  return sizeof(PipelineFeatures) / sizeof(int);
18  }
19 
20  static constexpr uint32_t version() {
21  return 3;
22  }
23 
24  // Access them by index.
25  int &operator[](int idx) {
26  return ((int *)(this))[idx];
27  }
28 
29  int operator[](int idx) const {
30  return ((const int *)(this))[idx];
31  }
32 
33  enum class OpType {
34  Const,
35  Cast,
36  Variable,
37  Param,
38  Add,
39  Sub,
40  Mod,
41  Mul,
42  Div,
43  Min,
44  Max,
45  EQ,
46  NE,
47  LT,
48  LE,
49  And,
50  Or,
51  Not,
52  Select,
53  ImageCall, // Loads to an input buffer
54  FuncCall, // Calls to another pipeline stage
55  SelfCall, // Recursive calls from a Func to itself
56  ExternCall, // Math intrinsics, typically
57  Let,
59  };
60 
61  enum class ScalarType {
62  Bool,
63  UInt8, // or Int8
64  UInt16, // or Int16
65  UInt32, // or Int32
66  UInt64, // or Int64
67  Float,
68  Double,
70  };
71 
72  // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things
74 
76 
77  enum class AccessType {
78  LoadFunc,
79  LoadSelf,
80  LoadImage,
81  Store,
83  };
84 
85  // Finer granularity call/store node properties. These are a
86  // function of the matrix of derivatives of each arg to a
87  // call w.r.t the loop variables of the Stage. Each row of
88  // the matrix corresponds to one of the call arguments. In
89  // each case we illustrate such a call, assuming that the
90  // variables of this Func are x, y, z, and that the
91  // dimension vectorized over is the first (x).
92 
93  // Square identity matrix. f(x - 2, y + 8, z + param)
95  // Square permutation matrix. f(y + 1, z - 3, x)
97  // Each row sums to 1. Each column sums to 1 or 0. f(y, x)
99  // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4)
101 
102  template<typename OS>
103  void dump(OS &os) const {
104  for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) {
105  const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"};
106  // Skip printing for types not used
107  if (!types_in_use[i]) {
108  continue;
109  }
110 
111  os << " Featurization for type " << type_names[i] << "\n"
112  << " Op histogram:\n"
113  << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n"
114  << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n"
115  << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n"
116  << " Param: " << op_histogram[(int)OpType::Param][i] << "\n"
117  << " Add: " << op_histogram[(int)OpType::Add][i] << "\n"
118  << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n"
119  << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n"
120  << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n"
121  << " Div: " << op_histogram[(int)OpType::Div][i] << "\n"
122  << " Min: " << op_histogram[(int)OpType::Min][i] << "\n"
123  << " Max: " << op_histogram[(int)OpType::Max][i] << "\n"
124  << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n"
125  << " NE: " << op_histogram[(int)OpType::NE][i] << "\n"
126  << " LT: " << op_histogram[(int)OpType::LT][i] << "\n"
127  << " LE: " << op_histogram[(int)OpType::LE][i] << "\n"
128  << " And: " << op_histogram[(int)OpType::And][i] << "\n"
129  << " Or: " << op_histogram[(int)OpType::Or][i] << "\n"
130  << " Not: " << op_histogram[(int)OpType::Not][i] << "\n"
131  << " Select: " << op_histogram[(int)OpType::Select][i] << "\n"
132  << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n"
133  << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n"
134  << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n"
135  << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n"
136  << " Let: " << op_histogram[(int)OpType::Let][i] << "\n"
137  << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n"
138  << " Pointwise: "
139  << pointwise_accesses[0][i] << " "
140  << pointwise_accesses[1][i] << " "
141  << pointwise_accesses[2][i] << " "
142  << pointwise_accesses[3][i] << "\n"
143  << " Transpose: "
144  << transpose_accesses[0][i] << " "
145  << transpose_accesses[1][i] << " "
146  << transpose_accesses[2][i] << " "
147  << transpose_accesses[3][i] << "\n"
148  << " Broadcast: "
149  << broadcast_accesses[0][i] << " "
150  << broadcast_accesses[1][i] << " "
151  << broadcast_accesses[2][i] << " "
152  << broadcast_accesses[3][i] << "\n"
153  << " Slice: "
154  << slice_accesses[0][i] << " "
155  << slice_accesses[1][i] << " "
156  << slice_accesses[2][i] << " "
157  << slice_accesses[3][i] << "\n";
158  }
159  }
160  void dump() const {
161  auto os = aslog(1);
162  dump(os);
163  }
164 };
165 
166 // The schedule-dependent portion of the featurization of a stage
167 struct ScheduleFeatures {
168  static constexpr size_t num_features() {
169  return sizeof(ScheduleFeatures) / sizeof(double);
170  }
171 
172  static constexpr uint32_t version() {
173  return 3;
174  }
175 
176  double &operator[](int idx) {
177  return ((double *)(this))[idx];
178  }
179 
180  double operator[](int idx) const {
181  return ((const double *)(this))[idx];
182  }
183 
184  // The number of times storage for this stage is allocated. The
185  // product of outer loops at store_at site
186  double num_realizations = 0;
187 
188  // The number of times a tile of the stage is computed. The
189  // pProduct of outer loops at compute_at site. Always at least as
190  // large as num_realizations.
191  double num_productions = 0;
192 
193  // Number of times the innermost loop happens per allocation.
195 
196  // Number of times the innermost stmt happens per tile computed.
198 
200 
201  // The total trip count of the innermost loop over the entire program.
202  // == num_realizations * points_computed_per_realization
203  // ~= num_productions * points_computed_per_production
204  // Only approximately equal because of the simplifications made
205  // regarding the modeling of sliding window
206  double points_computed_total = 0;
207 
208  // The minimum number of points that are actually required to be
209  // computed to produce a correct output. Not actually a function
210  // of the schedule, but a useful reference point to see if a
211  // schedule has gone off the rails.
212  double points_computed_minimum = 0;
213 
214  // Trip count of innermost loop nest.
215  double innermost_loop_extent = 0;
216 
217  // Trip count of just the pure loops in the innermost loop
218  // (i.e. excludes loops representing reductions).
219  double innermost_pure_loop_extent = 0;
220 
221  // If this is to be unrolled, what is the product of the unrolling
222  // factors.
223  double unrolled_loop_extent = 0;
224 
225  // The number of parallel jobs launched in the production of this
226  // stage. Always 1 unless the Func is compute_root, because we
227  // place all parallelism at the outermost level.
228  double inner_parallelism = 0;
229 
230  // The number of times this Func could be realized in parallel. 1
231  // when the Func is compute_root. Product of the containing
232  // parallel loops for other stages.
233  double outer_parallelism = 0;
234 
235  // Size of the region computed at the store_at site, measured in
236  // bytes. Does not take storage-folding optimizations into account.
237  double bytes_at_realization = 0;
238 
239  // Size of the region computed per tile (at the compute_at site),
240  // measured in bytes. This includes the effect of storage-folding,
241  // so it's a better number to look at to estimate memory usage.
242  double bytes_at_production = 0;
243 
244  // If the stage were hypothetically scheduled at root, how much
245  // memory would it consumed. Doesn't vary w.r.t. the schedule, but
246  // a useful reference.
247  double bytes_at_root = 0;
248 
249  // Same as the above, but only measuring the extent along the
250  // innermost dimension, so that we can reason about spatial
251  // locality, cache lines, prefetchers, etc.
254  double innermost_bytes_at_root = 0;
255 
256  // For inlined Funcs, how many calls are made to this Func total.
257  double inlined_calls = 0;
258 
259  // Number of unique bytes and unique continguous segments of
260  // memory loaded from all inputs over a single trip of the loop
261  // containing the allocation site.
268 
275 
276  // The sum of the sizes of the allocations accessed at this
277  // site. Gives a hint as to the likely locality of it.
281 
282  // The sum of the sizes of the temporary allocations while
283  // computing one tile of this Func. Probably a good thing if it
284  // fits in cache.
285  double working_set = 0;
286 
287  // Number of scalars computed (e.g. from tails of loops)
288  double num_scalars = 0;
289 
290  // The memory footprint written over one per parallel task. The
291  // union of the regions if the stage is computed at finer
292  // granularity that one parallel task of some consumer.
299 
300  // The memory footprint accessed while computing a single point
303 
304  // The memory footprint accessed per parallel task. Only counts
305  // loads from things computed outside of that parallel task (to
306  // measure the amount of traffic coming from another core).
307  double unique_bytes_read_per_task = 0;
308  double unique_lines_read_per_task = 0;
309 
310  // The sum of the sizes of all live allocations at various sites.
311  double working_set_at_task = 0;
312  double working_set_at_production = 0;
313  double working_set_at_realization = 0;
314  double working_set_at_root = 0;
315 
316  double num_blocks = 1;
318  double block_occupancy = 1.0 / 1024.0;
319 
320  double warp_lane_utilization = 1.0 / 32.0;
324  double idle_lane_wastage = 0;
325 
330 
333 
336 
338 
341  double max_warp_occupancy = 0;
343 
345  double expr_branching = 0;
346 
347  template<typename OS>
348  void dump(OS &os) const {
349  os << " num_realizations: " << num_realizations << "\n"
350  << " num_productions: " << num_productions << "\n"
351  << " points_computed_per_realization: " << points_computed_per_realization << "\n"
352  << " points_computed_per_production: " << points_computed_per_production << "\n"
353  << " points_computed_per_thread: " << points_computed_per_thread << "\n"
354  << " points_computed_total: " << points_computed_total << "\n"
355  << " points_computed_minimum: " << points_computed_minimum << "\n"
356  << " innermost_loop_extent: " << innermost_loop_extent << "\n"
357  << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n"
358  << " unrolled_loop_extent: " << unrolled_loop_extent << "\n"
359  << " inner_parallelism: " << inner_parallelism << "\n"
360  << " outer_parallelism: " << outer_parallelism << "\n"
361  << " bytes_at_realization: " << bytes_at_realization << "\n"
362  << " bytes_at_production: " << bytes_at_production << "\n"
363  << " bytes_at_root: " << bytes_at_root << "\n"
364  << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n"
365  << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n"
366  << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n"
367  << " inlined_calls: " << inlined_calls << "\n"
368  << " unique_global_bytes_read_per_realization: " << unique_global_bytes_read_per_realization << "\n"
369  << " unique_shared_bytes_read_per_realization: " << unique_shared_bytes_read_per_realization << "\n"
370  << " unique_register_bytes_read_per_realization: " << unique_register_bytes_read_per_realization << "\n"
371  << " unique_global_lines_read_per_realization: " << unique_global_lines_read_per_realization << "\n"
372  << " unique_shared_lines_read_per_realization: " << unique_shared_lines_read_per_realization << "\n"
373  << " unique_register_lines_read_per_realization: " << unique_register_lines_read_per_realization << "\n"
374  << " unique_global_bytes_read_per_thread: " << unique_global_bytes_read_per_thread << "\n"
375  << " unique_shared_bytes_read_per_thread: " << unique_shared_bytes_read_per_thread << "\n"
376  << " unique_register_bytes_read_per_thread: " << unique_register_bytes_read_per_thread << "\n"
377  << " unique_global_lines_read_per_thread: " << unique_global_lines_read_per_thread << "\n"
378  << " unique_shared_lines_read_per_thread: " << unique_shared_lines_read_per_thread << "\n"
379  << " unique_register_lines_read_per_thread: " << unique_register_lines_read_per_thread << "\n"
380  << " global_allocation_bytes_read_per_realization: " << global_allocation_bytes_read_per_realization << "\n"
381  << " shared_allocation_bytes_read_per_realization: " << shared_allocation_bytes_read_per_realization << "\n"
382  << " register_allocation_bytes_read_per_realization: " << register_allocation_bytes_read_per_realization << "\n"
383  << " working_set: " << working_set << "\n"
384  << " num_scalars: " << num_scalars << "\n"
385  << " global_bytes_at_task: " << global_bytes_at_task << "\n"
386  << " shared_bytes_at_task: " << shared_bytes_at_task << "\n"
387  << " register_bytes_at_task: " << register_bytes_at_task << "\n"
388  << " global_innermost_bytes_at_task: " << global_innermost_bytes_at_task << "\n"
389  << " shared_innermost_bytes_at_task: " << shared_innermost_bytes_at_task << "\n"
390  << " register_innermost_bytes_at_task: " << register_innermost_bytes_at_task << "\n"
391  << " unique_bytes_read_per_point: " << unique_bytes_read_per_point << "\n"
392  << " unique_lines_read_per_point: " << unique_lines_read_per_point << "\n"
393  << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n"
394  << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n"
395  << " working_set_at_task: " << working_set_at_task << "\n"
396  << " working_set_at_production: " << working_set_at_production << "\n"
397  << " working_set_at_realization: " << working_set_at_realization << "\n"
398  << " working_set_at_root: " << working_set_at_root << "\n"
399  << " num_blocks: " << num_blocks << "\n"
400  << " num_warps_per_block: " << num_warps_per_block << "\n"
401  << " block_occupancy: " << block_occupancy << "\n"
402  << " warp_lane_utilization: " << warp_lane_utilization << "\n"
403  << " num_active_warps_per_block: " << num_active_warps_per_block << "\n"
404  << " warp_lane_utilization_at_block_y: " << warp_lane_utilization_at_block_y << "\n"
405  << " warp_lane_utilization_at_block_z: " << warp_lane_utilization_at_block_z << "\n"
406  << " idle_lane_wastage: " << idle_lane_wastage << "\n"
407  << " num_shared_mem_loads_per_block: " << num_shared_mem_loads_per_block << "\n"
408  << " num_global_mem_loads_per_block: " << num_global_mem_loads_per_block << "\n"
409  << " num_shared_mem_stores_per_block: " << num_shared_mem_stores_per_block << "\n"
410  << " num_global_mem_stores_per_block: " << num_global_mem_stores_per_block << "\n"
411  << " shared_mem_store_efficiency: " << shared_mem_store_efficiency << "\n"
412  << " shared_mem_load_efficiency: " << shared_mem_load_efficiency << "\n"
413  << " global_mem_store_efficiency: " << global_mem_store_efficiency << "\n"
414  << " global_mem_load_efficiency: " << global_mem_load_efficiency << "\n"
415  << " working_set_at_thread: " << working_set_at_thread << "\n"
416  << " shared_mem_occupancy: " << shared_mem_occupancy << "\n"
417  << " shared_mem_block_limit_factor: " << shared_mem_block_limit_factor << "\n"
418  << " max_warp_occupancy: " << max_warp_occupancy << "\n"
419  << " max_block_occupancy: " << max_block_occupancy << "\n"
420  << " num_threads_per_block: " << num_threads_per_block << "\n"
421  << " expr_branching: " << expr_branching << "\n";
422  }
423 
424  void dump() const {
425  auto os = aslog(1);
426  dump(os);
427  }
428 
429  bool equal(const ScheduleFeatures &other) const {
430  const size_t n_features = ScheduleFeatures::num_features();
431  for (size_t i = 0; i < n_features; i++) {
432  if ((*this)[i] != other[i]) {
433  return false;
434  }
435  }
436  return true;
437  }
438 };
439 
440 } // namespace Internal
441 } // namespace Halide
442 
443 #endif
Halide::Internal::ScheduleFeatures::num_shared_mem_loads_per_block
double num_shared_mem_loads_per_block
Definition: Featurization.h:326
Halide::Internal::PipelineFeatures::AccessType::LoadFunc
@ LoadFunc
Halide::Internal::ScheduleFeatures::register_allocation_bytes_read_per_realization
double register_allocation_bytes_read_per_realization
Definition: Featurization.h:280
Halide::Internal::ScheduleFeatures::register_innermost_bytes_at_task
double register_innermost_bytes_at_task
Definition: Featurization.h:298
Halide::Internal::ScheduleFeatures::unrolled_loop_extent
double unrolled_loop_extent
Definition: Featurization.h:214
Halide::Internal::ScheduleFeatures::num_shared_mem_stores_per_block
double num_shared_mem_stores_per_block
Definition: Featurization.h:328
Halide::Internal::PipelineFeatures::OpType::Param
@ Param
Halide::Internal::ScheduleFeatures::shared_innermost_bytes_at_task
double shared_innermost_bytes_at_task
Definition: Featurization.h:297
Halide::Internal::ScheduleFeatures::working_set_at_realization
double working_set_at_realization
Definition: Featurization.h:307
Halide::Internal::ScheduleFeatures::num_threads_per_block
double num_threads_per_block
Definition: Featurization.h:344
Halide::Internal::ScheduleFeatures::working_set
double working_set
Definition: Featurization.h:263
Halide::Internal::ScheduleFeatures::working_set_at_root
double working_set_at_root
Definition: Featurization.h:308
Halide::Internal::ScheduleFeatures::unique_register_lines_read_per_thread
double unique_register_lines_read_per_thread
Definition: Featurization.h:274
Halide::Internal::ScheduleFeatures::shared_mem_block_limit_factor
double shared_mem_block_limit_factor
Definition: Featurization.h:340
Halide::Internal::ScheduleFeatures::innermost_bytes_at_realization
double innermost_bytes_at_realization
Definition: Featurization.h:243
Halide::Internal::ScheduleFeatures::unique_bytes_read_per_task
double unique_bytes_read_per_task
Definition: Featurization.h:301
Halide::Internal::ScheduleFeatures::points_computed_per_realization
double points_computed_per_realization
Definition: Featurization.h:187
Halide::Internal::ScheduleFeatures::warp_lane_utilization
double warp_lane_utilization
Definition: Featurization.h:320
Halide::Internal::ScheduleFeatures::num_active_warps_per_block
double num_active_warps_per_block
Definition: Featurization.h:321
Halide::Internal::PipelineFeatures::OpType::Mod
@ Mod
Halide::Internal::ScheduleFeatures::register_bytes_at_task
double register_bytes_at_task
Definition: Featurization.h:295
Halide::Internal::ScheduleFeatures::num_realizations
double num_realizations
Definition: Featurization.h:179
Halide::Internal::ScheduleFeatures::operator[]
double & operator[](int idx)
Definition: Featurization.h:176
Halide::Internal::ScheduleFeatures::version
static constexpr uint32_t version()
Definition: Featurization.h:172
Halide::Internal::PipelineFeatures::pointwise_accesses
int pointwise_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:92
Halide::Internal::ScheduleFeatures::global_mem_load_efficiency
double global_mem_load_efficiency
Definition: Featurization.h:335
Halide::Internal::ScheduleFeatures::unique_shared_bytes_read_per_thread
double unique_shared_bytes_read_per_thread
Definition: Featurization.h:270
Halide::Internal::PipelineFeatures::OpType::Let
@ Let
Halide::Internal::ScheduleFeatures::shared_mem_occupancy
double shared_mem_occupancy
Definition: Featurization.h:339
Halide::Internal::PipelineFeatures::OpType::LE
@ LE
Halide::Internal::ScheduleFeatures::working_set_at_task
double working_set_at_task
Definition: Featurization.h:305
Halide::Internal::ScheduleFeatures::unique_lines_read_per_point
double unique_lines_read_per_point
Definition: Featurization.h:302
Halide::Internal::PipelineFeatures::ScalarType::UInt16
@ UInt16
Halide::Internal::PipelineFeatures::OpType::NE
@ NE
Halide::Internal::PipelineFeatures::OpType::Min
@ Min
Halide::Internal::ScheduleFeatures::global_innermost_bytes_at_task
double global_innermost_bytes_at_task
Definition: Featurization.h:296
Halide::Internal::PipelineFeatures::ScalarType::UInt64
@ UInt64
Halide::Internal::ScheduleFeatures::innermost_pure_loop_extent
double innermost_pure_loop_extent
Definition: Featurization.h:210
Halide::Internal::ScheduleFeatures::num_global_mem_stores_per_block
double num_global_mem_stores_per_block
Definition: Featurization.h:329
Halide::Internal::ScheduleFeatures::outer_parallelism
double outer_parallelism
Definition: Featurization.h:224
Halide::Internal::PipelineFeatures::OpType::ExternCall
@ ExternCall
Halide::Internal::aslog
Definition: ASLog.h:16
Halide::Internal::PipelineFeatures
Definition: Featurization.h:13
Halide::Internal::ScheduleFeatures::unique_shared_bytes_read_per_realization
double unique_shared_bytes_read_per_realization
Definition: Featurization.h:263
Halide::Internal::ScheduleFeatures::num_warps_per_block
double num_warps_per_block
Definition: Featurization.h:317
Halide::Internal::PipelineFeatures::OpType::Variable
@ Variable
Halide::Internal::PipelineFeatures::OpType::Add
@ Add
Halide::Internal::ScheduleFeatures::num_scalars
double num_scalars
Definition: Featurization.h:278
Halide::Internal::PipelineFeatures::AccessType::Store
@ Store
Halide::Internal::PipelineFeatures::OpType::Const
@ Const
Halide::Internal::ScheduleFeatures::num_features
static constexpr size_t num_features()
Definition: Featurization.h:168
Halide::Internal::ScheduleFeatures::unique_global_bytes_read_per_realization
double unique_global_bytes_read_per_realization
Definition: Featurization.h:262
Halide::Internal::ScheduleFeatures::warp_lane_utilization_at_block_z
double warp_lane_utilization_at_block_z
Definition: Featurization.h:323
Halide::Internal::ScheduleFeatures::innermost_loop_extent
double innermost_loop_extent
Definition: Featurization.h:206
Halide::Internal::ScheduleFeatures::unique_shared_lines_read_per_thread
double unique_shared_lines_read_per_thread
Definition: Featurization.h:273
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
Halide::Internal::ScheduleFeatures::unique_global_lines_read_per_realization
double unique_global_lines_read_per_realization
Definition: Featurization.h:265
Halide::Internal::PipelineFeatures::version
static constexpr uint32_t version()
Definition: Featurization.h:20
Halide::Internal::ScheduleFeatures::global_bytes_at_task
double global_bytes_at_task
Definition: Featurization.h:293
Halide::Internal::ScheduleFeatures
Definition: Featurization.h:160
Halide::Internal::PipelineFeatures::slice_accesses
int slice_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:98
Halide::Internal::PipelineFeatures::OpType::Or
@ Or
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::PipelineFeatures::OpType::NumOpTypes
@ NumOpTypes
Halide::Internal::PipelineFeatures::OpType::Cast
@ Cast
Halide::Internal::ScheduleFeatures::shared_mem_store_efficiency
double shared_mem_store_efficiency
Definition: Featurization.h:331
Halide::Internal::PipelineFeatures::num_features
static constexpr size_t num_features()
Definition: Featurization.h:16
Halide::Internal::PipelineFeatures::OpType::Max
@ Max
Halide::Internal::ScheduleFeatures::inlined_calls
double inlined_calls
Definition: Featurization.h:248
Halide::Internal::ScheduleFeatures::unique_global_lines_read_per_thread
double unique_global_lines_read_per_thread
Definition: Featurization.h:272
Halide::Internal::PipelineFeatures::operator[]
int & operator[](int idx)
Definition: Featurization.h:25
Halide::Internal::ScheduleFeatures::inner_parallelism
double inner_parallelism
Definition: Featurization.h:219
Halide::Internal::ScheduleFeatures::num_productions
double num_productions
Definition: Featurization.h:184
Halide::Internal::PipelineFeatures::ScalarType::Bool
@ Bool
Halide::Internal::ScheduleFeatures::bytes_at_root
double bytes_at_root
Definition: Featurization.h:238
Halide::Internal::PipelineFeatures::OpType::ImageCall
@ ImageCall
Halide::Internal::PipelineFeatures::ScalarType::NumScalarTypes
@ NumScalarTypes
Halide::Internal::PipelineFeatures::OpType::LT
@ LT
Halide::Internal::ScheduleFeatures::working_set_at_thread
double working_set_at_thread
Definition: Featurization.h:337
Halide::Internal::ScheduleFeatures::dump
void dump() const
Definition: Featurization.h:424
Halide::Internal::PipelineFeatures::AccessType
AccessType
Definition: Featurization.h:75
Halide::Internal::PipelineFeatures::OpType::And
@ And
Halide::Internal::ScheduleFeatures::num_blocks
double num_blocks
Definition: Featurization.h:316
Halide::Internal::PipelineFeatures::ScalarType
ScalarType
Definition: Featurization.h:59
Halide::Internal::ScheduleFeatures::expr_branching
double expr_branching
Definition: Featurization.h:345
Halide::Internal::ScheduleFeatures::unique_shared_lines_read_per_realization
double unique_shared_lines_read_per_realization
Definition: Featurization.h:266
Halide::Internal::PipelineFeatures::OpType::FuncCall
@ FuncCall
Halide::Internal::PipelineFeatures::OpType::Div
@ Div
Halide::Internal::ScheduleFeatures::idle_lane_wastage
double idle_lane_wastage
Definition: Featurization.h:324
Halide::Internal::PipelineFeatures::OpType::Select
@ Select
Halide::Internal::ScheduleFeatures::num_global_mem_loads_per_block
double num_global_mem_loads_per_block
Definition: Featurization.h:327
Halide::Internal::ScheduleFeatures::warp_lane_utilization_at_block_y
double warp_lane_utilization_at_block_y
Definition: Featurization.h:322
Halide::Internal::PipelineFeatures::OpType::Sub
@ Sub
Halide::Internal::ScheduleFeatures::unique_register_lines_read_per_realization
double unique_register_lines_read_per_realization
Definition: Featurization.h:267
Halide::Internal::PipelineFeatures::dump
void dump(OS &os) const
Definition: Featurization.h:103
Halide::Internal::ScheduleFeatures::points_computed_per_production
double points_computed_per_production
Definition: Featurization.h:190
ASLog.h
Halide::Internal::PipelineFeatures::dump
void dump() const
Definition: Featurization.h:160
Halide::Internal::PipelineFeatures::OpType::Mul
@ Mul
Halide::Internal::PipelineFeatures::transpose_accesses
int transpose_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:94
Halide::Internal::ScheduleFeatures::shared_mem_load_efficiency
double shared_mem_load_efficiency
Definition: Featurization.h:332
Halide::Internal::ScheduleFeatures::shared_allocation_bytes_read_per_realization
double shared_allocation_bytes_read_per_realization
Definition: Featurization.h:279
Halide::Internal::ScheduleFeatures::shared_bytes_at_task
double shared_bytes_at_task
Definition: Featurization.h:294
Halide::Internal::PipelineFeatures::ScalarType::Float
@ Float
Halide::Internal::PipelineFeatures::AccessType::NumAccessTypes
@ NumAccessTypes
Halide::Internal::PipelineFeatures::OpType::SelfCall
@ SelfCall
Halide::Internal::ScheduleFeatures::points_computed_per_thread
double points_computed_per_thread
Definition: Featurization.h:199
Halide::Internal::ScheduleFeatures::unique_register_bytes_read_per_thread
double unique_register_bytes_read_per_thread
Definition: Featurization.h:271
Halide::Internal::ScheduleFeatures::max_warp_occupancy
double max_warp_occupancy
Definition: Featurization.h:341
Halide::Internal::ScheduleFeatures::max_block_occupancy
double max_block_occupancy
Definition: Featurization.h:342
Halide::Internal::PipelineFeatures::types_in_use
int types_in_use[(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:71
Halide::Internal::ScheduleFeatures::unique_lines_read_per_task
double unique_lines_read_per_task
Definition: Featurization.h:302
Halide::Internal::PipelineFeatures::broadcast_accesses
int broadcast_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:96
Halide::Internal::PipelineFeatures::op_histogram
int op_histogram[(int) OpType::NumOpTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:73
Halide::Internal::ScheduleFeatures::bytes_at_production
double bytes_at_production
Definition: Featurization.h:233
Halide::Internal::PipelineFeatures::OpType::Not
@ Not
Halide::Internal::ScheduleFeatures::points_computed_minimum
double points_computed_minimum
Definition: Featurization.h:203
Halide::Internal::PipelineFeatures::OpType
OpType
Definition: Featurization.h:31
Halide::Internal::PipelineFeatures::OpType::EQ
@ EQ
Halide::Internal::ScheduleFeatures::global_allocation_bytes_read_per_realization
double global_allocation_bytes_read_per_realization
Definition: Featurization.h:278
Halide::Internal::PipelineFeatures::ScalarType::UInt8
@ UInt8
Halide::Internal::PipelineFeatures::ScalarType::UInt32
@ UInt32
Halide::Internal::ScheduleFeatures::operator[]
double operator[](int idx) const
Definition: Featurization.h:180
Halide::Internal::ScheduleFeatures::points_computed_total
double points_computed_total
Definition: Featurization.h:197
Halide::Internal::PipelineFeatures::operator[]
int operator[](int idx) const
Definition: Featurization.h:29
Halide::Internal::PipelineFeatures::AccessType::LoadImage
@ LoadImage
uint32_t
unsigned __INT32_TYPE__ uint32_t
Definition: runtime_internal.h:25
Halide::Internal::ScheduleFeatures::dump
void dump(OS &os) const
Definition: Featurization.h:348
Halide::Internal::ScheduleFeatures::unique_global_bytes_read_per_thread
double unique_global_bytes_read_per_thread
Definition: Featurization.h:269
Halide::Internal::ScheduleFeatures::block_occupancy
double block_occupancy
Definition: Featurization.h:318
Halide::Internal::ScheduleFeatures::innermost_bytes_at_root
double innermost_bytes_at_root
Definition: Featurization.h:245
Halide::Internal::ScheduleFeatures::equal
bool equal(const ScheduleFeatures &other) const
Definition: Featurization.h:429
Halide::Internal::PipelineFeatures::AccessType::LoadSelf
@ LoadSelf
Halide::Internal::ScheduleFeatures::unique_register_bytes_read_per_realization
double unique_register_bytes_read_per_realization
Definition: Featurization.h:264
Halide::Internal::ScheduleFeatures::unique_bytes_read_per_point
double unique_bytes_read_per_point
Definition: Featurization.h:301
Halide::Internal::PipelineFeatures::ScalarType::Double
@ Double
Halide::Internal::ScheduleFeatures::bytes_at_realization
double bytes_at_realization
Definition: Featurization.h:228
Halide::Internal::ScheduleFeatures::global_mem_store_efficiency
double global_mem_store_efficiency
Definition: Featurization.h:334
Halide::Internal::ScheduleFeatures::working_set_at_production
double working_set_at_production
Definition: Featurization.h:306
Halide::Internal::ScheduleFeatures::innermost_bytes_at_production
double innermost_bytes_at_production
Definition: Featurization.h:244