Halide
LoopNest.h
Go to the documentation of this file.
1 /** This file defines the LoopNest, which is our
2  * representation of a Halide schedule, and contains methods to
3  * generate candidates for scheduling as well as extract a
4  * featurization that can be used to cost each candidate. */
5 
6 #ifndef LOOP_NEST_H
7 #define LOOP_NEST_H
8 
9 #include "ASLog.h"
10 #include "CostModel.h"
11 #include "FunctionDAG.h"
12 #include "GPULoopInfo.h"
13 #include "GPUMemInfo.h"
14 #include "PerfectHashMap.h"
15 #include "SearchSpaceOptions.h"
16 #include "Statistics.h"
17 #include "ThreadInfo.h"
18 #include "Tiling.h"
19 #include <set>
20 #include <vector>
21 
22 namespace Halide {
23 namespace Internal {
24 namespace Autoscheduler {
25 
26 template<typename T>
28 
29 template<typename T>
31 
32 enum class GPU_parallelism { Block,
33  Thread,
34  Serial,
35  Simd,
37  None };
38 
39 std::string stringify(GPU_parallelism label);
40 
41 // inlined => func is inlined so has no memory store location
42 enum class GPUMemoryType { Global,
43  Shared,
44  Local,
45  Registers,
46  Inlined };
47 
48 bool may_subtile(const Anderson2021Params &params);
49 
50 int64_t get_shared_memory_limit(const Anderson2021Params &params);
51 
52 int64_t get_active_block_hardware_limit(const Anderson2021Params &params);
53 
54 int64_t get_active_warp_hardware_limit(const Anderson2021Params &params);
55 
57  return 128;
58 }
59 
60 int get_unroll_limit(const Target &target);
61 
62 bool in_range_zero_one(double x);
63 
64 bool are_valid_thread_extents(const vector<int64_t> &counts);
65 
68 
69 bool all(const vector<int> &v);
70 bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);
71 
72 // We're going to do a tree search over possible schedules to find an
73 // optimal one. A tree search requires a state, and a function that
74 // gives you children of the state (with costs). The following struct
75 // represents the state, which is a partial schedule.
76 //
77 // A partial schedule is a tree. Each node is some portion of the for
78 // loop nest of some Func. If there are no children, it's the
79 // innermost set of loops. If there are children, it's a loop over
80 // tiles of that Func.
81 struct LoopNest {
82  mutable RefCount ref_count;
83 
84  // The extents of this loop. Put another way, the number of tiles,
85  // not the size of each tile.
86  vector<int64_t> size;
87 
88  // The nodes inside the loop body
89  vector<IntrusivePtr<const LoopNest>> children;
90 
91  // Funcs inlined into this inner loop, and the number of times
92  // each is called. Only valid if children is empty.
94 
95  // Funcs stored inside this loop
96  std::set<const FunctionDAG::Node *> store_at;
97 
98  // The total bounds required of any given Func over all iterations
99  // of this loop. In the paper, this is represented using the
100  // little boxes to the left of the loop nest tree figures.
101  mutable NodeMap<Bound> bounds;
102 
103  // The Func this loop nest belongs to
104  const FunctionDAG::Node *node = nullptr;
105 
106  // The stage of the Func
107  const FunctionDAG::Node::Stage *stage = nullptr;
108 
109  // Is this the innermost loop of this func (the SIMD loop)?
110  bool innermost = false;
111 
112  // Are we permitted to tile this loop?
113  bool tileable = false;
114 
115  // Is this the parallel outer loop?
116  bool parallel = false;
117 
118  // What dimension is this Func vectorized over, in terms of the pure args of the Func?
119  int vector_dim = -1;
120 
121  // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
122  int vectorized_loop_index = -1;
123 
124  // Apply gpu threads to this loop nest
126 
129  double num_vectors;
130  double num_scalars;
131  double vector_size;
137  };
138 
139  mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;
140  mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;
141 
142  bool is_gpu_serial(const Target &target) const {
143  return target.has_gpu_feature() && gpu_label == GPU_parallelism::Serial;
144  }
145 
146  bool is_gpu_thread(const Target &target) const {
147  return target.has_gpu_feature() && gpu_label == GPU_parallelism::Thread;
148  }
149 
150  bool is_gpu_block(const Target &target) const {
151  return target.has_gpu_feature() && gpu_label == GPU_parallelism::Block;
152  }
153 
154  bool is_scalar() const {
155  return size.empty();
156  }
157 
158  // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension
159  // across all siblings of f.
160  vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;
161 
162  // given a newly inserted node f into this LoopNest, gets the size of
163  // all of f's stages and their pure_dim indices
164  void get_stage_sizes(const FunctionDAG::Node *f,
165  vector<vector<int64_t>> &stage_sizes,
166  vector<vector<int>> &pure_dims,
167  vector<int> &vectorized_indices) const;
168 
169  // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes
170  // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32
171  // if so, we will include these in the serial loop sizes
172  void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;
173 
174  // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles
175  // the newly inserted loop nests of f into a threads loop outside a serial loop.
176  // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.
178  const Anderson2021Params &params,
179  const Target &target,
180  int v,
181  vector<IntrusivePtr<const LoopNest>> &result,
182  const vector<int64_t> &max_size);
183 
184  void copy_from(const LoopNest &n);
185  void copy_from_including_features(const LoopNest &n);
186 
187  static void hash_combine(uint64_t &h, uint64_t next) {
188  // From boost
189  h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
190  }
191 
192  // Hash the loop structure and sizes up to a fixed depth. This is
193  // used as the hash function for the coarse-to-fine beam search in
194  // the paper.
195  void structural_hash(uint64_t &h, int depth) const;
196 
197  // How many funcs are scheduled inside this loop level. Used in
198  // the structural hash.
199  size_t funcs_realized_or_inlined() const {
200  size_t count = inlined.size() + store_at.size();
201  for (const auto &c : children) {
202  count += c->funcs_realized_or_inlined();
203  }
204  return count;
205  }
206 
207  // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
208  struct Sites {
209  const LoopNest *compute = nullptr; // Its containing compute_at site
210  const LoopNest *store = nullptr; // Its containing store_at site
211  const LoopNest *produce = nullptr; // Its own outermost node
212  const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
213  const LoopNest *task = nullptr; // The parallel for loop it belongs to
214  const LoopNest *thread = nullptr; // Its containing gpu_thread loop
215  GPUMemoryType gpu_store_memory_type; // global, local, shared?
216  int64_t allocation_size = 0; // Allocation size in bytes
217  bool is_constant_allocation = false; // Does the allocation have constant size?
218  int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers
219  bool inlined = false; // Is the Func inlined?
220  std::vector<const LoopNest *> inlined_innermosts; // Is the Func inlined?
222 
223  bool is_stored_in_global_mem() const {
225  }
226  bool is_stored_in_shared_mem() const {
228  }
229  bool is_stored_in_local_mem() const {
231  }
232  bool is_stored_in_registers() const {
234  }
235  };
236 
237  GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined = false) const;
238 
239  std::vector<int> unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const;
240 
242  StageMap<Sites> &sites,
243  NodeMap<bool> &can_be_promoted_to_registers,
244  const LoopNest *grandparent,
245  const LoopNest *parent) const;
246 
247  bool promote_allocs_to_registers(const Target &target, StageMap<Sites> &sites) const;
248 
249  // Compute all the sites of interest for each pipeline stage
250  void get_sites(const Target &target,
251  StageMap<Sites> &sites,
252  StageMap<int64_t> &shared_mem_alloc_sizes,
253  const LoopNest *task = nullptr,
254  const LoopNest *parent = nullptr,
255  const LoopNest *current_thread_loop = nullptr) const;
256 
257  // A helper for the working_set_at_task feature. Most features are
258  // computed in the recursive pass 'compute_features' below, but
259  // this one must be done in a second separate recursive pass.
262  for (const auto &c : children) {
263  c->set_working_set_at_task_feature(working_set, features);
264  features->get(c->stage).working_set_at_task = working_set;
265  }
266  }
267 
268  bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const;
269 
271 
272  bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;
273 
275 
277 
279 
280  // Get the stride over "node's" storage for a unit increment in the vectorized loop's
281  // index
282  double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const;
283 
284  Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose = false) const;
285 
286  bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const;
287 
288  int get_actual_vector_dim(const Bound &store_bounds) const;
289 
290  void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector<int64_t> &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose = false) const;
291 
292  bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const;
293 
294  bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const;
295 
296  int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
297 
298  int vectorized_access_size(size_t loop_index, bool verbose = false) const;
299 
300  template<typename T>
301  void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose = false) const;
302 
303  std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const;
304 
305  template<typename T>
306  MemInfoType<T> compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
307 
308  template<typename T>
309  void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double serial_loop_extents, bool verbose = false) const;
310 
311  double compute_local_mem_stride(double stride, double bytes) const;
312 
313  // Assumes block, serial, thread or block, thread nesting
314  const LoopNest *get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const;
315 
316  std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
317 
319 
320  bool has_thread_loop_descendant() const;
321 
322  void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const;
323 
324  // Assume that when a block is active, all its warps are active
325  void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const;
326 
327  void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const;
328 
329  std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
330 
331  int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector<const FunctionDAG::Edge *> &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
332 
333  int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const;
334 
336 
337  vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
338 
340 
341  void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
342 
344 
345  void compute_working_set_from_features(int64_t *working_set,
346  const StageMap<ScheduleFeatures> *features) const;
347 
349 
350  std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
351 
352  // Do a recursive walk over the loop nest computing features to feed the cost model.
353  void compute_features(const FunctionDAG &dag,
354  const Anderson2021Params &params,
355  const Target &target,
356  const StageMap<Sites> &sites,
357  int64_t instances,
358  int64_t parallelism,
359  const LoopNest *parent,
360  const LoopNest *grandparent,
361  const LoopNest &root,
362  GPULoopInfo gpu_loop_info,
363  bool use_memoized_features,
364  const StageMap<int64_t> &total_shared_mem_alloc_sizes,
365  int64_t *working_set,
366  int64_t *working_set_local_constant,
367  int64_t *working_set_local_dynamic,
369  Statistics &stats,
370  bool verbose = false) const;
371 
372  bool is_root() const {
373  // The root is the sole node without a Func associated with
374  // it.
375  return node == nullptr;
376  }
377 
378  // Set the region required of a Func at this site.
379  const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
380  return bounds.emplace(f, b);
381  }
382 
383  // Get the region required of a Func at this site, from which we
384  // know what region would be computed if it were scheduled here,
385  // and what its loop nest would be.
386  const Bound &get_bounds(const FunctionDAG::Node *f) const;
387 
388  // Get the region required of a Func at this site (but only to satisfy the
389  // consumers along the given edge chain), from which we know what region
390  // would be computed if it were scheduled here and what its loop nest
391  // would be.
392  Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector<const FunctionDAG::Edge *> &edge_chain) const;
393 
394  void dump() const;
395 
396  std::string to_string() const;
397 
398  // Recursively print a loop nest representation to stderr
399  template<typename T>
400  void dump(T &stream, string prefix, const LoopNest *parent) const;
401 
402  // Does this loop nest access the given Func
403  bool calls(const FunctionDAG::Node *f) const;
404 
405  // What is the maximum number of inlined calls to a Func that
406  // occur within this loop. Used to prune states that would
407  // generate too much code.
408  int64_t max_inlined_calls() const;
409 
410  // Does this loop nest access an input buffer? Used to select
411  // trail strategies when splitting loops. We don't want to read
412  // out of bounds on inputs, even if we don't intend to use the
413  // values read. It could create annoying assertion failures for
414  // the user. It's OK to read out of range of the values computed
415  // on internal Funcs though. Allocation bounds inference just pads
416  // out the bounds so that it won't fault.
417  bool accesses_input_buffer() const;
418 
419  // Does this loop nest contain a computation of the given Func.
420  bool computes(const FunctionDAG::Node *f) const;
421 
422  // Above here most methods query the loop nest. Below we have
423  // methods that mutate the loop nest.
424 
425  // Inline a Func into all consumers within this loop.
426  void inline_func(const FunctionDAG::Node *f);
427 
428  // Compute a Func at this site.
429  bool compute_here(const FunctionDAG::Node *f,
430  bool tileable,
431  int v,
432  bool in_threads_loop,
433  const Anderson2021Params &params,
434  const Target &target);
435 
436  // Parallelize this loop according to the given tiling.
437  IntrusivePtr<const LoopNest> parallelize_in_tiles(const vector<int64_t> &tiling,
438  const LoopNest *parent,
439  const Anderson2021Params &params,
440  const Target &target,
441  bool inner_tiling,
442  bool adjust_tiling,
443  bool move_all_rvars_inward = true,
444  const vector<int> &rvars_to_move_inward = {}) const;
445 
446  int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, bool in_threads_loop = false) const;
448 
449  // All store ats further in than the block level must be fixed
450  // sized allocations. This method checks if f will require a dynamic
451  // allocation
452  bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const;
453 
454  // Return all possible ways to compute f in tiles somewhere within
455  // this loop nest.
456  // in_threads_loop tracks whether or not function is going to be placed inside a
457  // loop marked gpu_threads, in which case f's loops cannot be gpu_threads
458  vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
459  const LoopNest *parent,
460  const Anderson2021Params &params,
461  const Target &target,
462  const SearchSpaceOptions &search_space_options,
463  int v,
464  bool in_realization,
465  bool in_threads_loop,
466  bool is_pre_pass,
467  vector<int64_t> union_counts = vector<int64_t>()) const;
468 
469  // Below here we have methods that apply a schedule to a Halide pipeline.
470 
471  // A model of the state of the loop nest of a Func while applying
472  // Halide's scheduling directives.
473 
474  // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.
475  struct StageScheduleState {
476  // How much parallelism do we need to exploit with this Func?
477  double num_cores = 0;
478 
479  // Which storage dimension is vectorized? We need to reorder it innermost
480  int vector_dim = -1;
481  int vectorized_loop_index = -1;
482 
483  // The various Vars and RVars used for scheduling a Func.
484  struct FuncVar {
485  // The top-level var or rvar this was split off from
486  VarOrRVar orig;
487 
488  // This var.
489  VarOrRVar var;
490 
491  // Source code to access this Var/RVar. Used for printing
492  // valid Halide source for this schedule.
493  string accessor;
494 
495  // Our estimate of the extent of this var. This is exact
496  // when constant_extent flag is true.
497  int64_t extent = 0;
498 
499  // Which index in the symbolic loop nest does this var
500  // belong to.
501  size_t index = 0;
502 
503  // Some flags.
504  bool innermost_pure_dim = false,
505  outermost = false,
506  parallel = false,
507  exists = false,
508  pure = false,
509  constant_extent = false;
510 
511  bool vectorized = false;
512  bool gpu_threads = false;
513 
515  : orig(Var()), var(Var()) {
516  }
517  };
520  bool parallel = false;
521  bool vectorized = false;
524 
525  // In order from innermost to outermost. Each group of d is one tiling level.
526  vector<FuncVar> vars;
527 
528  // In order from innermost to outermost. Each group of d is one tiling level.
529  vector<FuncVar> ordered_vars;
530  vector<int64_t> gpu_thread_extents;
531 
533 
534  // From outermost in
535  vector<StageScheduleState *> ancestors;
536 
537  std::ostringstream schedule_source;
538  };
539 
542  bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const;
544  int num_serial_loops() const;
545  bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const;
546 
547  void update_producers_to_be_staged(StageScheduleState &state, const NodeMap<bool> &all_inlined) const;
548  bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const;
549 
550  // Apply the schedule represented by this loop nest to a Halide pipeline.
551  void apply(LoopLevel here,
552  StageMap<std::unique_ptr<StageScheduleState>> &state_map,
553  double num_cores,
554  int depth,
555  const LoopNest *parent,
556  const LoopNest *compute_site,
557  const Target &target,
558  std::vector<StageScheduleState *> &ancestors,
559  const NodeMap<bool> &all_inlined) const;
560 
561  double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const;
562 
563  bool has_valid_thread_extents() const;
564 
565  void collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze, NodeMap<bool> &inlined_nodes) const;
566 
567  void collect_all_inlined(NodeMap<bool> &all_inlined) const;
568 
569  int64_t product_of_self_and_descendants(int loop_index) const;
570  int64_t product_of_descendants(int loop_index) const;
571 
572  void get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants, const LoopNest *compute_root_loop_nest = nullptr) const;
573 };
574 
575 struct Filter {
577  bool logging = false;
578 
581  if (logging) {
582  std::cerr << "\nState filtered: \n";
583  loop_nest->dump();
584  std::cerr << "Reason: ";
585  }
586  }
587 
588  template<typename T>
589  Filter &operator<<(T &&x) {
590  if (logging) {
591  std::cerr << std::forward<T>(x);
592  }
593  return *this;
594  }
595 
596  static bool enable_filter_printing();
597 };
598 
599 } // namespace Autoscheduler
600 } // namespace Internal
601 } // namespace Halide
602 
603 #endif // LOOP_NEST_H
Halide::Internal::Autoscheduler::LoopNest::collect_producers
std::vector< std::pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
Halide::Internal::Autoscheduler::LoopNest::all_paths_to_leaves_have_thread_loop
bool all_paths_to_leaves_have_thread_loop() const
Halide::Internal::Autoscheduler::LoopNest::storage_stride
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
Halide::Internal::Autoscheduler::GPUMemoryType::Local
@ Local
Halide::Internal::Autoscheduler::LoopNest::compute_warp_and_block_occupancy
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
Halide::Internal::Autoscheduler::LoopNest::children
vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:89
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState
Definition: LoopNest.h:209
Halide::Internal::Autoscheduler::LoopNest::has_constant_region_required
bool has_constant_region_required(const FunctionDAG::Node *node) const
Halide::Internal::Autoscheduler::LoopNest::compute_local_mem_stride
double compute_local_mem_stride(double stride, double bytes) const
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::inlined_calls
double inlined_calls
Definition: LoopNest.h:128
Halide::Internal::Autoscheduler::LoopNest::get_pure_stage_vectorized_loop_index
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
Halide::Internal::Autoscheduler::LoopNest::producer_computed_here_or_further_in
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::accessor
string accessor
Definition: LoopNest.h:227
Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_shared_mem
bool is_stored_in_shared_mem() const
Definition: LoopNest.h:226
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_scalars
double num_scalars
Definition: LoopNest.h:130
Halide::Internal::Autoscheduler::LoopNest::store_at
std::set< const FunctionDAG::Node * > store_at
Definition: LoopNest.h:49
Halide::Internal::Autoscheduler::BoundContents
Definition: FunctionDAG.h:275
Halide::Internal::Autoscheduler::LoopNest::get_gpu_memory_type
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
Halide::Internal::Autoscheduler::LoopNest::ref_count
RefCount ref_count
Definition: LoopNest.h:35
Halide::Internal::Autoscheduler::LoopNest::Sites::is_constant_allocation
bool is_constant_allocation
Definition: LoopNest.h:217
Halide::Var
A Halide variable, to be used when defining functions.
Definition: Var.h:19
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_vectors
double num_vectors
Definition: LoopNest.h:129
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::ancestors
vector< StageScheduleState * > ancestors
Definition: LoopNest.h:535
Halide::Internal::Autoscheduler::LoopNest::Sites::gpu_store_memory_type
GPUMemoryType gpu_store_memory_type
Definition: LoopNest.h:215
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::gpu_thread_extents
vector< int64_t > gpu_thread_extents
Definition: LoopNest.h:530
Halide::Internal::Autoscheduler::LoopNest::collect_stages
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Halide::Internal::Autoscheduler::LoopNest::compute_gpu_store_features
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
Halide::Internal::Autoscheduler::LoopNest::compute_mem_store_info
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const
Halide::Internal::Autoscheduler::LoopNest::vectorized_loop_index
int vectorized_loop_index
Definition: LoopNest.h:75
Halide::Internal::Autoscheduler::LoopNest::promote_allocs_to_registers
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
Halide::Internal::Autoscheduler::get_idle_lane_wastage_limit_env_var
double get_idle_lane_wastage_limit_env_var()
Halide::Internal::Autoscheduler::LoopNest::get_vectorized_loop_index_from_pure_stage
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::FuncVar
FuncVar()
Definition: LoopNest.h:514
Halide::Internal::Autoscheduler::GPUMemoryType::Shared
@ Shared
Halide::Internal::Autoscheduler::LoopNest::compute_shared_mem_occupancy
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
Halide::Internal::Autoscheduler::LoopNest::inlined
NodeMap< int64_t > inlined
Definition: LoopNest.h:46
Halide::Internal::Autoscheduler::LoopNest::node_has_dynamic_region_computed
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::gpu_threads
bool gpu_threads
Definition: LoopNest.h:512
Halide::Internal::Autoscheduler::Filter::enable_filter_printing
static bool enable_filter_printing()
Halide::Internal::Autoscheduler::LoopNest::can_vectorize_store_access
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
Halide::Internal::Autoscheduler::LoopNest::has_dynamic_allocation_inside_thread
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
Halide::Internal::Autoscheduler::LoopNest::gpu_label
GPU_parallelism gpu_label
Definition: LoopNest.h:125
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::var
VarOrRVar var
Definition: LoopNest.h:223
Halide::Internal::Autoscheduler::GPU_parallelism::Simd
@ Simd
Halide::Internal::Autoscheduler::LoopNest::set_bounds
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition: LoopNest.h:379
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::extent
int64_t extent
Definition: LoopNest.h:231
Halide::Internal::Autoscheduler::LoopNest::requires_dynamic_allocation
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
Halide::Internal::Autoscheduler::FunctionDAG
Definition: FunctionDAG.h:368
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::ordered_vars
vector< FuncVar > ordered_vars
Definition: LoopNest.h:529
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vars
vector< FuncVar > vars
Definition: LoopNest.h:526
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::vector_size
double vector_size
Definition: LoopNest.h:131
Halide::Internal::Autoscheduler::GPUMemoryType
GPUMemoryType
Definition: LoopNest.h:42
Halide::Internal::Autoscheduler::LoopNest::get_bounds
const Bound & get_bounds(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::memoize_features
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features_to_insert) const
Halide::Internal::Autoscheduler::LoopNest::Sites::compute
const LoopNest * compute
Definition: LoopNest.h:101
Halide::Internal::Autoscheduler::LoopNest::has_thread_loop_descendant
bool has_thread_loop_descendant() const
Halide::Internal::Autoscheduler::GPUMemoryType::Inlined
@ Inlined
Halide::Internal::Autoscheduler::GPU_parallelism
GPU_parallelism
Definition: LoopNest.h:32
Halide::Internal::Autoscheduler::LoopNest::is_scalar
bool is_scalar() const
Definition: LoopNest.h:154
Halide::Internal::Autoscheduler::LoopNest::collect_all_inlined
void collect_all_inlined(NodeMap< bool > &all_inlined) const
Halide::Internal::Autoscheduler::Statistics
Definition: Statistics.h:63
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::index
size_t index
Definition: LoopNest.h:235
Halide::Internal::Autoscheduler::LoopNest::set_working_set_at_task_feature
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition: LoopNest.h:260
Halide::Internal::Autoscheduler::LoopNest::compute_here
void compute_here(const FunctionDAG::Node *f, bool tileable, int v, const Adams2019Params &params)
Halide::Internal::Autoscheduler::LoopNest::dump
void dump(std::ostream &os, string prefix, const LoopNest *parent) const
Halide::Internal::Autoscheduler::LoopNest::parallel
bool parallel
Definition: LoopNest.h:69
Halide::Internal::Autoscheduler::LoopNest::get_block_and_serial_extents
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
Halide::Target::has_gpu_feature
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
Halide::Internal::Autoscheduler::LoopNest::accesses_input_buffer
bool accesses_input_buffer() const
Halide::Internal::Autoscheduler::LoopNest::compute_mem_load_features
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
Halide::Internal::Autoscheduler::Filter::operator<<
Filter & operator<<(T &&x)
Definition: LoopNest.h:589
Halide::Internal::Autoscheduler::LoopNest::points_accessed_per_thread
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
Halide::Internal::Autoscheduler::LoopNest::compute_working_set_from_features
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
Halide::Internal::Autoscheduler::LoopNest::copy_from_including_features
void copy_from_including_features(const LoopNest &n)
Halide::Internal::Autoscheduler::Filter::Filter
Filter(const LoopNest *loop_nest)
Definition: LoopNest.h:579
GPULoopInfo.h
Halide::Internal::Autoscheduler::NodeMap
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition: LoopNest.h:21
Halide::Internal::IntrusivePtr
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
Halide::Internal::Autoscheduler::LoopNest::compute_warp_features
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
Halide::Internal::Autoscheduler::LoopNest::calls
bool calls(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::max_inlined_calls
int64_t max_inlined_calls() const
Halide::Internal::Autoscheduler::LoopNest::Sites::produce
const LoopNest * produce
Definition: LoopNest.h:103
Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_local_mem
bool is_stored_in_local_mem() const
Definition: LoopNest.h:229
uint64_t
unsigned __INT64_TYPE__ uint64_t
Definition: runtime_internal.h:23
Halide::Internal::Autoscheduler::Filter
Definition: LoopNest.h:575
ThreadInfo.h
Halide::Internal::Autoscheduler::LoopNest::get_bounds_along_edge_chain
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
Halide::Internal::Autoscheduler::LoopNest::find_innermost_and_parent
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
Halide::Internal::Autoscheduler::LoopNest::innermost
bool innermost
Definition: LoopNest.h:63
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
PerfectHashMap.h
Halide::Internal::Autoscheduler::are_valid_thread_extents
bool are_valid_thread_extents(const vector< int64_t > &counts)
Halide::Internal::Autoscheduler::LoopNest::compute_strides
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose=false) const
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_threads_per_block
double num_threads_per_block
Definition: LoopNest.h:135
Halide::Internal::ScheduleFeatures
Definition: Featurization.h:160
Halide::Internal::Autoscheduler::LoopNest::Sites::store
const LoopNest * store
Definition: LoopNest.h:102
Halide::Internal::Autoscheduler::accessed_at_constant_indices
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_warps_per_block
double num_warps_per_block
Definition: LoopNest.h:134
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::Autoscheduler::LoopNest::copy_from
void copy_from(const LoopNest &n)
Halide::Internal::Autoscheduler::GPU_parallelism::Thread
@ Thread
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::vectorized
bool vectorized
Definition: LoopNest.h:511
Halide::Internal::Autoscheduler::LoopNest::memoize_points_computed_minimum
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
Halide::Internal::Autoscheduler::LoopNest::computes
bool computes(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::may_subtile
bool may_subtile(const Anderson2021Params &params)
Halide::Internal::Autoscheduler::LoopNest::product_of_descendants
int64_t product_of_descendants(int loop_index) const
Halide::Internal::Autoscheduler::LoopNest::node
const FunctionDAG::Node * node
Definition: LoopNest.h:57
Halide::Internal::Autoscheduler::LoopNest
Definition: LoopNest.h:34
FunctionDAG.h
Halide::Internal::Autoscheduler::LoopNest::get_sites
void get_sites(StageMap< Sites > &sites, const LoopNest *task=nullptr, const LoopNest *parent=nullptr) const
Halide::Internal::Autoscheduler::LoopNest::Sites::allocation_size
int64_t allocation_size
Definition: LoopNest.h:216
Halide::Internal::Autoscheduler::LoopNest::dump
void dump() const
Tiling.h
Halide::Internal::Autoscheduler::LoopNest::get_stage_sizes
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t >> &stage_sizes, vector< vector< int >> &pure_dims, vector< int > &vectorized_indices) const
Halide::Internal::Autoscheduler::LoopNest::parallelize_in_tiles
IntrusivePtr< const LoopNest > parallelize_in_tiles(const Adams2019Params &params, const vector< int64_t > &tiling, const LoopNest *parent) const
Halide::Internal::Autoscheduler::ThreadInfo
Definition: ThreadInfo.h:40
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::node
const FunctionDAG::Node * node
Definition: LoopNest.h:518
Halide::Internal::Autoscheduler::LoopNest::apply
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState >> &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::parallel
bool parallel
Definition: LoopNest.h:240
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::stage
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:519
Halide::Internal::Autoscheduler::LoopNest::get_total_constant_local_mem_alloc_size
int64_t get_total_constant_local_mem_alloc_size() const
Halide::Internal::Autoscheduler::LoopNest::exceeds_serial_extents_limit
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
Halide::Internal::Autoscheduler::LoopNest::all_strides_exist
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
Halide::Internal::Autoscheduler::LoopNest::stage
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:60
GPUMemInfo.h
Halide::Internal::Autoscheduler::stringify
std::string stringify(GPU_parallelism label)
Halide::Internal::Autoscheduler::LoopNest::vectorized_access_size
int vectorized_access_size(size_t loop_index, bool verbose=false) const
int64_t
signed __INT64_TYPE__ int64_t
Definition: runtime_internal.h:22
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized
bool vectorized
Definition: LoopNest.h:521
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized_loop_index
int vectorized_loop_index
Definition: LoopNest.h:215
Halide::Internal::Autoscheduler::LoopNest::size
std::vector< int64_t > size
Definition: LoopNest.h:39
Halide::Internal::Autoscheduler::LoopNest::feature_intermediates
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition: LoopNest.h:139
Halide::Internal::Autoscheduler::get_active_block_hardware_limit
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)
Halide::Internal::Autoscheduler::LoopNest::Sites::thread
const LoopNest * thread
Definition: LoopNest.h:214
Halide::Internal::Autoscheduler::FunctionDAG::Node::Stage
Definition: FunctionDAG.h:449
Halide::Internal::Autoscheduler::Filter::logging
bool logging
Definition: LoopNest.h:577
Halide::Internal::Autoscheduler::LoopNest::num_serial_loops
int num_serial_loops() const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::pure
bool pure
Definition: LoopNest.h:242
Halide::Internal::Autoscheduler::get_idle_lane_wastage_limit
double get_idle_lane_wastage_limit()
Halide::Internal::Autoscheduler::GPU_parallelism::None
@ None
Halide::Internal::Autoscheduler::LoopNest::Sites::inlined
bool inlined
Definition: LoopNest.h:106
Halide::Internal::Autoscheduler::MemInfo
Definition: GPUMemInfo.h:56
Halide::Internal::Autoscheduler::LoopNest::get_allocs_that_can_be_promoted_to_registers
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
Halide::Internal::Autoscheduler::LoopNest::hash_combine
static void hash_combine(uint64_t &h, uint64_t next)
Definition: LoopNest.h:187
SearchSpaceOptions.h
Halide::Internal::Autoscheduler::all
bool all(const vector< int > &v)
Halide::Internal::Autoscheduler::LoopNest::recompute_inlined_features
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
Halide::Internal::Autoscheduler::LoopNest::compute_licm_amortization
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
ASLog.h
Halide::Internal::Autoscheduler::in_range_zero_one
bool in_range_zero_one(double x)
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::producers_to_be_staged
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition: LoopNest.h:532
Halide::Internal::Autoscheduler::LoopNest::get_union_thread_counts
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::GPUMemoryType::Global
@ Global
Halide::Internal::Autoscheduler::LoadJacobian
Definition: FunctionDAG.h:127
Halide::Internal::Autoscheduler::LoopNest::vectorized_load_access_size
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
Halide::Internal::Autoscheduler::GPU_parallelism::Block
@ Block
Halide::Internal::Autoscheduler::LoopNest::compute_features
void compute_features(const FunctionDAG &dag, const Adams2019Params &params, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, int64_t *working_set, StageMap< ScheduleFeatures > *features, bool use_cached_features) const
CostModel.h
Halide::Internal::Autoscheduler::LoopNest::Sites::hash_of_producers_stored_at_root
uint64_t hash_of_producers_stored_at_root
Definition: LoopNest.h:109
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::points_computed_per_thread
double points_computed_per_thread
Definition: LoopNest.h:136
Halide::Internal::Autoscheduler::LoopNest::compute_in_tiles
std::vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Adams2019Params &params, int v, bool in_realization) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::num_cores
double num_cores
Definition: LoopNest.h:211
Halide::Internal::Autoscheduler::LoopNest::is_gpu_thread
bool is_gpu_thread(const Target &target) const
Definition: LoopNest.h:146
Halide::Internal::Autoscheduler::LoopNest::bounds
NodeMap< Bound > bounds
Definition: LoopNest.h:54
Halide::LoopLevel
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
Halide::Internal::Autoscheduler::GPULoopInfo
Definition: GPULoopInfo.h:21
Halide::Internal::Autoscheduler::LoopNest::other_stage_has_same_producer
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vector_dim
int vector_dim
Definition: LoopNest.h:214
Halide::Internal::Autoscheduler::LoopNest::Sites::task
const LoopNest * task
Definition: LoopNest.h:105
Statistics.h
Halide::Internal::Autoscheduler::get_shared_memory_limit
int64_t get_shared_memory_limit(const Anderson2021Params &params)
Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_global_mem
bool is_stored_in_global_mem() const
Definition: LoopNest.h:223
Halide::Internal::Autoscheduler::LoopNest::collect_nodes_that_should_be_inlined
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
Halide::Internal::Autoscheduler::Filter::loop_nest
const LoopNest * loop_nest
Definition: LoopNest.h:576
Halide::Internal::Autoscheduler::get_register_mem_alloc_limit
constexpr int64_t get_register_mem_alloc_limit()
Definition: LoopNest.h:56
Halide::Internal::Autoscheduler::GPU_parallelism::Parallelized
@ Parallelized
Halide::Internal::Autoscheduler::LoopNest::can_vectorize_access_for_innermost_dim
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::innermost_pure_dim
bool innermost_pure_dim
Definition: LoopNest.h:238
Halide::Internal::Autoscheduler::LoopNest::features
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition: LoopNest.h:140
Halide::Internal::Autoscheduler::LoopNest::has_constant_region_computed
bool has_constant_region_computed(const FunctionDAG::Node *node) const
PerfectHashMap
Definition: PerfectHashMap.h:38
Halide::Internal::Autoscheduler::Anderson2021Params
Definition: CostModel.h:18
Halide::Internal::Autoscheduler::get_unroll_limit
int get_unroll_limit(const Target &target)
Halide::Internal::Autoscheduler::StageMap
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition: LoopNest.h:24
Halide::Internal::Autoscheduler::LoopNest::size
vector< int64_t > size
Definition: LoopNest.h:86
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::constant_extent
bool constant_extent
Definition: LoopNest.h:243
Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_registers
bool is_stored_in_registers() const
Definition: LoopNest.h:232
Halide::Internal::Autoscheduler::LoopNest::Sites::innermost
const LoopNest * innermost
Definition: LoopNest.h:104
Halide::Internal::Autoscheduler::LoopNest::to_string
std::string to_string() const
Halide::Internal::RefCount
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
Halide::Internal::Autoscheduler::LoopNest::structural_hash
void structural_hash(uint64_t &h, int depth) const
Halide::Internal::Autoscheduler::LoopNest::Sites::inlined_innermosts
std::vector< const LoopNest * > inlined_innermosts
Definition: LoopNest.h:220
Halide::Internal::Autoscheduler::LoopNest::funcs_realized_or_inlined
size_t funcs_realized_or_inlined() const
Definition: LoopNest.h:199
Halide::Internal::Autoscheduler::GPU_parallelism::Serial
@ Serial
Halide::Internal::Autoscheduler::LoopNest::tileable
bool tileable
Definition: LoopNest.h:66
Halide::Internal::Autoscheduler::LoopNest::get_actual_vector_dim
int get_actual_vector_dim(const Bound &store_bounds) const
Halide::VarOrRVar
A class that can represent Vars or RVars.
Definition: Func.h:30
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::all_innermost_unrolled
bool all_innermost_unrolled
Definition: LoopNest.h:522
Halide::Internal::Autoscheduler::LoopNest::region_computed_shrinks
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::schedule_source
std::ostringstream schedule_source
Definition: LoopNest.h:252
Halide::Internal::Autoscheduler::LoopNest::Sites::num_realizations
int64_t num_realizations
Definition: LoopNest.h:218
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates
Definition: LoopNest.h:127
Halide::Internal::Autoscheduler::LoopNest::has_valid_thread_extents
bool has_valid_thread_extents() const
Halide::Internal::Autoscheduler::LoopNest::is_gpu_serial
bool is_gpu_serial(const Target &target) const
Definition: LoopNest.h:142
Halide::Internal::Autoscheduler::LoopNest::compute_num_mem_accesses_per_block
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::innermost_pure_loop_extent
double innermost_pure_loop_extent
Definition: LoopNest.h:132
Halide::Internal::Autoscheduler::LoopNest::get_enclosing_block
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
Halide::Internal::Autoscheduler::FunctionDAG::Node
Definition: FunctionDAG.h:379
Halide::Internal::Autoscheduler::LoopNest::inline_func
void inline_func(const FunctionDAG::Node *f)
Halide::Internal::Autoscheduler::LoopNest::get_total_local_mem_alloc_size
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::exists
bool exists
Definition: LoopNest.h:241
Halide::Internal::Autoscheduler::LoopNest::children
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:42
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized_var
FuncVar vectorized_var
Definition: LoopNest.h:523
Halide::Internal::Autoscheduler::LoopNest::product_of_self_and_descendants
int64_t product_of_self_and_descendants(int loop_index) const
Halide::Internal::Autoscheduler::LoopNest::is_gpu_block
bool is_gpu_block(const Target &target) const
Definition: LoopNest.h:150
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::outermost
bool outermost
Definition: LoopNest.h:239
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::orig
VarOrRVar orig
Definition: LoopNest.h:220
Halide::Internal::Autoscheduler::LoopNest::max_idle_lane_wastage
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
Halide::Internal::Autoscheduler::LoopNest::get_stages_computed_in_each_compute_root_loop
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool >> &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
Halide::Internal::Autoscheduler::LoopNest::compute_hash_of_producers_stored_at_root
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
Halide::Target
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
Halide::Internal::Autoscheduler::LoopNest::update_producers_to_be_staged
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
Halide::Internal::Autoscheduler::LoopNest::compute_local_mem_store_features
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
Halide::Internal::Autoscheduler::LoopNest::is_root
bool is_root() const
Definition: LoopNest.h:372
Halide::Internal::Autoscheduler::LoopNest::vector_dim
int vector_dim
Definition: LoopNest.h:72
Halide::Internal::Autoscheduler::LoopNest::generate_vec_dim_serial_tilings
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
Halide::Internal::Autoscheduler::LoopNest::find_pure_stage_loop_nest
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::parallel
bool parallel
Definition: LoopNest.h:520
Halide::Internal::Autoscheduler::get_active_warp_hardware_limit
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)
Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::outer_parallelism
double outer_parallelism
Definition: LoopNest.h:133
Halide::Internal::Autoscheduler::LoopNest::compute_alloc_size_of_node_here
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar
Definition: LoopNest.h:218
Halide::Internal::Autoscheduler::Strides
Definition: GPUMemInfo.h:115
Halide::Internal::Autoscheduler::GPUMemoryType::Registers
@ Registers
Halide::Internal::Autoscheduler::LoopNest::add_gpu_thread_tilings
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest >> &result, const vector< int64_t > &max_size)
Halide::Internal::Autoscheduler::LoopNest::unrolled_loops
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const