Halide
LoopNest.h
Go to the documentation of this file.
1 /** This file defines the LoopNest, which is our
2  * representation of a Halide schedule, and contains methods to
3  * generate candidates for scheduling as well as extract a
4  * featurization that can be used to cost each candidate. */
5 
6 #ifndef LOOP_NEST_H
7 #define LOOP_NEST_H
8 
9 #include "FunctionDAG.h"
10 #include "PerfectHashMap.h"
11 #include <map>
12 #include <set>
13 #include <utility>
14 #include <vector>
15 
16 namespace Halide {
17 namespace Internal {
18 namespace Autoscheduler {
19 
20 template<typename T>
22 
23 template<typename T>
25 
26 // Given a multi-dimensional box of dimensionality d, generate a list
27 // of candidate tile sizes for it, logarithmically spacing the sizes
28 // using the given factor. If 'allow_splits' is false, every dimension
29 // must either be one, or the full extent of the box. This function is
30 // used to generate candidate tilings when tiling for
31 // producer-consumer fusion, or tiling for parallelism.
32 std::vector<std::vector<int64_t>> generate_tilings(const vector<int64_t> &s, int d, int factor, bool allow_splits);
33 
34 struct LoopNest {
36 
37  // The extents of this loop. Put another way, the number of tiles,
38  // not the size of each tile.
39  std::vector<int64_t> size;
40 
41  // The nodes inside the loop body
42  std::vector<IntrusivePtr<const LoopNest>> children;
43 
44  // Funcs inlined into this inner loop, and the number of times
45  // each is called. Only valid if children is empty.
47 
48  // Funcs stored inside this loop
49  std::set<const FunctionDAG::Node *> store_at;
50 
51  // The total bounds required of any given Func over all iterations
52  // of this loop. In the paper, this is represented using the
53  // little boxes to the left of the loop nest tree figures.
55 
56  // The Func this loop nest belongs to
57  const FunctionDAG::Node *node = nullptr;
58 
59  // The stage of the Func
60  const FunctionDAG::Node::Stage *stage = nullptr;
61 
62  // Is this the innermost loop of this func (the SIMD loop)?
63  bool innermost = false;
64 
65  // Are we permitted to tile this loop?
66  bool tileable = false;
67 
68  // Is this the parallel outer loop?
69  bool parallel = false;
70 
71  // What dimension is this Func vectorized over, in terms of the pure args of the Func?
72  int vector_dim = -1;
73 
74  // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
76 
77  void copy_from(const LoopNest &n);
78 
79  static void hash_combine(uint64_t &h, uint64_t next) {
80  // From boost
81  h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
82  }
83 
84  // Hash the loop structure and sizes up to a fixed depth. This is
85  // used as the hash function for the coarse-to-fine beam search in
86  // the paper.
87  void structural_hash(uint64_t &h, int depth) const;
88 
89  // How many funcs are scheduled inside this loop level. Used in
90  // the structural hash.
91  size_t funcs_realized_or_inlined() const {
92  size_t count = inlined.size() + store_at.size();
93  for (const auto &c : children) {
94  count += c->funcs_realized_or_inlined();
95  }
96  return count;
97  }
98 
99  // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
100  struct Sites {
101  const LoopNest *compute = nullptr; // Its containing compute_at site
102  const LoopNest *store = nullptr; // Its containing store_at site
103  const LoopNest *produce = nullptr; // Its own outermost node
104  const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
105  const LoopNest *task = nullptr; // The parallel for loop it belongs to
106  bool inlined = false; // Is the Func inlined?
107 
108  // Used for caching features/feature intermediates.
110  };
111 
112  // Compute all the sites of interest for each pipeline stage
113  void get_sites(StageMap<Sites> &sites,
114  const LoopNest *task = nullptr,
115  const LoopNest *parent = nullptr) const;
116 
117  // A helper for the working_set_at_task feature. Most features are
118  // computed in the recursive pass 'compute_features' below, but
119  // this one must be done in a second separate recursive pass.
122  for (const auto &c : children) {
123  c->set_working_set_at_task_feature(working_set, features);
124  features->get(c->stage).working_set_at_task = working_set;
125  }
126  }
127 
128  // Do a recursive walk over the loop nest computing features to feed the cost model.
129  void compute_features(const FunctionDAG &dag,
130  const Adams2019Params &params,
131  const StageMap<Sites> &sites,
132  int64_t instances,
133  int64_t parallelism,
134  const LoopNest *parent,
135  const LoopNest *grandparent,
136  const LoopNest &root,
137  int64_t *working_set,
139  bool use_cached_features) const;
140 
141  bool is_root() const {
142  // The root is the sole node without a Func associated with
143  // it.
144  return node == nullptr;
145  }
146 
147  // Set the region required of a Func at this site.
148  const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
149  return bounds.emplace(f, b);
150  }
151 
152  // Get the region required of a Func at this site, from which we
153  // know what region would be computed if it were scheduled here,
154  // and what its loop nest would be.
155  const Bound &get_bounds(const FunctionDAG::Node *f) const;
156 
157  // Recursively print a loop nest representation to stderr
158  void dump(std::ostream &os, string prefix, const LoopNest *parent) const;
159 
160  // Does this loop nest access the given Func
161  bool calls(const FunctionDAG::Node *f) const;
162 
163  // What is the maximum number of inlined calls to a Func that
164  // occur within this loop. Used to prune states that would
165  // generate too much code.
166  int64_t max_inlined_calls() const;
167 
168  // Does this loop nest access an input buffer? Used to select
169  // trail strategies when splitting loops. We don't want to read
170  // out of bounds on inputs, even if we don't intend to use the
171  // values read. It could create annoying assertion failures for
172  // the user. It's OK to read out of range of the values computed
173  // on internal Funcs though. Allocation bounds inference just pads
174  // out the bounds so that it won't fault.
175  bool accesses_input_buffer() const;
176 
177  // Does this loop nest contain a computation of the given Func.
178  bool computes(const FunctionDAG::Node *f) const;
179 
180  // Above here most methods query the loop nest. Below we have
181  // methods that mutate the loop nest.
182 
183  // Inline a Func into all consumers within this loop.
184  void inline_func(const FunctionDAG::Node *f);
185 
186  // Compute a Func at this site.
187  void compute_here(const FunctionDAG::Node *f, bool tileable, int v, const Adams2019Params &params);
188 
189  // Parallelize this loop according to the given tiling.
191  const vector<int64_t> &tiling,
192  const LoopNest *parent) const;
193 
194  // Return all possible ways to compute f in tiles somewhere within
195  // this loop nest.
196  std::vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
197  const LoopNest *parent,
198  const Adams2019Params &params,
199  int v,
200  bool in_realization) const;
201 
202  // Below here we have methods that apply a schedule to a Halide pipeline.
203 
204  // A model of the state of the loop nest of a Func while applying
205  // Halide's scheduling directives.
206 
207  // Note that StageScheduleState is movable-but-not-copyable thanks
208  // to its ostringstream member.
210  // How much parallelism do we need to exploit with this Func?
211  double num_cores = 0;
212 
213  // Which storage dimension is vectorized? We need to reorder it innermost
214  int vector_dim = -1;
216 
217  // The various Vars and RVars used for scheduling a Func.
218  struct FuncVar {
219  // The top-level var or rvar this was split off from
221 
222  // This var.
224 
225  // Source code to access this Var/RVar. Used for printing
226  // valid Halide source for this schedule.
227  string accessor;
228 
229  // Our estimate of the extent of this var. This is exact
230  // when constant_extent flag is true.
232 
233  // Which index in the symbolic loop nest does this var
234  // belong to.
235  size_t index = 0;
236 
237  // Some flags.
238  bool innermost_pure_dim = false,
239  outermost = false,
240  parallel = false,
241  exists = false,
242  pure = false,
245  : orig(Var()), var(Var()) {
246  }
247  };
248 
249  // In order from innermost to outermost. Each group of d is one tiling level.
250  std::vector<FuncVar> vars;
251 
252  std::ostringstream schedule_source;
253  };
254 
255  // Apply the schedule represented by this loop nest to a Halide pipeline.
256  void apply(LoopLevel here,
257  StageMap<std::unique_ptr<StageScheduleState>> &state_map,
258  double num_cores,
259  int depth,
260  const LoopNest *parent,
261  const LoopNest *compute_site) const;
262 
263  // The below are two feature caches.
264  // hash of producers -> StageMap
265  mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates_cache;
266  // hash of producers -> StageMap
267  mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features_cache;
268 
269  // Same as copy_from (above) but also copies the two caches.
270  void copy_from_including_features(const LoopNest &n);
271 
272  // Loops through inlined funcs and caches the pcm found in features, into memoized_features.
274  const StageMap<ScheduleFeatures> *features) const;
275 
276  // Merges features_to_insert into memoized_features if it does not already exist there.
277  void memoize_features(StageMap<ScheduleFeatures> &memoized_features,
278  const StageMap<ScheduleFeatures> *features_to_insert) const;
279 
280  // Recalculates working_set from cached features
281  void compute_working_set_from_features(int64_t *working_set,
282  const StageMap<ScheduleFeatures> *features) const;
283 
284  // Features need to be recomputed for inlined Funcs
287 
288  // Create a (hopefully) unique hash of the producers.
290 
291  // Gather all stages that are producers for any Func in this LoopNest.
292  std::vector<std::pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
293 
294  // Collect all stages referenced in this LoopNest.
295  void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
296 };
297 
298 // Find the deepest common ancestor of `a` and `b`.
299 // `parents` is a map from loop nest to (parent, depth) tuples.
300 // Assumes that `a` and `b` are found in `parents`, otherwise errors.
301 const LoopNest *deepest_common_ancestor(const std::map<const LoopNest *, std::pair<const LoopNest *, int>> &parents,
302  const LoopNest *a, const LoopNest *b);
303 
304 // Compute the parent and depth of every loop nest node.
305 // Stores in `parents` the children of `here` (keys) to tuples of (here, depth).
306 // Recurses on all children of `here`.
307 void compute_loop_nest_parents(std::map<const LoopNest *, std::pair<const LoopNest *, int>> &parents,
308  const LoopNest *here, int depth);
309 
310 } // namespace Autoscheduler
311 } // namespace Internal
312 } // namespace Halide
313 
314 #endif // LOOP_NEST_H
Halide::Internal::Autoscheduler::LoopNest::collect_producers
std::vector< std::pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
Halide::Internal::Autoscheduler::LoopNest::features_cache
std::map< uint64_t, StageMap< ScheduleFeatures > > features_cache
Definition: LoopNest.h:267
Halide::Internal::Autoscheduler::LoopNest::feature_intermediates_cache
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates_cache
Definition: LoopNest.h:265
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState
Definition: LoopNest.h:209
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::accessor
string accessor
Definition: LoopNest.h:227
Halide::Internal::Autoscheduler::LoopNest::store_at
std::set< const FunctionDAG::Node * > store_at
Definition: LoopNest.h:49
Halide::Internal::Autoscheduler::BoundContents
Definition: FunctionDAG.h:275
Halide::Internal::Autoscheduler::LoopNest::ref_count
RefCount ref_count
Definition: LoopNest.h:35
Halide::Var
A Halide variable, to be used when defining functions.
Definition: Var.h:19
Halide::Internal::Autoscheduler::LoopNest::collect_stages
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Halide::Internal::Autoscheduler::LoopNest::vectorized_loop_index
int vectorized_loop_index
Definition: LoopNest.h:75
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::FuncVar
FuncVar()
Definition: LoopNest.h:244
Halide::Internal::Autoscheduler::LoopNest::inlined
NodeMap< int64_t > inlined
Definition: LoopNest.h:46
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::var
VarOrRVar var
Definition: LoopNest.h:223
Halide::Internal::Autoscheduler::LoopNest::set_bounds
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition: LoopNest.h:148
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::extent
int64_t extent
Definition: LoopNest.h:231
Halide::Internal::Autoscheduler::FunctionDAG
Definition: FunctionDAG.h:368
Halide::Internal::Autoscheduler::LoopNest::get_bounds
const Bound & get_bounds(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::memoize_features
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features_to_insert) const
Halide::Internal::Autoscheduler::LoopNest::Sites::compute
const LoopNest * compute
Definition: LoopNest.h:101
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::index
size_t index
Definition: LoopNest.h:235
Halide::Internal::Autoscheduler::LoopNest::set_working_set_at_task_feature
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition: LoopNest.h:120
Halide::Internal::Autoscheduler::LoopNest::compute_here
void compute_here(const FunctionDAG::Node *f, bool tileable, int v, const Adams2019Params &params)
Halide::Internal::Autoscheduler::LoopNest::parallel
bool parallel
Definition: LoopNest.h:69
Halide::Internal::Autoscheduler::LoopNest::accesses_input_buffer
bool accesses_input_buffer() const
Halide::Internal::Autoscheduler::LoopNest::compute_working_set_from_features
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
Halide::Internal::Autoscheduler::LoopNest::copy_from_including_features
void copy_from_including_features(const LoopNest &n)
Halide::Internal::IntrusivePtr
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
Halide::Internal::Autoscheduler::LoopNest::calls
bool calls(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::max_inlined_calls
int64_t max_inlined_calls() const
Halide::Internal::Autoscheduler::LoopNest::Sites::produce
const LoopNest * produce
Definition: LoopNest.h:103
uint64_t
unsigned __INT64_TYPE__ uint64_t
Definition: runtime_internal.h:23
Halide::Internal::Autoscheduler::LoopNest::innermost
bool innermost
Definition: LoopNest.h:63
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
PerfectHashMap.h
Halide::Internal::Autoscheduler::LoopNest::Sites::store
const LoopNest * store
Definition: LoopNest.h:102
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::Autoscheduler::LoopNest::copy_from
void copy_from(const LoopNest &n)
Halide::Internal::Autoscheduler::LoopNest::memoize_points_computed_minimum
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
Halide::Internal::Autoscheduler::LoopNest::computes
bool computes(const FunctionDAG::Node *f) const
Halide::Internal::Autoscheduler::LoopNest::node
const FunctionDAG::Node * node
Definition: LoopNest.h:57
Halide::Internal::Autoscheduler::LoopNest
Definition: LoopNest.h:34
Halide::Internal::Autoscheduler::LoopNest::get_sites
void get_sites(StageMap< Sites > &sites, const LoopNest *task=nullptr, const LoopNest *parent=nullptr) const
Halide::Internal::Autoscheduler::LoopNest::dump
void dump() const
Halide::Internal::Autoscheduler::compute_loop_nest_parents
void compute_loop_nest_parents(std::map< const LoopNest *, std::pair< const LoopNest *, int >> &parents, const LoopNest *here, int depth)
Halide::Internal::Autoscheduler::Adams2019Params
Definition: CostModel.h:19
Halide::Internal::Autoscheduler::LoopNest::parallelize_in_tiles
IntrusivePtr< const LoopNest > parallelize_in_tiles(const Adams2019Params &params, const vector< int64_t > &tiling, const LoopNest *parent) const
Halide::Internal::Autoscheduler::LoopNest::apply
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState >> &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::parallel
bool parallel
Definition: LoopNest.h:240
Halide::Internal::Autoscheduler::LoopNest::stage
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:60
int64_t
signed __INT64_TYPE__ int64_t
Definition: runtime_internal.h:22
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized_loop_index
int vectorized_loop_index
Definition: LoopNest.h:215
Halide::Internal::Autoscheduler::LoopNest::size
std::vector< int64_t > size
Definition: LoopNest.h:39
Halide::Internal::Autoscheduler::FunctionDAG::Node::Stage
Definition: FunctionDAG.h:449
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::pure
bool pure
Definition: LoopNest.h:242
Halide::Internal::Autoscheduler::LoopNest::Sites::inlined
bool inlined
Definition: LoopNest.h:106
Halide::Internal::Autoscheduler::LoopNest::hash_combine
static void hash_combine(uint64_t &h, uint64_t next)
Definition: LoopNest.h:79
Halide::Internal::Autoscheduler::generate_tilings
std::vector< std::vector< int64_t > > generate_tilings(const vector< int64_t > &s, int d, int factor, bool allow_splits)
Halide::Internal::Autoscheduler::LoopNest::Sites
Definition: LoopNest.h:100
Halide::Internal::Autoscheduler::LoopNest::recompute_inlined_features
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
Halide::Internal::Autoscheduler::LoopNest::compute_features
void compute_features(const FunctionDAG &dag, const Adams2019Params &params, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, int64_t *working_set, StageMap< ScheduleFeatures > *features, bool use_cached_features) const
Halide::Internal::Autoscheduler::LoopNest::Sites::hash_of_producers_stored_at_root
uint64_t hash_of_producers_stored_at_root
Definition: LoopNest.h:109
Halide::Internal::Autoscheduler::LoopNest::compute_in_tiles
std::vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Adams2019Params &params, int v, bool in_realization) const
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::num_cores
double num_cores
Definition: LoopNest.h:211
Halide::Internal::Autoscheduler::deepest_common_ancestor
const LoopNest * deepest_common_ancestor(const std::map< const LoopNest *, std::pair< const LoopNest *, int >> &parents, const LoopNest *a, const LoopNest *b)
Halide::Internal::Autoscheduler::LoopNest::bounds
NodeMap< Bound > bounds
Definition: LoopNest.h:54
Halide::LoopLevel
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vector_dim
int vector_dim
Definition: LoopNest.h:214
Halide::Internal::Autoscheduler::LoopNest::Sites::task
const LoopNest * task
Definition: LoopNest.h:105
FunctionDAG.h
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::innermost_pure_dim
bool innermost_pure_dim
Definition: LoopNest.h:238
Halide::Internal::Autoscheduler::LoopNest::features
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition: LoopNest.h:140
PerfectHashMap
Definition: PerfectHashMap.h:38
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::constant_extent
bool constant_extent
Definition: LoopNest.h:243
Halide::Internal::Autoscheduler::LoopNest::Sites::innermost
const LoopNest * innermost
Definition: LoopNest.h:104
Halide::Internal::RefCount
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
Halide::Internal::Autoscheduler::LoopNest::structural_hash
void structural_hash(uint64_t &h, int depth) const
Halide::Internal::Autoscheduler::LoopNest::funcs_realized_or_inlined
size_t funcs_realized_or_inlined() const
Definition: LoopNest.h:91
Halide::Internal::Autoscheduler::LoopNest::tileable
bool tileable
Definition: LoopNest.h:66
Halide::VarOrRVar
A class that can represent Vars or RVars.
Definition: Func.h:30
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::schedule_source
std::ostringstream schedule_source
Definition: LoopNest.h:252
Halide::Internal::Autoscheduler::FunctionDAG::Node
Definition: FunctionDAG.h:379
Halide::Internal::Autoscheduler::LoopNest::inline_func
void inline_func(const FunctionDAG::Node *f)
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::exists
bool exists
Definition: LoopNest.h:241
Halide::Internal::Autoscheduler::LoopNest::children
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:42
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::outermost
bool outermost
Definition: LoopNest.h:239
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::orig
VarOrRVar orig
Definition: LoopNest.h:220
Halide::Internal::Autoscheduler::LoopNest::compute_hash_of_producers_stored_at_root
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
Halide::Internal::Autoscheduler::LoopNest::is_root
bool is_root() const
Definition: LoopNest.h:141
Halide::Internal::Autoscheduler::LoopNest::vector_dim
int vector_dim
Definition: LoopNest.h:72
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vars
std::vector< FuncVar > vars
Definition: LoopNest.h:250
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar
Definition: LoopNest.h:218