Halide 21.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
LoopNest.h
Go to the documentation of this file.
1/** This file defines the LoopNest, which is our
2 * representation of a Halide schedule, and contains methods to
3 * generate candidates for scheduling as well as extract a
4 * featurization that can be used to cost each candidate. */
5
6#ifndef LOOP_NEST_H
7#define LOOP_NEST_H
8
9#include "ASLog.h"
10#include "CostModel.h"
11#include "FunctionDAG.h"
12#include "GPULoopInfo.h"
13#include "GPUMemInfo.h"
14#include "PerfectHashMap.h"
15#include "SearchSpaceOptions.h"
16#include "Statistics.h"
17#include "ThreadInfo.h"
18#include "Tiling.h"
19#include <set>
20#include <vector>
21
22namespace Halide {
23namespace Internal {
24namespace Autoscheduler {
25
26template<typename T>
27using NodeMap = PerfectHashMap<FunctionDAG::Node, T>;
28
29template<typename T>
30using StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>;
31
40
41std::string stringify(GPU_parallelism label);
42
43// inlined => func is inlined so has no memory store location
51
52bool may_subtile(const Anderson2021Params &params);
53
55
57
59
61
63 return 128;
64}
65
66int get_unroll_limit(const Target &target);
67
68bool in_range_zero_one(double x);
69
70bool are_valid_thread_extents(const vector<int64_t> &counts);
71
74
75bool all(const vector<int> &v);
76bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);
77
78// We're going to do a tree search over possible schedules to find an
79// optimal one. A tree search requires a state, and a function that
80// gives you children of the state (with costs). The following struct
81// represents the state, which is a partial schedule.
82//
83// A partial schedule is a tree. Each node is some portion of the for
84// loop nest of some Func. If there are no children, it's the
85// innermost set of loops. If there are children, it's a loop over
86// tiles of that Func.
87struct LoopNest {
88 mutable RefCount ref_count;
89
90 // The extents of this loop. Put another way, the number of tiles,
91 // not the size of each tile.
92 vector<int64_t> size;
93
94 // The nodes inside the loop body
95 vector<IntrusivePtr<const LoopNest>> children;
96
97 // Funcs inlined into this inner loop, and the number of times
98 // each is called. Only valid if children is empty.
100
101 // Funcs stored inside this loop
102 std::set<const FunctionDAG::Node *> store_at;
103
104 // The total bounds required of any given Func over all iterations
105 // of this loop. In the paper, this is represented using the
106 // little boxes to the left of the loop nest tree figures.
107 mutable NodeMap<Bound> bounds;
108
109 // The Func this loop nest belongs to
110 const FunctionDAG::Node *node = nullptr;
111
112 // The stage of the Func
113 const FunctionDAG::Node::Stage *stage = nullptr;
114
115 // Is this the innermost loop of this func (the SIMD loop)?
116 bool innermost = false;
117
118 // Are we permitted to tile this loop?
119 bool tileable = false;
120
121 // Is this the parallel outer loop?
122 bool parallel = false;
123
124 // What dimension is this Func vectorized over, in terms of the pure args of the Func?
125 int vector_dim = -1;
126
127 // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
128 int vectorized_loop_index = -1;
129
130 // Apply gpu threads to this loop nest
132
144
145 mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;
146 mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;
147
148 bool is_gpu_serial(const Target &target) const {
150 }
151
152 bool is_gpu_thread(const Target &target) const {
154 }
155
156 bool is_gpu_block(const Target &target) const {
158 }
159
160 bool is_scalar() const {
161 return size.empty();
162 }
163
164 // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension
165 // across all siblings of f.
166 vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;
167
168 // given a newly inserted node f into this LoopNest, gets the size of
169 // all of f's stages and their pure_dim indices
171 vector<vector<int64_t>> &stage_sizes,
172 vector<vector<int>> &pure_dims,
173 vector<int> &vectorized_indices) const;
174
175 // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes
176 // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32
177 // if so, we will include these in the serial loop sizes
178 void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;
179
180 // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles
181 // the newly inserted loop nests of f into a threads loop outside a serial loop.
182 // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.
184 const Anderson2021Params &params,
185 const Target &target,
186 int v,
187 vector<IntrusivePtr<const LoopNest>> &result,
188 const vector<int64_t> &max_size);
189
190 void copy_from(const LoopNest &n);
192
193 static void hash_combine(uint64_t &h, uint64_t next) {
194 // From boost
195 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
196 }
197
198 // Hash the loop structure and sizes up to a fixed depth. This is
199 // used as the hash function for the coarse-to-fine beam search in
200 // the paper.
201 void structural_hash(uint64_t &h, int depth) const;
202
203 // How many funcs are scheduled inside this loop level. Used in
204 // the structural hash.
206 size_t count = inlined.size() + store_at.size();
207 for (const auto &c : children) {
208 count += c->funcs_realized_or_inlined();
209 }
210 return count;
211 }
212
213 // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
214 struct Sites {
215 const LoopNest *compute = nullptr; // Its containing compute_at site
216 const LoopNest *store = nullptr; // Its containing store_at site
217 const LoopNest *produce = nullptr; // Its own outermost node
218 const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
219 const LoopNest *task = nullptr; // The parallel for loop it belongs to
220 const LoopNest *thread = nullptr; // Its containing gpu_thread loop
221 GPUMemoryType gpu_store_memory_type; // global, local, shared?
222 int64_t allocation_size = 0; // Allocation size in bytes
223 bool is_constant_allocation = false; // Does the allocation have constant size?
224 int64_t num_realizations = 0; // Number of times this stage is realized. Only valid for unscheduled producers
225 bool inlined = false; // Is the Func inlined?
226 std::vector<const LoopNest *> inlined_innermosts; // Is the Func inlined?
228
241 };
242
244 bool in_thread,
245 bool is_inlined = false) const;
246
247 std::vector<int> unrolled_loops(const Target &target,
248 const LoopNest *parent,
249 const LoopNest *grandparent) const;
250
252 StageMap<Sites> &sites,
253 NodeMap<bool> &can_be_promoted_to_registers,
254 const LoopNest *grandparent,
255 const LoopNest *parent) const;
256
258 StageMap<Sites> &sites) const;
259
260 // Compute all the sites of interest for each pipeline stage
261 void get_sites(const Target &target,
262 StageMap<Sites> &sites,
263 StageMap<int64_t> &shared_mem_alloc_sizes,
264 const LoopNest *task = nullptr,
265 const LoopNest *parent = nullptr,
266 const LoopNest *current_thread_loop = nullptr) const;
267
268 // A helper for the working_set_at_task feature. Most features are
269 // computed in the recursive pass 'compute_features' below, but
270 // this one must be done in a second separate recursive pass.
273 for (const auto &c : children) {
274 c->set_working_set_at_task_feature(working_set, features);
275 features->get(c->stage).working_set_at_task = working_set;
276 }
277 }
278
280 const LoopNest *parent,
281 bool in_threads_loop) const;
282
284
285 bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;
286
288
290
292
293 // Get the stride over "node's" storage for a unit increment in the vectorized loop's
294 // index
295 double storage_stride(const LoadJacobian &jac,
296 int innermost_storage_dim,
297 const FunctionDAG::Node *storage_node,
298 const Bound &store_bounds,
299 const LoopNest &root) const;
300
302 int innermost_storage_dim,
303 const FunctionDAG::Node *storage_node,
304 const Bound &store_bounds,
305 const ThreadInfo *thread_info,
306 bool verbose = false) const;
307
309 const FunctionDAG::Node *storage_node,
310 const LoopNest &root) const;
311
312 int get_actual_vector_dim(const Bound &store_bounds) const;
313
315 int consumer_innermost_dim,
316 const FunctionDAG::Node *node,
317 const Bound &consumer_store_bounds,
318 const GPULoopInfo &gpu_loop_info,
319 const std::vector<int64_t> &inner_serial_loop_extents,
320 const Sites &consumer_site,
321 ScheduleFeatures &feat,
322 const LoopNest *parent,
323 const LoopNest &root,
324 GlobalMemInfo &global_mem_loads,
325 SharedMemInfo &shared_mem_loads,
326 LocalMemInfo &local_mem_loads,
327 bool verbose = false) const;
328
330 const FunctionDAG::Node *accessed,
331 int innermost_dim,
332 int loop_index) const;
333
335 const FunctionDAG::Node *accessed,
336 bool accessed_has_been_scheduled,
337 int innermost_dim,
338 int loop_index,
339 const GPUMemoryType &mem_type) const;
340
342 const FunctionDAG::Node *accessed,
343 bool accessed_has_been_scheduled,
344 int innermost_dim,
345 const GPUMemoryType &mem_type,
346 bool verbose = false) const;
347
348 int vectorized_access_size(size_t loop_index,
349 bool verbose = false) const;
350
351 template<typename T>
353 const FunctionDAG::Node *node,
354 const Bound &store_bounds,
355 const ThreadInfo *thread_info,
356 int innermost_dim,
357 double num_requests_per_warp,
358 MemInfoType<T> &mem_info,
359 bool verbose = false) const;
360
361 std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac,
362 int consumer_innermost_dim,
363 const FunctionDAG::Node *node,
364 const Bound &consumer_store_bounds,
365 const LoopNest &root,
366 double serial_loop_extents) const;
367
368 template<typename T>
370 int consumer_innermost_dim,
371 const FunctionDAG::Node *node,
372 const Bound &consumer_store_bounds,
373 const ThreadInfo *thread_info,
374 double serial_loop_extents,
375 bool verbose) const;
376
377 template<typename T>
379 int producer_innermost_dim,
380 const FunctionDAG::Node *node,
381 const Bound &producer_store_bounds,
382 bool producer_has_been_scheduled,
383 const ThreadInfo *thread_info,
384 MemInfoType<T> &mem_info,
385 double serial_loop_extents,
386 bool verbose = false) const;
387
388 double compute_local_mem_stride(double stride,
389 double bytes) const;
390
391 // Assumes block, serial, thread or block, thread nesting
393 const LoopNest *grandparent) const;
394
395 std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
396
398
400
402 const GPULoopInfo &gpu_loop_info) const;
403
404 // Assume that when a block is active, all its warps are active
406 ScheduleFeatures &feat,
407 const GPULoopInfo &gpu_loop_info) const;
408
410 const Target &target,
411 int64_t total_shared_mem_alloc_size,
412 ScheduleFeatures &feat) const;
413
414 std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
415
417 const Target &target,
418 const GPULoopInfo &gpu_loop_info,
419 const std::vector<const FunctionDAG::Edge *> &edge_chain,
420 const LoadJacobian &jac,
421 const LoopNest *parent,
422 const LoopNest *grandparent,
423 int64_t n,
424 const ScheduleFeatures &feat,
425 const LoadJacobian &serial_jac,
426 bool producer_has_been_scheduled,
427 int producer_innermost_dim,
428 const GPUMemoryType &mem_type,
429 bool verbose) const;
430
432 const LoopNest *parent,
433 const ScheduleFeatures &feat,
434 const LoadJacobian &jac,
435 int producer_dims) const;
436
439
440 vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
441
443
444 void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
445
448
451
454
455 std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
456
457 // Do a recursive walk over the loop nest computing features to feed the cost model.
459 const Anderson2021Params &params,
460 const Target &target,
461 const StageMap<Sites> &sites,
462 int64_t instances,
463 int64_t parallelism,
464 const LoopNest *parent,
465 const LoopNest *grandparent,
466 const LoopNest &root,
467 GPULoopInfo gpu_loop_info,
468 bool use_memoized_features,
469 const StageMap<int64_t> &total_shared_mem_alloc_sizes,
470 int64_t *working_set,
471 int64_t *working_set_local_constant,
472 int64_t *working_set_local_dynamic,
474 Statistics &stats,
475 bool verbose = false) const;
476
477 bool is_root() const {
478 // The root is the sole node without a Func associated with
479 // it.
480 return node == nullptr;
481 }
482
483 // Set the region required of a Func at this site.
484 const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
485 return bounds.emplace(f, b);
486 }
487
488 // Get the region required of a Func at this site, from which we
489 // know what region would be computed if it were scheduled here,
490 // and what its loop nest would be.
491 const Bound &get_bounds(const FunctionDAG::Node *f) const;
492
493 // Get the region required of a Func at this site (but only to satisfy the
494 // consumers along the given edge chain), from which we know what region
495 // would be computed if it were scheduled here and what its loop nest
496 // would be.
498 const vector<const FunctionDAG::Edge *> &edge_chain) const;
499
500 void dump() const;
501
502 std::string to_string() const;
503
504 // Recursively print a loop nest representation to stderr
505 template<typename T>
506 void dump(T &stream, string prefix, const LoopNest *parent) const;
507
508 // Does this loop nest access the given Func
509 bool calls(const FunctionDAG::Node *f) const;
510
511 // What is the maximum number of inlined calls to a Func that
512 // occur within this loop. Used to prune states that would
513 // generate too much code.
515
516 // Does this loop nest access an input buffer? Used to select
517 // trail strategies when splitting loops. We don't want to read
518 // out of bounds on inputs, even if we don't intend to use the
519 // values read. It could create annoying assertion failures for
520 // the user. It's OK to read out of range of the values computed
521 // on internal Funcs though. Allocation bounds inference just pads
522 // out the bounds so that it won't fault.
524
525 // Does this loop nest contain a computation of the given Func.
526 bool computes(const FunctionDAG::Node *f) const;
527
528 // Above here most methods query the loop nest. Below we have
529 // methods that mutate the loop nest.
530
531 // Inline a Func into all consumers within this loop.
533
534 // Compute a Func at this site.
536 bool tileable,
537 int v,
538 bool in_threads_loop,
539 const Anderson2021Params &params,
540 const Target &target);
541
542 // Parallelize this loop according to the given tiling.
544 const LoopNest *parent,
545 const Anderson2021Params &params,
546 const Target &target,
547 bool inner_tiling,
548 bool adjust_tiling,
549 bool move_all_rvars_inward = true,
550 const vector<int> &rvars_to_move_inward = {}) const;
551
552 int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false,
553 bool in_threads_loop = false) const;
555
556 // All store ats further in than the block level must be fixed
557 // sized allocations. This method checks if f will require a dynamic
558 // allocation
560 const Target &target,
561 bool in_threads_loop) const;
562
563 // Return all possible ways to compute f in tiles somewhere within
564 // this loop nest.
565 // in_threads_loop tracks whether or not function is going to be placed inside a
566 // loop marked gpu_threads, in which case f's loops cannot be gpu_threads
567 vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
568 const LoopNest *parent,
569 const Anderson2021Params &params,
570 const Target &target,
571 const SearchSpaceOptions &search_space_options,
572 int v,
573 bool in_realization,
574 bool in_threads_loop,
575 bool is_pre_pass,
576 vector<int64_t> union_counts = vector<int64_t>()) const;
577
578 // Below here we have methods that apply a schedule to a Halide pipeline.
579
580 // A model of the state of the loop nest of a Func while applying
581 // Halide's scheduling directives.
582
583 // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.
584 struct StageScheduleState {
585 // How much parallelism do we need to exploit with this Func?
586 double num_cores = 0;
587
588 // Which storage dimension is vectorized? We need to reorder it innermost
589 int vector_dim = -1;
590 int vectorized_loop_index = -1;
591
592 // The various Vars and RVars used for scheduling a Func.
593 struct FuncVar {
594 // The top-level var or rvar this was split off from
596
597 // This var.
599
600 // Source code to access this Var/RVar. Used for printing
601 // valid Halide source for this schedule.
602 string accessor;
603
604 // Our estimate of the extent of this var. This is exact
605 // when constant_extent flag is true.
606 int64_t extent = 0;
607
608 // Which index in the symbolic loop nest does this var
609 // belong to.
610 size_t index = 0;
611
612 // Some flags.
613 bool innermost_pure_dim = false;
614 bool outermost = false;
615 bool parallel = false;
616 bool exists = false;
617 bool pure = false;
618 bool constant_extent = false;
619
620 bool vectorized = false;
621 bool gpu_threads = false;
622
624 : orig(Var()),
625 var(Var()) {
626 }
627 };
630 bool parallel = false;
631 bool vectorized = false;
634
635 // In order from innermost to outermost. Each group of d is one tiling level.
636 vector<FuncVar> vars;
637
638 // In order from innermost to outermost. Each group of d is one tiling level.
639 vector<FuncVar> ordered_vars;
640 vector<int64_t> gpu_thread_extents;
641
644
645 // From outermost in
646 vector<StageScheduleState *> ancestors;
647
648 std::ostringstream schedule_source;
649 };
650
655 int num_serial_loops() const;
657
659 const NodeMap<bool> &all_inlined) const;
661 const LoopNest *parent) const;
662
663 // Apply the schedule represented by this loop nest to a Halide pipeline.
664 void apply(LoopLevel here,
665 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
666 double num_cores,
667 int depth,
668 const LoopNest *parent,
669 const LoopNest *compute_site,
670 const Target &target,
671 std::vector<StageScheduleState *> &ancestors,
672 const NodeMap<bool> &all_inlined) const;
673
674 double max_idle_lane_wastage(const Target &target,
675 GPULoopInfo gpu_loop_info) const;
676
678
680 NodeMap<bool> &inlined_nodes) const;
681
682 void collect_all_inlined(NodeMap<bool> &all_inlined) const;
683
685 int64_t product_of_descendants(int loop_index) const;
686
688 const LoopNest *compute_root_loop_nest = nullptr) const;
689};
690
691struct Filter {
693 bool logging = false;
694
695 explicit Filter(const LoopNest *loop_nest)
698 if (logging) {
699 std::cerr << "\nState filtered: \n";
700 loop_nest->dump();
701 std::cerr << "Reason: ";
702 }
703 }
704
705 template<typename T>
707 if (logging) {
708 std::cerr << std::forward<T>(x);
709 }
710 return *this;
711 }
712
714};
715
716} // namespace Autoscheduler
717} // namespace Internal
718} // namespace Halide
719
720#endif // LOOP_NEST_H
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition Schedule.h:203
A Halide variable, to be used when defining functions.
Definition Var.h:19
MemInfoType< SharedMem > SharedMemInfo
Definition GPUMemInfo.h:111
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition LoopNest.h:24
bool all(const vector< int > &v)
IntrusivePtr< const BoundContents > Bound
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
Definition LoopNest.h:62
int64_t get_shared_memory_sm_limit(const Anderson2021Params &params)
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)
bool may_subtile(const Anderson2021Params &params)
MemInfo< typename MemTraits< T >::MemInfoType > MemInfoType
Definition GPUMemInfo.h:108
MemInfoType< LocalMem > LocalMemInfo
Definition GPUMemInfo.h:112
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params &params)
std::string stringify(GPU_parallelism label)
MemInfoType< GlobalMem > GlobalMemInfo
Definition GPUMemInfo.h:110
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
Filter(const LoopNest *loop_nest)
Definition LoopNest.h:695
std::vector< const LoopNest * > inlined_innermosts
Definition LoopNest.h:226
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition LoopNest.h:643
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
bool is_gpu_thread(const Target &target) const
Definition LoopNest.h:152
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params &params, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition LoopNest.h:146
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
const FunctionDAG::Node * node
Definition LoopNest.h:57
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
Definition LoopNest.h:156
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition LoopNest.h:145
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose=false) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition LoopNest.h:271
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition LoopNest.h:484
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params &params, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
bool is_gpu_serial(const Target &target) const
Definition LoopNest.h:148
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition LoopNest.h:193
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
const FunctionDAG::Node::Stage * stage
Definition LoopNest.h:60
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
std::set< const FunctionDAG::Node * > store_at
Definition LoopNest.h:49
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params &params, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled?
A class that can represent Vars or RVars.
Definition Func.h:29