Halide 21.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
FunctionDAG.h
Go to the documentation of this file.
1/** This file defines the class FunctionDAG, which is our
2 * representation of a Halide pipeline, and contains methods to using
3 * Halide's bounds tools to query properties of it. */
4
5#ifndef FUNCTION_DAG_H
6#define FUNCTION_DAG_H
7
8#include <algorithm>
9#include <cstdint>
10#include <map>
11#include <string>
12#include <utility>
13#include <vector>
14
15#include "Featurization.h"
16#include "HalidePlugin.h"
17#include "PerfectHashMap.h"
18
19namespace Halide {
20namespace Internal {
21namespace Autoscheduler {
22
23using std::map;
24using std::pair;
25using std::string;
26using std::unique_ptr;
27using std::vector;
28
29// First we have various utility classes.
30
31// An optional rational type used when analyzing memory dependencies.
32struct OptionalRational {
34
35 bool exists() const {
36 return denominator != 0;
37 }
38
39 OptionalRational() = default;
44
45 void operator+=(const OptionalRational &other) {
46 if ((denominator & other.denominator) == 0) {
48 return;
49 }
50 if (denominator == other.denominator) {
51 numerator += other.numerator;
52 return;
53 }
54
55 int64_t l = lcm(denominator, other.denominator);
57 denominator = l;
58 numerator += other.numerator * (l / other.denominator);
60 numerator /= g;
61 denominator /= g;
62 }
63
65 if ((*this) == 0) {
66 return *this;
67 }
68 int64_t num = numerator * factor;
69 return OptionalRational{num, denominator};
70 }
71
73 if ((*this) == 0) {
74 return *this;
75 }
76 if (other == 0) {
77 return other;
78 }
79 int64_t num = numerator * other.numerator;
80 int64_t den = denominator * other.denominator;
81 return OptionalRational{num, den};
82 }
83
84 // Because this type is optional (exists may be false), we don't
85 // have a total ordering. These methods all return false when the
86 // operators are not comparable, so a < b is not the same as !(a
87 // >= b).
88 bool operator<(int x) const {
89 if (denominator == 0) {
90 return false;
91 } else if (denominator > 0) {
92 return numerator < x * denominator;
93 } else {
94 return numerator > x * denominator;
95 }
96 }
97
98 bool operator<=(int x) const {
99 if (denominator == 0) {
100 return false;
101 } else if (denominator > 0) {
102 return numerator <= x * denominator;
103 } else {
104 return numerator >= x * denominator;
105 }
106 }
107
108 bool operator>(int x) const {
109 if (!exists()) {
110 return false;
111 }
112 return !((*this) <= x);
113 }
114
115 bool operator>=(int x) const {
116 if (!exists()) {
117 return false;
118 }
119 return !((*this) < x);
120 }
121
122 bool operator==(int x) const {
123 return exists() && (numerator == (x * denominator));
124 }
125
126 bool operator==(const OptionalRational &other) const {
127 return (exists() == other.exists()) && (numerator * other.denominator == denominator * other.numerator);
128 }
129};
130
131// A LoadJacobian records the derivative of the coordinate accessed in
132// some producer w.r.t the loops of the consumer.
133class LoadJacobian {
134 std::vector<OptionalRational> coeffs;
135 int64_t c;
136 size_t rows, cols;
137
138public:
140 : c(count),
142 cols(consumer_loop_dims) {
143 coeffs.resize(rows * cols);
144 }
145
146 bool all_coeffs_exist() const {
147 for (const auto &coeff : coeffs) {
148 if (!coeff.exists()) {
149 return false;
150 }
151 }
152 return true;
153 }
154
155 bool empty() const {
156 return rows == 0;
157 }
158
159 size_t producer_storage_dims() const {
160 return rows;
161 }
162
163 size_t consumer_loop_dims() const {
164 return cols;
165 }
166
167 bool is_constant() const {
168 for (const auto &c : coeffs) {
169 if (!c.exists() || !(c == 0)) {
170 return false;
171 }
172 }
173
174 return true;
175 }
176
177 OptionalRational operator()(int producer_storage_dim, int consumer_loop_dim) const {
178 if (producer_storage_dims() == 0 || consumer_loop_dims() == 0) {
179 // The producer or consumer is scalar, so all strides are zero.
180 return {0, 1};
181 }
182 return coeffs[producer_storage_dim * cols + consumer_loop_dim];
183 }
184
185 OptionalRational &operator()(int producer_storage_dim, int consumer_loop_dim) {
186 return coeffs[producer_storage_dim * cols + consumer_loop_dim];
187 }
188
189 // To avoid redundantly re-recording copies of the same
190 // load Jacobian, we keep a count of how many times a
191 // load with this Jacobian occurs.
192 int64_t count() const {
193 return c;
194 }
195
196 // Try to merge another LoadJacobian into this one, increasing the
197 // count if the coefficients match.
198 bool merge(const LoadJacobian &other) {
199 if (other.rows != rows || other.cols != cols) {
200 return false;
201 }
202 for (size_t i = 0; i < rows * cols; i++) {
203 if (!(other.coeffs[i] == coeffs[i])) {
204 return false;
205 }
206 }
207 c += other.count();
208 return true;
209 }
210
211 // Scale the matrix coefficients by the given factors
212 LoadJacobian operator*(const std::vector<int64_t> &factors) const {
213 LoadJacobian result(rows, cols, c);
214 for (size_t i = 0; i < producer_storage_dims(); i++) {
215 for (size_t j = 0; j < consumer_loop_dims(); j++) {
216 result(i, j) = (*this)(i, j) * factors[j];
217 }
218 }
219 return result;
220 }
221
222 // Multiply Jacobians, used to look at memory dependencies through
223 // inlined functions.
224 LoadJacobian operator*(const LoadJacobian &other) const {
225 LoadJacobian result(producer_storage_dims(), other.consumer_loop_dims(), count() * other.count());
226 for (size_t i = 0; i < producer_storage_dims(); i++) {
227 for (size_t j = 0; j < other.consumer_loop_dims(); j++) {
228 result(i, j) = OptionalRational{0, 1};
229 for (size_t k = 0; k < consumer_loop_dims(); k++) {
230 result(i, j) += (*this)(i, k) * other(k, j);
231 }
232 }
233 }
234 return result;
235 }
236
237 void dump(const char *prefix) const;
238};
239
240// Classes to represent a concrete set of bounds for a Func. A Span is
241// single-dimensional, and a Bound is a multi-dimensional box. For
242// each dimension we track the estimated size, and also whether or not
243// the size is known to be constant at compile-time. For each Func we
244// track three different types of bounds:
245
246// 1) The region required by consumers of the Func, which determines
247// 2) The region actually computed, which in turn determines
248// 3) The min and max of all loops in the loop next.
249
250// 3 in turn determines the region required of the inputs to a Func,
251// which determines their region computed, and hence their loop nest,
252// and so on back up the Function DAG from outputs back to inputs.
253
254class Span {
255 int64_t min_, max_;
256 bool constant_extent_;
257
258public:
259 int64_t min() const {
260 return min_;
261 }
262 int64_t max() const {
263 return max_;
264 }
265 int64_t extent() const {
266 return max_ - min_ + 1;
267 }
268 bool constant_extent() const {
269 return constant_extent_;
270 }
271
272 void union_with(const Span &other) {
273 min_ = std::min(min_, other.min());
274 max_ = std::max(max_, other.max());
275 constant_extent_ = constant_extent_ && other.constant_extent();
276 }
277
279 max_ = min_ + e - 1;
280 }
281
283 min_ += x;
284 max_ += x;
285 }
286
287 Span(int64_t a, int64_t b, bool c)
288 : min_(a),
289 max_(b),
290 constant_extent_(c) {
291 }
292 Span() = default;
293 Span(const Span &other) = default;
294 static Span empty_span() {
295 return Span(INT64_MAX, INT64_MIN, true);
296 }
297};
298
299// Bounds objects are created and destroyed very frequently while
300// exploring scheduling options, so we have a custom allocator and
301// memory pool. Much like IR nodes, we treat them as immutable once
302// created and wrapped in a Bound object so that they can be shared
303// safely across scheduling alternatives.
304struct BoundContents {
305 mutable RefCount ref_count;
306
307 class Layout;
308 const Layout *layout = nullptr;
309
310 Span *data() const {
311 // This struct is a header
312 return (Span *)(const_cast<BoundContents *>(this) + 1);
313 }
314
316 return data()[i];
317 }
318
320 return data()[i + layout->computed_offset];
321 }
322
323 Span &loops(int i, int j) {
324 return data()[j + layout->loop_offset[i]];
325 }
326
327 const Span &region_required(int i) const {
328 return data()[i];
329 }
330
331 const Span &region_computed(int i) const {
332 return data()[i + layout->computed_offset];
333 }
334
335 const Span &loops(int i, int j) const {
336 return data()[j + layout->loop_offset[i]];
337 }
338
340 auto *b = layout->make();
341 size_t bytes = sizeof(data()[0]) * layout->total_size;
342 memcpy(b->data(), data(), bytes);
343 return b;
344 }
345
346 void validate() const;
347
348 // We're frequently going to need to make these concrete bounds
349 // arrays. It makes things more efficient if we figure out the
350 // memory layout of those data structures once ahead of time, and
351 // make each individual instance just use that. Note that this is
352 // not thread-safe.
353 class Layout {
354 // A memory pool of free BoundContent objects with this layout
355 mutable std::vector<BoundContents *> pool;
356
357 // All the blocks of memory allocated
358 mutable std::vector<void *> blocks;
359
360 mutable size_t num_live = 0;
361
362 void allocate_some_more() const;
363
364 public:
365 // number of Span to allocate
366 int total_size;
367
368 // region_computed comes next at the following index
369 int computed_offset;
370
371 // the loop for each stage starts at the following index
372 std::vector<int> loop_offset;
373
374 Layout() = default;
376
377 Layout(const Layout &) = delete;
378 void operator=(const Layout &) = delete;
379 Layout(Layout &&) = delete;
380 void operator=(Layout &&) = delete;
381
382 // Make a BoundContents object with this layout
384
385 // Release a BoundContents object with this layout back to the pool
386 void release(const BoundContents *b) const;
387 };
388};
389
391
392// A representation of the function DAG. The nodes and edges are both
393// in reverse realization order, so if you want to walk backwards up
394// the DAG, just iterate the nodes or edges in-order.
395struct FunctionDAG {
396
397 // An edge is a producer-consumer relationship
398 struct Edge;
399
400 struct SymbolicInterval {
403 };
404
405 // A Node represents a single Func
406 struct Node {
407 // A pointer back to the owning DAG
409
410 // The Halide Func this represents
412
413 // The number of bytes per point stored.
414 double bytes_per_point;
415
416 // The min/max variables used to denote a symbolic region of
417 // this Func. Used in the cost above, and in the Edges below.
418 vector<SymbolicInterval> region_required;
419
420 // A concrete region required from a bounds estimate. Only
421 // defined for outputs.
422 vector<Span> estimated_region_required;
423
424 // The region computed of a Func, in terms of the region
425 // required. For simple Funcs this is identical to the
426 // region_required. However, in some Funcs computing one
427 // output requires computing other outputs too. You can't
428 // really ask for a single output pixel from something blurred
429 // with an IIR without computing the others, for example.
430 struct RegionComputedInfo {
431 // The min and max in their full symbolic glory. We use
432 // these in the general case.
433 Interval in;
434
435 // Analysis used to accelerate common cases
437 int64_t c_min = 0, c_max = 0;
438 };
439 vector<RegionComputedInfo> region_computed;
441
442 // Expand a region required into a region computed, using the
443 // symbolic intervals above.
444 void required_to_computed(const Span *required, Span *computed) const;
445
446 // Metadata about one symbolic loop in a Func's default loop nest.
447 struct Loop {
448 string var;
449 bool pure, rvar;
450 Expr min, max;
451
452 // Which pure dimension does this loop correspond to? Invalid if it's an rvar
453 int pure_dim;
454
455 // Precomputed metadata to accelerate common cases:
456
457 // If true, the loop bounds are just the region computed in the given dimension
458 bool equals_region_computed = false;
459 int region_computed_dim = 0;
460
461 // If true, the loop bounds are a constant with the given min and max
462 bool bounds_are_constant = false;
463 int64_t c_min = 0, c_max = 0;
464
465 // A persistent fragment of source for getting this Var
466 // from its owner Func. Used for printing source code
467 // equivalent to a computed schedule.
468 string accessor;
469 };
470
471 // Get the loop nest shape as a function of the region computed
472 void loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const;
473
474 // One stage of a Func
475 struct Stage {
476 // The owning Node
477 Node *node;
478
479 // Which stage of the Func is this. 0 = pure.
480 int index;
481
482 // The loop nest that computes this stage, from innermost out.
483 vector<Loop> loop;
484 bool loop_nest_all_common_cases = false;
485
486 // The vectorization width that will be used for
487 // compute. Corresponds to the natural width for the
488 // narrowest type used.
489 int vector_size;
490
491 // The featurization of the compute done
493
494 // The actual Halide front-end stage object
496
497 // The name for scheduling (e.g. "foo.update(3)")
498 string name;
499
501
502 // Ids for perfect hashing on stages.
503 int id, max_id;
504
505 std::unique_ptr<LoadJacobian> store_jacobian;
506
507 vector<Edge *> incoming_edges;
508
509 vector<bool> dependencies;
510 bool downstream_of(const Node &n) const {
511 return dependencies[n.id];
512 };
513
515 : stage(std::move(s)) {
516 }
517
518 int get_loop_index_from_var(const std::string &var) const {
519 int i = 0;
520 for (const auto &l : loop) {
521 if (l.var == var) {
522 return i;
523 }
524
525 ++i;
526 }
527
528 return -1;
529 }
530 };
531 vector<Stage> stages;
532
533 vector<Edge *> outgoing_edges;
534
535 // Max vector size across the stages
536 int vector_size;
537
538 // A unique ID for this node, allocated consecutively starting
539 // at zero for each pipeline.
540 int id, max_id;
541
542 // Just func->dimensions(), but we ask for it so many times
543 // that's it's worth avoiding the function call into
544 // libHalide.
545 int dimensions;
546
547 // Is a single pointwise call to another Func
548 bool is_wrapper;
549
550 // We represent the input buffers as node, though we do not attempt to schedule them.
551 bool is_input;
552
553 // Is one of the pipeline outputs
554 bool is_output;
555
556 // Only uses pointwise calls
557 bool is_pointwise;
558
559 // Only uses pointwise calls + clamping on all indices
561
562 std::unique_ptr<BoundContents::Layout> bounds_memory_layout;
563
565 return bounds_memory_layout->make();
566 }
567 };
568
569 // A representation of a producer-consumer relationship
570 struct Edge {
571 struct BoundInfo {
572 // The symbolic expression for the bound in this dimension
573 Expr expr;
574
575 // Fields below are the results of additional analysis
576 // used to evaluate this bound more quickly.
579 bool affine, uses_max;
580
582 };
583
584 // Memory footprint on producer required by consumer.
585 vector<pair<BoundInfo, BoundInfo>> bounds;
586
589
590 // The number of calls the consumer makes to the producer, per
591 // point in the loop nest of the consumer.
592 int calls;
593
595
596 vector<LoadJacobian> load_jacobians;
597
599
601
602 // Given a loop nest of the consumer stage, expand a region
603 // required of the producer to be large enough to include all
604 // points required.
605 void expand_footprint(const Span *consumer_loop, Span *producer_required) const;
606 };
607
608 vector<Node> nodes;
609 vector<Edge> edges;
610
612
613 // We're going to be querying this DAG a lot while searching for
614 // an optimal schedule, so we'll also create a variety of
615 // auxiliary data structures.
616 map<int, const Node *> stage_id_to_node_map;
617
618 // Create the function DAG, and do all the dependency and cost
619 // analysis. This is done once up-front before the tree search.
620 FunctionDAG(const vector<Function> &outputs, const Target &target);
621
622 void dump() const;
623 std::ostream &dump(std::ostream &os) const;
624
625 // This class uses a lot of internal pointers, so we'll hide the copy constructor.
626 FunctionDAG(const FunctionDAG &other) = delete;
627 void operator=(const FunctionDAG &other) = delete;
628
629private:
630 // Compute the featurization for the entire DAG
631 void featurize();
632
633 template<typename OS>
634 void dump_internal(OS &os) const;
635};
636
637template<typename T>
639
640class ExprBranching : public VariadicVisitor<ExprBranching, int, int> {
642
643private:
644 const NodeMap<int64_t> &inlined;
645
646public:
647 int visit(const Reinterpret *op);
648 int visit(const IntImm *op);
649 int visit(const UIntImm *op);
650 int visit(const FloatImm *op);
651 int visit(const StringImm *op);
652 int visit(const Broadcast *op);
653 int visit(const Cast *op);
654 int visit(const Variable *op);
655 int visit(const Add *op);
656 int visit(const Sub *op);
657 int visit(const Mul *op);
658 int visit(const Div *op);
659 int visit(const Mod *op);
660 int visit(const Min *op);
661 int visit(const Max *op);
662 int visit(const EQ *op);
663 int visit(const NE *op);
664 int visit(const LT *op);
665 int visit(const LE *op);
666 int visit(const GT *op);
667 int visit(const GE *op);
668 int visit(const And *op);
669 int visit(const Or *op);
670 int visit(const Not *op);
671 int visit(const Select *op);
672 int visit(const Ramp *op);
673 int visit(const Load *op);
674 int visit(const Call *op);
675 int visit(const Shuffle *op);
676 int visit(const Let *op);
677 int visit(const VectorReduce *op);
678 int visit_binary(const Expr &a, const Expr &b);
679 int visit_nary(const std::vector<Expr> &exprs);
680
681 explicit ExprBranching(const NodeMap<int64_t> &inlined)
682 : inlined{inlined} {
683 }
684
685 int compute(const Function &f);
686};
687
688void sanitize_names(std::string &str);
689
690} // namespace Autoscheduler
691} // namespace Internal
692} // namespace Halide
693
694#endif // FUNCTION_DAG_H
void release(const BoundContents *b) const
int visit_binary(const Expr &a, const Expr &b)
ExprBranching(const NodeMap< int64_t > &inlined)
int visit_nary(const std::vector< Expr > &exprs)
OptionalRational operator()(int producer_storage_dim, int consumer_loop_dim) const
LoadJacobian operator*(const LoadJacobian &other) const
LoadJacobian(size_t producer_storage_dims, size_t consumer_loop_dims, int64_t count)
OptionalRational & operator()(int producer_storage_dim, int consumer_loop_dim)
bool merge(const LoadJacobian &other)
LoadJacobian(vector< vector< OptionalRational > > &&matrix, int64_t c=1)
void dump(const char *prefix) const
LoadJacobian operator*(const std::vector< int64_t > &factors) const
Span(int64_t a, int64_t b, bool c)
Span(const Span &other)=default
void union_with(const Span &other)
A reference-counted handle to Halide's internal representation of a function.
Definition Function.h:39
A class representing a reference count to be used with IntrusivePtr.
A visitor/mutator capable of passing arbitrary arguments to the visit methods using CRTP and returnin...
Definition IRVisitor.h:163
A single definition of a Func.
Definition Func.h:70
A Halide variable, to be used when defining functions.
Definition Var.h:19
IntrusivePtr< const BoundContents > Bound
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21
void sanitize_names(std::string &str)
int64_t gcd(int64_t, int64_t)
The greatest common divisor of two integers.
ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b)
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b)
int64_t lcm(int64_t, int64_t)
The least common multiple of two integers.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
signed __INT64_TYPE__ int64_t
signed __INT32_TYPE__ int32_t
void * memcpy(void *s1, const void *s2, size_t n)
A fragment of Halide syntax.
Definition Expr.h:258
The sum of two expressions.
Definition IR.h:56
Logical and - are both expressions true.
Definition IR.h:175
const Span & loops(int i, int j) const
BoundInfo(const Expr &e, const Node::Stage &consumer)
void expand_footprint(const Span *consumer_loop, Span *producer_required) const
vector< pair< BoundInfo, BoundInfo > > bounds
int get_loop_index_from_var(const std::string &var) const
void loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const
std::unique_ptr< BoundContents::Layout > bounds_memory_layout
void required_to_computed(const Span *required, Span *computed) const
void operator=(const FunctionDAG &other)=delete
FunctionDAG(const FunctionDAG &other)=delete
map< int, const Node * > stage_id_to_node_map
std::ostream & dump(std::ostream &os) const
FunctionDAG(const vector< Function > &outputs, const Target &target)
void operator+=(const OptionalRational &other)
Definition FunctionDAG.h:45
bool operator==(const OptionalRational &other) const
OptionalRational operator*(int64_t factor) const
Definition FunctionDAG.h:64
OptionalRational operator*(const OptionalRational &other) const
Definition FunctionDAG.h:72
A vector with 'lanes' elements, in which every element is 'value'.
Definition IR.h:259
A function call.
Definition IR.h:490
The actual IR nodes begin here.
Definition IR.h:30
The ratio of two expressions.
Definition IR.h:83
Is the first expression equal to the second.
Definition IR.h:121
Floating point constants.
Definition Expr.h:236
Is the first expression greater than or equal to the second.
Definition IR.h:166
Is the first expression greater than the second.
Definition IR.h:157
Integer constants.
Definition Expr.h:218
A class to represent ranges of Exprs.
Definition Interval.h:14
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Is the first expression less than or equal to the second.
Definition IR.h:148
Is the first expression less than the second.
Definition IR.h:139
A let expression, like you might find in a functional language.
Definition IR.h:271
Load a value from a named symbol if predicate is true.
Definition IR.h:217
The greater of two values.
Definition IR.h:112
The lesser of two values.
Definition IR.h:103
The remainder of a / b.
Definition IR.h:94
The product of two expressions.
Definition IR.h:74
Is the first expression not equal to the second.
Definition IR.h:130
Logical not - true if the expression false.
Definition IR.h:193
Logical or - is at least one of the expression true.
Definition IR.h:184
A linear ramp vector node.
Definition IR.h:247
Reinterpret value as another type, without affecting any of the bits (on little-endian systems).
Definition IR.h:47
A ternary operator.
Definition IR.h:204
Construct a new vector by taking elements from another sequence of vectors.
Definition IR.h:884
String constants.
Definition Expr.h:245
The difference of two expressions.
Definition IR.h:65
Unsigned integer constants.
Definition Expr.h:227
A named variable.
Definition IR.h:801
Horizontally reduce a vector to a scalar or narrower vector using the given commutative and associati...
Definition IR.h:1012
A struct representing a target machine and os to generate code for.
Definition Target.h:19