Halide 21.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
State.h
Go to the documentation of this file.
1#ifndef STATE_H
2#define STATE_H
3
4#include "ASLog.h"
5#include "CostModel.h"
6#include "DefaultCostModel.h"
7#include "Featurization.h"
8#include "FunctionDAG.h"
9#include "LoopNest.h"
10#include "PerfectHashMap.h"
11#include <set>
12#include <unordered_set>
13#include <vector>
14
15namespace Halide {
16namespace Internal {
17namespace Autoscheduler {
18
19using std::map;
20using std::pair;
21using std::set;
22using std::string;
23using std::unordered_set;
24using std::vector;
25
27
29
31
32constexpr int kLocalMemoryLimit = 524288; // 512 KB
33
34// Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM)
35// = 103232 bytes
36// Not all 103232 bytes will be free for allocations so reduce it by factor to
37// allow a buffer
39
41
43
45 void operator()(LoopNest *new_loop_nest) const {
46 }
47};
48
49template<typename PostCreateMutator>
50void deep_copy_loop_nest(LoopNest *new_loop_nest,
51 const LoopNest *new_loop_nest_parent,
52 const IntrusivePtr<const LoopNest> &existing_loop_nest,
53 const PostCreateMutator &post_create_mutator) {
54 new_loop_nest->copy_from(*existing_loop_nest);
55
56 for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) {
57 LoopNest *new_child = new LoopNest;
58 new_loop_nest->children[i] = new_child;
59 deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
60 }
61
62 post_create_mutator(new_loop_nest);
63}
64
65using LoopNestMap = map<const LoopNest *, pair<const LoopNest *, int>>;
66
67template<typename PostCreateMutator>
69 const PostCreateMutator &post_create_mutator) {
70 LoopNest *new_loop_nest = new LoopNest;
71 deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator);
72 return new_loop_nest;
73}
74
75struct State {
76 mutable RefCount ref_count;
79 double cost = 0;
80 std::vector<double> cost_per_stage;
82 int num_decisions_made = 0;
83 bool penalized = false;
84 string schedule_source;
85
86 State() = default;
87 State(const State &) = delete;
88 State(State &&) = delete;
89 void operator=(const State &) = delete;
90 void operator=(State &&) = delete;
91
92 uint64_t structural_hash(int depth) const;
93
94 // Compute the parent and depth of every loop nest node
96 const LoopNest *here,
97 int depth) const;
98
100 const LoopNest *a,
101 const LoopNest *b) const;
102
103 // We use the post_create_mutator so that the loop nests can be modified
104 // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
105 template<typename PostCreateMutator>
106 LoopNest *create_feature_root(const PostCreateMutator &post_create_mutator) const {
107 LoopNest *new_root = new LoopNest;
108 deep_copy_loop_nest<PostCreateMutator>(new_root, nullptr, root, post_create_mutator);
109 return new_root;
110 }
111
113
115
119
120 void operator()(LoopNest *new_loop_nest) const;
121
122 // In phase 2, any compute_root loop marked 'none' will be split into
123 // blocks, threads, and serial loops. To enable the cost model to make a
124 // meaningful prediction on these pre-split loops, we assume a split into
125 // blocks and threads with a single full warp (if possible)
126 void split_compute_root_loops(LoopNest *loop_nest) const;
127
128 // If a loop nest does not have thread loops, split the outermost serial
129 // loops to create thread loops with extents 1
130 void add_outer_thread_loops(LoopNest *loop_nest) const;
131 };
132
134 const Target &target) const;
135
137 const LoopNest *loop,
138 LoopNest::Sites &site) const;
139
141 const Anderson2021Params &params,
142 const Target &target,
144 Statistics &stats,
145 bool verbose = false) const;
146
148 const Anderson2021Params &params,
149 const Target &target,
150 std::ostream &out) const;
151
152 bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at,
154
155 // For GPU, only allow store_at root or inside the outermost loop nest. Any
156 // store_ats further in will be hoisted and expanded, increasing the
157 // amount of shared memory required.
159
161
162 bool exceeds_serial_extents_limit(const Target &target) const;
163
165 const LoopNest *loop) const;
166
168 const Target &target) const;
169
171 const Target &target) const;
172
174 const Anderson2021Params &params,
175 const Target &target,
176 CostModel *cost_model,
177 Statistics &stats,
178 bool verbose = false);
179
180 // Make a child copy of this state. The loop nest is const (we
181 // make mutated copies of it, rather than mutating it), so we can
182 // continue to point to the same one and so this is a cheap
183 // operation.
185
186 void dump() const;
187
189
191 Stage &stage,
192 const vector<VarOrRVar> &parallel_vars,
193 const vector<int64_t> &parallel_extents,
194 const vector<int> &constant_extents) const;
195
197 Stage &stage,
198 const vector<VarOrRVar> &parallel_vars,
199 const vector<int64_t> &parallel_extents) const;
200
202 Stage &stage,
203 std::unordered_set<std::string> &new_serial_vars,
204 std::ostringstream &staged_funcs_schedule_source) const;
205
206 bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
207
208 // Apply the schedule represented by this state to a Halide
209 // Pipeline. Also generate source code for the schedule for the
210 // user to copy-paste to freeze this schedule as permanent artifact.
212 const Anderson2021Params &params,
213 const Target &target);
214
218
220 const LoopNestMap &parent,
221 const FunctionDAG::Node &node,
222 const LoopNest *loop,
223 const LoopNest *root,
224 StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
226 const LoopNest *loop) const;
227};
228
229// A priority queue of states, sorted according to increasing
230// cost. Never shrinks, to avoid reallocations.
231// Can't use std::priority_queue because it doesn't support unique_ptr.
233private:
234 struct CompareStates {
235 bool operator()(const IntrusivePtr<State> &a, const IntrusivePtr<State> &b) const {
236 return a->cost > b->cost;
237 }
238 };
239
240 std::vector<IntrusivePtr<State>> storage;
241 size_t sz = 0;
242
243public:
245 if (sz >= storage.size()) {
246 storage.resize(std::max(sz * 2, (size_t)64));
247 }
248 internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n";
249 storage[sz] = std::move(s);
250 sz++;
251 std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
252 }
253
255 internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n";
256 std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
257 sz--;
258 return std::move(storage[sz]);
259 }
260
262 return storage[0];
263 }
264
265 bool empty() const {
266 return sz == 0;
267 }
268
269 size_t size() const {
270 return sz;
271 }
272
273 void swap(StateQueue &other) noexcept {
274 storage.swap(other.storage);
275 std::swap(sz, other.sz);
276 }
277
279 return storage[idx];
280 }
281
282 void resort() {
283 std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
284 }
285
286 void clear() {
287 for (size_t i = 0; i < sz; i++) {
288 storage[i] = IntrusivePtr<State>{};
289 }
290 sz = 0;
291 }
292};
293
294} // namespace Autoscheduler
295} // namespace Internal
296} // namespace Halide
297
298#endif // STATE_H
#define internal_assert(c)
Definition Error.h:218
const IntrusivePtr< State > & top()
Definition State.h:261
void emplace(IntrusivePtr< State > &&s)
Definition State.h:244
void swap(StateQueue &other) noexcept
Definition State.h:273
IntrusivePtr< State > operator[](int idx) const
Definition State.h:278
A class representing a reference count to be used with IntrusivePtr.
A single definition of a Func.
Definition Func.h:70
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition LoopNest.h:24
constexpr int kLocalMemoryLimit
Definition State.h:32
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21
map< const LoopNest *, pair< const LoopNest *, int > > LoopNestMap
Definition State.h:65
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
Definition State.h:50
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42
void operator()(LoopNest *new_loop_nest) const
Definition State.h:45
void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params &params, const LoopNestMap &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
int64_t total_loop_extents_of_ancestors(const LoopNestMap &parent, const LoopNest *loop) const
bool can_fuse_gpu(const vector< int64_t > &parallel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
void operator=(const State &)=delete
void compute_loop_nest_parents(LoopNestMap &p, const LoopNest *here, int depth) const
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents, const vector< int > &constant_extents) const
void set_gpu_store_site(const LoopNestMap &parent, const LoopNest *loop, LoopNest::Sites &site) const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
Definition State.h:81
IntrusivePtr< const State > parent
Definition State.h:26
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
Definition State.h:24
bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params &params, const Target &target) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
Definition State.h:106
const LoopNest * deepest_common_ancestor(const LoopNestMap &parent, const LoopNest *a, const LoopNest *b) const
IntrusivePtr< State > make_child() const
bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target)
std::vector< double > cost_per_stage
Definition State.h:80
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19