Halide 19.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
State.h
Go to the documentation of this file.
1#ifndef STATE_H
2#define STATE_H
3
4#include "ASLog.h"
5#include "CostModel.h"
6#include "DefaultCostModel.h"
7#include "Featurization.h"
8#include "FunctionDAG.h"
9#include "LoopNest.h"
10#include "PerfectHashMap.h"
11#include <set>
12#include <unordered_set>
13#include <vector>
14
15namespace Halide {
16namespace Internal {
17namespace Autoscheduler {
18
19using std::map;
20using std::pair;
21using std::set;
22using std::string;
23using std::unordered_set;
24using std::vector;
25
27
29
31
32constexpr int kLocalMemoryLimit = 524288; // 512 KB
33
34// Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM)
35// = 103232 bytes
36// Not all 103232 bytes will be free for allocations so reduce it by factor to
37// allow a buffer
39
41
43
45 void operator()(LoopNest *new_loop_nest) const {
46 }
47};
48
49template<typename PostCreateMutator>
50void deep_copy_loop_nest(LoopNest *new_loop_nest,
51 const LoopNest *new_loop_nest_parent,
52 const IntrusivePtr<const LoopNest> &existing_loop_nest,
53 const PostCreateMutator &post_create_mutator) {
54 new_loop_nest->copy_from(*existing_loop_nest);
55
56 for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) {
57 LoopNest *new_child = new LoopNest;
58 new_loop_nest->children[i] = new_child;
59 deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
60 }
61
62 post_create_mutator(new_loop_nest);
63}
64
65using LoopNestMap = map<const LoopNest *, pair<const LoopNest *, int>>;
66
67template<typename PostCreateMutator>
69 const PostCreateMutator &post_create_mutator) {
70 LoopNest *new_loop_nest = new LoopNest;
71 deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator);
72 return new_loop_nest;
73}
74
75struct State {
76 mutable RefCount ref_count;
79 double cost = 0;
80 std::vector<double> cost_per_stage;
82 int num_decisions_made = 0;
83 bool penalized = false;
84 string schedule_source;
85
86 State() = default;
87 State(const State &) = delete;
88 State(State &&) = delete;
89 void operator=(const State &) = delete;
90 void operator=(State &&) = delete;
91
92 uint64_t structural_hash(int depth) const;
93
94 // Compute the parent and depth of every loop nest node
96 const LoopNest *here,
97 int depth) const;
98
100 const LoopNest *a,
101 const LoopNest *b) const;
102
103 // We use the post_create_mutator so that the loop nests can be modified
104 // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
105 template<typename PostCreateMutator>
106 LoopNest *create_feature_root(const PostCreateMutator &post_create_mutator) const {
107 LoopNest *new_root = new LoopNest;
108 deep_copy_loop_nest<PostCreateMutator>(new_root, nullptr, root, post_create_mutator);
109 return new_root;
110 }
111
113
115
119
120 void operator()(LoopNest *new_loop_nest) const;
121
122 // In phase 2, any compute_root loop marked 'none' will be split into
123 // blocks, threads, and serial loops. To enable the cost model to make a
124 // meaningful prediction on these pre-split loops, we assume a split into
125 // blocks and threads with a single full warp (if possible)
126 void split_compute_root_loops(LoopNest *loop_nest) const;
127
128 // If a loop nest does not have thread loops, split the outermost serial
129 // loops to create thread loops with extents 1
130 void add_outer_thread_loops(LoopNest *loop_nest) const;
131 };
132
134 const Target &target) const;
135
137 const LoopNest *loop,
138 LoopNest::Sites &site) const;
139
141 const Anderson2021Params &params,
142 const Target &target,
144 Statistics &stats,
145 bool verbose = false) const;
146
148 const Anderson2021Params &params,
149 const Target &target,
150 std::ostream &out) const;
151
152 bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at,
154
155 // For GPU, only allow store_at root or inside the outermost loop nest. Any
156 // store_ats further in will be hoisted and expanded, increasing the
157 // amount of shared memory required.
159
161
162 bool exceeds_serial_extents_limit(const Target &target) const;
163
165 const LoopNest *loop) const;
166
168 const Target &target) const;
169
171 const Target &target) const;
172
174 const Anderson2021Params &params,
175 const Target &target,
176 CostModel *cost_model,
177 Statistics &stats,
178 bool verbose = false);
179
180 // Make a child copy of this state. The loop nest is const (we
181 // make mutated copies of it, rather than mutating it), so we can
182 // continue to point to the same one and so this is a cheap
183 // operation.
185
186 void dump() const;
187
189
191 Stage &stage,
192 const vector<VarOrRVar> &parallel_vars,
193 const vector<int64_t> &parallel_extents,
194 const vector<int> &constant_extents) const;
195
197 Stage &stage,
198 const vector<VarOrRVar> &parallel_vars,
199 const vector<int64_t> &parallel_extents) const;
200
202 Stage &stage,
203 std::unordered_set<std::string> &new_serial_vars,
204 std::ostringstream &staged_funcs_schedule_source) const;
205
206 bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
207
208 // Apply the schedule represented by this state to a Halide
209 // Pipeline. Also generate source code for the schedule for the
210 // user to copy-paste to freeze this schedule as permanent artifact.
212 const Anderson2021Params &params,
213 const Target &target);
214
218
220 const LoopNestMap &parent,
221 const FunctionDAG::Node &node,
222 const LoopNest *loop,
223 const LoopNest *root,
224 StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
226 const LoopNest *loop) const;
227};
228
229// A priority queue of states, sorted according to increasing
230// cost. Never shrinks, to avoid reallocations.
231// Can't use std::priority_queue because it doesn't support unique_ptr.
233private:
234 struct CompareStates {
235 bool operator()(const IntrusivePtr<State> &a, const IntrusivePtr<State> &b) const {
236 return a->cost > b->cost;
237 }
238 };
239
240 std::vector<IntrusivePtr<State>> storage;
241 size_t sz = 0;
242
243public:
245 if (sz >= storage.size()) {
246 storage.resize(std::max(sz * 2, (size_t)64));
247 }
248 internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n";
249 storage[sz] = std::move(s);
250 sz++;
251 std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
252 }
253
255 internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n";
256 std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
257 sz--;
258 return std::move(storage[sz]);
259 }
260
262 return storage[0];
263 }
264
265 bool empty() const {
266 return sz == 0;
267 }
268
269 size_t size() const {
270 return sz;
271 }
272
273 void swap(StateQueue &other) noexcept {
274 storage.swap(other.storage);
275 std::swap(sz, other.sz);
276 }
277
279 return storage[idx];
280 }
281
282 void resort() {
283 std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
284 }
285
286 void clear() {
287 for (size_t i = 0; i < sz; i++) {
288 storage[i] = IntrusivePtr<State>{};
289 }
290 sz = 0;
291 }
292};
293
294} // namespace Autoscheduler
295} // namespace Internal
296} // namespace Halide
297
298#endif // STATE_H
#define internal_assert(c)
Definition Errors.h:19
const IntrusivePtr< State > & top()
Definition State.h:261
void emplace(IntrusivePtr< State > &&s)
Definition State.h:244
void swap(StateQueue &other) noexcept
Definition State.h:273
IntrusivePtr< State > operator[](int idx) const
Definition State.h:278
A class representing a reference count to be used with IntrusivePtr.
A single definition of a Func.
Definition Func.h:69
constexpr int kLocalMemoryLimit
Definition State.h:32
map< const LoopNest *, pair< const LoopNest *, int > > LoopNestMap
Definition State.h:65
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
Definition State.h:50
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42
void operator()(LoopNest *new_loop_nest) const
Definition State.h:45
void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params &params, const LoopNestMap &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
int64_t total_loop_extents_of_ancestors(const LoopNestMap &parent, const LoopNest *loop) const
bool can_fuse_gpu(const vector< int64_t > &parallel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
void operator=(const State &)=delete
void compute_loop_nest_parents(LoopNestMap &p, const LoopNest *here, int depth) const
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents, const vector< int > &constant_extents) const
void set_gpu_store_site(const LoopNestMap &parent, const LoopNest *loop, LoopNest::Sites &site) const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
Definition State.h:81
IntrusivePtr< const State > parent
Definition State.h:26
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
Definition State.h:24
bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params &params, const Target &target) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
Definition State.h:106
const LoopNest * deepest_common_ancestor(const LoopNestMap &parent, const LoopNest *a, const LoopNest *b) const
IntrusivePtr< State > make_child() const
bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target)
std::vector< double > cost_per_stage
Definition State.h:80
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
Definition Target.h:19