Halide
State.h
Go to the documentation of this file.
1 #ifndef STATE_H
2 #define STATE_H
3 
4 #include "ASLog.h"
5 #include "CostModel.h"
6 #include "DefaultCostModel.h"
7 #include "Featurization.h"
8 #include "FunctionDAG.h"
9 #include "LoopNest.h"
10 #include "PerfectHashMap.h"
11 #include <set>
12 #include <unordered_set>
13 #include <vector>
14 
15 namespace Halide {
16 namespace Internal {
17 namespace Autoscheduler {
18 
19 using std::map;
20 using std::pair;
21 using std::set;
22 using std::string;
23 using std::unordered_set;
24 using std::vector;
25 
27 
29 
31 
32 constexpr int kLocalMemoryLimit = 524288; // 512 KB
33 
34 // Stack memory limit = Total GPU Memory / (# of SMs * maximum threads per SM)
35 // = 103232 bytes
36 // Not all 103232 bytes will be free for allocations so reduce it by factor to
37 // allow a buffer
39 
41 
43 
44 struct NoOpMutator {
45  void operator()(LoopNest *new_loop_nest) const {
46  }
47 };
48 
49 template<typename PostCreateMutator>
50 void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr<const LoopNest> &existing_loop_nest, const PostCreateMutator &post_create_mutator) {
51  new_loop_nest->copy_from(*existing_loop_nest);
52 
53  for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) {
54  LoopNest *new_child = new LoopNest;
55  new_loop_nest->children[i] = new_child;
56  deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
57  }
58 
59  post_create_mutator(new_loop_nest);
60 }
61 
62 template<typename PostCreateMutator>
63 LoopNest *deep_copy_loop_nest(const IntrusivePtr<const LoopNest> &loop_nest, const PostCreateMutator &post_create_mutator) {
64  LoopNest *new_loop_nest = new LoopNest;
65  deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator);
66  return new_loop_nest;
67 }
68 
69 struct State {
70  mutable RefCount ref_count;
73  double cost = 0;
74  std::vector<double> cost_per_stage;
76  int num_decisions_made = 0;
77  bool penalized = false;
78  string schedule_source;
79 
80  State() = default;
81  State(const State &) = delete;
82  State(State &&) = delete;
83  void operator=(const State &) = delete;
84  void operator=(State &&) = delete;
85 
86  uint64_t structural_hash(int depth) const;
87 
88  // Compute the parent and depth of every loop nest node
89  void compute_loop_nest_parents(map<const LoopNest *, pair<const LoopNest *, int>> &p,
90  const LoopNest *here, int depth) const;
91 
92  const LoopNest *deepest_common_ancestor(const map<const LoopNest *, pair<const LoopNest *, int>> &parent,
93  const LoopNest *a, const LoopNest *b) const;
94 
95  // We use the post_create_mutator so that the loop nests can be modified
96  // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
97  template<typename PostCreateMutator>
98  LoopNest *create_feature_root(const PostCreateMutator &post_create_mutator) const {
99  LoopNest *new_root = new LoopNest;
100  deep_copy_loop_nest<PostCreateMutator>(new_root, nullptr, root, post_create_mutator);
101  return new_root;
102  }
103 
105 
107 
110  const Target &target;
111 
112  void operator()(LoopNest *new_loop_nest) const;
113 
114  // In phase 2, any compute_root loop marked 'none' will be split into
115  // blocks, threads, and serial loops. To enable the cost model to make a
116  // meaningful prediction on these pre-split loops, we assume a split into
117  // blocks and threads with a single full warp (if possible)
118  void split_compute_root_loops(LoopNest *loop_nest) const;
119 
120  // If a loop nest does not have thread loops, split the outermost serial
121  // loops to create thread loops with extents 1
122  void add_outer_thread_loops(LoopNest *loop_nest) const;
123  };
124 
126 
127  void set_gpu_store_site(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop, LoopNest::Sites &site) const;
128 
129  bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap<ScheduleFeatures> *features, Statistics &stats, bool verbose = false) const;
130 
131  void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const;
132 
133  bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at, const IntrusivePtr<const LoopNest> &parent) const;
134 
135  // For GPU, only allow store_at root or inside the outermost loop nest. Any
136  // store_ats further in will be hoisted and expanded, increasing the
137  // amount of shared memory required.
139 
141 
142  bool exceeds_serial_extents_limit(const Target &target) const;
143 
144  int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const;
145 
146  bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const;
147 
148  bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const;
149 
150  bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false);
151 
152  // Make a child copy of this state. The loop nest is const (we
153  // make mutated copies of it, rather than mutating it), so we can
154  // continue to point to the same one and so this is a cheap
155  // operation.
157 
158  void dump() const;
159 
160  void print_compute_locations() const;
161 
162  void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents, const vector<int> &constant_extents) const;
163 
164  void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents) const;
165 
166  bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set<std::string> &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const;
167 
168  bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
169 
170  // Apply the schedule represented by this state to a Halide
171  // Pipeline. Also generate source code for the schedule for the
172  // user to copy-paste to freeze this schedule as permanent artifact.
173  void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target);
174 
175  bool should_always_consider_inline(const FunctionDAG::Node *node) const;
178 
179  const LoopNest *deepest_valid_compute_location(const Anderson2021Params &params, const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
180  int64_t total_loop_extents_of_ancestors(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop) const;
181 };
182 
183 // A priority queue of states, sorted according to increasing
184 // cost. Never shrinks, to avoid reallocations.
185 // Can't use std::priority_queue because it doesn't support unique_ptr.
186 class StateQueue {
187 private:
188  struct CompareStates {
189  bool operator()(const IntrusivePtr<State> &a, const IntrusivePtr<State> &b) const {
190  return a->cost > b->cost;
191  }
192  };
193 
194  std::vector<IntrusivePtr<State>> storage;
195  size_t sz = 0;
196 
197 public:
199  if (sz >= storage.size()) {
200  storage.resize(std::max(sz * 2, (size_t)64));
201  }
202  internal_assert(sz < storage.size()) << sz << " " << storage.size() << "\n";
203  storage[sz] = std::move(s);
204  sz++;
205  std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
206  }
207 
209  internal_assert(sz <= storage.size()) << sz << " " << storage.size() << "\n";
210  std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
211  sz--;
212  return std::move(storage[sz]);
213  }
214 
216  return storage[0];
217  }
218 
219  bool empty() const {
220  return sz == 0;
221  }
222 
223  size_t size() const {
224  return sz;
225  }
226 
227  void swap(StateQueue &other) {
228  storage.swap(other.storage);
229  std::swap(sz, other.sz);
230  }
231 
233  return storage[idx];
234  }
235 
236  void resort() {
237  std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
238  }
239 
240  void clear() {
241  for (size_t i = 0; i < sz; i++) {
242  storage[i] = IntrusivePtr<State>{};
243  }
244  sz = 0;
245  }
246 };
247 
248 } // namespace Autoscheduler
249 } // namespace Internal
250 } // namespace Halide
251 
252 #endif // STATE_H
Halide::Internal::Autoscheduler::State::calculate_cost
bool calculate_cost(const FunctionDAG &dag, const Adams2019Params &params, CostModel *cost_model, const CachingOptions &cache_options, int verbosity=99)
Halide::Internal::Autoscheduler::get_stack_memory_adjustment_factor
double get_stack_memory_adjustment_factor()
Halide::Internal::Autoscheduler::use_adjusted_tilings
bool use_adjusted_tilings()
Halide::Internal::Autoscheduler::LoopNest::StageScheduleState
Definition: LoopNest.h:209
Halide::Internal::Autoscheduler::State::FeatureLoopNestMutator::operator()
void operator()(LoopNest *new_loop_nest) const
internal_assert
#define internal_assert(c)
Definition: Errors.h:19
Halide::Internal::Autoscheduler::State::create_feature_root
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
Definition: State.h:98
Halide::Internal::Autoscheduler::StateQueue::pop
IntrusivePtr< State > pop()
Definition: State.h:208
Halide::Internal::Autoscheduler::State
Definition: State.h:21
Halide::Internal::Autoscheduler::State::save_featurization
void save_featurization(const FunctionDAG &dag, const Adams2019Params &params, const CachingOptions &cache_options, std::ostream &out)
Halide::Internal::Autoscheduler::StateQueue::emplace
void emplace(IntrusivePtr< State > &&s)
Definition: State.h:198
Halide::Internal::Autoscheduler::kLocalMemoryLimit
constexpr int kLocalMemoryLimit
Definition: State.h:32
Halide::Internal::Autoscheduler::State::add_to_always_consider_inline_options
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
Halide::Internal::Autoscheduler::FunctionDAG
Definition: FunctionDAG.h:368
Halide::Internal::Autoscheduler::State::apply_schedule
void apply_schedule(const FunctionDAG &dag, const Adams2019Params &params)
Halide::Internal::Autoscheduler::State::can_fuse_gpu
bool can_fuse_gpu(const vector< int64_t > &parallel_extents) const
Halide::Internal::Autoscheduler::NoOpMutator
Definition: State.h:44
Halide::Internal::Autoscheduler::State::always_consider_inline
NodeMap< bool > always_consider_inline
Definition: State.h:75
Halide::Internal::Autoscheduler::State::dump
void dump() const
Halide::Internal::Autoscheduler::Statistics
Definition: Statistics.h:63
Halide::Internal::Autoscheduler::StateQueue::size
size_t size() const
Definition: State.h:223
Halide::Internal::Autoscheduler::State::make_child
IntrusivePtr< State > make_child() const
Halide::Internal::Autoscheduler::State::exceeds_serial_extents_limit
bool exceeds_serial_extents_limit(const Target &target) const
Halide::Internal::IntrusivePtr
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
uint64_t
unsigned __INT64_TYPE__ uint64_t
Definition: runtime_internal.h:23
Halide::Internal::Autoscheduler::State::cost
double cost
Definition: State.h:28
Halide::Internal::Autoscheduler::StateQueue
Definition: State.h:186
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
PerfectHashMap.h
Halide::Internal::Autoscheduler::State::has_compute_root_loops_without_blocks
bool has_compute_root_loops_without_blocks() const
Halide::Internal::Autoscheduler::StateQueue::top
const IntrusivePtr< State > & top()
Definition: State.h:215
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::Autoscheduler::LoopNest::copy_from
void copy_from(const LoopNest &n)
Halide::Internal::Autoscheduler::get_stack_memory_limit
int64_t get_stack_memory_limit()
Halide::Internal::Autoscheduler::StateQueue::operator[]
IntrusivePtr< State > operator[](int idx) const
Definition: State.h:232
Halide::Internal::Autoscheduler::State::operator=
void operator=(const State &)=delete
Halide::Internal::Autoscheduler::LoopNest
Definition: LoopNest.h:34
Halide::CostModel
Definition: CostModel.h:61
FunctionDAG.h
size_t
__SIZE_TYPE__ size_t
Definition: runtime_internal.h:31
Halide::Internal::Autoscheduler::State::contains_store_at_further_in_than_outermost
bool contains_store_at_further_in_than_outermost() const
Halide::Internal::Autoscheduler::State::parent
IntrusivePtr< const State > parent
Definition: State.h:26
Halide::Internal::Autoscheduler::State::print_compute_locations
void print_compute_locations() const
Halide::Internal::Autoscheduler::deep_copy_loop_nest
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
Definition: State.h:50
Halide::Internal::Autoscheduler::State::has_dynamic_allocation_inside_thread
bool has_dynamic_allocation_inside_thread() const
Halide::Internal::Autoscheduler::State::penalized
bool penalized
Definition: State.h:32
Halide::Internal::Autoscheduler::State::exceeds_local_memory_limit
bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const
Featurization.h
int64_t
signed __INT64_TYPE__ int64_t
Definition: runtime_internal.h:22
Halide::Internal::Autoscheduler::State::fuse_gpu_blocks
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents, const vector< int > &constant_extents) const
Halide::Internal::Autoscheduler::State::ref_count
RefCount ref_count
Definition: State.h:22
Halide::Internal::Autoscheduler::State::mark_gpu_threads
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
Halide::Internal::Autoscheduler::State::root
IntrusivePtr< const LoopNest > root
Definition: State.h:24
Halide::Internal::Autoscheduler::StateQueue::empty
bool empty() const
Definition: State.h:219
Halide::Internal::Autoscheduler::State::cost_per_stage
std::vector< double > cost_per_stage
Definition: State.h:74
Halide::Internal::Autoscheduler::State::FeatureLoopNestMutator
Definition: State.h:108
Halide::Internal::Autoscheduler::LoopNest::Sites
Definition: LoopNest.h:100
Halide::Internal::Autoscheduler::is_memoize_blocks_enabled
bool is_memoize_blocks_enabled()
ASLog.h
Halide::Internal::Autoscheduler::State::FeatureLoopNestMutator::params
const Anderson2021Params & params
Definition: State.h:109
CostModel.h
Halide::Internal::Autoscheduler::State::exceeds_shared_memory_limit
bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const
Halide::Internal::Autoscheduler::State::set_gpu_store_site
void set_gpu_store_site(const map< const LoopNest *, pair< const LoopNest *, int >> &parent, const LoopNest *loop, LoopNest::Sites &site) const
Halide::Internal::Autoscheduler::State::get_shared_mem_alloc_size
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
Halide::Internal::Autoscheduler::State::State
State()=default
Halide::Internal::Autoscheduler::State::has_loop_nest_without_thread_loops
bool has_loop_nest_without_thread_loops() const
PerfectHashMap
Definition: PerfectHashMap.h:38
Halide::Internal::Autoscheduler::Anderson2021Params
Definition: CostModel.h:18
Halide::Internal::Autoscheduler::State::FeatureLoopNestMutator::target
const Target & target
Definition: State.h:110
Halide::Internal::Autoscheduler::State::update_always_consider_inline_options
void update_always_consider_inline_options(const FunctionDAG::Node *node)
Halide::Internal::Autoscheduler::State::FeatureLoopNestMutator::add_outer_thread_loops
void add_outer_thread_loops(LoopNest *loop_nest) const
Halide::Internal::Autoscheduler::StateQueue::swap
void swap(StateQueue &other)
Definition: State.h:227
Halide::Internal::RefCount
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
Halide::Internal::Autoscheduler::State::structural_hash
uint64_t structural_hash(int depth) const
Halide::Internal::Autoscheduler::NoOpMutator::operator()
void operator()(LoopNest *new_loop_nest) const
Definition: State.h:45
Halide::Internal::Autoscheduler::State::num_decisions_made
int num_decisions_made
Definition: State.h:30
Halide::Internal::Autoscheduler::compute_root_and_inline_only
bool compute_root_and_inline_only()
Halide::Internal::Autoscheduler::FunctionDAG::Node
Definition: FunctionDAG.h:379
Halide::Internal::Autoscheduler::State::mark_gpu_blocks
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > &parallel_vars, const vector< int64_t > &parallel_extents) const
Halide::Internal::Autoscheduler::State::compute_featurization
void compute_featurization(const FunctionDAG &dag, const Adams2019Params &params, StageMap< ScheduleFeatures > *features, const CachingOptions &cache_options)
Halide::max
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:587
Halide::Internal::Autoscheduler::State::deepest_valid_compute_location
const LoopNest * deepest_valid_compute_location(const Anderson2021Params &params, const map< const LoopNest *, pair< const LoopNest *, int >> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
Halide::Stage
A single definition of a Func.
Definition: Func.h:70
Halide::Internal::Autoscheduler::LoopNest::children
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:42
DefaultCostModel.h
Halide::Internal::Autoscheduler::verify_memoized_features
bool verify_memoized_features()
Halide::Internal::Autoscheduler::State::FeatureLoopNestMutator::split_compute_root_loops
void split_compute_root_loops(LoopNest *loop_nest) const
Halide::Internal::Autoscheduler::StateQueue::resort
void resort()
Definition: State.h:236
Halide::Target
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
Halide::Internal::Autoscheduler::State::should_always_consider_inline
bool should_always_consider_inline(const FunctionDAG::Node *node) const
Halide::Internal::Autoscheduler::State::total_loop_extents_of_ancestors
int64_t total_loop_extents_of_ancestors(const map< const LoopNest *, pair< const LoopNest *, int >> &parent, const LoopNest *loop) const
Halide::Internal::Autoscheduler::State::compute_loop_nest_parents
void compute_loop_nest_parents(map< const LoopNest *, pair< const LoopNest *, int >> &p, const LoopNest *here, int depth) const
Halide::Internal::Autoscheduler::StateQueue::clear
void clear()
Definition: State.h:240
Halide::Internal::Autoscheduler::State::contains_store_at
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
LoopNest.h
Halide::Internal::Autoscheduler::State::deepest_common_ancestor
const LoopNest * deepest_common_ancestor(const map< const LoopNest *, pair< const LoopNest *, int >> &parent, const LoopNest *a, const LoopNest *b) const
Halide::Internal::Autoscheduler::State::schedule_source
string schedule_source
Definition: State.h:36
Halide::Internal::Autoscheduler::State::get_root_for_features
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params &params, const Target &target) const