Halide
Tiling.h
Go to the documentation of this file.
1 #ifndef TILING_H
2 #define TILING_H
3 
4 #include <cstdint>
5 #include <vector>
6 
7 namespace Halide {
8 namespace Internal {
9 namespace Autoscheduler {
10 
11 bool all_ones(const std::vector<int64_t> &nums);
12 
13 bool equal_to_existing_size(const std::vector<int64_t> &s, const std::vector<int64_t> &nums);
14 
15 std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int64_t> &s, int d,
16  int last_d,
17  int vectorized_index,
18  const std::vector<int> &vec_dim_serial_sizes,
19  bool filter_small_outer_extents = false,
20  bool allow_inner_ones = false);
21 
22 // Given a multi-dimensional box of dimensionality d, generate a list
23 // of candidate tile sizes for it, logarithmically spacing the sizes
24 // using the given factor. If 'allow_splits' is false, every dimension
25 // must either be one, or the full extent of the box. This function is
26 // used to generate candidate tilings when tiling for
27 // producer-consumer fusion, or tiling for parallelism.
28 // inner_sizes is optional vector of fixed sizes to choose from for inner loop.
29 // used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop
30 std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s, int d, int factor,
31  bool allow_splits,
32  const std::vector<int> &inner_sizes = std::vector<int>());
33 
34 /** moves vectorized dimension first and also removes dimensions with size 1
35  to reflect actual thread dimensions when loop nests are lowered **/
36 void lowered_dims(const std::vector<int64_t> &size, int vector_loop_i, std::vector<int64_t> &lowered_size);
37 
38 // creates tilings for gpu threads loops.
39 // Innermost thread loop is always the vectorized dim and its extent is a multiple of 32.
40 // Other loop extents are sized to be powers of 2 such that total extent is < 1024
41 // called either when we are creating parallel -> (blocks, threads) loop when computing at root
42 // OR when we are creating none -> (threads, SIMD) loop when computing at a serial loop
43 // serial_inner = True when we're generating (thread, serial) tilings, False when generating (block,thread) tilings
44 // max_s hold max gpu_thread counts of all siblings in each dimension. Used to make sure union of
45 // thread counts is under 1024 threshold.
46 std::vector<std::vector<int64_t>> generate_gpu_tilings(const std::vector<std::vector<int64_t>> &stage_sizes,
47  const std::vector<std::vector<int>> &pure_dims,
48  const std::vector<int64_t> &max_s,
49  int d,
50  const std::vector<int> &vectorized_indices,
51  bool serial_inner,
52  bool is_compute_root_stage);
53 
54 } // namespace Autoscheduler
55 } // namespace Internal
56 } // namespace Halide
57 
58 #endif // TILING_H
Halide::Internal::Autoscheduler::equal_to_existing_size
bool equal_to_existing_size(const std::vector< int64_t > &s, const std::vector< int64_t > &nums)
Halide::Internal::Autoscheduler::all_ones
bool all_ones(const std::vector< int64_t > &nums)
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::Autoscheduler::generate_tilings
std::vector< std::vector< int64_t > > generate_tilings(const vector< int64_t > &s, int d, int factor, bool allow_splits)
Halide::Internal::Autoscheduler::generate_serial_tilings
std::vector< std::vector< int64_t > > generate_serial_tilings(const std::vector< int64_t > &s, int d, int last_d, int vectorized_index, const std::vector< int > &vec_dim_serial_sizes, bool filter_small_outer_extents=false, bool allow_inner_ones=false)
Halide::Internal::Autoscheduler::generate_gpu_tilings
std::vector< std::vector< int64_t > > generate_gpu_tilings(const std::vector< std::vector< int64_t >> &stage_sizes, const std::vector< std::vector< int >> &pure_dims, const std::vector< int64_t > &max_s, int d, const std::vector< int > &vectorized_indices, bool serial_inner, bool is_compute_root_stage)
Halide::Internal::Autoscheduler::lowered_dims
void lowered_dims(const std::vector< int64_t > &size, int vector_loop_i, std::vector< int64_t > &lowered_size)
moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dim...