Halide 19.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
Tiling.h
Go to the documentation of this file.
1#ifndef TILING_H
2#define TILING_H
3
4#include <cstdint>
5#include <vector>
6
7namespace Halide {
8namespace Internal {
9namespace Autoscheduler {
10
11bool all_ones(const std::vector<int64_t> &nums);
12
13bool equal_to_existing_size(const std::vector<int64_t> &s,
14 const std::vector<int64_t> &nums);
15
16std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int64_t> &s,
17 int d,
18 int last_d,
19 int vectorized_index,
20 const std::vector<int> &vec_dim_serial_sizes,
21 bool filter_small_outer_extents = false,
22 bool allow_inner_ones = false);
23
24// Given a multi-dimensional box of dimensionality d, generate a list
25// of candidate tile sizes for it, logarithmically spacing the sizes
26// using the given factor. If 'allow_splits' is false, every dimension
27// must either be one, or the full extent of the box. This function is
28// used to generate candidate tilings when tiling for
29// producer-consumer fusion, or tiling for parallelism.
30// inner_sizes is optional vector of fixed sizes to choose from for inner loop.
31// used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop
32std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s,
33 int d,
34 int factor,
35 bool allow_splits,
36 const std::vector<int> &inner_sizes = std::vector<int>());
37
38/** moves vectorized dimension first and also removes dimensions with size 1
39 to reflect actual thread dimensions when loop nests are lowered **/
40void lowered_dims(const std::vector<int64_t> &size,
41 int vector_loop_i,
42 std::vector<int64_t> &lowered_size);
43
44// creates tilings for gpu threads loops.
45// Innermost thread loop is always the vectorized dim and its extent is a multiple of 32.
46// Other loop extents are sized to be powers of 2 such that total extent is < 1024
47// called either when we are creating parallel -> (blocks, threads) loop when computing at root
48// OR when we are creating none -> (threads, SIMD) loop when computing at a serial loop
49// serial_inner = True when we're generating (thread, serial) tilings, False when generating (block,thread) tilings
50// max_s hold max gpu_thread counts of all siblings in each dimension. Used to make sure union of
51// thread counts is under 1024 threshold.
52std::vector<std::vector<int64_t>> generate_gpu_tilings(const std::vector<std::vector<int64_t>> &stage_sizes,
53 const std::vector<std::vector<int>> &pure_dims,
54 const std::vector<int64_t> &max_s,
55 int d,
56 const std::vector<int> &vectorized_indices,
57 bool serial_inner,
58 bool is_compute_root_stage);
59
60} // namespace Autoscheduler
61} // namespace Internal
62} // namespace Halide
63
64#endif // TILING_H
bool all_ones(const std::vector< int64_t > &nums)
bool equal_to_existing_size(const std::vector< int64_t > &s, const std::vector< int64_t > &nums)
void lowered_dims(const std::vector< int64_t > &size, int vector_loop_i, std::vector< int64_t > &lowered_size)
moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dim...
std::vector< std::vector< int64_t > > generate_gpu_tilings(const std::vector< std::vector< int64_t > > &stage_sizes, const std::vector< std::vector< int > > &pure_dims, const std::vector< int64_t > &max_s, int d, const std::vector< int > &vectorized_indices, bool serial_inner, bool is_compute_root_stage)
std::vector< std::vector< int64_t > > generate_tilings(const vector< int64_t > &s, int d, int factor, bool allow_splits)
std::vector< std::vector< int64_t > > generate_serial_tilings(const std::vector< int64_t > &s, int d, int last_d, int vectorized_index, const std::vector< int > &vec_dim_serial_sizes, bool filter_small_outer_extents=false, bool allow_inner_ones=false)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.