Halide
tutorial/lesson_05_scheduling_1.cpp
// Halide tutorial lesson 5: Vectorize, parallelize, unroll and tile your code
// This lesson demonstrates how to manipulate the order in which you
// evaluate pixels in a Func, including vectorization,
// parallelization, unrolling, and tiling.
// On linux, you can compile and run it like so:
// g++ lesson_05*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_05 -std=c++17
// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_05
// On os x:
// g++ lesson_05*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_05 -std=c++17
// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_05
// If you have the entire Halide source tree, you can also build it by
// running:
// make tutorial_lesson_05_scheduling_1
// in a shell with the current directory at the top of the halide
// source tree.
#include "Halide.h"
#include <algorithm>
#include <stdio.h>
using namespace Halide;
int main(int argc, char **argv) {
// We're going to define and schedule our gradient function in
// several different ways, and see what order pixels are computed
// in.
Var x("x"), y("y");
// First we observe the default ordering.
{
Func gradient("gradient");
gradient(x, y) = x + y;
gradient.trace_stores();
// By default we walk along the rows and then down the
// columns. This means x varies quickly, and y varies
// slowly. x is the column and y is the row, so this is a
// row-major traversal.
printf("Evaluating gradient row-major\n");
Buffer<int> output = gradient.realize({4, 4});
// See figures/lesson_05_row_major.gif for a visualization of
// what this did.
// The equivalent C is:
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
printf("\n\n");
// Tracing is one useful way to understand what a schedule is
// doing. You can also ask Halide to print out pseudocode
// showing what loops Halide is generating:
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
// Because we're using the default ordering, it should print:
// compute gradient:
// for y:
// for x:
// gradient(...) = ...
}
// Reorder variables.
{
Func gradient("gradient_col_major");
gradient(x, y) = x + y;
gradient.trace_stores();
// If we reorder x and y, we can walk down the columns
// instead. The reorder call takes the arguments of the func,
// and sets a new nesting order for the for loops that are
// generated. The arguments are specified from the innermost
// loop out, so the following call puts y in the inner loop:
gradient.reorder(y, x);
// This means y (the row) will vary quickly, and x (the
// column) will vary slowly, so this is a column-major
// traversal.
printf("Evaluating gradient column-major\n");
Buffer<int> output = gradient.realize({4, 4});
// See figures/lesson_05_col_major.gif for a visualization of
// what this did.
printf("Equivalent C:\n");
for (int x = 0; x < 4; x++) {
for (int y = 0; y < 4; y++) {
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
printf("\n");
// If we print pseudo-code for this schedule, we'll see that
// the loop over y is now inside the loop over x.
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
}
// Split a variable into two.
{
Func gradient("gradient_split");
gradient(x, y) = x + y;
gradient.trace_stores();
// The most powerful primitive scheduling operation you can do
// to a var is to split it into inner and outer sub-variables:
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 2);
// This breaks the loop over x into two nested loops: an outer
// one over x_outer, and an inner one over x_inner. The last
// argument to split was the "split factor". The inner loop
// runs from zero to the split factor. The outer loop runs
// from zero to the extent required of x (4 in this case)
// divided by the split factor. Within the loops, the old
// variable is defined to be outer * factor + inner. If the
// old loop started at a value other than zero, then that is
// also added within the loops.
printf("Evaluating gradient with x split into x_outer and x_inner \n");
Buffer<int> output = gradient.realize({4, 4});
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
for (int x_inner = 0; x_inner < 2; x_inner++) {
int x = x_outer * 2 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
// Note that the order of evaluation of pixels didn't actually
// change! Splitting by itself does nothing, but it does open
// up all of the scheduling possibilities that we will explore
// below.
}
// Fuse two variables into one.
{
Func gradient("gradient_fused");
gradient(x, y) = x + y;
// The opposite of splitting is 'fusing'. Fusing two variables
// merges the two loops into a single for loop over the
// product of the extents. Fusing is less important than
// splitting, but it also sees use (as we'll see later in this
// lesson). Like splitting, fusing by itself doesn't change
// the order of evaluation.
Var fused;
gradient.fuse(x, y, fused);
printf("Evaluating gradient with x and y fused\n");
Buffer<int> output = gradient.realize({4, 4});
printf("Equivalent C:\n");
for (int fused = 0; fused < 4 * 4; fused++) {
int y = fused / 4;
int x = fused % 4;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
}
// Evaluating in tiles.
{
Func gradient("gradient_tiled");
gradient(x, y) = x + y;
gradient.trace_stores();
// Now that we can both split and reorder, we can do tiled
// evaluation. Let's split both x and y by a factor of four,
// and then reorder the vars to express a tiled traversal.
//
// A tiled traversal splits the domain into small rectangular
// tiles, and outermost iterates over the tiles, and within
// that iterates over the points within each tile. It can be
// good for performance if neighboring pixels use overlapping
// input data, for example in a blur. We can express a tiled
// traversal like so:
Var x_outer, x_inner, y_outer, y_inner;
gradient.split(x, x_outer, x_inner, 4);
gradient.split(y, y_outer, y_inner, 4);
gradient.reorder(x_inner, y_inner, x_outer, y_outer);
// This pattern is common enough that there's a shorthand for it:
// gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
printf("Evaluating gradient in 4x4 tiles\n");
Buffer<int> output = gradient.realize({8, 8});
// See figures/lesson_05_tiled.gif for a visualization of this
// schedule.
printf("Equivalent C:\n");
for (int y_outer = 0; y_outer < 2; y_outer++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
for (int y_inner = 0; y_inner < 4; y_inner++) {
for (int x_inner = 0; x_inner < 4; x_inner++) {
int x = x_outer * 4 + x_inner;
int y = y_outer * 4 + y_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
}
// Evaluating in vectors.
{
Func gradient("gradient_in_vectors");
gradient(x, y) = x + y;
gradient.trace_stores();
// The nice thing about splitting is that it guarantees the
// inner variable runs from zero to the split factor. Most of
// the time the split-factor will be a compile-time constant,
// so we can replace the loop over the inner variable with a
// single vectorized computation. This time we'll split by a
// factor of four, because on X86 we can use SSE to compute in
// 4-wide vectors.
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 4);
gradient.vectorize(x_inner);
// Splitting and then vectorizing the inner variable is common
// enough that there's a short-hand for it. We could have also
// said:
//
// gradient.vectorize(x, 4);
//
// which is equivalent to:
//
// gradient.split(x, x, x_inner, 4);
// gradient.vectorize(x_inner);
//
// Note that in this case we reused the name 'x' as the new
// outer variable. Later scheduling calls that refer to x
// will refer to this new outer variable named x.
// This time we'll evaluate over an 8x4 box, so that we have
// more than one vector of work per scanline.
printf("Evaluating gradient with x_inner vectorized \n");
Buffer<int> output = gradient.realize({8, 4});
// See figures/lesson_05_vectors.gif for a visualization.
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
// The loop over x_inner has gone away, and has been
// replaced by a vectorized version of the
// expression. On x86 processors, Halide generates SSE
// for all of this.
int x_vec[] = {x_outer * 4 + 0,
x_outer * 4 + 1,
x_outer * 4 + 2,
x_outer * 4 + 3};
int val[] = {x_vec[0] + y,
x_vec[1] + y,
x_vec[2] + y,
x_vec[3] + y};
printf("Evaluating at <%d, %d, %d, %d>, <%d, %d, %d, %d>:"
" <%d, %d, %d, %d>\n",
x_vec[0], x_vec[1], x_vec[2], x_vec[3],
y, y, y, y,
val[0], val[1], val[2], val[3]);
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
}
// Unrolling a loop.
{
Func gradient("gradient_unroll");
gradient(x, y) = x + y;
gradient.trace_stores();
// If multiple pixels share overlapping data, it can make
// sense to unroll a computation so that shared values are
// only computed or loaded once. We do this similarly to how
// we expressed vectorizing. We split a dimension and then
// fully unroll the loop of the inner variable. Unrolling
// doesn't change the order in which things are evaluated.
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 2);
gradient.unroll(x_inner);
// The shorthand for this is:
// gradient.unroll(x, 2);
printf("Evaluating gradient unrolled by a factor of two\n");
Buffer<int> result = gradient.realize({4, 4});
printf("Equivalent C:\n");
for (int y = 0; y < 4; y++) {
for (int x_outer = 0; x_outer < 2; x_outer++) {
// Instead of a for loop over x_inner, we get two
// copies of the innermost statement.
{
int x_inner = 0;
int x = x_outer * 2 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
{
int x_inner = 1;
int x = x_outer * 2 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
}
// Splitting by factors that don't divide the extent.
{
Func gradient("gradient_split_7x2");
gradient(x, y) = x + y;
gradient.trace_stores();
// Splitting guarantees that the inner loop runs from zero to
// the split factor, which is important for the uses we saw
// above. So what happens when the total extent we wish to
// evaluate x over isn't a multiple of the split factor? We'll
// split by a factor three, and we'll evaluate gradient over a
// 7x2 box instead of the 4x4 box we've been using.
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner, 3);
printf("Evaluating gradient over a 7x2 box with x split by three \n");
Buffer<int> output = gradient.realize({7, 2});
// See figures/lesson_05_split_7_by_3.gif for a visualization
// of what happened. Note that some points get evaluated more
// than once!
printf("Equivalent C:\n");
for (int y = 0; y < 2; y++) {
for (int x_outer = 0; x_outer < 3; x_outer++) { // Now runs from 0 to 2
for (int x_inner = 0; x_inner < 3; x_inner++) {
int x = x_outer * 3;
// Before we add x_inner, make sure we don't
// evaluate points outside of the 7x2 box. We'll
// clamp x to be at most 4 (7 minus the split
// factor).
if (x > 4) x = 4;
x += x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
// If you read the output, you'll see that some coordinates
// were evaluated more than once. That's generally OK, because
// pure Halide functions have no side-effects, so it's safe to
// evaluate the same point multiple times. If you're calling
// out to C functions like we are, it's your responsibility to
// make sure you can handle the same point being evaluated
// multiple times.
// The general rule is: If we require x from x_min to x_min + x_extent, and
// we split by a factor 'factor', then:
//
// x_outer runs from 0 to (x_extent + factor - 1)/factor
// x_inner runs from 0 to factor
// x = min(x_outer * factor, x_extent - factor) + x_inner + x_min
//
// In our example, x_min was 0, x_extent was 7, and factor was 3.
// However, if you write a Halide function with an update
// definition (see lesson 9), then it is not safe to evaluate
// the same point multiple times, so we won't apply this
// trick. Instead the range of values computed will be rounded
// up to the next multiple of the split factor.
}
// Fusing, tiling, and parallelizing.
{
// We saw in the previous lesson that we can parallelize
// across a variable. Here we combine it with fusing and
// tiling to express a useful pattern - processing tiles in
// parallel.
// This is where fusing shines. Fusing helps when you want to
// parallelize across multiple dimensions without introducing
// nested parallelism. Nested parallelism (parallel for loops
// within parallel for loops) is supported by Halide, but
// often gives poor performance compared to fusing the
// parallel variables into a single parallel for loop.
Func gradient("gradient_fused_tiles");
gradient(x, y) = x + y;
gradient.trace_stores();
// First we'll tile, then we'll fuse the tile indices and
// parallelize across the combination.
Var x_outer, y_outer, x_inner, y_inner, tile_index;
gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);
gradient.fuse(x_outer, y_outer, tile_index);
gradient.parallel(tile_index);
// The scheduling calls all return a reference to the Func, so
// you can also chain them together into a single statement to
// make things slightly clearer:
//
// gradient
// .tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4)
// .fuse(x_outer, y_outer, tile_index)
// .parallel(tile_index);
printf("Evaluating gradient tiles in parallel\n");
Buffer<int> output = gradient.realize({8, 8});
// The tiles should occur in arbitrary order, but within each
// tile the pixels will be traversed in row-major order. See
// figures/lesson_05_parallel_tiles.gif for a visualization.
printf("Equivalent (serial) C:\n");
// This outermost loop should be a parallel for loop, but that's hard in C.
for (int tile_index = 0; tile_index < 4; tile_index++) {
int y_outer = tile_index / 2;
int x_outer = tile_index % 2;
for (int y_inner = 0; y_inner < 4; y_inner++) {
for (int x_inner = 0; x_inner < 4; x_inner++) {
int y = y_outer * 4 + y_inner;
int x = x_outer * 4 + x_inner;
printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();
printf("\n");
}
// Putting it all together.
{
// Are you ready? We're going to use all of the features above now.
Func gradient_fast("gradient_fast");
gradient_fast(x, y) = x + y;
// We'll process 64x64 tiles in parallel.
Var x_outer, y_outer, x_inner, y_inner, tile_index;
gradient_fast
.tile(x, y, x_outer, y_outer, x_inner, y_inner, 64, 64)
.fuse(x_outer, y_outer, tile_index)
.parallel(tile_index);
// We'll compute two scanlines at once while we walk across
// each tile. We'll also vectorize in x. The easiest way to
// express this is to recursively tile again within each tile
// into 4x2 subtiles, then vectorize the subtiles across x and
// unroll them across y:
Var x_inner_outer, y_inner_outer, x_vectors, y_pairs;
gradient_fast
.tile(x_inner, y_inner, x_inner_outer, y_inner_outer, x_vectors, y_pairs, 4, 2)
.vectorize(x_vectors)
.unroll(y_pairs);
// Note that we didn't do any explicit splitting or
// reordering. Those are the most important primitive
// operations, but mostly they are buried underneath tiling,
// vectorizing, or unrolling calls.
// Now let's evaluate this over a range which is not a
// multiple of the tile size.
// If you like you can turn on tracing, but it's going to
// produce a lot of printfs. Instead we'll compute the answer
// both in C and Halide and see if the answers match.
Buffer<int> result = gradient_fast.realize({350, 250});
// See figures/lesson_05_fast.mp4 for a visualization.
printf("Checking Halide result against equivalent C...\n");
for (int tile_index = 0; tile_index < 6 * 4; tile_index++) {
int y_outer = tile_index / 4;
int x_outer = tile_index % 4;
for (int y_inner_outer = 0; y_inner_outer < 64 / 2; y_inner_outer++) {
for (int x_inner_outer = 0; x_inner_outer < 64 / 4; x_inner_outer++) {
// We're vectorized across x
int x = std::min(x_outer * 64, 350 - 64) + x_inner_outer * 4;
int x_vec[4] = {x + 0,
x + 1,
x + 2,
x + 3};
// And we unrolled across y
int y_base = std::min(y_outer * 64, 250 - 64) + y_inner_outer * 2;
{
// y_pairs = 0
int y = y_base + 0;
int y_vec[4] = {y, y, y, y};
int val[4] = {x_vec[0] + y_vec[0],
x_vec[1] + y_vec[1],
x_vec[2] + y_vec[2],
x_vec[3] + y_vec[3]};
// Check the result.
for (int i = 0; i < 4; i++) {
if (result(x_vec[i], y_vec[i]) != val[i]) {
printf("There was an error at %d %d!\n",
x_vec[i], y_vec[i]);
return -1;
}
}
}
{
// y_pairs = 1
int y = y_base + 1;
int y_vec[4] = {y, y, y, y};
int val[4] = {x_vec[0] + y_vec[0],
x_vec[1] + y_vec[1],
x_vec[2] + y_vec[2],
x_vec[3] + y_vec[3]};
// Check the result.
for (int i = 0; i < 4; i++) {
if (result(x_vec[i], y_vec[i]) != val[i]) {
printf("There was an error at %d %d!\n",
x_vec[i], y_vec[i]);
return -1;
}
}
}
}
}
}
printf("\n");
printf("Pseudo-code for the schedule:\n");
gradient_fast.print_loop_nest();
printf("\n");
// Note that in the Halide version, the algorithm is specified
// once at the top, separately from the optimizations, and there
// aren't that many lines of code total. Compare this to the C
// version. There's more code (and it isn't even parallelized or
// vectorized properly). More annoyingly, the statement of the
// algorithm (the result is x plus y) is buried in multiple places
// within the mess. This C code is hard to write, hard to read,
// hard to debug, and hard to optimize further. This is why Halide
// exists.
}
printf("Success!\n");
return 0;
}
Halide::Var
A Halide variable, to be used when defining functions.
Definition: Var.h:19
Halide::Func::reorder
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Halide::min
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:584
Halide::Func::vectorize
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Halide::Func::fuse
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.
Halide::Func::realize
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
Halide::Buffer
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Argument.h:16
Halide::Func::parallel
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Halide::Func::tile
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
Halide::Func
A halide function.
Definition: Func.h:687
Halide::Func::unroll
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Halide::Func::trace_stores
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Halide::Func::print_loop_nest
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Halide::Func::split
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...