// Halide tutorial lesson 5: Vectorize, parallelize, unroll and tile your code// This lesson demonstrates how to manipulate the order in which you// evaluate pixels in a Func, including vectorization,// parallelization, unrolling, and tiling.// On linux, you can compile and run it like so:// g++ lesson_05*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_05 -std=c++17// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_05// On os x:// g++ lesson_05*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_05 -std=c++17// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_05// If you have the entire Halide source tree, you can also build it by// running:// make tutorial_lesson_05_scheduling_1// in a shell with the current directory at the top of the halide// source tree.#include"Halide.h"#include <algorithm>#include <stdio.h>using namespace Halide;intmain(int argc,char**argv) {// We're going to define and schedule our gradient function in// several different ways, and see what order pixels are computed// in.
Var x("x"),y("y");// First we observe the default ordering.{
Func gradient("gradient");gradient(x, y) = x + y;
gradient.trace_stores();// By default we walk along the rows and then down the// columns. This means x varies quickly, and y varies// slowly. x is the column and y is the row, so this is a// row-major traversal.printf("Evaluating gradient row-major\n");
Buffer<int> output = gradient.realize({4,4});// Click to show output ...
> Begin pipeline gradient.0()
> Tag gradient.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
> Store gradient.0(0, 0) = 0
> Store gradient.0(1, 0) = 1
> Store gradient.0(2, 0) = 2
> Store gradient.0(3, 0) = 3
> Store gradient.0(0, 1) = 1
> Store gradient.0(1, 1) = 2
> Store gradient.0(2, 1) = 3
> Store gradient.0(3, 1) = 4
> Store gradient.0(0, 2) = 2
> Store gradient.0(1, 2) = 3
> Store gradient.0(2, 2) = 4
> Store gradient.0(3, 2) = 5
> Store gradient.0(0, 3) = 3
> Store gradient.0(1, 3) = 4
> Store gradient.0(2, 3) = 5
> Store gradient.0(3, 3) = 6
> End pipeline gradient.0()
// See below for a visualization of// what this did.// The equivalent C is:printf("Equivalent C:\n");for(int y =0; y <4; y++) {for(int x =0; x <4; x++) {printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}printf("\n\n");// Tracing is one useful way to understand what a schedule is// doing. You can also ask Halide to print out pseudocode// showing what loops Halide is generating:printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient:
> for y:
> for x:
> gradient(...) = ...
// Because we're using the default ordering, it should print:// compute gradient:// for y:// for x:// gradient(...) = ...}// Reorder variables.{
Func gradient("gradient_col_major");gradient(x, y) = x + y;
gradient.trace_stores();// If we reorder x and y, we can walk down the columns// instead. The reorder call takes the arguments of the func,// and sets a new nesting order for the for loops that are// generated. The arguments are specified from the innermost// loop out, so the following call puts y in the inner loop:
gradient.reorder(y, x);// This means y (the row) will vary quickly, and x (the// column) will vary slowly, so this is a column-major// traversal.printf("Evaluating gradient column-major\n");
Buffer<int> output = gradient.realize({4,4});// Click to show output ...
> Begin pipeline gradient_col_major.0()
> Tag gradient_col_major.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
> Store gradient_col_major.0(0, 0) = 0
> Store gradient_col_major.0(0, 1) = 1
> Store gradient_col_major.0(0, 2) = 2
> Store gradient_col_major.0(0, 3) = 3
> Store gradient_col_major.0(1, 0) = 1
> Store gradient_col_major.0(1, 1) = 2
> Store gradient_col_major.0(1, 2) = 3
> Store gradient_col_major.0(1, 3) = 4
> Store gradient_col_major.0(2, 0) = 2
> Store gradient_col_major.0(2, 1) = 3
> Store gradient_col_major.0(2, 2) = 4
> Store gradient_col_major.0(2, 3) = 5
> Store gradient_col_major.0(3, 0) = 3
> Store gradient_col_major.0(3, 1) = 4
> Store gradient_col_major.0(3, 2) = 5
> Store gradient_col_major.0(3, 3) = 6
> End pipeline gradient_col_major.0()
// See below for a visualization of// what this did.printf("Equivalent C:\n");for(int x =0; x <4; x++) {for(int y =0; y <4; y++) {printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}printf("\n");// If we print pseudo-code for this schedule, we'll see that// the loop over y is now inside the loop over x.printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_col_major:
> for x:
> for y:
> gradient_col_major(...) = ...
}// Split a variable into two.{
Func gradient("gradient_split");gradient(x, y) = x + y;
gradient.trace_stores();// The most powerful primitive scheduling operation you can do// to a var is to split it into inner and outer sub-variables:
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner,2);// This breaks the loop over x into two nested loops: an outer// one over x_outer, and an inner one over x_inner. The last// argument to split was the "split factor". The inner loop// runs from zero to the split factor. The outer loop runs// from zero to the extent required of x (4 in this case)// divided by the split factor. Within the loops, the old// variable is defined to be outer * factor + inner. If the// old loop started at a value other than zero, then that is// also added within the loops.printf("Evaluating gradient with x split into x_outer and x_inner\n");
Buffer<int> output = gradient.realize({4,4});// Click to show output ...
> Begin pipeline gradient_split.0()
> Tag gradient_split.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
> Store gradient_split.0(0, 0) = 0
> Store gradient_split.0(1, 0) = 1
> Store gradient_split.0(2, 0) = 2
> Store gradient_split.0(3, 0) = 3
> Store gradient_split.0(0, 1) = 1
> Store gradient_split.0(1, 1) = 2
> Store gradient_split.0(2, 1) = 3
> Store gradient_split.0(3, 1) = 4
> Store gradient_split.0(0, 2) = 2
> Store gradient_split.0(1, 2) = 3
> Store gradient_split.0(2, 2) = 4
> Store gradient_split.0(3, 2) = 5
> Store gradient_split.0(0, 3) = 3
> Store gradient_split.0(1, 3) = 4
> Store gradient_split.0(2, 3) = 5
> Store gradient_split.0(3, 3) = 6
> End pipeline gradient_split.0()
printf("Equivalent C:\n");for(int y =0; y <4; y++) {for(int x_outer =0; x_outer <2; x_outer++) {for(int x_inner =0; x_inner <2; x_inner++) {int x = x_outer *2+ x_inner;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_split:
> for y:
> for x.x_outer:
> for x.x_inner in [0, 1]:
> gradient_split(...) = ...
// Note that the order of evaluation of pixels didn't actually// change! Splitting by itself does nothing, but it does open// up all of the scheduling possibilities that we will explore// below.}// Fuse two variables into one.{
Func gradient("gradient_fused");gradient(x, y) = x + y;// The opposite of splitting is 'fusing'. Fusing two variables// merges the two loops into a single for loop over the// product of the extents. Fusing is less important than// splitting, but it also sees use (as we'll see later in this// lesson). Like splitting, fusing by itself doesn't change// the order of evaluation.
Var fused;
gradient.fuse(x, y, fused);printf("Evaluating gradient with x and y fused\n");
Buffer<int> output = gradient.realize({4,4});printf("Equivalent C:\n");for(int fused =0; fused <4*4; fused++) {int y = fused /4;int x = fused %4;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_fused:
> for x.fused:
> gradient_fused(...) = ...
}// Evaluating in tiles.{
Func gradient("gradient_tiled");gradient(x, y) = x + y;
gradient.trace_stores();// Now that we can both split and reorder, we can do tiled// evaluation. Let's split both x and y by a factor of four,// and then reorder the vars to express a tiled traversal.//// A tiled traversal splits the domain into small rectangular// tiles, and outermost iterates over the tiles, and within// that iterates over the points within each tile. It can be// good for performance if neighboring pixels use overlapping// input data, for example in a blur. We can express a tiled// traversal like so:
Var x_outer, x_inner, y_outer, y_inner;
gradient.split(x, x_outer, x_inner,4);
gradient.split(y, y_outer, y_inner,4);
gradient.reorder(x_inner, y_inner, x_outer, y_outer);// This pattern is common enough that there's a shorthand for it:// gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);printf("Evaluating gradient in 4x4 tiles\n");
Buffer<int> output = gradient.realize({8,8});// Click to show output ...
> Begin pipeline gradient_tiled.0()
> Tag gradient_tiled.0() tag = "func_type_and_dim: 1 0 32 1 2 0 8 0 8"
> Store gradient_tiled.0(0, 0) = 0
> Store gradient_tiled.0(1, 0) = 1
> Store gradient_tiled.0(2, 0) = 2
> Store gradient_tiled.0(3, 0) = 3
> Store gradient_tiled.0(0, 1) = 1
> Store gradient_tiled.0(1, 1) = 2
> Store gradient_tiled.0(2, 1) = 3
> Store gradient_tiled.0(3, 1) = 4
> Store gradient_tiled.0(0, 2) = 2
> Store gradient_tiled.0(1, 2) = 3
> Store gradient_tiled.0(2, 2) = 4
> Store gradient_tiled.0(3, 2) = 5
> Store gradient_tiled.0(0, 3) = 3
> Store gradient_tiled.0(1, 3) = 4
> Store gradient_tiled.0(2, 3) = 5
> Store gradient_tiled.0(3, 3) = 6
> Store gradient_tiled.0(4, 0) = 4
> Store gradient_tiled.0(5, 0) = 5
> Store gradient_tiled.0(6, 0) = 6
> Store gradient_tiled.0(7, 0) = 7
> Store gradient_tiled.0(4, 1) = 5
> Store gradient_tiled.0(5, 1) = 6
> Store gradient_tiled.0(6, 1) = 7
> Store gradient_tiled.0(7, 1) = 8
> Store gradient_tiled.0(4, 2) = 6
> Store gradient_tiled.0(5, 2) = 7
> Store gradient_tiled.0(6, 2) = 8
> Store gradient_tiled.0(7, 2) = 9
> Store gradient_tiled.0(4, 3) = 7
> Store gradient_tiled.0(5, 3) = 8
> Store gradient_tiled.0(6, 3) = 9
> Store gradient_tiled.0(7, 3) = 10
> Store gradient_tiled.0(0, 4) = 4
> Store gradient_tiled.0(1, 4) = 5
> Store gradient_tiled.0(2, 4) = 6
> Store gradient_tiled.0(3, 4) = 7
> Store gradient_tiled.0(0, 5) = 5
> Store gradient_tiled.0(1, 5) = 6
> Store gradient_tiled.0(2, 5) = 7
> Store gradient_tiled.0(3, 5) = 8
> Store gradient_tiled.0(0, 6) = 6
> Store gradient_tiled.0(1, 6) = 7
> Store gradient_tiled.0(2, 6) = 8
> Store gradient_tiled.0(3, 6) = 9
> Store gradient_tiled.0(0, 7) = 7
> Store gradient_tiled.0(1, 7) = 8
> Store gradient_tiled.0(2, 7) = 9
> Store gradient_tiled.0(3, 7) = 10
> Store gradient_tiled.0(4, 4) = 8
> Store gradient_tiled.0(5, 4) = 9
> Store gradient_tiled.0(6, 4) = 10
> Store gradient_tiled.0(7, 4) = 11
> Store gradient_tiled.0(4, 5) = 9
> Store gradient_tiled.0(5, 5) = 10
> Store gradient_tiled.0(6, 5) = 11
> Store gradient_tiled.0(7, 5) = 12
> Store gradient_tiled.0(4, 6) = 10
> Store gradient_tiled.0(5, 6) = 11
> Store gradient_tiled.0(6, 6) = 12
> Store gradient_tiled.0(7, 6) = 13
> Store gradient_tiled.0(4, 7) = 11
> Store gradient_tiled.0(5, 7) = 12
> Store gradient_tiled.0(6, 7) = 13
> Store gradient_tiled.0(7, 7) = 14
> End pipeline gradient_tiled.0()
// See below for a visualization of this// schedule.printf("Equivalent C:\n");for(int y_outer =0; y_outer <2; y_outer++) {for(int x_outer =0; x_outer <2; x_outer++) {for(int y_inner =0; y_inner <4; y_inner++) {for(int x_inner =0; x_inner <4; x_inner++) {int x = x_outer *4+ x_inner;int y = y_outer *4+ y_inner;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_tiled:
> for y.y_outer:
> for x.x_outer:
> for y.y_inner in [0, 3]:
> for x.x_inner in [0, 3]:
> gradient_tiled(...) = ...
}// Evaluating in vectors.{
Func gradient("gradient_in_vectors");gradient(x, y) = x + y;
gradient.trace_stores();// The nice thing about splitting is that it guarantees the// inner variable runs from zero to the split factor. Most of// the time the split-factor will be a compile-time constant,// so we can replace the loop over the inner variable with a// single vectorized computation. This time we'll split by a// factor of four, because on X86 we can use SSE to compute in// 4-wide vectors.
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner,4);
gradient.vectorize(x_inner);// Splitting and then vectorizing the inner variable is common// enough that there's a short-hand for it. We could have also// said://// gradient.vectorize(x, 4);//// which is equivalent to://// gradient.split(x, x, x_inner, 4);// gradient.vectorize(x_inner);//// Note that in this case we reused the name 'x' as the new// outer variable. Later scheduling calls that refer to x// will refer to this new outer variable named x.// This time we'll evaluate over an 8x4 box, so that we have// more than one vector of work per scanline.printf("Evaluating gradient with x_inner vectorized\n");
Buffer<int> output = gradient.realize({8,4});// Click to show output ...
// See below for a visualization.printf("Equivalent C:\n");for(int y =0; y <4; y++) {for(int x_outer =0; x_outer <2; x_outer++) {// The loop over x_inner has gone away, and has been// replaced by a vectorized version of the// expression. On x86 processors, Halide generates SSE// for all of this.int x_vec[] = {x_outer *4+0,
x_outer *4+1,
x_outer *4+2,
x_outer *4+3};int val[] = {x_vec[0] + y,
x_vec[1] + y,
x_vec[2] + y,
x_vec[3] + y};printf("Evaluating at <%d, %d, %d, %d>, <%d, %d, %d, %d>:"" <%d, %d, %d, %d>\n",
x_vec[0], x_vec[1], x_vec[2], x_vec[3],
y, y, y, y,
val[0], val[1], val[2], val[3]);}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_in_vectors:
> for y:
> for x.x_outer:
> vectorized x.x_inner in [0, 3]:
> gradient_in_vectors(...) = ...
}// Unrolling a loop.{
Func gradient("gradient_unroll");gradient(x, y) = x + y;
gradient.trace_stores();// If multiple pixels share overlapping data, it can make// sense to unroll a computation so that shared values are// only computed or loaded once. We do this similarly to how// we expressed vectorizing. We split a dimension and then// fully unroll the loop of the inner variable. Unrolling// doesn't change the order in which things are evaluated.
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner,2);
gradient.unroll(x_inner);// The shorthand for this is:// gradient.unroll(x, 2);printf("Evaluating gradient unrolled by a factor of two\n");
Buffer<int> result = gradient.realize({4,4});// Click to show output ...
> Begin pipeline gradient_unroll.0()
> Tag gradient_unroll.0() tag = "func_type_and_dim: 1 0 32 1 2 0 4 0 4"
> Store gradient_unroll.0(0, 0) = 0
> Store gradient_unroll.0(1, 0) = 1
> Store gradient_unroll.0(2, 0) = 2
> Store gradient_unroll.0(3, 0) = 3
> Store gradient_unroll.0(0, 1) = 1
> Store gradient_unroll.0(1, 1) = 2
> Store gradient_unroll.0(2, 1) = 3
> Store gradient_unroll.0(3, 1) = 4
> Store gradient_unroll.0(0, 2) = 2
> Store gradient_unroll.0(1, 2) = 3
> Store gradient_unroll.0(2, 2) = 4
> Store gradient_unroll.0(3, 2) = 5
> Store gradient_unroll.0(0, 3) = 3
> Store gradient_unroll.0(1, 3) = 4
> Store gradient_unroll.0(2, 3) = 5
> Store gradient_unroll.0(3, 3) = 6
> End pipeline gradient_unroll.0()
printf("Equivalent C:\n");for(int y =0; y <4; y++) {for(int x_outer =0; x_outer <2; x_outer++) {// Instead of a for loop over x_inner, we get two// copies of the innermost statement.{int x_inner =0;int x = x_outer *2+ x_inner;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}{int x_inner =1;int x = x_outer *2+ x_inner;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_unroll:
> for y:
> for x.x_outer:
> unrolled x.x_inner in [0, 1]:
> gradient_unroll(...) = ...
}// Splitting by factors that don't divide the extent.{
Func gradient("gradient_split_7x2");gradient(x, y) = x + y;
gradient.trace_stores();// Splitting guarantees that the inner loop runs from zero to// the split factor, which is important for the uses we saw// above. So what happens when the total extent we wish to// evaluate x over isn't a multiple of the split factor? We'll// split by a factor three, and we'll evaluate gradient over a// 7x2 box instead of the 4x4 box we've been using.
Var x_outer, x_inner;
gradient.split(x, x_outer, x_inner,3);printf("Evaluating gradient over a 7x2 box with x split by three\n");
Buffer<int> output = gradient.realize({7,2});// Click to show output ...
> Begin pipeline gradient_split_7x2.0()
> Tag gradient_split_7x2.0() tag = "func_type_and_dim: 1 0 32 1 2 0 7 0 2"
> Store gradient_split_7x2.0(0, 0) = 0
> Store gradient_split_7x2.0(1, 0) = 1
> Store gradient_split_7x2.0(2, 0) = 2
> Store gradient_split_7x2.0(3, 0) = 3
> Store gradient_split_7x2.0(4, 0) = 4
> Store gradient_split_7x2.0(5, 0) = 5
> Store gradient_split_7x2.0(4, 0) = 4
> Store gradient_split_7x2.0(5, 0) = 5
> Store gradient_split_7x2.0(6, 0) = 6
> Store gradient_split_7x2.0(0, 1) = 1
> Store gradient_split_7x2.0(1, 1) = 2
> Store gradient_split_7x2.0(2, 1) = 3
> Store gradient_split_7x2.0(3, 1) = 4
> Store gradient_split_7x2.0(4, 1) = 5
> Store gradient_split_7x2.0(5, 1) = 6
> Store gradient_split_7x2.0(4, 1) = 5
> Store gradient_split_7x2.0(5, 1) = 6
> Store gradient_split_7x2.0(6, 1) = 7
> End pipeline gradient_split_7x2.0()
// See below for a visualization// of what happened. Note that some points get evaluated more// than once!printf("Equivalent C:\n");for(int y =0; y <2; y++) {for(int x_outer =0; x_outer <3; x_outer++) {// Now runs from 0 to 2for(int x_inner =0; x_inner <3; x_inner++) {int x = x_outer *3;// Before we add x_inner, make sure we don't// evaluate points outside of the 7x2 box. We'll// clamp x to be at most 4 (7 minus the split// factor).if(x >4) x =4;
x += x_inner;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_split_7x2:
> for y:
> for x.x_outer:
> for x.x_inner in [0, 2]:
> gradient_split_7x2(...) = ...
// If you read the output, you'll see that some coordinates// were evaluated more than once. That's generally OK, because// pure Halide functions have no side-effects, so it's safe to// evaluate the same point multiple times. If you're calling// out to C functions like we are, it's your responsibility to// make sure you can handle the same point being evaluated// multiple times.// The general rule is: If we require x from x_min to x_min + x_extent, and// we split by a factor 'factor', then://// x_outer runs from 0 to (x_extent + factor - 1)/factor// x_inner runs from 0 to factor// x = min(x_outer * factor, x_extent - factor) + x_inner + x_min//// In our example, x_min was 0, x_extent was 7, and factor was 3.// However, if you write a Halide function with an update// definition (see lesson 9), then it is not safe to evaluate// the same point multiple times, so we won't apply this// trick. Instead the range of values computed will be rounded// up to the next multiple of the split factor.}// Fusing, tiling, and parallelizing.{// We saw in the previous lesson that we can parallelize// across a variable. Here we combine it with fusing and// tiling to express a useful pattern - processing tiles in// parallel.// This is where fusing shines. Fusing helps when you want to// parallelize across multiple dimensions without introducing// nested parallelism. Nested parallelism (parallel for loops// within parallel for loops) is supported by Halide, but// often gives poor performance compared to fusing the// parallel variables into a single parallel for loop.
Func gradient("gradient_fused_tiles");gradient(x, y) = x + y;
gradient.trace_stores();// First we'll tile, then we'll fuse the tile indices and// parallelize across the combination.
Var x_outer, y_outer, x_inner, y_inner, tile_index;
gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner,4,4);
gradient.fuse(x_outer, y_outer, tile_index);
gradient.parallel(tile_index);// The scheduling calls all return a reference to the Func, so// you can also chain them together into a single statement to// make things slightly clearer://// gradient// .tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4)// .fuse(x_outer, y_outer, tile_index)// .parallel(tile_index);printf("Evaluating gradient tiles in parallel\n");
Buffer<int> output = gradient.realize({8,8});// Click to show output ...
> Begin pipeline gradient_fused_tiles.0()
> Tag gradient_fused_tiles.0() tag = "func_type_and_dim: 1 0 32 1 2 0 8 0 8"
> Store gradient_fused_tiles.0(0, 0) = 0
> Store gradient_fused_tiles.0(1, 0) = 1
> Store gradient_fused_tiles.0(2, 0) = 2
> Store gradient_fused_tiles.0(3, 0) = 3
> Store gradient_fused_tiles.0(0, 1) = 1
> Store gradient_fused_tiles.0(1, 1) = 2
> Store gradient_fused_tiles.0(2, 1) = 3
> Store gradient_fused_tiles.0(3, 1) = 4
> Store gradient_fused_tiles.0(0, 2) = 2
> Store gradient_fused_tiles.0(1, 2) = 3
> Store gradient_fused_tiles.0(2, 2) = 4
> Store gradient_fused_tiles.0(3, 2) = 5
> Store gradient_fused_tiles.0(0, 3) = 3
> Store gradient_fused_tiles.0(1, 3) = 4
> Store gradient_fused_tiles.0(2, 3) = 5
> Store gradient_fused_tiles.0(3, 3) = 6
> Store gradient_fused_tiles.0(0, 4) = 4
> Store gradient_fused_tiles.0(1, 4) = 5
> Store gradient_fused_tiles.0(2, 4) = 6
> Store gradient_fused_tiles.0(3, 4) = 7
> Store gradient_fused_tiles.0(0, 5) = 5
> Store gradient_fused_tiles.0(1, 5) = 6
> Store gradient_fused_tiles.0(2, 5) = 7
> Store gradient_fused_tiles.0(3, 5) = 8
> Store gradient_fused_tiles.0(0, 6) = 6
> Store gradient_fused_tiles.0(1, 6) = 7
> Store gradient_fused_tiles.0(2, 6) = 8
> Store gradient_fused_tiles.0(3, 6) = 9
> Store gradient_fused_tiles.0(0, 7) = 7
> Store gradient_fused_tiles.0(1, 7) = 8
> Store gradient_fused_tiles.0(2, 7) = 9
> Store gradient_fused_tiles.0(3, 7) = 10
> Store gradient_fused_tiles.0(4, 0) = 4
> Store gradient_fused_tiles.0(5, 0) = 5
> Store gradient_fused_tiles.0(6, 0) = 6
> Store gradient_fused_tiles.0(4, 4) = 8
> Store gradient_fused_tiles.0(7, 0) = 7
> Store gradient_fused_tiles.0(4, 1) = 5
> Store gradient_fused_tiles.0(5, 4) = 9
> Store gradient_fused_tiles.0(5, 1) = 6
> Store gradient_fused_tiles.0(6, 4) = 10
> Store gradient_fused_tiles.0(6, 1) = 7
> Store gradient_fused_tiles.0(7, 4) = 11
> Store gradient_fused_tiles.0(7, 1) = 8
> Store gradient_fused_tiles.0(4, 5) = 9
> Store gradient_fused_tiles.0(4, 2) = 6
> Store gradient_fused_tiles.0(5, 5) = 10
> Store gradient_fused_tiles.0(5, 2) = 7
> Store gradient_fused_tiles.0(6, 5) = 11
> Store gradient_fused_tiles.0(6, 2) = 8
> Store gradient_fused_tiles.0(7, 5) = 12
> Store gradient_fused_tiles.0(7, 2) = 9
> Store gradient_fused_tiles.0(4, 6) = 10
> Store gradient_fused_tiles.0(4, 3) = 7
> Store gradient_fused_tiles.0(5, 6) = 11
> Store gradient_fused_tiles.0(5, 3) = 8
> Store gradient_fused_tiles.0(6, 6) = 12
> Store gradient_fused_tiles.0(6, 3) = 9
> Store gradient_fused_tiles.0(7, 6) = 13
> Store gradient_fused_tiles.0(7, 3) = 10
> Store gradient_fused_tiles.0(4, 7) = 11
> Store gradient_fused_tiles.0(5, 7) = 12
> Store gradient_fused_tiles.0(6, 7) = 13
> Store gradient_fused_tiles.0(7, 7) = 14
> End pipeline gradient_fused_tiles.0()
// The tiles should occur in arbitrary order, but within each// tile the pixels will be traversed in row-major order. See// below for a visualization.printf("Equivalent (serial) C:\n");// This outermost loop should be a parallel for loop, but that's hard in C.for(int tile_index =0; tile_index <4; tile_index++) {int y_outer = tile_index /2;int x_outer = tile_index %2;for(int y_inner =0; y_inner <4; y_inner++) {for(int x_inner =0; x_inner <4; x_inner++) {int y = y_outer *4+ y_inner;int x = x_outer *4+ x_inner;printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);}}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_fused_tiles:
> parallel x.x_outer.tile_index:
> for y.y_inner in [0, 3]:
> for x.x_inner in [0, 3]:
> gradient_fused_tiles(...) = ...
}// Putting it all together.{// Are you ready? We're going to use all of the features above now.
Func gradient_fast("gradient_fast");gradient_fast(x, y) = x + y;// We'll process 64x64 tiles in parallel.
Var x_outer, y_outer, x_inner, y_inner, tile_index;
gradient_fast
.tile(x, y, x_outer, y_outer, x_inner, y_inner,64,64).fuse(x_outer, y_outer, tile_index).parallel(tile_index);// We'll compute two scanlines at once while we walk across// each tile. We'll also vectorize in x. The easiest way to// express this is to recursively tile again within each tile// into 4x2 subtiles, then vectorize the subtiles across x and// unroll them across y:
Var x_inner_outer, y_inner_outer, x_vectors, y_pairs;
gradient_fast
.tile(x_inner, y_inner, x_inner_outer, y_inner_outer, x_vectors, y_pairs,4,2).vectorize(x_vectors).unroll(y_pairs);// Note that we didn't do any explicit splitting or// reordering. Those are the most important primitive// operations, but mostly they are buried underneath tiling,// vectorizing, or unrolling calls.// Now let's evaluate this over a range which is not a// multiple of the tile size.// If you like you can turn on tracing, but it's going to// produce a lot of printfs. Instead we'll compute the answer// both in C and Halide and see if the answers match.
Buffer<int> result = gradient_fast.realize({350,250});// See below for a visualization.printf("Checking Halide result against equivalent C...\n");for(int tile_index =0; tile_index <6*4; tile_index++) {int y_outer = tile_index /4;int x_outer = tile_index %4;for(int y_inner_outer =0; y_inner_outer <64/2; y_inner_outer++) {for(int x_inner_outer =0; x_inner_outer <64/4; x_inner_outer++) {// We're vectorized across xint x =std::min(x_outer *64,350-64) + x_inner_outer *4;int x_vec[4] = {x +0,
x +1,
x +2,
x +3};// And we unrolled across yint y_base =std::min(y_outer *64,250-64) + y_inner_outer *2;{// y_pairs = 0int y = y_base +0;int y_vec[4] = {y, y, y, y};int val[4] = {x_vec[0] + y_vec[0],
x_vec[1] + y_vec[1],
x_vec[2] + y_vec[2],
x_vec[3] + y_vec[3]};// Check the result.for(int i =0; i <4; i++) {if(result(x_vec[i], y_vec[i]) != val[i]) {printf("There was an error at %d %d!\n",
x_vec[i], y_vec[i]);return-1;}}}{// y_pairs = 1int y = y_base +1;int y_vec[4] = {y, y, y, y};int val[4] = {x_vec[0] + y_vec[0],
x_vec[1] + y_vec[1],
x_vec[2] + y_vec[2],
x_vec[3] + y_vec[3]};// Check the result.for(int i =0; i <4; i++) {if(result(x_vec[i], y_vec[i]) != val[i]) {printf("There was an error at %d %d!\n",
x_vec[i], y_vec[i]);return-1;}}}}}}printf("\n");printf("Pseudo-code for the schedule:\n");
gradient_fast.print_loop_nest();printf("\n");// Click to show output ...
> produce gradient_fast:
> parallel x.x_outer.tile_index:
> for y.y_inner.y_inner_outer in [0, 31]:
> for x.x_inner.x_inner_outer in [0, 15]:
> unrolled y.y_inner.y_pairs in [0, 1]:
> vectorized x.x_inner.x_vectors in [0, 3]:
> gradient_fast(...) = ...
// Note that in the Halide version, the algorithm is specified// once at the top, separately from the optimizations, and there// aren't that many lines of code total. Compare this to the C// version. There's more code (and it isn't even parallelized or// vectorized properly). More annoyingly, the statement of the// algorithm (the result is x plus y) is buried in multiple places// within the mess. This C code is hard to write, hard to read,// hard to debug, and hard to optimize further. This is why Halide// exists.}printf("Success!\n");return0;}