docs/tutorial_2lesson_05_scheduling_1_8cpp-example.html

// Halide tutorial lesson 5: Vectorize, parallelize, unroll and tile your code


// This lesson demonstrates how to manipulate the order in which you

// evaluate pixels in a Func, including vectorization,

// parallelization, unrolling, and tiling.


// On linux, you can compile and run it like so:

// g++ lesson_05*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_05 -std=c++17

// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_05


// On os x:

// g++ lesson_05*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_05 -std=c++17

// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_05


// If you have the entire Halide source tree, you can also build it by

// running:

//    make tutorial_lesson_05_scheduling_1

// in a shell with the current directory at the top of the halide

// source tree.


#include "Halide.h"

#include <algorithm>

#include <stdio.h>

using namespace Halide;


int main(int argc, char **argv) {


    // We're going to define and schedule our gradient function in

    // several different ways, and see what order pixels are computed

    // in.


    Var x("x"), y("y");


    // First we observe the default ordering.

    {

        Func gradient("gradient");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // By default we walk along the rows and then down the

        // columns. This means x varies quickly, and y varies

        // slowly. x is the column and y is the row, so this is a

        // row-major traversal.

        printf("Evaluating gradient row-major\n");

        Buffer<int> output = gradient.realize({4, 4});


        // See figures/lesson_05_row_major.gif for a visualization of

        // what this did.


        // The equivalent C is:

        printf("Equivalent C:\n");

        for (int y = 0; y < 4; y++) {

            for (int x = 0; x < 4; x++) {

                printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

            }

        }

        printf("\n\n");


        // Tracing is one useful way to understand what a schedule is

        // doing. You can also ask Halide to print out pseudocode

        // showing what loops Halide is generating:

        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");


        // Because we're using the default ordering, it should print:

        // compute gradient:

        //   for y:

        //     for x:

        //       gradient(...) = ...

    }


    // Reorder variables.

    {

        Func gradient("gradient_col_major");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // If we reorder x and y, we can walk down the columns

        // instead. The reorder call takes the arguments of the func,

        // and sets a new nesting order for the for loops that are

        // generated. The arguments are specified from the innermost

        // loop out, so the following call puts y in the inner loop:

        gradient.reorder(y, x);


        // This means y (the row) will vary quickly, and x (the

        // column) will vary slowly, so this is a column-major

        // traversal.


        printf("Evaluating gradient column-major\n");

        Buffer<int> output = gradient.realize({4, 4});


        // See figures/lesson_05_col_major.gif for a visualization of

        // what this did.


        printf("Equivalent C:\n");

        for (int x = 0; x < 4; x++) {

            for (int y = 0; y < 4; y++) {

                printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

            }

        }

        printf("\n");


        // If we print pseudo-code for this schedule, we'll see that

        // the loop over y is now inside the loop over x.

        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");

    }


    // Split a variable into two.

    {

        Func gradient("gradient_split");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // The most powerful primitive scheduling operation you can do

        // to a var is to split it into inner and outer sub-variables:

        Var x_outer, x_inner;

        gradient.split(x, x_outer, x_inner, 2);


        // This breaks the loop over x into two nested loops: an outer

        // one over x_outer, and an inner one over x_inner. The last

        // argument to split was the "split factor". The inner loop

        // runs from zero to the split factor. The outer loop runs

        // from zero to the extent required of x (4 in this case)

        // divided by the split factor. Within the loops, the old

        // variable is defined to be outer * factor + inner. If the

        // old loop started at a value other than zero, then that is

        // also added within the loops.


        printf("Evaluating gradient with x split into x_outer and x_inner \n");

        Buffer<int> output = gradient.realize({4, 4});


        printf("Equivalent C:\n");

        for (int y = 0; y < 4; y++) {

            for (int x_outer = 0; x_outer < 2; x_outer++) {

                for (int x_inner = 0; x_inner < 2; x_inner++) {

                    int x = x_outer * 2 + x_inner;

                    printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

                }

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");


        // Note that the order of evaluation of pixels didn't actually

        // change! Splitting by itself does nothing, but it does open

        // up all of the scheduling possibilities that we will explore

        // below.

    }


    // Fuse two variables into one.

    {

        Func gradient("gradient_fused");

        gradient(x, y) = x + y;


        // The opposite of splitting is 'fusing'. Fusing two variables

        // merges the two loops into a single for loop over the

        // product of the extents. Fusing is less important than

        // splitting, but it also sees use (as we'll see later in this

        // lesson). Like splitting, fusing by itself doesn't change

        // the order of evaluation.

        Var fused;

        gradient.fuse(x, y, fused);


        printf("Evaluating gradient with x and y fused\n");

        Buffer<int> output = gradient.realize({4, 4});


        printf("Equivalent C:\n");

        for (int fused = 0; fused < 4 * 4; fused++) {

            int y = fused / 4;

            int x = fused % 4;

            printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");

    }


    // Evaluating in tiles.

    {

        Func gradient("gradient_tiled");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // Now that we can both split and reorder, we can do tiled

        // evaluation. Let's split both x and y by a factor of four,

        // and then reorder the vars to express a tiled traversal.

        //

        // A tiled traversal splits the domain into small rectangular

        // tiles, and outermost iterates over the tiles, and within

        // that iterates over the points within each tile. It can be

        // good for performance if neighboring pixels use overlapping

        // input data, for example in a blur. We can express a tiled

        // traversal like so:

        Var x_outer, x_inner, y_outer, y_inner;

        gradient.split(x, x_outer, x_inner, 4);

        gradient.split(y, y_outer, y_inner, 4);

        gradient.reorder(x_inner, y_inner, x_outer, y_outer);


        // This pattern is common enough that there's a shorthand for it:

        // gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);


        printf("Evaluating gradient in 4x4 tiles\n");

        Buffer<int> output = gradient.realize({8, 8});


        // See figures/lesson_05_tiled.gif for a visualization of this

        // schedule.


        printf("Equivalent C:\n");

        for (int y_outer = 0; y_outer < 2; y_outer++) {

            for (int x_outer = 0; x_outer < 2; x_outer++) {

                for (int y_inner = 0; y_inner < 4; y_inner++) {

                    for (int x_inner = 0; x_inner < 4; x_inner++) {

                        int x = x_outer * 4 + x_inner;

                        int y = y_outer * 4 + y_inner;

                        printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

                    }

                }

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");

    }


    // Evaluating in vectors.

    {

        Func gradient("gradient_in_vectors");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // The nice thing about splitting is that it guarantees the

        // inner variable runs from zero to the split factor. Most of

        // the time the split-factor will be a compile-time constant,

        // so we can replace the loop over the inner variable with a

        // single vectorized computation. This time we'll split by a

        // factor of four, because on X86 we can use SSE to compute in

        // 4-wide vectors.

        Var x_outer, x_inner;

        gradient.split(x, x_outer, x_inner, 4);

        gradient.vectorize(x_inner);


        // Splitting and then vectorizing the inner variable is common

        // enough that there's a short-hand for it. We could have also

        // said:

        //

        // gradient.vectorize(x, 4);

        //

        // which is equivalent to:

        //

        // gradient.split(x, x, x_inner, 4);

        // gradient.vectorize(x_inner);

        //

        // Note that in this case we reused the name 'x' as the new

        // outer variable. Later scheduling calls that refer to x

        // will refer to this new outer variable named x.


        // This time we'll evaluate over an 8x4 box, so that we have

        // more than one vector of work per scanline.

        printf("Evaluating gradient with x_inner vectorized \n");

        Buffer<int> output = gradient.realize({8, 4});


        // See figures/lesson_05_vectors.gif for a visualization.


        printf("Equivalent C:\n");

        for (int y = 0; y < 4; y++) {

            for (int x_outer = 0; x_outer < 2; x_outer++) {

                // The loop over x_inner has gone away, and has been

                // replaced by a vectorized version of the

                // expression. On x86 processors, Halide generates SSE

                // for all of this.

                int x_vec[] = {x_outer * 4 + 0,

                               x_outer * 4 + 1,

                               x_outer * 4 + 2,

                               x_outer * 4 + 3};

                int val[] = {x_vec[0] + y,

                             x_vec[1] + y,

                             x_vec[2] + y,

                             x_vec[3] + y};

                printf("Evaluating at <%d, %d, %d, %d>, <%d, %d, %d, %d>:"

                       " <%d, %d, %d, %d>\n",

                       x_vec[0], x_vec[1], x_vec[2], x_vec[3],

                       y, y, y, y,

                       val[0], val[1], val[2], val[3]);

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");

    }


    // Unrolling a loop.

    {

        Func gradient("gradient_unroll");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // If multiple pixels share overlapping data, it can make

        // sense to unroll a computation so that shared values are

        // only computed or loaded once. We do this similarly to how

        // we expressed vectorizing. We split a dimension and then

        // fully unroll the loop of the inner variable. Unrolling

        // doesn't change the order in which things are evaluated.

        Var x_outer, x_inner;

        gradient.split(x, x_outer, x_inner, 2);

        gradient.unroll(x_inner);


        // The shorthand for this is:

        // gradient.unroll(x, 2);


        printf("Evaluating gradient unrolled by a factor of two\n");

        Buffer<int> result = gradient.realize({4, 4});


        printf("Equivalent C:\n");

        for (int y = 0; y < 4; y++) {

            for (int x_outer = 0; x_outer < 2; x_outer++) {

                // Instead of a for loop over x_inner, we get two

                // copies of the innermost statement.

                {

                    int x_inner = 0;

                    int x = x_outer * 2 + x_inner;

                    printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

                }

                {

                    int x_inner = 1;

                    int x = x_outer * 2 + x_inner;

                    printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

                }

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");

    }


    // Splitting by factors that don't divide the extent.

    {

        Func gradient("gradient_split_7x2");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // Splitting guarantees that the inner loop runs from zero to

        // the split factor, which is important for the uses we saw

        // above. So what happens when the total extent we wish to

        // evaluate x over isn't a multiple of the split factor? We'll

        // split by a factor three, and we'll evaluate gradient over a

        // 7x2 box instead of the 4x4 box we've been using.

        Var x_outer, x_inner;

        gradient.split(x, x_outer, x_inner, 3);


        printf("Evaluating gradient over a 7x2 box with x split by three \n");

        Buffer<int> output = gradient.realize({7, 2});


        // See figures/lesson_05_split_7_by_3.gif for a visualization

        // of what happened. Note that some points get evaluated more

        // than once!


        printf("Equivalent C:\n");

        for (int y = 0; y < 2; y++) {

            for (int x_outer = 0; x_outer < 3; x_outer++) {  // Now runs from 0 to 2

                for (int x_inner = 0; x_inner < 3; x_inner++) {

                    int x = x_outer * 3;

                    // Before we add x_inner, make sure we don't

                    // evaluate points outside of the 7x2 box. We'll

                    // clamp x to be at most 4 (7 minus the split

                    // factor).

                    if (x > 4) x = 4;

                    x += x_inner;

                    printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

                }

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");


        // If you read the output, you'll see that some coordinates

        // were evaluated more than once. That's generally OK, because

        // pure Halide functions have no side-effects, so it's safe to

        // evaluate the same point multiple times. If you're calling

        // out to C functions like we are, it's your responsibility to

        // make sure you can handle the same point being evaluated

        // multiple times.


        // The general rule is: If we require x from x_min to x_min + x_extent, and

        // we split by a factor 'factor', then:

        //

        // x_outer runs from 0 to (x_extent + factor - 1)/factor

        // x_inner runs from 0 to factor

        // x = min(x_outer * factor, x_extent - factor) + x_inner + x_min

        //

        // In our example, x_min was 0, x_extent was 7, and factor was 3.


        // However, if you write a Halide function with an update

        // definition (see lesson 9), then it is not safe to evaluate

        // the same point multiple times, so we won't apply this

        // trick. Instead the range of values computed will be rounded

        // up to the next multiple of the split factor.

    }


    // Fusing, tiling, and parallelizing.

    {

        // We saw in the previous lesson that we can parallelize

        // across a variable. Here we combine it with fusing and

        // tiling to express a useful pattern - processing tiles in

        // parallel.


        // This is where fusing shines. Fusing helps when you want to

        // parallelize across multiple dimensions without introducing

        // nested parallelism. Nested parallelism (parallel for loops

        // within parallel for loops) is supported by Halide, but

        // often gives poor performance compared to fusing the

        // parallel variables into a single parallel for loop.


        Func gradient("gradient_fused_tiles");

        gradient(x, y) = x + y;

        gradient.trace_stores();


        // First we'll tile, then we'll fuse the tile indices and

        // parallelize across the combination.

        Var x_outer, y_outer, x_inner, y_inner, tile_index;

        gradient.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);

        gradient.fuse(x_outer, y_outer, tile_index);

        gradient.parallel(tile_index);


        // The scheduling calls all return a reference to the Func, so

        // you can also chain them together into a single statement to

        // make things slightly clearer:

        //

        // gradient

        //     .tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4)

        //     .fuse(x_outer, y_outer, tile_index)

        //     .parallel(tile_index);


        printf("Evaluating gradient tiles in parallel\n");

        Buffer<int> output = gradient.realize({8, 8});


        // The tiles should occur in arbitrary order, but within each

        // tile the pixels will be traversed in row-major order. See

        // figures/lesson_05_parallel_tiles.gif for a visualization.


        printf("Equivalent (serial) C:\n");

        // This outermost loop should be a parallel for loop, but that's hard in C.

        for (int tile_index = 0; tile_index < 4; tile_index++) {

            int y_outer = tile_index / 2;

            int x_outer = tile_index % 2;

            for (int y_inner = 0; y_inner < 4; y_inner++) {

                for (int x_inner = 0; x_inner < 4; x_inner++) {

                    int y = y_outer * 4 + y_inner;

                    int x = x_outer * 4 + x_inner;

                    printf("Evaluating at x = %d, y = %d: %d\n", x, y, x + y);

                }

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient.print_loop_nest();

        printf("\n");

    }


    // Putting it all together.

    {

        // Are you ready? We're going to use all of the features above now.

        Func gradient_fast("gradient_fast");

        gradient_fast(x, y) = x + y;


        // We'll process 64x64 tiles in parallel.

        Var x_outer, y_outer, x_inner, y_inner, tile_index;

        gradient_fast

            .tile(x, y, x_outer, y_outer, x_inner, y_inner, 64, 64)

            .fuse(x_outer, y_outer, tile_index)

            .parallel(tile_index);


        // We'll compute two scanlines at once while we walk across

        // each tile. We'll also vectorize in x. The easiest way to

        // express this is to recursively tile again within each tile

        // into 4x2 subtiles, then vectorize the subtiles across x and

        // unroll them across y:

        Var x_inner_outer, y_inner_outer, x_vectors, y_pairs;

        gradient_fast

            .tile(x_inner, y_inner, x_inner_outer, y_inner_outer, x_vectors, y_pairs, 4, 2)

            .vectorize(x_vectors)

            .unroll(y_pairs);


        // Note that we didn't do any explicit splitting or

        // reordering. Those are the most important primitive

        // operations, but mostly they are buried underneath tiling,

        // vectorizing, or unrolling calls.


        // Now let's evaluate this over a range which is not a

        // multiple of the tile size.


        // If you like you can turn on tracing, but it's going to

        // produce a lot of printfs. Instead we'll compute the answer

        // both in C and Halide and see if the answers match.

        Buffer<int> result = gradient_fast.realize({350, 250});


        // See figures/lesson_05_fast.mp4 for a visualization.


        printf("Checking Halide result against equivalent C...\n");

        for (int tile_index = 0; tile_index < 6 * 4; tile_index++) {

            int y_outer = tile_index / 4;

            int x_outer = tile_index % 4;

            for (int y_inner_outer = 0; y_inner_outer < 64 / 2; y_inner_outer++) {

                for (int x_inner_outer = 0; x_inner_outer < 64 / 4; x_inner_outer++) {

                    // We're vectorized across x

                    int x = std::min(x_outer * 64, 350 - 64) + x_inner_outer * 4;

                    int x_vec[4] = {x + 0,

                                    x + 1,

                                    x + 2,

                                    x + 3};


                    // And we unrolled across y

                    int y_base = std::min(y_outer * 64, 250 - 64) + y_inner_outer * 2;

                    {

                        // y_pairs = 0

                        int y = y_base + 0;

                        int y_vec[4] = {y, y, y, y};

                        int val[4] = {x_vec[0] + y_vec[0],

                                      x_vec[1] + y_vec[1],

                                      x_vec[2] + y_vec[2],

                                      x_vec[3] + y_vec[3]};


                        // Check the result.

                        for (int i = 0; i < 4; i++) {

                            if (result(x_vec[i], y_vec[i]) != val[i]) {

                                printf("There was an error at %d %d!\n",

                                       x_vec[i], y_vec[i]);

                                return -1;

                            }

                        }

                    }

                    {

                        // y_pairs = 1

                        int y = y_base + 1;

                        int y_vec[4] = {y, y, y, y};

                        int val[4] = {x_vec[0] + y_vec[0],

                                      x_vec[1] + y_vec[1],

                                      x_vec[2] + y_vec[2],

                                      x_vec[3] + y_vec[3]};


                        // Check the result.

                        for (int i = 0; i < 4; i++) {

                            if (result(x_vec[i], y_vec[i]) != val[i]) {

                                printf("There was an error at %d %d!\n",

                                       x_vec[i], y_vec[i]);

                                return -1;

                            }

                        }

                    }

                }

            }

        }

        printf("\n");


        printf("Pseudo-code for the schedule:\n");

        gradient_fast.print_loop_nest();

        printf("\n");


        // Note that in the Halide version, the algorithm is specified

        // once at the top, separately from the optimizations, and there

        // aren't that many lines of code total. Compare this to the C

        // version. There's more code (and it isn't even parallelized or

        // vectorized properly). More annoyingly, the statement of the

        // algorithm (the result is x plus y) is buried in multiple places

        // within the mess. This C code is hard to write, hard to read,

        // hard to debug, and hard to optimize further. This is why Halide

        // exists.

    }


    printf("Success!\n");

    return 0;

}

Halide::Buffer
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition RDom.h:21

Halide::Func
A halide function.
Definition Func.h:700

Halide::Func::print_loop_nest
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.

Halide::Func::trace_stores
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.

Halide::Func::reorder
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.

Halide::Func::split
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...

Halide::Func::tile
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...

Halide::Func::unroll
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.

Halide::Func::realize
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.

Halide::Func::parallel
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.

Halide::Func::vectorize
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.

Halide::Func::fuse
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.

Halide::Var
A Halide variable, to be used when defining functions.
Definition Var.h:19

Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition AbstractGenerator.h:19