00   Introduction
01   Getting started with Funcs, Vars, and Exprs
02   Processing images
03   Inspecting the generated code
04   Debugging with tracing, print, and print_when
05   Vectorize, parallelize, unroll and tile your code
06   Realizing Funcs over arbitrary domains
07   Multi-stage pipelines
08   Scheduling multi-stage pipelines
09   Multi-pass Funcs, update definitions, and reductions
10   AOT compilation part 1
10   AOT compilation part 2
11   Cross-compilation
12   Using the GPU
13   Tuples
14   The Halide type system
15   Generators part 1
15   Generators part 2
16   RGB images and memory layouts part 1
16   RGB images and memory layouts part 2
17   Reductions over non-rectangular domains
18   Factoring an associative reduction using rfactor
19   Wrapper Funcs
20   Cloning Funcs
21   Auto-Scheduler
21   Auto-Scheduler
// Halide tutorial lesson 12: Using the GPU

// This lesson demonstrates how to use Halide to run code on a GPU using OpenCL.

// On linux, you can compile and run it like so:
// g++ lesson_12*.cpp -g -std=c++11 -I ../include -I ../tools -L ../bin -lHalide `libpng-config --cflags --ldflags` -ljpeg -lpthread -ldl -o lesson_12
// LD_LIBRARY_PATH=../bin ./lesson_12

// On os x:
// g++ lesson_12*.cpp -g -std=c++11 -I ../include -I ../tools -L ../bin -lHalide `libpng-config --cflags --ldflags` -ljpeg -o lesson_12
// DYLD_LIBRARY_PATH=../bin ./lesson_12

// If you have the entire Halide source tree, you can also build it by
// running:
//    make tutorial_lesson_12_using_the_gpu
// in a shell with the current directory at the top of the halide
// source tree.

#include "Halide.h"
#include <stdio.h>
using namespace Halide;

// Include some support code for loading pngs.
#include "halide_image_io.h"
using namespace Halide::Tools;

// Include a clock to do performance testing.
#include "clock.h"

// Define some Vars to use.
Var x, y, c, i, ii, xo, yo, xi, yi;

// We're going to want to schedule a pipeline in several ways, so we
// define the pipeline in a class so that we can recreate it several
// times with different schedules.
class MyPipeline {
    Func lut, padded, padded16, sharpen, curved;
    Buffer<uint8_t> input;

    MyPipeline(Buffer<uint8_t> in) : input(in) {
        // For this lesson, we'll use a two-stage pipeline that sharpens
        // and then applies a look-up-table (LUT).

        // First we'll define the LUT. It will be a gamma curve.

        lut(i) = cast<uint8_t>(clamp(pow(i / 255.0f, 1.2f) * 255.0f, 0, 255));

        // Augment the input with a boundary condition.
        padded(x, y, c) = input(clamp(x, 0, input.width()-1),
                                clamp(y, 0, input.height()-1), c);

        // Cast it to 16-bit to do the math.
        padded16(x, y, c) = cast<uint16_t>(padded(x, y, c));

        // Next we sharpen it with a five-tap filter.
        sharpen(x, y, c) = (padded16(x, y, c) * 2-
                            (padded16(x - 1, y, c) +
                             padded16(x, y - 1, c) +
                             padded16(x + 1, y, c) +
                             padded16(x, y + 1, c)) / 4);

        // Then apply the LUT.
        curved(x, y, c) = lut(sharpen(x, y, c));

    // Now we define methods that give our pipeline several different
    // schedules.
    void schedule_for_cpu() {
        // Compute the look-up-table ahead of time.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)

        // Look-up-tables don't vectorize well, so just parallelize
        // curved in slices of 16 scanlines.
        Var yo, yi;
        curved.split(y, yo, yi, 16)

        // Compute sharpen as needed per scanline of curved.
        sharpen.compute_at(curved, yi);

        // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide.
        sharpen.vectorize(x, 8);

        // Compute the padded input as needed per scanline of curved,
        // reusing previous values computed within the same strip of
        // 16 scanlines.
        padded.store_at(curved, yo)
              .compute_at(curved, yi);

        // Also vectorize the padding. It's 8-bit, so we'll vectorize
        // 16-wide.
        padded.vectorize(x, 16);

        // JIT-compile the pipeline for the CPU.

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, block, thread, 16);

        // Func::gpu_tile behaves the same as Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing
        // the intermediate result in shared memory. In the schedule
        // above xo corresponds to GPU blocks.
        padded.compute_at(curved, xo);

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
        // Metal are not enabled by default. We have to construct a
        // Target object, enable one of them, and then pass that
        // target object to compile_jit. Otherwise your CPU will very
        // slowly pretend it's a GPU, and use one thread per output
        // pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or Metal, depending on which platform
        // we're on. OS X doesn't update its OpenCL drivers, so they
        // tend to be broken. CUDA would also be a fine choice on
        // machines with NVidia GPUs.
        if (target.os == Target::OSX) {
        } else {

        // Uncomment the next line and comment out the lines above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL, Metal, or CUDA API
        // calls done by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);


    void test_performance() {
        // Test the performance of the scheduled MyPipeline.

        Buffer<uint8_t> output(input.width(), input.height(), input.channels());

        // Run the filter once to initialize any GPU runtime state.

        // Now take the best of 3 runs for timing.
        double best_time = 0.0;
        for (int i = 0; i < 3; i++) {

            double t1 = current_time();

            // Run the filter 100 times.
            for (int j = 0; j < 100; j++) {

            // Force any GPU code to finish by copying the buffer back to the CPU.

            double t2 = current_time();

            double elapsed = (t2 - t1)/100;
            if (i == 0 || elapsed < best_time) {
                best_time = elapsed;

        printf("%1.4f milliseconds\n", best_time);

    void test_correctness(Buffer<uint8_t> reference_output) {
        Buffer<uint8_t> output =
            curved.realize(input.width(), input.height(), input.channels());

        // Check against the reference output.
        for (int c = 0; c < input.channels(); c++) {
            for (int y = 0; y < input.height(); y++) {
                for (int x = 0; x < input.width(); x++) {
                    if (output(x, y, c) != reference_output(x, y, c)) {
                        printf("Mismatch between output (%d) and "
                               "reference output (%d) at %d, %d, %d\n",
                               output(x, y, c),
                               reference_output(x, y, c),
                               x, y, c);


bool have_opencl_or_metal();

int main(int argc, char **argv) {
    // Load an input image.
    Buffer<uint8_t> input = load_image("images/rgb.png");

    // Allocated an image that will store the correct output
    Buffer<uint8_t> reference_output(input.width(), input.height(), input.channels());

    printf("Testing performance on CPU:\n");
    MyPipeline p1(input);

    if (have_opencl_or_metal()) {
        printf("Testing performance on GPU:\n");
        MyPipeline p2(input);
    } else {
        printf("Not testing performance on GPU, "
               "because I can't find the opencl library\n");

    return 0;

// A helper function to check if OpenCL seems to exist on this machine.

#ifdef _WIN32
#include <windows.h>
#include <dlfcn.h>

bool have_opencl_or_metal() {
#ifdef _WIN32
    return LoadLibrary("OpenCL.dll") != NULL;
#elif __APPLE__
    return dlopen("/System/Library/Frameworks/Metal.framework/Versions/Current/Metal", RTLD_LAZY) != NULL;
    return dlopen("libOpenCL.so", RTLD_LAZY) != NULL;