Inspecting the generated code

// Halide tutorial lesson 3: Inspecting the generated code

// This lesson demonstrates how to inspect what the Halide compiler is producing.

// On linux, you can compile and run it like so:
// g++ lesson_03*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_03 -std=c++17
// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_03

// On os x:
// g++ lesson_03*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_03 -std=c++17
// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_03

// If you have the entire Halide source tree, you can also build it by
// running:
//    make tutorial_lesson_03_debugging_1
// in a shell with the current directory at the top of the halide
// source tree.

#include "Halide.h"
#include <stdio.h>

// This time we'll just import the entire Halide namespace
using namespace Halide;

int main(int argc, char **argv) {

    // We'll start by defining the simple single-stage imaging
    // pipeline from lesson 1.

    // This lesson will be about debugging, but unfortunately in C++,
    // objects don't know their own names, which makes it hard for us
    // to understand the generated code. To get around this, you can
    // pass a string to the Func and Var constructors to give them a
    // name for debugging purposes.
    Func gradient("gradient");
    Var x("x"), y("y");
    gradient(x, y) = x + y;

    // Realize the function to produce an output image. We'll keep it
    // very small for this lesson.
    Buffer<int> output = gradient.realize({8, 8});

    // That line compiled and ran the pipeline. Try running this
    // lesson with the environment variable HL_DEBUG_CODEGEN set to
    // 1. It will print out the various stages of compilation, and a
    // pseudocode representation of the final pipeline.
    // Click to show output ...
 > Creating initial loop nests...
 > Injecting realization of { gradient }
 > Skipping injecting memoization...
 > Injecting tracing...
 > Adding checks for parameters
 > Computing bounds of each function's value
 > Clamping unsafe data-dependent accesses
 > Performing computation bounds inference...
 > Removing extern loops...
 > Performing sliding window optimization...
 > Uniquifying variable names...
 > Simplifying...
 > Simplifying correlated differences...
 > Performing allocation bounds inference...
 > Adding checks for images
 > Removing code that depends on undef values...
 > Performing storage folding optimization...
 > Injecting debug_to_file calls...
 > Injecting prefetches...
 > Discarding safe promises...
 > Dynamically skipping stages...
 > Forking asynchronous producers...
 > Destructuring tuple-valued realizations...
 > Bounding small realizations...
 > Performing storage flattening...
 > Adding atomic mutex allocation...
 > Unpacking buffer arguments...
 > Skipping rewriting memoized allocations...
 > Simplifying...
 > Reduce prefetch dimension...
 > Simplifying correlated differences...
 > Unrolling...
 > Vectorizing...
 > Detecting vector interleavings...
 > Partitioning loops to simplify boundary conditions...
 > Staging strided loads...
 > Trimming loops to the region over which they do something...
 > Rebasing loops to zero...
 > Hoisting loop invariant if statements...
 > Injecting early frees...
 > Simplifying correlated differences...
 > Bounding small allocations...
 > Simplifying...
 > Lowering unsafe promises...
 > Flattening nested ramps...
 > Removing dead allocations and moving loop invariant code...
 > Finding intrinsics...
 > Hoisting prefetches...
 > Lowering after final simplification:
 > assert(reinterpret((struct halide_buffer_t *)gradient.buffer) != (uint64)0, halide_error_buffer_argument_is_null("gradient"))
 > let gradient = (void *)_halide_buffer_get_host((struct halide_buffer_t *)gradient.buffer)
 > let gradient.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)gradient.buffer)
 > let gradient.device_dirty = (uint1)_halide_buffer_get_device_dirty((struct halide_buffer_t *)gradient.buffer)
 > let gradient.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)gradient.buffer)
 > let gradient.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)gradient.buffer, 0)
 > let gradient.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)gradient.buffer, 0)
 > let gradient.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)gradient.buffer, 0)
 > let gradient.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)gradient.buffer, 1)
 > let gradient.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)gradient.buffer, 1)
 > let gradient.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)gradient.buffer, 1)
 > if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)gradient.buffer)) {
 >  (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)gradient.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)gradient.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 0, 32, 2, (struct halide_dimension_t *)make_struct(gradient.min.0, gradient.extent.0, 1, 0, gradient.min.1, gradient.extent.1, gradient.extent.0, 0), (uint64)0)
 > }
 > if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)gradient.buffer)) {
 >  assert(gradient.type == (uint32)73728, halide_error_bad_type("Output buffer gradient", gradient.type, (uint32)73728))
 >  assert(gradient.dimensions == 2, halide_error_bad_dimensions("Output buffer gradient", gradient.dimensions, 2))
 >  assert(0 <= gradient.extent.0, halide_error_buffer_extents_negative("Output buffer gradient", 0, gradient.extent.0))
 >  assert(0 <= gradient.extent.1, halide_error_buffer_extents_negative("Output buffer gradient", 1, gradient.extent.1))
 >  assert(gradient.stride.0 == 1, halide_error_constraint_violated("gradient.stride.0", gradient.stride.0, "1", 1))
 >  let gradient.total_extent.1 = int64(gradient.extent.1)*int64(gradient.extent.0)
 >  assert(uint64(gradient.extent.0) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("gradient", uint64(gradient.extent.0), (uint64)2147483647))
 >  assert((uint64)abs(int64(gradient.extent.1)*int64(gradient.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("gradient", (uint64)abs(int64(gradient.extent.1)*int64(gradient.stride.1)), (uint64)2147483647))
 >  assert(gradient.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("gradient", gradient.total_extent.1, (int64)2147483647))
 >  assert(!gradient.device_dirty, halide_error_device_dirty_with_no_device_support("Output buffer gradient"))
 >  assert(gradient != reinterpret<(void *)>((uint64)0), halide_error_host_is_null("Output buffer gradient"))
 >  produce gradient {
 >   let t2 = 0 - (gradient.min.1*gradient.stride.1)
 >   let t1 = gradient.min.0 + gradient.min.1
 >   for (gradient.s0.y.rebased, 0, gradient.extent.1) {
 >    let t4 = ((gradient.min.1 + gradient.s0.y.rebased)*gradient.stride.1) + t2
 >    let t3 = gradient.s0.y.rebased + t1
 >    for (gradient.s0.x.rebased, 0, gradient.extent.0) {
 >     gradient[gradient.s0.x.rebased + t4] = gradient.s0.x.rebased + t3
 >    }
 >   }
 >  }
 > }
 > Skipping Hexagon offload...
 > Skipping GPU offload...
 > Lowering Parallel Tasks...
 > Target triple of initial module: x86_64--linux-gnu
 > Generating llvm bitcode...
 > Generating llvm bitcode prolog for function gradient...
 > Generating llvm bitcode for function gradient...
 > JIT compiling shared runtime for x86-64-linux-tune_znver3-avx-avx2-f16c-fma-jit-sse41-user_context
 > JIT compiling gradient for x86-64-linux-tune_znver3-avx-avx2-f16c-fma-jit-sse41-user_context
 > Creating initial loop nests...
 > Injecting realization of { gradient }
 > Skipping injecting memoization...
 > Injecting tracing...
 > Adding checks for parameters
 > Computing bounds of each function's value
 > Clamping unsafe data-dependent accesses
 > Performing computation bounds inference...
 > Removing extern loops...
 > Performing sliding window optimization...
 > Uniquifying variable names...
 > Simplifying...
 > Simplifying correlated differences...
 > Performing allocation bounds inference...
 > Adding checks for images
 > Removing code that depends on undef values...
 > Performing storage folding optimization...
 > Injecting debug_to_file calls...
 > Injecting prefetches...
 > Discarding safe promises...
 > Dynamically skipping stages...
 > Forking asynchronous producers...
 > Destructuring tuple-valued realizations...
 > Bounding small realizations...
 > Performing storage flattening...
 > Adding atomic mutex allocation...
 > Unpacking buffer arguments...
 > Skipping rewriting memoized allocations...
 > Simplifying...
 > Reduce prefetch dimension...
 > Simplifying correlated differences...
 > Unrolling...
 > Vectorizing...
 > Detecting vector interleavings...
 > Partitioning loops to simplify boundary conditions...
 > Staging strided loads...
 > Trimming loops to the region over which they do something...
 > Rebasing loops to zero...
 > Hoisting loop invariant if statements...
 > Injecting early frees...
 > Simplifying correlated differences...
 > Bounding small allocations...
 > Simplifying...
 > Lowering unsafe promises...
 > Flattening nested ramps...
 > Removing dead allocations and moving loop invariant code...
 > Finding intrinsics...
 > Hoisting prefetches...
 > Lowering after final simplification:
 > assert(reinterpret((struct halide_buffer_t *)gradient.buffer) != (uint64)0, halide_error_buffer_argument_is_null("gradient"))
 > let gradient = (void *)_halide_buffer_get_host((struct halide_buffer_t *)gradient.buffer)
 > let gradient.type = (uint32)_halide_buffer_get_type((struct halide_buffer_t *)gradient.buffer)
 > let gradient.device_dirty = (uint1)_halide_buffer_get_device_dirty((struct halide_buffer_t *)gradient.buffer)
 > let gradient.dimensions = _halide_buffer_get_dimensions((struct halide_buffer_t *)gradient.buffer)
 > let gradient.min.0 = _halide_buffer_get_min((struct halide_buffer_t *)gradient.buffer, 0)
 > let gradient.extent.0 = _halide_buffer_get_extent((struct halide_buffer_t *)gradient.buffer, 0)
 > let gradient.stride.0 = _halide_buffer_get_stride((struct halide_buffer_t *)gradient.buffer, 0)
 > let gradient.min.1 = _halide_buffer_get_min((struct halide_buffer_t *)gradient.buffer, 1)
 > let gradient.extent.1 = _halide_buffer_get_extent((struct halide_buffer_t *)gradient.buffer, 1)
 > let gradient.stride.1 = _halide_buffer_get_stride((struct halide_buffer_t *)gradient.buffer, 1)
 > if ((uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)gradient.buffer)) {
 >  (struct halide_buffer_t *)_halide_buffer_init((struct halide_buffer_t *)gradient.buffer, (struct halide_dimension_t *)_halide_buffer_get_shape((struct halide_buffer_t *)gradient.buffer), reinterpret<(void *)>((uint64)0), (uint64)0, reinterpret<(struct halide_device_interface_t *)>((uint64)0), 0, 32, 2, (struct halide_dimension_t *)make_struct(gradient.min.0, gradient.extent.0, 1, 0, gradient.min.1, gradient.extent.1, gradient.extent.0, 0), (uint64)0)
 > }
 > if (!(uint1)_halide_buffer_is_bounds_query((struct halide_buffer_t *)gradient.buffer)) {
 >  assert(gradient.type == (uint32)73728, halide_error_bad_type("Output buffer gradient", gradient.type, (uint32)73728))
 >  assert(gradient.dimensions == 2, halide_error_bad_dimensions("Output buffer gradient", gradient.dimensions, 2))
 >  assert(0 <= gradient.extent.0, halide_error_buffer_extents_negative("Output buffer gradient", 0, gradient.extent.0))
 >  assert(0 <= gradient.extent.1, halide_error_buffer_extents_negative("Output buffer gradient", 1, gradient.extent.1))
 >  assert(gradient.stride.0 == 1, halide_error_constraint_violated("gradient.stride.0", gradient.stride.0, "1", 1))
 >  let gradient.total_extent.1 = int64(gradient.extent.1)*int64(gradient.extent.0)
 >  assert(uint64(gradient.extent.0) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("gradient", uint64(gradient.extent.0), (uint64)2147483647))
 >  assert((uint64)abs(int64(gradient.extent.1)*int64(gradient.stride.1)) <= (uint64)2147483647, halide_error_buffer_allocation_too_large("gradient", (uint64)abs(int64(gradient.extent.1)*int64(gradient.stride.1)), (uint64)2147483647))
 >  assert(gradient.total_extent.1 <= (int64)2147483647, halide_error_buffer_extents_too_large("gradient", gradient.total_extent.1, (int64)2147483647))
 >  assert(!gradient.device_dirty, halide_error_device_dirty_with_no_device_support("Output buffer gradient"))
 >  assert(gradient != reinterpret<(void *)>((uint64)0), halide_error_host_is_null("Output buffer gradient"))
 >  produce gradient {
 >   let t7 = 0 - (gradient.min.1*gradient.stride.1)
 >   let t6 = gradient.min.0 + gradient.min.1
 >   for (gradient.s0.y.rebased, 0, gradient.extent.1) {
 >    let t9 = ((gradient.min.1 + gradient.s0.y.rebased)*gradient.stride.1) + t7
 >    let t8 = gradient.s0.y.rebased + t6
 >    for (gradient.s0.x.rebased, 0, gradient.extent.0) {
 >     gradient[gradient.s0.x.rebased + t9] = gradient.s0.x.rebased + t8
 >    }
 >   }
 >  }
 > }
 > Skipping Hexagon offload...
 > Skipping GPU offload...
 > Lowering Parallel Tasks...
 > add_temp_object_file: /tmp/0J9DzW/gradient.stmt.html.s
 > Module.compile(): creating temp file for assembly output at /tmp/0J9DzW/gradient.stmt.html.s
 > Target triple of initial module: x86_64--linux-gnu
 > Generating llvm bitcode...
 > Generating llvm bitcode prolog for function gradient...
 > Generating llvm bitcode for function gradient...
 > Module.compile(): assembly /tmp/0J9DzW/gradient.stmt.html.s
 > emit_file.Compiling to native code...
 > Module.compile(): stmt_html gradient.stmt.html
 > Done generating HTML IR Visualization - printed to: gradient.stmt.html
 > file_unlink: /tmp/0J9DzW/gradient.stmt.html.s
 > dir_rmdir: /tmp/0J9DzW



    // If you set HL_DEBUG_CODEGEN to a higher number, you can see
    // more and more details of how Halide compiles your pipeline.
    // Setting HL_DEBUG_CODEGEN=2 shows the Halide code at each stage
    // of compilation, and also the llvm bitcode we generate at the
    // end.

    // Halide will also output an HTML version of this output, which
    // supports syntax highlighting and code-folding, so it can be
    // nicer to read for large pipelines. Open gradient.stmt.html" with your
    // browser after running this tutorial.
    gradient.compile_to_lowered_stmt("gradient.stmt.html", {}, HTML);

    // You can usually figure out what code Halide is generating using
    // this pseudocode. In the next lesson we'll see how to snoop on
    // Halide at runtime.

    printf("Success!\n");
    return 0;
}