#include <stdio.h>
#include "Halide.h"
#include "clock.h"
#include "halide_image_io.h"
using namespace Halide::Tools;
Var x, y, c, i, ii, xo, yo, xi, yi;
class MyPipeline {
public:
Func lut, padded, padded16, sharpen, curved;
: input(in) {
lut(i) = cast<uint8_t>(clamp(pow(i / 255.0f, 1.2f) * 255.0f, 0, 255));
padded(x, y, c) = input(clamp(x, 0, input.width() - 1),
clamp(y, 0, input.height() - 1), c);
padded16(x, y, c) = cast<uint16_t>(padded(x, y, c));
sharpen(x, y, c) = (padded16(x, y, c) * 2 -
(padded16(x - 1, y, c) +
padded16(x, y - 1, c) +
padded16(x + 1, y, c) +
padded16(x, y + 1, c)) /
4);
curved(x, y, c) = lut(sharpen(x, y, c));
}
void schedule_for_cpu() {
curved.
split(y, yo, yi, 16)
Target target = get_host_target();
}
bool schedule_for_gpu() {
Target target = find_gpu_target();
return false;
}
lut.
split(i, block, thread, 16);
curved.
gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
printf(
"Target: %s\n", target.
to_string().c_str());
return true;
}
void test_performance() {
double best_time = 0.0;
for (int i = 0; i < 3; i++) {
double t1 = current_time();
for (int j = 0; j < 100; j++) {
}
output.copy_to_host();
double t2 = current_time();
double elapsed = (t2 - t1) / 100;
if (i == 0 || elapsed < best_time) {
best_time = elapsed;
}
}
printf("%1.4f milliseconds\n", best_time);
}
curved.
realize({input.width(), input.height(), input.channels()});
for (int c = 0; c < input.channels(); c++) {
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
if (output(x, y, c) != reference_output(x, y, c)) {
printf("Mismatch between output (%d) and "
"reference output (%d) at %d, %d, %d\n",
output(x, y, c),
reference_output(x, y, c),
x, y, c);
exit(1);
}
}
}
}
}
};
int main(int argc, char **argv) {
Buffer<uint8_t> reference_output(input.width(), input.height(), input.channels());
printf("Running pipeline on CPU:\n");
MyPipeline p1(input);
p1.schedule_for_cpu();
p1.curved.realize(reference_output);
printf("Running pipeline on GPU:\n");
MyPipeline p2(input);
bool has_gpu_target = p2.schedule_for_gpu();
if (has_gpu_target) {
printf("Testing GPU correctness:\n");
p2.test_correctness(reference_output);
} else {
printf("No GPU target available on the host\n");
}
printf("Testing performance on CPU:\n");
p1.test_performance();
if (has_gpu_target) {
printf("Testing performance on GPU:\n");
p2.test_performance();
}
return 0;
}
std::vector<Target::Feature> features_to_try;
if (target.
os == Target::Windows) {
if (sizeof(void*) == 8) {
features_to_try.push_back(Target::D3D12Compute);
}
features_to_try.push_back(Target::OpenCL);
}
else if (target.
os == Target::OSX) {
features_to_try.push_back(Target::Metal);
} else {
features_to_try.push_back(Target::OpenCL);
}
return new_target;
}
}
printf("Requested GPU(s) are not supported. (Do you have the proper hardware and/or driver installed?)\n");
return target;
}
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func & compute_root()
Compute all of this function once ahead of time.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
A Halide variable, to be used when defining functions.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Target get_host_target()
Return the target corresponding to the host machine.
bool host_supports_target_device(const Target &t)
This attempts to sniff whether a given Target (and its implied DeviceAPI) is usable on the current ho...
A struct representing a target machine and os to generate code for.
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
enum Halide::Target::OS os
std::string to_string() const
Convert the Target into a string form that can be reconstituted by merge_string(),...
Feature
Optional features a target can have.
Target with_feature(Feature f) const
Return a copy of the target with the given feature set.