aboutsummaryrefslogtreecommitdiff
path: root/final/Bitcode/Benchmarks/Halide/blur/driver.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'final/Bitcode/Benchmarks/Halide/blur/driver.cpp')
-rw-r--r--final/Bitcode/Benchmarks/Halide/blur/driver.cpp119
1 files changed, 119 insertions, 0 deletions
diff --git a/final/Bitcode/Benchmarks/Halide/blur/driver.cpp b/final/Bitcode/Benchmarks/Halide/blur/driver.cpp
new file mode 100644
index 00000000..61a7696b
--- /dev/null
+++ b/final/Bitcode/Benchmarks/Halide/blur/driver.cpp
@@ -0,0 +1,119 @@
+#include <emmintrin.h>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+
+#include "../common/benchmark.h"
+#include "../common/halide_image.h"
+#include "../common/halide_image_io.h"
+#include "halide_blur.h"
+
+using namespace Halide::Tools;
+
+double t;
+
+Image<uint16_t> blur(Image<uint16_t> in) {
+ Image<uint16_t> tmp(in.width()-8, in.height());
+ Image<uint16_t> out(in.width()-8, in.height()-2);
+
+ t = benchmark(10, 1, [&]() {
+ for (int y = 0; y < tmp.height(); y++)
+ for (int x = 0; x < tmp.width(); x++)
+ tmp(x, y) = (in(x, y) + in(x+1, y) + in(x+2, y))/3;
+
+ for (int y = 0; y < out.height(); y++)
+ for (int x = 0; x < out.width(); x++)
+ out(x, y) = (tmp(x, y) + tmp(x, y+1) + tmp(x, y+2))/3;
+ });
+
+ return out;
+}
+
+
+Image<uint16_t> blur_fast(Image<uint16_t> in) {
+ Image<uint16_t> out(in.width()-8, in.height()-2);
+
+ t = benchmark(10, 1, [&]() {
+ __m128i one_third = _mm_set1_epi16(21846);
+#pragma omp parallel for
+ for (int yTile = 0; yTile < out.height(); yTile += 32) {
+ __m128i a, b, c, sum, avg;
+ __m128i tmp[(128/8) * (32 + 2)];
+ for (int xTile = 0; xTile < out.width(); xTile += 128) {
+ __m128i *tmpPtr = tmp;
+ for (int y = 0; y < 32+2; y++) {
+ const uint16_t *inPtr = &(in(xTile, yTile+y));
+ for (int x = 0; x < 128; x += 8) {
+ a = _mm_load_si128((__m128i*)(inPtr));
+ b = _mm_loadu_si128((__m128i*)(inPtr+1));
+ c = _mm_loadu_si128((__m128i*)(inPtr+2));
+ sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
+ avg = _mm_mulhi_epi16(sum, one_third);
+ _mm_store_si128(tmpPtr++, avg);
+ inPtr+=8;
+ }
+ }
+ tmpPtr = tmp;
+ for (int y = 0; y < 32; y++) {
+ __m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y)));
+ for (int x = 0; x < 128; x += 8) {
+ a = _mm_load_si128(tmpPtr+(2*128)/8);
+ b = _mm_load_si128(tmpPtr+128/8);
+ c = _mm_load_si128(tmpPtr++);
+ sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
+ avg = _mm_mulhi_epi16(sum, one_third);
+ _mm_store_si128(outPtr++, avg);
+ }
+ }
+ }
+ }
+ });
+
+ return out;
+}
+
+Image<uint16_t> blur_halide(Image<uint16_t> in) {
+ Image<uint16_t> out(in.width()-8, in.height()-2);
+
+ // Call it once to initialize the halide runtime stuff
+ halide_blur(in, out);
+
+ t = benchmark(10, 1, [&]() {
+ // Compute the same region of the output as blur_fast (i.e., we're
+ // still being sloppy with boundary conditions)
+ halide_blur(in, out);
+ });
+
+ return out;
+}
+
+int main(int argc, char **argv) {
+
+ Image<uint16_t> input(6408, 4802);
+
+ for (int y = 0; y < input.height(); y++) {
+ for (int x = 0; x < input.width(); x++) {
+ input(x, y) = rand() & 0xfff;
+ }
+ }
+
+ Image<uint16_t> blurry = blur(input);
+ double slow_time = t;
+
+ Image<uint16_t> speedy = blur_fast(input);
+ double fast_time = t;
+
+ Image<uint16_t> halide = blur_halide(input);
+ double halide_time = t;
+
+ printf("times: %f %f %f\n", slow_time, fast_time, halide_time);
+
+ for (int y = 64; y < input.height() - 64; y++) {
+ for (int x = 64; x < input.width() - 64; x++) {
+ if (blurry(x, y) != speedy(x, y) || blurry(x, y) != halide(x, y))
+ printf("difference at (%d,%d): %d %d %d\n", x, y, blurry(x, y), speedy(x, y), halide(x, y));
+ }
+ }
+
+ return 0;
+}