small fixes and performance improvments

parent 10557c97
...@@ -6,7 +6,7 @@ OPENMPFLAGS= -fopenmp=libomp ...@@ -6,7 +6,7 @@ OPENMPFLAGS= -fopenmp=libomp
#CLFLAGS= -D CL_TARGET_OPENCL_VERSION=100 -lOpenCL #CLFLAGS= -D CL_TARGET_OPENCL_VERSION=100 -lOpenCL
CFLAGS := $(CFLAGS) -O3 -mavx -march=native CFLAGS := $(CFLAGS) -Ofast -mavx -march=native
CFLAGS := $(CFLAGS) $(OPENMPFLAGS) $(WARNING_FLAGS) CFLAGS := $(CFLAGS) $(OPENMPFLAGS) $(WARNING_FLAGS)
LDLIBS = $(shell pkg-config --libs sdl2 SDL2_ttf SDL2_image) LDLIBS = $(shell pkg-config --libs sdl2 SDL2_ttf SDL2_image)
DEBUG_FLAGS = -g -DDEBUG -pg -O0 DEBUG_FLAGS = -g -DDEBUG -pg -O0
......
...@@ -31,46 +31,42 @@ ...@@ -31,46 +31,42 @@
* 3.1 x86intrin.h DONE * 3.1 x86intrin.h DONE
*/ */
int int mandelbrot_avx(Complex c, unsigned max_iterations) {
mandelbrot_avx(Complex c, unsigned max_iterations) __m256d cr = _mm256_set1_pd(c.real);
{ __m256d ci = _mm256_set1_pd(c.imag);
__m256 cr = _mm256_set1_pd(c.real);
__m256 ci = _mm256_set1_pd(c.imag); __m256d zr = cr;
__m256d zi = ci;
__m256 zr = cr;
__m256 zi = ci; __m256d threshold = _mm256_set1_pd(4.0);
__m256d one = _mm256_set1_pd(1.0);
__m256 threshold = _mm256_set1_pd(4);
int k = 1;
unsigned int k = 1; __m256d mk = _mm256_set1_pd(k);
__m256 mk = _mm256_set1_pd(k); while (k < max_iterations) {
__m256 one = _mm256_set1_pd(1); // Compute z = z^2 + c
while (++k < max_iterations){ __m256d zr2 = _mm256_mul_pd(zr, zr);
/* Compute z1 from z0 */ __m256d zi2 = _mm256_mul_pd(zi, zi);
__m256 zr2 = _mm256_mul_pd(zr, zr); __m256d zrzi = _mm256_mul_pd(zr, zi);
__m256 zi2 = _mm256_mul_pd(zi, zi);
__m256 zrzi = _mm256_mul_pd(zr, zi);
/* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
/* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
zr = _mm256_add_pd(_mm256_sub_pd(zr2, zi2), cr); zr = _mm256_add_pd(_mm256_sub_pd(zr2, zi2), cr);
zi = _mm256_add_pd(_mm256_add_pd(zrzi, zrzi), ci); zi = _mm256_add_pd(_mm256_add_pd(zrzi, zrzi), ci);
/* Increment k */ // Check for escape
zr2 = _mm256_mul_pd(zr, zr); __m256d mag2 = _mm256_add_pd(zr2, zi2);
zi2 = _mm256_mul_pd(zi, zi); __m256d mask = _mm256_cmp_pd(mag2, threshold, _CMP_LT_OS);
__m256 mag2 = _mm256_add_pd(zr2, zi2);
__m256 mask = _mm256_cmp_pd(mag2, threshold, _CMP_LT_OS);
mk = _mm256_add_pd(_mm256_and_pd(mask, one), mk); mk = _mm256_add_pd(_mm256_and_pd(mask, one), mk);
/* Early bailout? */ if (_mm256_testz_pd(mask, _mm256_set1_pd(-1))) {
if (_mm256_testz_pd(mask, _mm256_set1_pd(-1)))
break; break;
}
k++;
} }
return k; return k;
} }
int int
mandelbrot(Complex c, unsigned int max_iterations) { mandelbrot(Complex c, unsigned int max_iterations) {
Complex z = {0.0, 0.0}; Complex z = {0.0, 0.0};
...@@ -100,16 +96,36 @@ typedef struct { ...@@ -100,16 +96,36 @@ typedef struct {
int size; int size;
}Array; }Array;
void Color
calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned use_avx, unsigned int max_iterations) calculate_color(unsigned int iterations, unsigned int max_iterations)
{ {
int (*mandelbrot_func)(Complex C, unsigned); Color cl;
if (iterations == max_iterations){
cl.r = 0;
cl.g = 0;
cl.b = 0;
cl.a = 0;
} else {
double t = (double)iterations / max_iterations;
t = 0.5 + 0.5 * tan(log(t + 0.0001) * 3.0);
cl.r = 255 * t * 0.5;
cl.g = 255 * t * 0.4;
cl.b = 255 * t * 0.3;
cl.a = 255;
}
return cl;
}
if (use_avx == 0) void
mandelbrot_func = mandelbrot; calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned use_avx, short unsigned use_omp, unsigned int max_iterations)
else {
mandelbrot_func = mandelbrot_avx; int (*mandelbrot_func)(Complex, unsigned) = use_avx == 0 ? mandelbrot : mandelbrot_avx;
if (use_omp == 0)
goto no_omp;
#pragma omp parallel for schedule(static, 16)
for (int y = 0; y < height; y++) { for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) { for (int x = 0; x < width; x++) {
long double real = view.x_min + (x * (view.x_max - view.x_min)) / width; long double real = view.x_min + (x * (view.x_max - view.x_min)) / width;
...@@ -118,35 +134,12 @@ calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned u ...@@ -118,35 +134,12 @@ calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned u
unsigned int iterations = mandelbrot_func(c, max_iterations); unsigned int iterations = mandelbrot_func(c, max_iterations);
if (iterations == max_iterations){ arr->pointer[y * width + x] = calculate_color(iterations, max_iterations);
arr->pointer[y * width + x].r = 0;
arr->pointer[y * width + x].g = 0;
arr->pointer[y * width + x].b = 0;
arr->pointer[y * width + x].a = 0;
} else {
double t = (double)iterations / max_iterations;
t = 0.5 + 0.5 * tan(log(t + 0.0001) * 3.0);
arr->pointer[y * width + x].r = 255 * t * 0.5;
arr->pointer[y * width + x].g = 255 * t * 0.4;
arr->pointer[y * width + x].b = 255 * t;
arr->pointer[y * width + x].a = 255;
}
} }
} }
} return;
void no_omp:
calculate_set_omp(Array *arr, int height, int width, ViewInfo view, short unsigned use_avx, unsigned int max_iterations)
{
int (*mandelbrot_func)(Complex, unsigned);
if (use_avx == 0)
mandelbrot_func = mandelbrot;
else
mandelbrot_func = mandelbrot_avx;
#pragma omp parallel for schedule(guided)
for (int y = 0; y < height; y++) { for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) { for (int x = 0; x < width; x++) {
long double real = view.x_min + (x * (view.x_max - view.x_min)) / width; long double real = view.x_min + (x * (view.x_max - view.x_min)) / width;
...@@ -154,21 +147,8 @@ calculate_set_omp(Array *arr, int height, int width, ViewInfo view, short unsign ...@@ -154,21 +147,8 @@ calculate_set_omp(Array *arr, int height, int width, ViewInfo view, short unsign
Complex c = {real, imag}; Complex c = {real, imag};
unsigned int iterations = mandelbrot_func(c, max_iterations); unsigned int iterations = mandelbrot_func(c, max_iterations);
if (iterations == max_iterations){ arr->pointer[y * width + x] = calculate_color(iterations, max_iterations);
arr->pointer[y * width + x].r = 0;
arr->pointer[y * width + x].g = 0;
arr->pointer[y * width + x].b = 0;
arr->pointer[y * width + x].a = 0;
} else {
double t = (double)iterations / max_iterations;
t = 0.5 + 0.5 * tan(log(t + 0.0001) * 3.0);
arr->pointer[y * width + x].r = 255 * t * 0.5;
arr->pointer[y * width + x].g = 255 * t * 0.4;
arr->pointer[y * width + x].b = 255 * t;
arr->pointer[y * width + x].a = 255;
}
} }
} }
} }
...@@ -284,14 +264,11 @@ parse_opt (int key, char *arg, struct argp_state *state) ...@@ -284,14 +264,11 @@ parse_opt (int key, char *arg, struct argp_state *state)
static struct argp argp = {options, parse_opt, NULL, NULL, NULL, NULL, NULL}; static struct argp argp = {options, parse_opt, NULL, NULL, NULL, NULL, NULL};
void void
render_cl(Array *arr, SDL_Texture *texture, SDL_Renderer *renderer, int width, int height, ViewInfo *view, render_cl(Array *arr, SDL_Texture *texture, SDL_Renderer *renderer, int width, int height, ViewInfo *view,
short unsigned use_avx, short unsigned use_omp, short unsigned *needs_recalc, unsigned int max_iterations) { short unsigned use_avx, short unsigned use_omp, short unsigned *needs_recalc, unsigned int max_iterations) {
void (*calculate_set_func)(Array *arr, int height, int width,
ViewInfo view, short unsigned use_avx, unsigned max_iterations);
calculate_set_func = use_omp ? calculate_set_omp : calculate_set;
SDL_SetRenderDrawColor(renderer, 0, 0, 50, 255); SDL_SetRenderDrawColor(renderer, 0, 0, 50, 255);
SDL_RenderClear(renderer); SDL_RenderClear(renderer);
...@@ -301,7 +278,7 @@ render_cl(Array *arr, SDL_Texture *texture, SDL_Renderer *renderer, int width, i ...@@ -301,7 +278,7 @@ render_cl(Array *arr, SDL_Texture *texture, SDL_Renderer *renderer, int width, i
arr->size = height * width * sizeof(Color); arr->size = height * width * sizeof(Color);
} }
if (*needs_recalc){ if (*needs_recalc){
calculate_set_func(arr, height, width, *view, use_avx, max_iterations); calculate_set(arr, height, width, *view, use_avx, use_omp, max_iterations);
*needs_recalc = 0; *needs_recalc = 0;
} }
...@@ -417,6 +394,11 @@ main(int argc, char **argv) ...@@ -417,6 +394,11 @@ main(int argc, char **argv)
int running = 1; int running = 1;
// draw the first frame
render_cl(&screen, app.texture, app.renderer, app.win_width, app.win_height, &view, args.use_avx, args.use_omp,
&app.needs_recalc, args.max_iterations);
SDL_RenderPresent(app.renderer);
while (running) while (running)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment