small fixes and performance improvments

parent 10557c97
......@@ -6,7 +6,7 @@ OPENMPFLAGS= -fopenmp=libomp
#CLFLAGS= -D CL_TARGET_OPENCL_VERSION=100 -lOpenCL
CFLAGS := $(CFLAGS) -O3 -mavx -march=native
CFLAGS := $(CFLAGS) -Ofast -mavx -march=native
CFLAGS := $(CFLAGS) $(OPENMPFLAGS) $(WARNING_FLAGS)
LDLIBS = $(shell pkg-config --libs sdl2 SDL2_ttf SDL2_image)
DEBUG_FLAGS = -g -DDEBUG -pg -O0
......
......@@ -31,46 +31,42 @@
* 3.1 x86intrin.h DONE
*/
int
mandelbrot_avx(Complex c, unsigned max_iterations)
{
__m256 cr = _mm256_set1_pd(c.real);
__m256 ci = _mm256_set1_pd(c.imag);
__m256 zr = cr;
__m256 zi = ci;
__m256 threshold = _mm256_set1_pd(4);
unsigned int k = 1;
__m256 mk = _mm256_set1_pd(k);
__m256 one = _mm256_set1_pd(1);
while (++k < max_iterations){
/* Compute z1 from z0 */
__m256 zr2 = _mm256_mul_pd(zr, zr);
__m256 zi2 = _mm256_mul_pd(zi, zi);
__m256 zrzi = _mm256_mul_pd(zr, zi);
int mandelbrot_avx(Complex c, unsigned max_iterations) {
__m256d cr = _mm256_set1_pd(c.real);
__m256d ci = _mm256_set1_pd(c.imag);
__m256d zr = cr;
__m256d zi = ci;
__m256d threshold = _mm256_set1_pd(4.0);
__m256d one = _mm256_set1_pd(1.0);
int k = 1;
__m256d mk = _mm256_set1_pd(k);
while (k < max_iterations) {
// Compute z = z^2 + c
__m256d zr2 = _mm256_mul_pd(zr, zr);
__m256d zi2 = _mm256_mul_pd(zi, zi);
__m256d zrzi = _mm256_mul_pd(zr, zi);
/* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
/* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
/* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
zr = _mm256_add_pd(_mm256_sub_pd(zr2, zi2), cr);
zi = _mm256_add_pd(_mm256_add_pd(zrzi, zrzi), ci);
/* Increment k */
zr2 = _mm256_mul_pd(zr, zr);
zi2 = _mm256_mul_pd(zi, zi);
__m256 mag2 = _mm256_add_pd(zr2, zi2);
__m256 mask = _mm256_cmp_pd(mag2, threshold, _CMP_LT_OS);
// Check for escape
__m256d mag2 = _mm256_add_pd(zr2, zi2);
__m256d mask = _mm256_cmp_pd(mag2, threshold, _CMP_LT_OS);
mk = _mm256_add_pd(_mm256_and_pd(mask, one), mk);
/* Early bailout? */
if (_mm256_testz_pd(mask, _mm256_set1_pd(-1)))
if (_mm256_testz_pd(mask, _mm256_set1_pd(-1))) {
break;
}
k++;
}
return k;
}
int
mandelbrot(Complex c, unsigned int max_iterations) {
Complex z = {0.0, 0.0};
......@@ -100,16 +96,36 @@ typedef struct {
int size;
}Array;
void
calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned use_avx, unsigned int max_iterations)
Color
calculate_color(unsigned int iterations, unsigned int max_iterations)
{
int (*mandelbrot_func)(Complex C, unsigned);
Color cl;
if (iterations == max_iterations){
cl.r = 0;
cl.g = 0;
cl.b = 0;
cl.a = 0;
} else {
double t = (double)iterations / max_iterations;
t = 0.5 + 0.5 * tan(log(t + 0.0001) * 3.0);
cl.r = 255 * t * 0.5;
cl.g = 255 * t * 0.4;
cl.b = 255 * t * 0.3;
cl.a = 255;
}
return cl;
}
if (use_avx == 0)
mandelbrot_func = mandelbrot;
else
mandelbrot_func = mandelbrot_avx;
void
calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned use_avx, short unsigned use_omp, unsigned int max_iterations)
{
int (*mandelbrot_func)(Complex, unsigned) = use_avx == 0 ? mandelbrot : mandelbrot_avx;
if (use_omp == 0)
goto no_omp;
#pragma omp parallel for schedule(static, 16)
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
long double real = view.x_min + (x * (view.x_max - view.x_min)) / width;
......@@ -118,35 +134,12 @@ calculate_set(Array *arr, int height, int width, ViewInfo view, short unsigned u
unsigned int iterations = mandelbrot_func(c, max_iterations);
if (iterations == max_iterations){
arr->pointer[y * width + x].r = 0;
arr->pointer[y * width + x].g = 0;
arr->pointer[y * width + x].b = 0;
arr->pointer[y * width + x].a = 0;
} else {
double t = (double)iterations / max_iterations;
t = 0.5 + 0.5 * tan(log(t + 0.0001) * 3.0);
arr->pointer[y * width + x].r = 255 * t * 0.5;
arr->pointer[y * width + x].g = 255 * t * 0.4;
arr->pointer[y * width + x].b = 255 * t;
arr->pointer[y * width + x].a = 255;
}
arr->pointer[y * width + x] = calculate_color(iterations, max_iterations);
}
}
}
return;
void
calculate_set_omp(Array *arr, int height, int width, ViewInfo view, short unsigned use_avx, unsigned int max_iterations)
{
int (*mandelbrot_func)(Complex, unsigned);
if (use_avx == 0)
mandelbrot_func = mandelbrot;
else
mandelbrot_func = mandelbrot_avx;
#pragma omp parallel for schedule(guided)
no_omp:
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
long double real = view.x_min + (x * (view.x_max - view.x_min)) / width;
......@@ -154,21 +147,8 @@ calculate_set_omp(Array *arr, int height, int width, ViewInfo view, short unsign
Complex c = {real, imag};
unsigned int iterations = mandelbrot_func(c, max_iterations);
if (iterations == max_iterations){
arr->pointer[y * width + x].r = 0;
arr->pointer[y * width + x].g = 0;
arr->pointer[y * width + x].b = 0;
arr->pointer[y * width + x].a = 0;
} else {
double t = (double)iterations / max_iterations;
t = 0.5 + 0.5 * tan(log(t + 0.0001) * 3.0);
arr->pointer[y * width + x].r = 255 * t * 0.5;
arr->pointer[y * width + x].g = 255 * t * 0.4;
arr->pointer[y * width + x].b = 255 * t;
arr->pointer[y * width + x].a = 255;
}
arr->pointer[y * width + x] = calculate_color(iterations, max_iterations);
}
}
}
......@@ -284,14 +264,11 @@ parse_opt (int key, char *arg, struct argp_state *state)
static struct argp argp = {options, parse_opt, NULL, NULL, NULL, NULL, NULL};
void
render_cl(Array *arr, SDL_Texture *texture, SDL_Renderer *renderer, int width, int height, ViewInfo *view,
short unsigned use_avx, short unsigned use_omp, short unsigned *needs_recalc, unsigned int max_iterations) {
void (*calculate_set_func)(Array *arr, int height, int width,
ViewInfo view, short unsigned use_avx, unsigned max_iterations);
calculate_set_func = use_omp ? calculate_set_omp : calculate_set;
SDL_SetRenderDrawColor(renderer, 0, 0, 50, 255);
SDL_RenderClear(renderer);
......@@ -301,7 +278,7 @@ render_cl(Array *arr, SDL_Texture *texture, SDL_Renderer *renderer, int width, i
arr->size = height * width * sizeof(Color);
}
if (*needs_recalc){
calculate_set_func(arr, height, width, *view, use_avx, max_iterations);
calculate_set(arr, height, width, *view, use_avx, use_omp, max_iterations);
*needs_recalc = 0;
}
......@@ -417,6 +394,11 @@ main(int argc, char **argv)
int running = 1;
// draw the first frame
render_cl(&screen, app.texture, app.renderer, app.win_width, app.win_height, &view, args.use_avx, args.use_omp,
&app.needs_recalc, args.max_iterations);
SDL_RenderPresent(app.renderer);
while (running)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment