mandelbrot_avx func

parent 41403d09
CFLAGS = -Wno-unused-command-line-argument -Wno-missing-field-initializers -Wall -Wextra -Wpedantic -lm
CFLAGS = -Wno-unused-command-line-argument -Wno-missing-field-initializers -Wall -Wextra -Wpedantic -lm -mavx
OPENMPFLAGS= -fopenmp=libomp
#OPENMPFLAGS= -fopenmp
#CLFLAGS= -D CL_TARGET_OPENCL_VERSION=100 -lOpenCL
......
No preview for this file type
......@@ -5,6 +5,7 @@
#include <SDL2/SDL_surface.h>
#include <SDL2/SDL_timer.h>
#include <SDL2/SDL_video.h>
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#include <SDL2/SDL.h>
......@@ -24,24 +25,57 @@
* 2.1 texture tendering DONE
* 2.2 fix colors ???
* 3. employ SSE2
* 3.1 x86intrin.h
* 3.2 inline assembly
* 3.1 x86intrin.h DONE
*/
int
mandelbrot_avx(Complex c)
{
__m256 cr = _mm256_set1_ps(c.real);
__m256 ci = _mm256_set1_ps(c.imag);
__m256 zr = cr;
__m256 zi = ci;
__m256 threshold = _mm256_set1_ps(4);
int k = 1;
__m256 mk = _mm256_set1_ps(k);
__m256 one = _mm256_set1_ps(1);
while (++k < MAX_ITERATIONS){
/* Compute z1 from z0 */
__m256 zr2 = _mm256_mul_ps(zr, zr);
__m256 zi2 = _mm256_mul_ps(zi, zi);
__m256 zrzi = _mm256_mul_ps(zr, zi);
/* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
/* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr);
zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci);
/* Increment k */
zr2 = _mm256_mul_ps(zr, zr);
zi2 = _mm256_mul_ps(zi, zi);
__m256 mag2 = _mm256_add_ps(zr2, zi2);
__m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS);
mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk);
/* Early bailout? */
if (_mm256_testz_ps(mask, _mm256_set1_ps(-1)))
break;
}
return k;
}
int
mandelbrot(Complex c) {
// builb check
//if ((c.real + 1.0) * (c.real + 1.0) + c.imag * c.imag <= 0.25)
// return MAX_ITERATIONS;
Complex z = {0.0, 0.0};
int i;
for (i = 0; i < MAX_ITERATIONS; i++) {
double temp_real = z.real * z.real - z.imag * z.imag + c.real;
double temp_imag = 2 * z.real * z.imag + c.imag;
long double temp_real = z.real * z.real - z.imag * z.imag + c.real;
long double temp_imag = 2 * z.real * z.imag + c.imag;
z.real = temp_real;
z.imag = temp_imag;
......@@ -70,11 +104,11 @@ calculate_set(Array *arr, int height, int width, ViewInfo view)
#pragma omp parallel for schedule(guided)
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
double real = view.x_min + (x * (view.x_max - view.x_min)) / width;
double imag = view.y_min + (y * (view.y_max - view.y_min)) / height;
long double real = view.x_min + (x * (view.x_max - view.x_min)) / width;
long double imag = view.y_min + (y * (view.y_max - view.y_min)) / height;
Complex c = {real, imag};
int iterations = mandelbrot(c);
int iterations = mandelbrot_avx(c);
if (iterations == MAX_ITERATIONS){
arr->pointer[y * width + x].r = 0;
......
......@@ -9,8 +9,8 @@
#define MAX_ITERATIONS 250
typedef struct {
double real;
double imag;
long double real;
long double imag;
} Complex;
typedef struct{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment