The Quake III Arena inverse square root hack only outperforms math.h 1/sqrt(X) when compiled with optimizations enabled:
float q_sqrt(float x) {
float x2 = x * 0.5F;
int i = *( int* )&x; // evil floating point bit hack
i = 0x5f3759df - (i >> 1); // what the fuck?
x = *( float* )&i;
x = x * ( 1.5F - ( (x2 * x * x) ) ); //1st iteration
//y = y * ( 1.5F - ( (x2 * y * y) ) ); //2nd iteration, can be removed
return x;
}
To test how fast 1/sqrt(x) runs compared to q_sqrt(x):
//qtest.c
#include
#include
#include
#include
/*
Implementation of 1/sqrt(x) used in tue quake III game
*/
float q_sqrt(float x) {
float x2 = x * 0.5F;
int i = *( int* )&x; // evil floating point bit hack
i = 0x5f3759df - (i >> 1); // what the fuck?
x = *( float* )&i;
x = x * ( 1.5F - ( (x2 * x * x) ) ); //1st iteration
//y = y * ( 1.5F - ( (x2 * y * y) ) ); //2nd iteration, can be removed
return x;
}
int main(int argc, char *argv[]) {
struct timespec start, stop;
//Will work on floats in the range [0,100]
float maxn = 100;
//Work on 10000 random floats or as many as user provides
size_t num = 10000;
//Bogus
float ans = 0;
//Measure nanoseconds
size_t ns = 0;
if (argc > 1)
num = atoll(argv[1]);
if (num <= 0) return -1;
//Compute "num" random floats
float *vecs = malloc(num * sizeof(float));
if (!vecs) return -1;
for (int i = 0; i < num; i++)
vecs[i] = maxn * ( (float)rand() / (float)RAND_MAX );
fprintf(stderr, "Measuring 1/sqrt(x)\n");
clock_gettime( CLOCK_REALTIME, &start);
for (size_t i = 0; i < num; i++)
ans += 1 / sqrt(vecs[i]);
clock_gettime( CLOCK_REALTIME, &stop);
ns = ( stop.tv_sec - start.tv_sec ) * 1E9 + ( stop.tv_nsec - start.tv_nsec );
fprintf(stderr, "1/sqrt(x) took %.6f nanosecods\n", (double)ns/num );
fprintf(stderr, "Measuring q_sqrt(x)\n");
clock_gettime( CLOCK_REALTIME, &start);
for (size_t i = 0; i < num; i++)
ans += q_sqrt(vecs[i]);
clock_gettime( CLOCK_REALTIME, &stop);
ns = ( stop.tv_sec - start.tv_sec ) * 1E9 + ( stop.tv_nsec - start.tv_nsec );
fprintf(stderr, "q_sqrt(x) took %.6f nanosecods\n", (double)ns/num );
//Side by side
//for (size_t i = 0; i < num; i++)
// fprintf(stdout, "%.6f\t%.6f\n", 1/sqrt(vecs[i]),q_sqrt(vecs[i]));
free(vecs);
}
On an AMD Ryzen 7 3700X I get:
gcc -Wall -pedantic -o qtest qtest.c -lm
./qtest
Measuring 1/sqrt(x)
1/sqrt(x) took 4.470000 nanosecods
Measuring q_sqrt(x)
q_sqrt(x) took 4.859000 nanosecods
gcc -Wall -pedantic -O1 -o qtest qtest.c -lm
./qtest
Measuring 1/sqrt(x)
1/sqrt(x) took 0.378000 nanosecods
Measuring q_sqrt(x)
q_sqrt(x) took 0.497000 nanosecods
gcc -Wall -pedantic -O2 -o qtest qtest.c -lm
qtest.c: In function ‘q_sqrt’:
qtest.c:11:14: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]
11 | int i = *( int* )&x; // evil floating point bit hack
|
qtest.c:13:10: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]
13 | x = *( float* )&i;
|
./qtest
Measuring 1/sqrt(x)
1/sqrt(x) took 0.500000 nanosecods
Measuring q_sqrt(x)
q_sqrt(x) took 0.002000 nanosecods
My expectation was that q_sqrt(x) works better than 1/sqrt(X). So either libm is better optimized or my CPU is equipped with a hardware solution for sqrt(X) (CPUs have changed since the quick inverse root hack).
What optimizations would the compiler be applying to make it so much faster? Maybe my benchmark is ill conceived.