Quake III Arena inverse square root hack only works when compiled with optimizations enabled

05:41 21 Jan 2021

The Quake III Arena inverse square root hack only outperforms math.h 1/sqrt(X) when compiled with optimizations enabled:

float q_sqrt(float x) {
    float x2 = x * 0.5F;
    int i = *( int* )&x;                  // evil floating point bit hack
    i = 0x5f3759df - (i >> 1);            // what the fuck?
    x = *( float* )&i;
    x = x * ( 1.5F - ( (x2 * x * x) ) );  //1st iteration
  //y = y * ( 1.5F - ( (x2 * y * y) ) );  //2nd iteration, can be removed
    return x;
}

To test how fast 1/sqrt(x) runs compared to q_sqrt(x):

//qtest.c
#include 
#include 
#include 
#include 

/*
Implementation of 1/sqrt(x) used in tue quake III game
*/
float q_sqrt(float x) {
    float x2 = x * 0.5F;
    int i = *( int* )&x;                  // evil floating point bit hack
    i = 0x5f3759df - (i >> 1);            // what the fuck?
    x = *( float* )&i;
    x = x * ( 1.5F - ( (x2 * x * x) ) );  //1st iteration
  //y = y * ( 1.5F - ( (x2 * y * y) ) );  //2nd iteration, can be removed
    return x;
}


int main(int argc, char *argv[]) {
    struct timespec start, stop;
    //Will work on floats in the range [0,100]
    float maxn = 100;
    //Work on 10000 random floats or as many as user provides
    size_t num = 10000;
    //Bogus
    float ans = 0;
    //Measure nanoseconds
    size_t ns = 0;
    if (argc > 1)
        num = atoll(argv[1]);
    if (num <= 0) return -1;
    //Compute "num" random floats
    float *vecs = malloc(num * sizeof(float));
    if (!vecs) return -1;
    for (int i = 0; i < num; i++)
        vecs[i] = maxn * ( (float)rand() / (float)RAND_MAX );

    fprintf(stderr, "Measuring 1/sqrt(x)\n");
    clock_gettime( CLOCK_REALTIME, &start);
    for (size_t i = 0; i < num; i++)
        ans += 1 / sqrt(vecs[i]);
    clock_gettime( CLOCK_REALTIME, &stop);
    ns = ( stop.tv_sec - start.tv_sec ) * 1E9 + ( stop.tv_nsec - start.tv_nsec );
    fprintf(stderr, "1/sqrt(x) took %.6f nanosecods\n", (double)ns/num );


    fprintf(stderr, "Measuring q_sqrt(x)\n");
    clock_gettime( CLOCK_REALTIME, &start);
    for (size_t i = 0; i < num; i++)
        ans += q_sqrt(vecs[i]);
    clock_gettime( CLOCK_REALTIME, &stop);
    ns = ( stop.tv_sec - start.tv_sec ) * 1E9 + ( stop.tv_nsec - start.tv_nsec );
    fprintf(stderr, "q_sqrt(x) took %.6f nanosecods\n", (double)ns/num );

    //Side by side
  //for (size_t i = 0; i < num; i++)
  //    fprintf(stdout, "%.6f\t%.6f\n", 1/sqrt(vecs[i]),q_sqrt(vecs[i]));
    free(vecs);
}

On an AMD Ryzen 7 3700X I get:

gcc -Wall -pedantic -o qtest qtest.c -lm
./qtest
Measuring 1/sqrt(x)
1/sqrt(x) took 4.470000 nanosecods
Measuring q_sqrt(x)
q_sqrt(x) took 4.859000 nanosecods


gcc -Wall -pedantic -O1 -o qtest qtest.c -lm
./qtest
Measuring 1/sqrt(x)
1/sqrt(x) took 0.378000 nanosecods
Measuring q_sqrt(x)
q_sqrt(x) took 0.497000 nanosecods


gcc -Wall -pedantic -O2 -o qtest qtest.c -lm
qtest.c: In function ‘q_sqrt’:
qtest.c:11:14: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]
  11 |     int i = *( int* )&x;                  // evil floating point bit hack
     |
qtest.c:13:10: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]
  13 |     x = *( float* )&i;
     |
./qtest
Measuring 1/sqrt(x)
1/sqrt(x) took 0.500000 nanosecods
Measuring q_sqrt(x)
q_sqrt(x) took 0.002000 nanosecods

My expectation was that q_sqrt(x) works better than 1/sqrt(X). So either libm is better optimized or my CPU is equipped with a hardware solution for sqrt(X) (CPUs have changed since the quick inverse root hack).

What optimizations would the compiler be applying to make it so much faster? Maybe my benchmark is ill conceived.

c sqrt quake

Your Answer

Privacy & Cookie Consent