golang and C with SIMD instructions
04:41 24 Feb 2025

I am quite new to golang, and I am doing some experiments with golang and cgo. I wanted to see if golang can benefit with a C program that uses SIMD operations on a simple test, however what I saw is that the normal go-way is faster than a simd optimized C code that is called from golang with cgo. I heard that there is a penalty using the C from golang, but I couldn't find any detailed information. Here are my samples so any advise would be helpful.

golang - cgo test:

package cutils

/*
#cgo CFLAGS: -mavx2 -Wall -O0
#include 
#include 
const char* foo(void) {
    return __FILE__;
}

#define FLOATS_IN_AVX_REG 8

void simdAdd(float* out,  float* a,  float* b, unsigned long len) {
    const unsigned long vectSize = (len  / FLOATS_IN_AVX_REG) * FLOATS_IN_AVX_REG ;
    unsigned long i=0;
    for(i=0; i < vectSize; i += FLOATS_IN_AVX_REG) {
        __m256 regA = _mm256_loadu_ps(a+i);
        __m256 regB = _mm256_loadu_ps(b+i);
        __m256 res = _mm256_add_ps(regA, regB);
        _mm256_storeu_ps(out+i, res);
    }
    for(; i < len; i++) {
        out[i] = a[i] + b[i];
    }

}

*/
import "C"
import (
    "fmt"
    "unsafe"
)
 

func NoSimdAdd(a, b []float32, l uint32) []float32 {
    res := make([]float32, l)
    for i := 0; i < len(a); i++ {
        res[i] = a[i] + b[i]
    }
    return res
}

func SIMDadd(a, b []float32, l uint32) []float32 {
    res := make([]float32, l)
    C.simdAdd(
        (*C.float)(unsafe.Pointer(&res[0])),
        (*C.float)(unsafe.Pointer(&a[0])),
        (*C.float)(unsafe.Pointer(&b[0])),
        C.ulong(l))
    return res

}

And the main test:

package main

import (
    "ex1/files/cutils"
    "fmt"
    "math/rand/v2"
    "time"
)

func genFArray(s uint32) []float32 {
    r := make([]float32, s)
    for i := range s {
        r[i] = 1.0 * rand.Float32() * (10.0 - 1.0)
    }
    return r
}

const (
    TEST_SIZE = 16
    LOOP_CNT  = 1000000
)

func main() {

    a := genFArray(1 << TEST_SIZE)
    start := time.Now()
    for i := 0; i < LOOP_CNT; i++ {
        cutils.SIMDadd(a, a, 1<

The original C program that proves SIMD optimization is also here:

#include 
#include 
#include 
#include                 // for gettimeofday()

#define FLOATS_IN_AVX_REG 8
#define ARR_SIZE (1 << 16)
#define LOOP_CNT 1000000

void plain_add(float* out,  float* a,  float* b, unsigned long len);

void simd_add(float* out,  float* a,  float* b, unsigned long len) ;

float func_Uniform(float) ;


int main(void) {
    unsigned long  i,j;
    float arr1[ARR_SIZE];
    float result[ARR_SIZE];

    for(i=0; i < ARR_SIZE; i++) arr1[i] = func_Uniform(1.0);
    printf("Start test\r\n");

    struct timeval t1, t2;
    double elapsedTime;
    gettimeofday(&t1, NULL);
    for(j=0; j < LOOP_CNT; j++)
       simd_add(result, arr1, arr1, ARR_SIZE);

    gettimeofday(&t2, NULL);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      // sec to ms
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   // us to ms
    printf("Time elapsed %f ms.\n", elapsedTime);
    return 0;
}

float func_Uniform(float a) {
    return ((float)rand()/(float)(RAND_MAX)) * a ;
}

void plain_add(float* out,  float* a,  float* b, unsigned long len)
{
    unsigned long i;
    for(i=0; i < len; i++) out[i] = a[i] + b[i];
}


void simd_add(float* out,  float* a,  float* b, unsigned long len) {
    const unsigned long vectSize = (len  / FLOATS_IN_AVX_REG) * FLOATS_IN_AVX_REG ;
    unsigned long i=0;
    for(i=0; i < vectSize; i += FLOATS_IN_AVX_REG) {
        __m256 regA = _mm256_loadu_ps(a+i);
        __m256 regB = _mm256_loadu_ps(b+i);
        __m256 res = _mm256_add_ps(regA, regB);
        _mm256_storeu_ps(out+i, res);
    }
    for(; i < len; i++) {
        out[i] = a[i] + b[i];
    }
}

So the C program tested with SIMD vs no SIMD has a difference that is noticeable:

no simd
Start test
Time elapsed 206796.292000 ms.

simd
Start test
Time elapsed 84095.521000 ms.

The golang experiment however has various times differences but the SIMD is slower.

Thanks.

[EDIT]: I've changed the test a bit and saw some improvement and also instead of using go run . I've build the binary to be able to static link with all the code and hope for the best. I've also passed the loop count to C directly as changed the C code to:

void simdAddTest(float* out, float* a, float* b, unsigned long len, unsigned long loopcnt) {
    unsigned long l ;
    for(l=0; l  < loopcnt; l++) {
        simdAdd(out, a, b, len);
    }
}

and call it from golang only one time:

func SIMDAdd2(a, b []float32, l uint32, cnt uint32) []float32 {
    res := make([]float32, l)
    C.simdAddTest(
        (*C.float)(unsafe.Pointer(&res[0])),
        (*C.float)(unsafe.Pointer(&a[0])),
        (*C.float)(unsafe.Pointer(&b[0])),
        C.ulong(l),
        C.ulong(cnt))
    return res
}

Now the outcome is a bit more reasonable:

NO SIMD 59.71s elapsed
SIMD 41.10s elapsed

It's around 10-15 sec difference between SIMD and non SIMD vector add which is somehow ok, but I am also not sure if that is the best I can squeeze from it.

go simd cgo