I am quite new to golang, and I am doing some experiments with golang and cgo. I wanted to see if golang can benefit with a C program that uses SIMD operations on a simple test, however what I saw is that the normal go-way is faster than a simd optimized C code that is called from golang with cgo. I heard that there is a penalty using the C from golang, but I couldn't find any detailed information. Here are my samples so any advise would be helpful.
golang - cgo test:
package cutils
/*
#cgo CFLAGS: -mavx2 -Wall -O0
#include
#include
const char* foo(void) {
return __FILE__;
}
#define FLOATS_IN_AVX_REG 8
void simdAdd(float* out, float* a, float* b, unsigned long len) {
const unsigned long vectSize = (len / FLOATS_IN_AVX_REG) * FLOATS_IN_AVX_REG ;
unsigned long i=0;
for(i=0; i < vectSize; i += FLOATS_IN_AVX_REG) {
__m256 regA = _mm256_loadu_ps(a+i);
__m256 regB = _mm256_loadu_ps(b+i);
__m256 res = _mm256_add_ps(regA, regB);
_mm256_storeu_ps(out+i, res);
}
for(; i < len; i++) {
out[i] = a[i] + b[i];
}
}
*/
import "C"
import (
"fmt"
"unsafe"
)
func NoSimdAdd(a, b []float32, l uint32) []float32 {
res := make([]float32, l)
for i := 0; i < len(a); i++ {
res[i] = a[i] + b[i]
}
return res
}
func SIMDadd(a, b []float32, l uint32) []float32 {
res := make([]float32, l)
C.simdAdd(
(*C.float)(unsafe.Pointer(&res[0])),
(*C.float)(unsafe.Pointer(&a[0])),
(*C.float)(unsafe.Pointer(&b[0])),
C.ulong(l))
return res
}
And the main test:
package main
import (
"ex1/files/cutils"
"fmt"
"math/rand/v2"
"time"
)
func genFArray(s uint32) []float32 {
r := make([]float32, s)
for i := range s {
r[i] = 1.0 * rand.Float32() * (10.0 - 1.0)
}
return r
}
const (
TEST_SIZE = 16
LOOP_CNT = 1000000
)
func main() {
a := genFArray(1 << TEST_SIZE)
start := time.Now()
for i := 0; i < LOOP_CNT; i++ {
cutils.SIMDadd(a, a, 1<
The original C program that proves SIMD optimization is also here:
#include
#include
#include
#include // for gettimeofday()
#define FLOATS_IN_AVX_REG 8
#define ARR_SIZE (1 << 16)
#define LOOP_CNT 1000000
void plain_add(float* out, float* a, float* b, unsigned long len);
void simd_add(float* out, float* a, float* b, unsigned long len) ;
float func_Uniform(float) ;
int main(void) {
unsigned long i,j;
float arr1[ARR_SIZE];
float result[ARR_SIZE];
for(i=0; i < ARR_SIZE; i++) arr1[i] = func_Uniform(1.0);
printf("Start test\r\n");
struct timeval t1, t2;
double elapsedTime;
gettimeofday(&t1, NULL);
for(j=0; j < LOOP_CNT; j++)
simd_add(result, arr1, arr1, ARR_SIZE);
gettimeofday(&t2, NULL);
elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms
elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms
printf("Time elapsed %f ms.\n", elapsedTime);
return 0;
}
float func_Uniform(float a) {
return ((float)rand()/(float)(RAND_MAX)) * a ;
}
void plain_add(float* out, float* a, float* b, unsigned long len)
{
unsigned long i;
for(i=0; i < len; i++) out[i] = a[i] + b[i];
}
void simd_add(float* out, float* a, float* b, unsigned long len) {
const unsigned long vectSize = (len / FLOATS_IN_AVX_REG) * FLOATS_IN_AVX_REG ;
unsigned long i=0;
for(i=0; i < vectSize; i += FLOATS_IN_AVX_REG) {
__m256 regA = _mm256_loadu_ps(a+i);
__m256 regB = _mm256_loadu_ps(b+i);
__m256 res = _mm256_add_ps(regA, regB);
_mm256_storeu_ps(out+i, res);
}
for(; i < len; i++) {
out[i] = a[i] + b[i];
}
}
So the C program tested with SIMD vs no SIMD has a difference that is noticeable:
no simd
Start test
Time elapsed 206796.292000 ms.
simd
Start test
Time elapsed 84095.521000 ms.
The golang experiment however has various times differences but the SIMD is slower.
Thanks.
[EDIT]: I've changed the test a bit and saw some improvement and also instead of using go run . I've build the binary to be able to static link with all the code and hope for the best. I've also passed the loop count to C directly as changed the C code to:
void simdAddTest(float* out, float* a, float* b, unsigned long len, unsigned long loopcnt) {
unsigned long l ;
for(l=0; l < loopcnt; l++) {
simdAdd(out, a, b, len);
}
}
and call it from golang only one time:
func SIMDAdd2(a, b []float32, l uint32, cnt uint32) []float32 {
res := make([]float32, l)
C.simdAddTest(
(*C.float)(unsafe.Pointer(&res[0])),
(*C.float)(unsafe.Pointer(&a[0])),
(*C.float)(unsafe.Pointer(&b[0])),
C.ulong(l),
C.ulong(cnt))
return res
}
Now the outcome is a bit more reasonable:
NO SIMD 59.71s elapsed
SIMD 41.10s elapsed
It's around 10-15 sec difference between SIMD and non SIMD vector add which is somehow ok, but I am also not sure if that is the best I can squeeze from it.