/* simd.c (SIMD) VC++ : cl.exe /O2 simd.c gcc : gcc -O2 -mavx simd.c -o simd Usage: > simd <1|2> [sse|avx] */ #include #include #include #include #include static void vadd(int simd, int n, const float a[], const float b[], float c[]) { int ne = 0; if (simd == 1) { // SSE ne = (n / 4) * 4; for (int i = 0; i < ne; i += 4) { _mm_storeu_ps(c + i, _mm_add_ps( _mm_loadu_ps(a + i), _mm_loadu_ps(b + i))); } } else if (simd == 2) { // AVX ne = (n / 8) * 8; for (int i = 0; i < ne; i += 8) { _mm256_storeu_ps(c + i, _mm256_add_ps( _mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i))); } } for (int i = ne; i < n; i++) { c[i] = a[i] + b[i]; } } static float sdot(int simd, int n, const float a[], const float b[]) { int ne = 0; float sum = 0; if (simd == 1) { // SSE float fsum[4]; __m128 vsum; ne = (n / 4) * 4; vsum = _mm_setzero_ps(); for (int i = 0; i < ne; i += 4) { vsum = _mm_add_ps(vsum, _mm_mul_ps(_mm_loadu_ps(a + i), _mm_loadu_ps(b + i))); } _mm_storeu_ps(fsum, vsum); sum = fsum[0] + fsum[1] + fsum[2] + fsum[3]; } else if (simd == 2) { // AVX float fsum[8]; __m256 vsum; ne = (n / 8) * 8; vsum = _mm256_setzero_ps(); for (int i = 0; i < ne; i += 8) { vsum = _mm256_add_ps(vsum, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i))); } _mm256_storeu_ps(fsum, vsum); sum = fsum[0] + fsum[1] + fsum[2] + fsum[3] + fsum[4] + fsum[5] + fsum[6] + fsum[7]; } for (int i = ne; i < n; i++) { sum += a[i] * b[i]; } return sum; } int main(int argc, char **argv) { int calc = 1; int simd = 0; int n = 1000; int nloop = 1000; float *a = NULL, *b = NULL, *c = NULL; char *str[3] = {"none", "SSE", "AVX"}; // arguments if (argc >= 4) { calc = atoi(argv[1]); n = atoi(argv[2]); nloop = atoi(argv[3]); } if (argc >= 5) { if (!strcmp(argv[4], "sse")) { simd = 1; } else if (!strcmp(argv[4], "avx")) { simd = 2; } } // alloc size_t size = n * sizeof(float); if (calc == 1) { a = (float *)malloc(size); b = (float *)malloc(size); c = (float *)malloc(size); } else if (calc == 2) { a = (float *)malloc(size); b = (float *)malloc(size); } // setup problem for (int i = 0; i < n; i++) { a[i] = i + 1.0f; b[i] = i + 1.0f; } // timer clock_t t0 = clock(); // calculation double result = 0, exact = 0; if (calc == 1) { // vadd for (int loop = 0; loop < nloop; loop++) { vadd(simd, n, a, b, c); } result = 0; for (int i = 0; i < n; i++) { result += c[i]; } exact = (double)n * (n + 1); } else if (calc == 2) { // sdot double sum = 0; for (int loop = 0; loop < nloop; loop++) { sum += sdot(simd, n, a, b); } result = sum / nloop; exact = (double)n * (n + 1) * (2 * n + 1) / 6.0; } // timer clock_t t1 = clock(); double cpu = (double)(t1 - t0) / CLOCKS_PER_SEC; // output printf("%d N=%d L=%d %.6e(%.6e) err=%.1e %.3f[sec] %s\n", calc, n, nloop, result, exact, fabs(result - exact) / exact, cpu, str[simd]); // free if (calc == 1) { free(a); free(b); free(c); } else if (calc == 2) { free(a); free(b); } return 0; }