/* omp.c (OpenMP) VC++ : cl.exe /O2 /openmp omp.c gcc : gcc -O2 -fopenmp omp.c -o omp Usage : > omp {1|2} [sse|avx] */ #include #include #include #include #include #ifdef _OPENMP #include #endif #ifndef _WIN32 #include #endif static inline void vadd(int n, const float a[], const float b[], float c[]) { int i; #ifdef _OPENMP #pragma omp parallel for //schedule(static, 1) #endif for (i = 0; i < n; i++) { c[i] = a[i] + b[i]; } } static inline double sdot(int n, const float a[], const float b[]) { double sum = 0; int i; #ifdef _OPENMP #pragma omp parallel for reduction(+:sum) #endif for (i = 0; i < n; i++) { sum += a[i] * b[i]; } return sum; } static inline double sdot_simd(int n, const float a[], const float b[], int nthread, int simd) { double *sum_thread = (double *)malloc(nthread * sizeof(double)); memset(sum_thread, 0, nthread * sizeof(double)); #ifdef _OPENMP #pragma omp parallel #endif { int ithread = 0; #ifdef _OPENMP ithread = omp_get_thread_num(); #endif const int lsimd = 4 * simd; // = 4/8 const int lthread = (n + (nthread - 1)) / nthread; const int ns = ithread * lthread; const int ne = ((ns + lthread) > n) ? n : (ns + lthread); const int nc = ns + ((ne - ns) / lsimd) * lsimd; if (simd == 1) { // SSE float fsum[4]; __m128 vsum = _mm_setzero_ps(); for (int i = ns; i < nc; i += lsimd) { vsum = _mm_add_ps(vsum, _mm_mul_ps(_mm_loadu_ps(a + i), _mm_loadu_ps(b + i))); } _mm_storeu_ps(fsum, vsum); sum_thread[ithread] = fsum[0] + fsum[1] + fsum[2] + fsum[3]; } else if (simd == 2) { // AVX float fsum[8]; __m256 vsum = _mm256_setzero_ps(); for (int i = ns; i < nc; i += lsimd) { vsum = _mm256_add_ps(vsum, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i))); } _mm256_storeu_ps(fsum, vsum); sum_thread[ithread] = fsum[0] + fsum[1] + fsum[2] + fsum[3] + fsum[4] + fsum[5] + fsum[6] + fsum[7]; } for (int i = nc; i < ne; i++) { sum_thread[ithread] += a[i] * b[i]; } } double sum = 0; for (int ithread = 0; ithread < nthread; ithread++) { sum += sum_thread[ithread]; } return sum; } int main(int argc, char **argv) { int calc = 1; int nthread = 1; int n = 1000; int nloop = 1000; int simd = 0; float *a = NULL, *b = NULL, *c = NULL; #ifdef _WIN32 clock_t t0, t1; #else struct timeval t0, t1; #endif // arguments if (argc >= 5) { calc = atoi(argv[1]); n = atoi(argv[2]); nloop = atoi(argv[3]); nthread = atoi(argv[4]); } if (argc >= 6) { if (!strcmp(argv[5], "sse")) { simd = 1; } else if (!strcmp(argv[5], "avx")) { simd = 2; } } #ifdef _OPENMP // threads omp_set_num_threads(nthread); #endif // alloc size_t size = n * sizeof(float); if (calc == 1) { a = (float *)malloc(size); b = (float *)malloc(size); c = (float *)malloc(size); } else if (calc == 2) { a = (float *)malloc(size); b = (float *)malloc(size); } // setup problem for (int i = 0; i < n; i++) { a[i] = i + 1.0f; b[i] = i + 1.0f; } // timer #ifdef _WIN32 t0 = clock(); #else gettimeofday(&t0, NULL); #endif // calculation double sum = 0; if (calc == 1) { // add vector for (int loop = 0; loop < nloop; loop++) { vadd(n, a, b, c); } for (int i = 0; i < n; i++) { sum += c[i]; } } else if (calc == 2) { // dot product for (int loop = 0; loop < nloop; loop++) { if (!simd) { sum += sdot(n, a, b); } else { sum += sdot_simd(n, a, b, nthread, simd); } } sum /= nloop; } // timer double cpu = 0; #ifdef _WIN32 t1 = clock(); cpu = (double)(t1 - t0) / CLOCKS_PER_SEC; #else gettimeofday(&t1, NULL); cpu = (t1.tv_sec - t0.tv_sec) + 1e-6 * (t1.tv_usec - t0.tv_usec); #endif // output double exact = (calc == 1) ? (double)n * (n + 1) : (double)n * (n + 1) * (2 * n + 1) / 6.0; printf("%d N=%d L=%d thread=%d %.6e(%.6e) err=%.1e %.3f[sec]\n", calc, n, nloop, nthread, sum, exact, fabs(sum - exact) / exact, cpu); // free if (calc == 1) { free(a); free(b); free(c); } else if (calc == 2) { free(a); free(b); } return 0; }