/* cuda_vadd.cu (CUDA) Compile,Link: > nvcc -O2 -o cuda_vadd cuda_vadd.cu cuda_memory.cu Usage: > cuda_vadd [-cpu] [-um] [-device ] */ // GPU/CPU __host__ __device__ static void vadd_calc(float a, float b, float *c) { *c = a + b; } // GPU __global__ static void vadd_gpu(int n, const float *a, const float *b, float *c) { int tid = threadIdx.x + (blockIdx.x * blockDim.x); if (tid < n) { vadd_calc(a[tid], b[tid], &c[tid]); } } // CPU static void vadd_cpu(int n, const float *a, const float *b, float *c) { for (int i = 0; i < n; i++) { vadd_calc(a[i], b[i], &c[i]); } } // GPU/CPU static void vadd(int gpu, int n, const float *a, const float *b, float *c) { if (gpu) { int block = 256; int grid = (n + block - 1) / block; vadd_gpu<<>>(n, a, b, c); } else { vadd_cpu(n, a, b, c); } } #include #include #include #include extern void cuda_malloc(int, int, void **, size_t); extern void cuda_free(int, void *); extern void cuda_memcpy(int, void *, const void *, size_t, cudaMemcpyKind); int main(int argc, char **argv) { int gpu = 1; int um = 0; int device = 0; int n = 1000; int nloop = 1000; float *a, *b, *c; clock_t t0 = 0, t1 = 0; // arguments while (--argc) { argv++; if (!strcmp(*argv, "-gpu")) { gpu = 1; } else if (!strcmp(*argv, "-cpu")) { gpu = 0; } else if (!strcmp(*argv, "-hdm")) { um = 0; } else if (!strcmp(*argv, "-um")) { um = 1; } else if (!strcmp(*argv, "-device")) { if (--argc) { device = atoi(*++argv); } } else if (argc == 2) { n = atoi(*argv); } else if (argc == 1) { nloop = atoi(*argv); } } // GPU info and set device if (gpu) { int ndevice; cudaDeviceProp prop; cudaGetDeviceCount(&ndevice); if (device >= ndevice) device = ndevice - 1; cudaGetDeviceProperties(&prop, device); printf("GPU-%d : %s, C.C.%d.%d, U.M.%s\n", device, prop.name, prop.major, prop.minor, (um ? "ON" : "OFF")); cudaSetDevice(device); } // alloc device or managed memory size_t size = n * sizeof(float); cuda_malloc(gpu, um, (void **)&a, size); cuda_malloc(gpu, um, (void **)&b, size); cuda_malloc(gpu, um, (void **)&c, size); // alloc host memory float *h_a = (float *)malloc(size); float *h_b = (float *)malloc(size); // setup problem for (int i = 0; i < n; i++) { h_a[i] = i; h_b[i] = i + 1; } // copy host to device cuda_memcpy(gpu, a, h_a, size, cudaMemcpyHostToDevice); cuda_memcpy(gpu, b, h_b, size, cudaMemcpyHostToDevice); // timer t0 = clock(); // calculation for (int loop = 0; loop < nloop; loop++) { vadd(gpu, n, a, b, c); } if (gpu) cudaDeviceSynchronize(); // timer t1 = clock(); // copy device to host float *h_c = (float *)malloc(size); cuda_memcpy(gpu, h_c, c, size, cudaMemcpyDeviceToHost); // sum double sum = 0; for (int i = 0; i < n; i++) { sum += h_c[i]; } // output double exact = (double)n * n; double sec = (double)(t1 - t0) / CLOCKS_PER_SEC; printf("N=%d L=%d %.6e(%.6e) %.1e %s[sec]=%.3f\n", n, nloop, sum, exact, fabs((sum - exact) / exact), (gpu ? "GPU" : "CPU"), sec); // free free(h_a); free(h_b); free(h_c); cuda_free(gpu, a); cuda_free(gpu, b); cuda_free(gpu, c); return 0; }