/* cuda_mpi_vadd.cu (CUDA + MPI) Compile, Link: > nvcc -O2 -D_MPI -o cuda_mpi_vadd cuda_mpi_vadd.cu cuda_memory.cu msmpi.lib Usage: > mpiexec -n cuda_mpi_vadd [-gpu|-cpu] [-hdm|-um] > mpiexec -hosts [ ...] cuda_mpi_vadd [-gpu|-cpu] [-hdm|-um] */ // GPU/CPU __host__ __device__ static void vadd_calc(float a, float b, float *c) { *c = a + b; } // GPU __global__ static void vadd_gpu(int n, const float *a, const float *b, float *c) { int tid = threadIdx.x + (blockIdx.x * blockDim.x); if (tid < n) { vadd_calc(a[tid], b[tid], &c[tid]); } } // CPU static void vadd_cpu(int n, const float *a, const float *b, float *c) { for (int i = 0; i < n; i++) { vadd_calc(a[i], b[i], &c[i]); } } // GPU/CPU static void vadd(int gpu, int n, const float *a, const float *b, float *c) { if (gpu) { int block = 256; int grid = (n + (block - 1)) / block; vadd_gpu<<>>(n, a, b, c); } else { vadd_cpu(n, a, b, c); } } #include #include #include #include #ifdef _MPI #include #endif static int device_number(int, int, int []); extern void cuda_malloc(int, int, void **, size_t); extern void cuda_free(int, void *); extern void cuda_memcpy(int, void *, const void *, size_t, cudaMemcpyKind); int main(int argc, char **argv) { int gpu = 1; int um = 0; int comm_size = 1; int comm_rank = 0; int n = 1000; int nloop = 1000; int nhost = 1; int *ndevice; float *a, *b, *c; clock_t t0 = 0, t1 = 0; // initialize (MPI) #ifdef _MPI MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank); #endif // arguments while (--argc) { argv++; if (!strcmp(*argv, "-hosts")) { if (--argc) { nhost = atoi(*++argv); if (nhost < 1) nhost = 1; ndevice = (int *)malloc(nhost * sizeof(int)); for (int ihost = 0; ihost < nhost; ihost++) { if (argc > 1) { ndevice[ihost] = atoi(*++argv); argc--; } else { ndevice[ihost] = 1; } } } } else if (!strcmp(*argv, "-gpu")) { gpu = 1; } else if (!strcmp(*argv, "-cpu")) { gpu = 0; } else if (!strcmp(*argv, "-hdm")) { um = 0; } else if (!strcmp(*argv, "-um")) { um = 1; } else if (argc == 2) { n = atoi(*argv); } else if (argc == 1) { nloop = atoi(*argv); } } // GPU device if (gpu) { // rank -> device number int device = device_number(comm_rank, nhost, ndevice); // GPU info cudaDeviceProp prop; cudaGetDeviceProperties(&prop, device); printf("Rank-%d GPU-%d : %s, C.C.%d.%d, U.M.%s\n", comm_rank, device, prop.name, prop.major, prop.minor, (um ? "ON" : "OFF")); fflush(stdout); // set device cudaSetDevice(device); } // global size : n // local size : l_n int l_n = (n + (comm_size - 1)) / comm_size; // alloc device memory size_t size = l_n * sizeof(float); cuda_malloc(gpu, um, (void **)&a, size); cuda_malloc(gpu, um, (void **)&b, size); cuda_malloc(gpu, um, (void **)&c, size); // alloc host memory float *h_a = (float *)malloc(size); float *h_b = (float *)malloc(size); // setup problem for (int i = 0; i < l_n; i++) { int gid = (comm_rank * l_n) + i; if (gid < n) { h_a[i] = gid; h_b[i] = gid + 1; } } // copy host to device cuda_memcpy(gpu, a, h_a, size, cudaMemcpyHostToDevice); cuda_memcpy(gpu, b, h_b, size, cudaMemcpyHostToDevice); // timer #ifdef _MPI MPI_Barrier(MPI_COMM_WORLD); #endif if (comm_rank == 0) { t0 = clock(); } // calculation for (int loop = 0; loop < nloop; loop++) { vadd(gpu, l_n, a, b, c); } if (gpu) cudaDeviceSynchronize(); // timer #ifdef _MPI MPI_Barrier(MPI_COMM_WORLD); #endif if (comm_rank == 0) { t1 = clock(); } // copy device to host float *h_c = (float *)malloc(size); cuda_memcpy(gpu, h_c, c, size, cudaMemcpyDeviceToHost); // local sum double sum = 0; for (int i = 0; i < l_n; i++) { sum += h_c[i]; } // reduction (MPI) #ifdef _MPI double l_sum = sum; MPI_Reduce(&l_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); #endif // output if (comm_rank == 0) { double exact = (double)n * n; double sec = (double)(t1 - t0) / CLOCKS_PER_SEC; printf("N=%d L=%d %.6e(%.6e) err=%.1e %d%s %.3f[sec]\n", n, nloop, sum, exact, fabs((sum - exact) / exact), comm_size, (gpu ? "GPU" : "CPU"), sec); fflush(stdout); } #ifdef _MPI MPI_Finalize(); #endif // free free(h_a); free(h_b); free(h_c); cuda_free(gpu, a); cuda_free(gpu, b); cuda_free(gpu, c); return 0; } // rank -> device number static int device_number(int comm_rank, int nhost, int ndevice[]) { int device = 0; if (nhost <= 1) { // single node device = comm_rank; } else { // cluster device = -1; int rank = -1; for (int ihost = 0; ihost < nhost; ihost++) { for (int idevice = 0; idevice < ndevice[ihost]; idevice++) { if (++rank == comm_rank) { device = idevice; break; } } if (device >= 0) { break; } } if (device < 0) device = 0; } int num_device; cudaGetDeviceCount(&num_device); if (device >= num_device) device = num_device - 1; return device; }