/* ocl_mpi_vadd.c (OpenCL + MPI) Compile + Link: > cl.exe /O2 ocl_mpi_vadd.c OpenCL.lib msmpi.lib Usage: > mpiexec -n ocl_mpi_vadd ... */ #include #include #include #include #ifdef __APPLE__ #include #else #include #endif #ifdef _MPI #include #endif #define MAX_PLATFORMS (10) #define MAX_DEVICES (10) #define MAX_SOURCE_SIZE (100000) static int setup_ocl(cl_uint, cl_uint, char *); static void vadd_calc(void); static void vadd(void); cl_command_queue Queue; cl_kernel k_vadd; int N; float *h_A, *h_B, *h_C; cl_mem d_A, d_B, d_C; int OCL; int main(int argc, char **argv) { int *platform = NULL; int *device = NULL; int comm_size = 1; int comm_rank = 0; int n = 1000; int nloop = 1000; clock_t t0 = 0, t1 = 0; #ifdef _MPI MPI_Status status; int g_ret; #endif // initialize (MPI) #ifdef _MPI MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank); #endif // arguments platform = (int *)calloc(comm_size, sizeof(int)); device = (int *)calloc(comm_size, sizeof(int)); if (argc >= 3 + 2 * comm_size) { n = atoi(argv[1]); nloop = atoi(argv[2]); for (int i = 0; i < comm_size; i++) { platform[i] = atoi(argv[3 + 2 * i]); device[i] = atoi(argv[4 + 2 * i]); } } OCL = (platform[0] >= 0); // local size : N N = (n + (comm_size - 1)) / comm_size; // alloc host memory size_t size = N * sizeof(float); h_A = (float *)malloc(size); h_B = (float *)malloc(size); h_C = (float *)malloc(size); // setup problem for (int i = 0; i < N; i++) { int gid = (comm_rank * N) + i; if (gid < n) { h_A[i] = gid + 1.0f; h_B[i] = gid + 1.0f; } } // setup OpenCL if (OCL) { char msg[BUFSIZ]; int ret = setup_ocl((cl_uint)platform[comm_rank], (cl_uint)device[comm_rank], msg); #ifdef _MPI // show device name if (comm_rank == 0) { printf("%d: %s\n", 0, msg); for (int i = 1; i < comm_size; i++) { MPI_Recv(msg, BUFSIZ, MPI_CHAR, i, 0, MPI_COMM_WORLD, &status); printf("%d: %s\n", i, msg); } fflush(stdout); } else { MPI_Send(msg, BUFSIZ, MPI_CHAR, 0, 0, MPI_COMM_WORLD); } // error check MPI_Allreduce(&ret, &g_ret, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); if (g_ret) { MPI_Finalize(); exit(1); } #else printf("%s\n", msg); if (ret) { exit(1); } #endif } // timer if (comm_rank == 0) { t0 = clock(); } // copy host to device if (OCL) { clEnqueueWriteBuffer(Queue, d_A, CL_TRUE, 0, size, h_A, 0, NULL, NULL); clEnqueueWriteBuffer(Queue, d_B, CL_TRUE, 0, size, h_B, 0, NULL, NULL); } // run for (int loop = 0; loop < nloop; loop++) { vadd_calc(); } // copy device to host if (OCL) { clEnqueueReadBuffer(Queue, d_C, CL_TRUE, 0, size, h_C, 0, NULL, NULL); } // timer if (comm_rank == 0) { t1 = clock(); } // sum double sum = 0; for (int i = 0; i < N; i++) { sum += h_C[i]; } // reduction (MPI) #ifdef _MPI double l_sum = sum; MPI_Reduce(&l_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); #endif // output if (comm_rank == 0) { const double exact = n * (n + 1.0); const double sec = (double)(t1 - t0) / CLOCKS_PER_SEC; printf("N=%d L=%d %.6e(%.6e) err=%.1e %dGPUs %.3f[sec]\n", n, nloop, sum, exact, fabs((sum - exact) / exact), comm_size, sec); fflush(stdout); } #ifdef _MPI MPI_Finalize(); #endif // release if (OCL) { clReleaseMemObject(d_A); clReleaseMemObject(d_B); clReleaseMemObject(d_C); clReleaseKernel(k_vadd); clReleaseCommandQueue(Queue); } // free free(h_A); free(h_B); free(h_C); return 0; } // setup OpenCL static int setup_ocl(cl_uint platform, cl_uint device, char *msg) { cl_context context = NULL; cl_program program = NULL; cl_platform_id platform_id[MAX_PLATFORMS]; cl_device_id device_id[MAX_DEVICES]; FILE *fp; char *source_str; char str[BUFSIZ]; size_t source_size, ret_size, size; cl_uint num_platforms, num_devices; cl_int ret; // alloc source_str = (char *)malloc(MAX_SOURCE_SIZE * sizeof(char)); // platform clGetPlatformIDs(MAX_PLATFORMS, platform_id, &num_platforms); if (platform >= num_platforms) { sprintf(msg, "error : platform = %d (limit = %d)", platform, num_platforms - 1); return 1; } // device clGetDeviceIDs(platform_id[platform], CL_DEVICE_TYPE_ALL, MAX_DEVICES, device_id, &num_devices); if (device >= num_devices) { sprintf(msg, "error : device = %d (limit = %d)", device, num_devices - 1); return 1; } // device name (option) clGetDeviceInfo(device_id[device], CL_DEVICE_NAME, sizeof(str), str, &ret_size); sprintf(msg, "%s (platform = %d, device = %d)", str, platform, device); // context context = clCreateContext(NULL, 1, &device_id[device], NULL, NULL, &ret); // command queue Queue = clCreateCommandQueue(context, device_id[device], 0, &ret); // source if ((fp = fopen("vadd.cl", "r")) == NULL) { sprintf(msg, "kernel source open error"); return 1; } source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != CL_SUCCESS) { sprintf(msg, "clCreateProgramWithSource() error"); return 1; } // build if (clBuildProgram(program, 1, &device_id[device], NULL, NULL, NULL) != CL_SUCCESS) { sprintf(msg, "clBuildProgram() error"); exit(1); } // kernel k_vadd = clCreateKernel(program, "vadd", &ret); if (ret != CL_SUCCESS) { sprintf(msg, "clCreateKernel() error"); return 1; } // memory object size = N * sizeof(float); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &ret); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &ret); d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &ret); // release clReleaseProgram(program); clReleaseContext(context); // free free(source_str); return 0; } // entry point static void vadd_calc(void) { if (OCL) { size_t global_item_size, local_item_size; // args clSetKernelArg(k_vadd, 0, sizeof(cl_mem), (void *)&d_A); clSetKernelArg(k_vadd, 1, sizeof(cl_mem), (void *)&d_B); clSetKernelArg(k_vadd, 2, sizeof(cl_mem), (void *)&d_C); clSetKernelArg(k_vadd, 3, sizeof(int), (void *)&N); // work item local_item_size = 256; global_item_size = ((N + local_item_size - 1) / local_item_size) * local_item_size; // run clEnqueueNDRangeKernel(Queue, k_vadd, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); } else { // non-OpenCL code vadd(); } } // non-OpenCL code static void vadd(void) { for (int i = 0; i < N; i++) { h_C[i] = h_A[i] + h_B[i]; } }