/* * render_opencl.c * * Created on: 26.01.2018 * Author: Superleo1810 */ #include "render_opencl.h" void init_opencl(OpenCLConfig *config) { x_min_s = -2.0; x_max_s = 1.0; y_min_s = -1.0; y_max_s = 1.0; x_min = x_min_s; x_max = x_max_s; y_min = y_min_s; y_max = y_max_s; config_opencl = config; output = (cl_uint *) malloc((config_opencl->width) * (config_opencl->height) * sizeof(cl_uchar4)); context = NULL; width_cl = config_opencl->width; // Leave it in, cl needs different endianness cl_platform_id platform_id; cl_uint ret_num_devices; cl_uint ret_num_platforms; size_t device_list_size; cl_program program; size_t cl_src_sz; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); ret = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size); devices = (cl_device_id *)malloc(device_list_size); num_devices = (cl_uint)(device_list_size/sizeof(cl_device_id)); clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL); FILE *fp; char *cl_src, *path, *flags = (char *)malloc(200 * sizeof(char)); flags[0] = '\0'; switch(config_opencl->fpu) { case OPENCL_FPU_32: switch(config_opencl->set_func) { case SFUNC_JULIA: path = "cl/julia32.cl"; break; case SFUNC_MANDELBROT: default: path = "cl/mandelbrot32.cl"; break; } break; case OPENCL_FPU_64: switch(config_opencl->set_func) { case SFUNC_JULIA: path = "cl/julia64.cl"; break; case SFUNC_MANDELBROT: default: path = "cl/mandelbrot64.cl"; break; } int khrFP64 = 0; int amdFP64 = 0; for (cl_uint i = 0; i < num_devices; i++) { char deviceExtensions[8192]; ret = clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(deviceExtensions), deviceExtensions, 0); if (strstr(deviceExtensions, "cl_khr_fp64")) { khrFP64++; } else { if (strstr(deviceExtensions, "cl_amd_fp64")) { amdFP64++; } } } if (khrFP64 == num_devices) { flags = strcat(flags, "-D KHR_DP_EXTENSION "); } else if (amdFP64 == num_devices) { flags = strcat(flags, ""); } break; case OPENCL_FPU_128: printf("128 bit precision not implemented yet\n"); break; } cl_src = (char *)malloc(MAX_SOURCE_SIZE * sizeof(char)); fp = fopen(path, "r"); cl_src_sz = fread(cl_src, 1, MAX_SOURCE_SIZE, fp); fclose(fp); for (cl_uint i = 0; i < num_devices; i++) { cl_command_queue_properties prop = 0; // if (sampleArgs->timing) // { // prop |= CL_QUEUE_PROFILING_ENABLE; // } commandQueue[i] = clCreateCommandQueue(context, devices[i], prop, &ret); outputBuffer[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(cl_uint) * (config_opencl->width) * (config_opencl->height)) / num_devices, NULL, &ret); } if (config_opencl->fma) { flags = strcat(flags, "-D MUL_ADD=fma "); } else { flags = strcat(flags, "-D MUL_ADD=mad "); } program = clCreateProgramWithSource(context, 1, (const char **)&cl_src, (const size_t *)&cl_src_sz, &ret); ret = clBuildProgram(program, num_devices, devices, flags, NULL, NULL); for (cl_uint i = 0; i < num_devices; i++) { kernel_vector[i] = clCreateKernel(program, "calculate", &ret); } } void render_opencl(void) { cl_event events[MAX_DEVICES]; cl_int eventStatus = CL_QUEUED; size_t globalThreads[1]; size_t localThreads[1]; size_t kernelWorkGroupSize; cl_kernel kernel; cl_double y_max_t; cl_float y_max_t_f; cl_double x_delta = ((x_max - x_min) / (double) config_opencl->width); cl_double y_delta = -((y_max - y_min) / (double) config_opencl->height); cl_float x_delta_f = (float) x_delta; cl_float y_delta_f = (float) y_delta; cl_float x_min_f = (float) x_min; globalThreads[0] = ((config_opencl->width) * (config_opencl->height)) / num_devices; localThreads[0] = 256; globalThreads[0] >>= 2; for (cl_uint i = 0; i < num_devices; i++) { kernel = kernel_vector[i]; ret = clGetKernelWorkGroupInfo(kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if ((cl_uint) (localThreads[0]) > kernelWorkGroupSize) { localThreads[0] = kernelWorkGroupSize; } y_max_t = (((y_min + y_max) / 2.0) + (y_max - y_min) / 2.0 - ((double) i * (y_max - y_min)) / (double) num_devices); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &outputBuffer[i]); y_max_t_f = (float) y_max_t; //printf("x_delta: %f, y_delta: %f, x_delta_f: %f, y_delta_f: %f, x_min_f: %f, y_max_t: %f, y_max_t_f: %f\n", x_delta, y_delta, x_delta_f, y_delta_f, x_min_f, y_max_t, y_max_t_f); switch (config_opencl->fpu) { case OPENCL_FPU_32: // lel ret = clSetKernelArg(kernel, 1, sizeof(cl_float), (void *) &x_min_f); ret = clSetKernelArg(kernel, 2, sizeof(cl_float), (void *) &y_max_t_f); ret = clSetKernelArg(kernel, 3, sizeof(cl_float), (void *) &x_delta_f); ret = clSetKernelArg(kernel, 4, sizeof(cl_float), (void *) &y_delta_f); break; case OPENCL_FPU_64: ret = clSetKernelArg(kernel, 1, sizeof(cl_double), (void *) &x_min); ret = clSetKernelArg(kernel, 2, sizeof(cl_double), (void *) &y_max_t); ret = clSetKernelArg(kernel, 3, sizeof(cl_double), (void *) &x_delta); ret = clSetKernelArg(kernel, 4, sizeof(cl_double), (void *) &y_delta); break; case OPENCL_FPU_128: break; } ret = clSetKernelArg(kernel, 5, sizeof(cl_uint), (void *) &config_opencl->iterations); ret = clSetKernelArg(kernel, 6, sizeof(cl_int), (void *) &width_cl); ret = clEnqueueNDRangeKernel(commandQueue[i], kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clFlush(commandQueue[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clWaitForEvents(1, &events[num_devices - i - 1]); ret = clReleaseEvent(events[num_devices - i - 1]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clEnqueueReadBuffer(commandQueue[i], outputBuffer[i], CL_FALSE, 0, (config_opencl->width * config_opencl->height * sizeof(cl_int)) / num_devices, config_opencl->arr + (config_opencl->width * config_opencl->height / num_devices) * i, 0, NULL, &events[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clFlush(commandQueue[i]); } for (cl_uint i = 0; i < num_devices; i++) { ret = clWaitForEvents(1, &events[num_devices - i - 1]); ret = clReleaseEvent(events[num_devices - i - 1]); } glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glBindTexture(GL_TEXTURE_2D, config_opencl->tex); glEnable(GL_TEXTURE_2D); glBegin(GL_QUADS); glTexCoord2i(0, 0); glVertex2i(0, 0); glTexCoord2i(0, 1); glVertex2i(0, config_opencl->height); glTexCoord2i(1, 1); glVertex2i(config_opencl->width, config_opencl->height); glTexCoord2i(1, 0); glVertex2i(config_opencl->width, 0); glEnd(); glDisable(GL_TEXTURE_2D); glBindTexture(GL_TEXTURE_2D, 0); glutSwapBuffers(); } void idle_opencl(void) { static int t_old; int t = 0, delta = 0; do { t = glutGet(GLUT_ELAPSED_TIME); delta = t - t_old; } while (delta < 16); // TODO: Hardcoded FPS t_old = t; //glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, config_opencl->tex); //glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); //glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, config_opencl->width, config_opencl->height, 0, GL_RGBA, GL_UNSIGNED_BYTE, config_opencl->arr); glBindTexture(GL_TEXTURE_2D, 0); cl_ft += (config_opencl->speed * (delta / 1000.0)); x_min = x_min_s + config_opencl->zoom_func(cl_ft, (d64) 2.0 + config_opencl->to_x); y_min = y_min_s + config_opencl->zoom_func(cl_ft, (d64) 1.0 + config_opencl->to_y); x_max = x_max_s - config_opencl->zoom_func(cl_ft, (d64) 1.0 - config_opencl->to_x); y_max = y_max_s - config_opencl->zoom_func(cl_ft, (d64) 1.0 - config_opencl->to_y); glutPostRedisplay(); }