33#include <libFreeWRL.h>
36#include "RenderFuncs.h"
37#include "../vrml_parser/Structs.h"
39#include "../main/headers.h"
41#include "LinearAlgebra.h"
45#include "../opencl/OpenCL_Utils.h"
49static const char* collide_non_walk_kernel;
50static const char* collide_non_walk_kernel_headers;
52#define FLOAT_TOLERANCE 0.00000001
65cl_platform_id cpPlatform = NULL;
66cl_device_id* cdDevices = NULL;
67cl_uint uiTargetDevice = 0;
83bool bGLinteropSupported =
false;
84cl_uint uiNumDevsUsed = 1;
85bool bGLinterop =
false;
124bool collision_initGPUCollide (
struct sCollisionGPU* initme) {
127 p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
131 size_t kernel_wg_size;
138 kp[0] = (
char *)collide_non_walk_kernel_headers;
139 kp[1] = (
char *)collide_non_walk_kernel;
141 initme->CollideGPU_program = clCreateProgramWithSource(p->CL_context, 2, (
const char **) kp, NULL, &err);
144 if (!initme->CollideGPU_program || (err != CL_SUCCESS)) {
145 printCLError(
"clCreateProgramWithSource",err);
157 err = clBuildProgram(initme->CollideGPU_program, 1, &(p->CL_device_id), opts, NULL, NULL);
161 if (err != CL_SUCCESS) {
165 ConsoleMessage(
"Error: Failed to build program executable\n");
166 printCLError(
"clBuildProgram",err);
167 err = clGetProgramBuildInfo(initme->CollideGPU_program, p->CL_device_id, CL_PROGRAM_BUILD_LOG,
168 sizeof(buffer), buffer, &len);
169 TEST_ERR(
"clGetProgramBuildInfo",err);
170 ConsoleMessage (
"error string len %d\n",(
int)len);
171 ConsoleMessage(
"%s\n", buffer);
176 initme->CollideGPU_kernel = clCreateKernel(initme->CollideGPU_program,
"compute_collide", &err);
179 if (!initme->CollideGPU_kernel || (err != CL_SUCCESS)) {
180 printCLError(
"clCreateKernel",err);
187 err = clGetKernelWorkGroupInfo (initme->CollideGPU_kernel, p->CL_device_id,
188 CL_KERNEL_WORK_GROUP_SIZE,
sizeof(
size_t), &kernel_wg_size, &rvlen);
190 if (err!=CL_SUCCESS) {
191 printCLError(
"clGetKernelWorkGroupInfo",err);
196 if (kernel_wg_size < p->CL_default_workgroup_size) initme->CollideGPU_workgroup_size = kernel_wg_size;
197 else initme->CollideGPU_workgroup_size = p->CL_default_workgroup_size;
200 ConsoleMessage (
"MAX_WORK_GROUP_SIZE %d\n",kernel_wg_size);
201 ConsoleMessage (
"We are going to set our workgroup size to %d\n",wg_size);
221int printedOnce = TRUE;
224struct point_XYZ run_non_walk_collide_program(GLuint vertex_vbo, GLuint index_vbo, float *modelMat,
int ntri,
225 int face_ccw,
int face_flags,
float avatar_radius) {
229 size_t local_work_size;
230 size_t global_work_size;
233 double maxdisp = 0.0;
234 struct point_XYZ dispv, maxdispv = {0,0,0};
236 struct sCollisionGPU* me = GPUCollisionInfo();
240 p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
243 if (me->CollideGPU_returnValues.n < ntri) {
244 if (me->CollideGPU_returnValues.n != 0) {
245 err = clReleaseMemObject(me->CollideGPU_output_buffer);
246 TEST_ERR(
"clReleaseMemObject",err);
249 me->CollideGPU_output_buffer = clCreateBuffer(p->CL_context, CL_MEM_WRITE_ONLY,
sizeof(
struct SFColorRGBA) * ntri,
252 if (me->CollideGPU_matrix_buffer == NULL) {
253 me->CollideGPU_matrix_buffer = clCreateBuffer(p->CL_context, CL_MEM_READ_ONLY, sizeof (cl_float16), NULL, NULL);
256 if (!(me->CollideGPU_output_buffer) || !(me->CollideGPU_matrix_buffer)) {
257 printCLError(
"clCreateBuffer",10000);
260 me->CollideGPU_output_size = ntri;
261 me->CollideGPU_returnValues.p = REALLOC(me->CollideGPU_returnValues.p,
sizeof(
struct SFColorRGBA) *ntri);
262 me->CollideGPU_returnValues.n = ntri;
266 err = clEnqueueWriteBuffer(p->CL_queue, me->CollideGPU_matrix_buffer, CL_TRUE, 0,
sizeof(cl_float16), modelMat, 0, NULL, NULL);
267 TEST_ERR(
"clEnqueueWriteBuffer",err);
270 me->CollideGPU_vertex_buffer=clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, vertex_vbo, &err);
271 if (err != CL_SUCCESS) {
272 printCLError(
"clCreateFromGLBuffer",err);
277 me->CollideGPU_index_buffer = clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, index_vbo, &err);
278 if (err != CL_SUCCESS) {
279 printCLError(
"clCreateFromGLBuffer",err);
284 count = (
unsigned int) ntri;
286 err = clSetKernelArg(me->CollideGPU_kernel, 0,
sizeof(cl_mem), &me->CollideGPU_output_buffer);
287 TEST_ERR(
"clSetKernelArg",err);
289 err =clSetKernelArg(me->CollideGPU_kernel, 1,
sizeof(
unsigned int), &count);
290 TEST_ERR(
"clSetKernelArg",err);
292 err =clSetKernelArg(me->CollideGPU_kernel, 2, sizeof (cl_mem), &me->CollideGPU_matrix_buffer);
293 TEST_ERR(
"clSetKernelArg",err);
295 err =clSetKernelArg(me->CollideGPU_kernel, 3, sizeof (cl_mem), &me->CollideGPU_vertex_buffer);
296 TEST_ERR(
"clSetKernelArg",err);
298 err =clSetKernelArg(me->CollideGPU_kernel, 4, sizeof (cl_mem), &me->CollideGPU_index_buffer);
299 TEST_ERR(
"clSetKernelArg",err);
301 err =clSetKernelArg(me->CollideGPU_kernel, 5,
sizeof(
int), &face_ccw);
302 TEST_ERR(
"clSetKernelArg",err);
304 err =clSetKernelArg(me->CollideGPU_kernel, 6,
sizeof(
int), &face_flags);
305 TEST_ERR(
"clSetKernelArg",err);
307 err =clSetKernelArg(me->CollideGPU_kernel, 7,
sizeof(
int), &avatar_radius);
308 TEST_ERR(
"clSetKernelArg",err);
310 err =clSetKernelArg(me->CollideGPU_kernel, 8,
sizeof(
int), &ntri);
311 TEST_ERR(
"clSetKernelArg",err);
314 #define MYWG (me->CollideGPU_workgroup_size)
317 global_work_size = (size_t) (ntri) / MYWG;
318 else global_work_size = 0;
321 global_work_size += 1;
324 global_work_size *= MYWG;
328 local_work_size = MYWG;
333 cl_context myContext;
334 cl_device_id myDevice;
338 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_CONTEXT,
sizeof(myContext), &myContext, NULL);
339printf (
"queue context when commandqueue created %p, should be %p\n", myContext,p->CL_context);
340 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_DEVICE,
sizeof(myDevice), &myDevice, NULL);
341printf (
"queue Device when commandqueue created %p, should be %p\n", myDevice,p->CL_device_id);
342 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_REFERENCE_COUNT,
sizeof(myReference), &myReference, NULL);
343printf (
"queue Reference when commandqueue created %d\n", myReference);
347 err = clEnqueueNDRangeKernel(p->CL_queue, me->CollideGPU_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
349 if (err != CL_SUCCESS) {
350 printCLError(
"clEnqueueNDRangeKernel",err);
358 err = clFlush(p->CL_queue);
359 if (err != CL_SUCCESS) {
360 printCLError(
"clFlush",err);
364 err = clFinish(p->CL_queue);
365 if (err != CL_SUCCESS) {
366 printCLError(
"clFinish",err);
381 err = clEnqueueReadBuffer (p->CL_queue, me->CollideGPU_output_buffer,
383 me->CollideGPU_returnValues.p, 0, NULL, NULL);
385 if (err != CL_SUCCESS) {
386 printCLError(
"clEnqueueReadBuffer",err);
391 for (i=0; i < ntri; i++) {
401 if (me->CollideGPU_returnValues.p[i].c[3] > 1.0) {
405 dispv.x = me->CollideGPU_returnValues.p[i].c[0];
406 dispv.y = me->CollideGPU_returnValues.p[i].c[1];
407 dispv.z = me->CollideGPU_returnValues.p[i].c[2];
415 disp = vecdot (&dispv,&dispv);
416 if ((disp > FLOAT_TOLERANCE) && (disp>maxdisp)) {
430#ifdef GL_ES_VERSION_2_0
431static const char* collide_non_walk_kernel_headers =
" \
432//#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n\
433#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
434//#pragma OPENCL EXTENSION CL_APPLE_gl_sharing : enable \n\
435//#pragma OPENCL EXTENSION CL_KHR_gl_sharing : enable \n\
436//#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode : enable \n\
451static const char* collide_non_walk_kernel_headers =
" \
452#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
460static const char* collide_non_walk_kernel =
" \
462/********************************************************************************/ \n\
464/* Collide kernel for fly and examine modes */ \n\
466/********************************************************************************/ \n\
467/* Function prototypes */ \n\
468float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c); \n\
470/* start the collide process. \n\
4721) transform the vertex. \n\
4732) calculate normal \n\
4743) if triangle is visible to us, get ready for collide calcs \n\
479#define DOUGS_FLOAT_TOLERANCE 0.00000001 \n\
480#define FLOAT_TOLERANCE 0.0000001f \n\
481#define PR_DOUBLESIDED 0x01 \n\
482#define PR_FRONTFACING 0x02 /* overrides effect of doublesided. */ \n\
483#define PR_BACKFACING 0x04 /* overrides effect of doublesided, all normals are reversed. */ \n\
485/********************************************************************************/ \n\
488#define APPROX (a, b) (fabs(a-b) < FLOAT_TOLERANCE) \n\
489#define VECSCALE(v,s) (float4)(v.x*s, v.y*s, v.z*s, 0.0f) \n\
490#define VECLENGTH(v) (float)sqrt((float)dot((float4)v,(float4)v)) \n\
494/********************************************************************************/ \n\
496/* Three vertices; find the closest one which intersects the Z plane; */ \n\
497/* either we choose a Vertex, on an edge, or fabricate one in the */ \n\
498/* middle of the triangle somewhere. */ \n\
500/* Adapted from \"Real time Collision Detection\", Christer Ericson. */ \n\
502/********************************************************************************/ \n\
505float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c) { \n\
506 float4 vector_ab = (point_b - point_a); // b - a \n\
507 float4 vector_ac = (point_c - point_a); // c - a \n\
508 float4 vector_bc = (point_c - point_b); // c - b \n\
509 float4 vector_ba = (point_a - point_b); // a - b \n\
510 float4 vector_ca = (point_a - point_c); // a - c \n\
511 float4 vector_cb = (point_b - point_c); // b - c \n\
514 // we have moved points, so our bounding sphere is at (0,0,0) so p = (0,0,0) \n\
515 float4 vector_ap = point_a * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - a \n\
516 float4 vector_bp = point_b * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - b \n\
517 float4 vector_cp = point_c * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - c \n\
518 #define vector_pa point_a /* a - p */ \n\
519 #define vector_pb point_b /* b - p */ \n\
520 #define vector_pc point_c /* c - p */ \n\
522 // Step 2. Compute parametric position s for projection P' of P on AB, \n\
523 // P' = A + s*AB, s = snom/(snom+sdenom) \n\
525 float snom = dot(vector_ap, vector_ab); // (p - a, ab); \n\
526 float sdenom = dot(vector_bp, vector_ba); // (p - b, a - b); \n\
529 // Compute parametric position t for projection P' of P on AC, \n\
530 // P' = A + t*AC, s = tnom/(tnom+tdenom) \n\
531 float tnom = dot(vector_ap, vector_ac); // (p - a, ac); \n\
532 float tdenom = dot(vector_cp, vector_ca); // (p - c, a - c); \n\
535 if (snom <= 0.0f && tnom <= 0.0f) { \n\
540 // Compute parametric position u for projection P' of P on BC, \n\
541 // P' = B + u*BC, u = unom/(unom+udenom) \n\
542 float unom = dot(vector_bp, vector_bc); //(p - b, bc) \n\
543 float udenom = dot(vector_cp, vector_cb); // (p - c, b - c); \n\
546 if (sdenom <= 0.0f && unom <= 0.0f) { \n\
550 if (tdenom <= 0.0f && udenom <= 0.0f) { \n\
556 // P is outside (or on) AB if the triple scalar product [N PA PB] <= 0 \n\
561 n = cross(vector_ab, vector_ac); // (b - a, c - a); \n\
562 tmp = cross(vector_pa, vector_pb); // veccross (a-p, b-p); \n\
564 // vc = dot(n, veccross(a - p, b - p)); \n\
565 vc = dot(n, tmp); \n\
568 // If P outside AB and within feature region of AB, \n\
569 // return projection of P onto AB \n\
570 if (vc <= 0.0f && snom >= 0.0f && sdenom >= 0.0f) { \n\
571 return point_a + snom / (snom + sdenom) * vector_ab; \n\
577 // P is outside (or on) BC if the triple scalar product [N PB PC] <= 0 \n\
578 tmp = cross (vector_pb, vector_pc); \n\
580 float va = dot(n, tmp); // Cross(b - p, c - p)); \n\
582 // If P outside BC and within feature region of BC, \n\
583 // return projection of P onto BC \n\
584 if (va <= 0.0f && unom >= 0.0f && udenom >= 0.0f) { \n\
585 return point_b + unom / (unom + udenom) * vector_bc; \n\
589 // P is outside (or on) CA if the triple scalar product [N PC PA] <= 0 \n\
590 tmp = cross (vector_pc, vector_pa); \n\
592 float vb = dot(n, tmp); // Cross(c - p, a - p)); \n\
593 // If P outside CA and within feature region of CA, \n\
594 // return projection of P onto CA \n\
595 if (vb <= 0.0f && tnom >= 0.0f && tdenom >= 0.0f) { \n\
596 return point_a + tnom / (tnom + tdenom) * vector_ac; \n\
600 // P must project inside face region. Compute Q using barycentric coordinates \n\
601 float u = va / (va + vb + vc); \n\
602 float v = vb / (va + vb + vc); \n\
603 float w = 1.0f - u - v; // = vc / (va + vb + vc) \n\
604 float4 u4 = (float4)(u); \n\
605 float4 v4 = (float4)(v); \n\
606 float4 w4 = (float4)(w); \n\
608 //return u * point_a + v * point_b + w * point_c; \n\
609 float4 rv = mad(point_a,u4,mad(point_b,v4,point_c*w4)); \n\
613/********************************************************************************/ \n\
615 __kernel void compute_collide ( \n\
616 __global float4 *output, /* 0 */ \n\
617 const unsigned int count, /* 1 */ \n\
618 __global float *mymat, /* 2 */ \n\
619 __global float *my_vertex, /* 3 */ \n\
620 __global short *my_cindex, /* 4 */ \n\
621 const int face_ccw, /* 5 */ \n\
622 const int face_flags, /* 6 */ \n\
623 const float avatar_radius, /* 7 */ \n\
624 const int ntri /* 8 */ \n\
627 /* which index this instantation is working on */ \n\
628 int i_am_canadian = get_global_id(0); \n\
629 if (i_am_canadian >= ntri) return; /* allows for workgroup size sizes */ \n\
631 /* vertices for this triangle */ \n\
632 /* transformed by matrix */ \n\
637 /* starting index in my_vertex of this vertex */ \n\
638 /* we work in triangles; each triangle has 3 vertices */ \n\
639 #define COORD_1 (my_cindex[i_am_canadian*3+0]*3) \n\
640 #define COORD_2 (my_cindex[i_am_canadian*3+1]*3) \n\
641 #define COORD_3 (my_cindex[i_am_canadian*3+2]*3) \n\
643 /* do matrix transform, 4 floats wide. */ \n\
644 float4 matColumn1 = (float4)(convert_float(mymat[0]),convert_float(mymat[1]),convert_float(mymat[2]),0.0f); \n\
645 float4 matColumn2 = (float4)(convert_float(mymat[4]),convert_float(mymat[5]),convert_float(mymat[6]),0.0f); \n\
646 float4 matColumn3 = (float4)(convert_float(mymat[8]),convert_float(mymat[9]),convert_float(mymat[10]),0.0f); \n\
647 float4 matColumn4 = (float4)(convert_float(mymat[12]),convert_float(mymat[13]),convert_float(mymat[14]),0.0f); \n\
649 /* first vertex */ \n\
650 float4 Vertex_X = (float4)(my_vertex[COORD_1+0]); \n\
651 float4 Vertex_Y = (float4)(my_vertex[COORD_1+1]); \n\
652 float4 Vertex_Z = (float4)(my_vertex[COORD_1+2]); \n\
653 tv1 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
655 /* second vertex */ \n\
656 Vertex_X = (float4)(my_vertex[COORD_2+0]); \n\
657 Vertex_Y = (float4)(my_vertex[COORD_2+1]); \n\
658 Vertex_Z = (float4)(my_vertex[COORD_2+2]); \n\
659 tv2 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
661 /* third vertex */ \n\
662 Vertex_X = (float4)(my_vertex[COORD_3+0]); \n\
663 Vertex_Y = (float4)(my_vertex[COORD_3+1]); \n\
664 Vertex_Z = (float4)(my_vertex[COORD_3+2]); \n\
665 tv3 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
668 /* calculate normal for face from transformed vertices */ \n\
669 /* this replicates polynormalf for opencl */ \n\
670 #define VEC_DIST_1 (tv2-tv1) \n\
671 #define VEC_DIST_2 (tv3-tv1) \n\
672 float4 norm = normalize(cross(VEC_DIST_1,VEC_DIST_2)); \n\
674 /* from polyrep_disp_rec2, see that function for full comments */ \n\
675 bool frontfacing; \n\
677 /* how we view it from the avatar */ \n\
678 if (face_ccw) frontfacing = (dot(norm,tv1) < 0); \n\
679 else frontfacing = (dot(norm,tv1) >= 0); \n\
681 /* now, is solid false, or ccw or ccw winded triangle? */ \n\
682 /* if we should do this triangle, the if statement is true */ \n\
684 bool should_do_this_triangle = \n\
685 ((frontfacing && !(face_flags & PR_DOUBLESIDED) ) \n\
686 || ( (face_flags & PR_DOUBLESIDED) && !(face_flags & (PR_FRONTFACING | PR_BACKFACING) ) ) \n\
687 || (frontfacing && (face_flags & PR_FRONTFACING)) \n\
688 || (!frontfacing && (face_flags & PR_BACKFACING)) ); \n\
691 if (!should_do_this_triangle) { \n\
692 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
698 /* if we are down to here, we have to do this triangle */ \n\
700 if(!frontfacing) { /*can only be here in DoubleSided mode*/ \n\
701 /*reverse polygon orientation, and do calculations*/ \n\
702 norm = VECSCALE(norm,-1.0f); \n\
705 /********************************************************************************/ \n\
707 /* Collide Kernel Step 2: do hit calculations */ \n\
708 /* replicate Dougs get_poly_min_disp_with_sphere function */ \n\
710 /********************************************************************************/ \n\
712 float4 closest_point = closest_point_on_plane(tv1,tv2,tv3); \n\
714 float get_poly_mindisp = dot(closest_point,closest_point); \n\
716 if (get_poly_mindisp > (avatar_radius * avatar_radius)) { \n\
717 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
721 /* do we have a movement here? */ \n\
722 if (VECLENGTH(closest_point) > FLOAT_TOLERANCE) { \n\
723 float poly_min_rt = sqrt(get_poly_mindisp); \n\
724 float sFactor = (avatar_radius -poly_min_rt) /VECLENGTH(closest_point); \n\
726 float4 result = VECSCALE(closest_point,sFactor); \n\
727 /* copy over the result */ \n\
728 result.w = 100.0f; /* flag that this is a good one */ \n\
729 output[i_am_canadian] = result; \n\
734 /* if we are down to here, we can just return zero */ \n\
735 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\