FreeWRL / FreeX3D 4.3.0
CollisionGPU.c
1/*
2
3
4Render the children of nodes.
5
6*/
7
8/****************************************************************************
9 This file is part of the FreeWRL/FreeX3D Distribution.
10
11 Copyright 2009 CRC Canada. (http://www.crc.gc.ca)
12
13 FreeWRL/FreeX3D is free software: you can redistribute it and/or modify
14 it under the terms of the GNU Lesser Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
17
18 FreeWRL/FreeX3D is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with FreeWRL/FreeX3D. If not, see <http://www.gnu.org/licenses/>.
25****************************************************************************/
26
27
28#include <config.h>
29#include <system.h>
30#include <display.h>
31#include <internal.h>
32
33#include <libFreeWRL.h>
34
35#include "Viewer.h"
36#include "RenderFuncs.h"
37#include "../vrml_parser/Structs.h"
38
39#include "../main/headers.h"
40
41#include "LinearAlgebra.h"
42#include "Collision.h"
43// OLD_IPHONE_AQUA #if !defined(_ANDROID) && !defined(IPHONE)
44#if !defined(_ANDROID)
45#include "../opencl/OpenCL_Utils.h"
46#endif
47#ifdef HAVE_OPENCL
48
49static const char* collide_non_walk_kernel;
50static const char* collide_non_walk_kernel_headers;
51
52#define FLOAT_TOLERANCE 0.00000001
53
54/********************************************************************************/
55/* */
56/* Collide kernel, generic structures, etc */
57/* */
58/********************************************************************************/
59
60/********************************************************************************/
61/* */
62/* */
63/********************************************************************************/
64#ifdef _MSC_VER
65cl_platform_id cpPlatform = NULL; // OpenCL platform
66cl_device_id* cdDevices = NULL; // device list
67cl_uint uiTargetDevice = 0; // Default Device to compute on
68
69
70
71cl_int ciErrNum; // Error code var
72enum LOGMODES
73{
74 LOGCONSOLE = 1, // bit to signal "log to console"
75 LOGFILE = 2, // bit to signal "log to file"
76 LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
77 APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
78 MASTER = 8, // bit to signal master .csv log output
79 ERRORMSG = 16, // bit to signal "pre-pend Error"
80 CLOSELOG = 32 // bit to close log file, if open, after any requested file write
81};
82int bQATest = 0; // false = normal GL loop, true = run No-GL test sequence
83bool bGLinteropSupported = false; // state var for GL interop supported or not
84cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
85bool bGLinterop = false; // state var for GL interop or not
86/*
87void Cleanup(int iExitCode)
88{
89 // Cleanup allocated objects
90 //shrLog("\nStarting Cleanup...\n\n");
91 if(program)clReleaseProgram(program);
92 if(context)clReleaseContext(context);
93 if(cdDevices)free(cdDevices);
94
95 // Cleanup GL objects if used
96 if (!bQATest)
97 {
98 //DeInitGL();
99 }
100
101 // finalize logs and leave
102 //shrLog("%s\n\n", iExitCode == 0 ? "PASSED" : "FAILED");
103 if ((bQATest))
104 {
105 // shrLogEx(LOGBOTH | CLOSELOG, 0, "oclBoxFilter.exe Exiting...\n");
106 }
107 else
108 {
109 //shrLogEx(LOGBOTH | CLOSELOG, 0, "oclBoxFilter.exe Exiting...\nPress <Enter> to Quit\n");
110 #ifdef WIN32
111 getchar();
112 #endif
113 }
114 exit (iExitCode);
115}
116void (*pCleanup)(int) = &Cleanup;
117*/
118#define shrLog printf
119#endif //_MSC_VER
120
121
122
123
124bool collision_initGPUCollide (struct sCollisionGPU* initme) {
125 ppOpenCL_Utils p;
126 ttglobal tg = gglobal();
127 p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
128
129 char *kp[2];
130 cl_int err;
131 size_t kernel_wg_size;
132 size_t rvlen;
133
134 //printf ("called initGPUCollide, p is %p\n",p);
135 //printf ("called initGPUCollide, context is %p\n",p->CL_context);
136 //printf ("called initGPUCollide, initme is %p\n",initme);
137
138 kp[0] = (char *)collide_non_walk_kernel_headers;
139 kp[1] = (char *)collide_non_walk_kernel;
140
141 initme->CollideGPU_program = clCreateProgramWithSource(p->CL_context, 2, (const char **) kp, NULL, &err);
142 //printf ("past the clCreateProgramWithSource call\n");
143
144 if (!initme->CollideGPU_program || (err != CL_SUCCESS)) {
145 printCLError("clCreateProgramWithSource",err);
146 return FALSE;
147 }
148
149 // build the compute program executable
150 //char *opts = "-Werror -cl-single-precision-constant -cl-nv-verbose -g -cl-opt-disable -cl-strict-aliasing";
151 //char *opts = "-Werror -cl-single-precision-constant -cl-opt-disable -cl-strict-aliasing";
152 //err = clBuildProgram(p->program, 0, NULL, opts, NULL, NULL);
153 //ConsoleMessage ("calling clBuildProgram with program %p\n",p->program);
154
155 // build the program, hard code in devices to 1 device, with the device list, no options
156 char *opts = NULL;
157 err = clBuildProgram(initme->CollideGPU_program, 1, &(p->CL_device_id), opts, NULL, NULL);
158 //printf ("past the clBuildProgram call\n");
159
160 //ConsoleMessage ("called clBuildProgram error %d\n",err);
161 if (err != CL_SUCCESS) {
162 size_t len;
163 char buffer[16384];
164
165 ConsoleMessage("Error: Failed to build program executable\n");
166 printCLError("clBuildProgram",err);
167 err = clGetProgramBuildInfo(initme->CollideGPU_program, p->CL_device_id, CL_PROGRAM_BUILD_LOG,
168 sizeof(buffer), buffer, &len);
169 TEST_ERR("clGetProgramBuildInfo",err);
170 ConsoleMessage ("error string len %d\n",(int)len);
171 ConsoleMessage("%s\n", buffer);
172 return FALSE;
173 }
174
175 // create the compute kernel
176 initme->CollideGPU_kernel = clCreateKernel(initme->CollideGPU_program, "compute_collide", &err);
177 // printf ("kernel is %p\n",initme->CollideGPU_kernel);
178
179 if (!initme->CollideGPU_kernel || (err != CL_SUCCESS)) {
180 printCLError("clCreateKernel",err);
181 return FALSE;
182 }
183
184
185 // Kernel Workgroup size
186 // rv = clGetDeviceInfo (p->device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, &rvlen);
187 err = clGetKernelWorkGroupInfo (initme->CollideGPU_kernel, p->CL_device_id,
188 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_wg_size, &rvlen);
189
190 if (err!=CL_SUCCESS) {
191 printCLError( "clGetKernelWorkGroupInfo",err);
192 return FALSE;
193 }
194
195 // try the smaller of the two
196 if (kernel_wg_size < p->CL_default_workgroup_size) initme->CollideGPU_workgroup_size = kernel_wg_size;
197 else initme->CollideGPU_workgroup_size = p->CL_default_workgroup_size;
198
199 #ifdef GPU_DEBUG
200 ConsoleMessage ("MAX_WORK_GROUP_SIZE %d\n",kernel_wg_size);
201 ConsoleMessage ("We are going to set our workgroup size to %d\n",wg_size);
202
203
204/*
2051. Get workGroupSize from clGetDeviceInfo with CL_DEVICE_mum of two values and use that value as your optimal workGroupSize
2062. Get KernelWorkGroupSize from from clGetKernelWorkGroupInfo with CL_KERNEL_WORK_GPOUP_SIZE
2073. Get minimum of two values and use that value as your optimal workGroupSize
208*/
209
210 #endif // GPU_DEBUG
211
212 return TRUE;
213}
214
215
216/********************************************************************************/
217/* */
218/* */
219/********************************************************************************/
220
221int printedOnce = TRUE;
222
223
224struct point_XYZ run_non_walk_collide_program(GLuint vertex_vbo, GLuint index_vbo, float *modelMat,int ntri,
225 int face_ccw, int face_flags, float avatar_radius) {
226
227 int i;
228 cl_int err;
229 size_t local_work_size;
230 size_t global_work_size;
231 unsigned int count;
232
233 double maxdisp = 0.0;
234 struct point_XYZ dispv, maxdispv = {0,0,0};
235
236 struct sCollisionGPU* me = GPUCollisionInfo();
237
238 ppOpenCL_Utils p;
239 ttglobal tg = gglobal();
240 p = (ppOpenCL_Utils)tg->OpenCL_Utils.prv;
241
242 // enough space for rv?
243 if (me->CollideGPU_returnValues.n < ntri) {
244 if (me->CollideGPU_returnValues.n != 0) {
245 err = clReleaseMemObject(me->CollideGPU_output_buffer);
246 TEST_ERR("clReleaseMemObject",err);
247 }
248
249 me->CollideGPU_output_buffer = clCreateBuffer(p->CL_context, CL_MEM_WRITE_ONLY, sizeof(struct SFColorRGBA) * ntri,
250 NULL, NULL);
251
252 if (me->CollideGPU_matrix_buffer == NULL) {
253 me->CollideGPU_matrix_buffer = clCreateBuffer(p->CL_context, CL_MEM_READ_ONLY, sizeof (cl_float16), NULL, NULL);
254 }
255
256 if (!(me->CollideGPU_output_buffer) || !(me->CollideGPU_matrix_buffer)) {
257 printCLError("clCreateBuffer",10000);
258 }
259
260 me->CollideGPU_output_size = ntri;
261 me->CollideGPU_returnValues.p = REALLOC(me->CollideGPU_returnValues.p, sizeof(struct SFColorRGBA) *ntri);
262 me->CollideGPU_returnValues.n = ntri;
263 }
264
265 // update the current matrix transform
266 err = clEnqueueWriteBuffer(p->CL_queue, me->CollideGPU_matrix_buffer, CL_TRUE, 0, sizeof(cl_float16), modelMat, 0, NULL, NULL);
267 TEST_ERR("clEnqueueWriteBuffer",err);
268
269 // lets get the openGL vertex buffer here
270 me->CollideGPU_vertex_buffer=clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, vertex_vbo, &err);
271 if (err != CL_SUCCESS) {
272 printCLError("clCreateFromGLBuffer",err);
273 return maxdispv;
274 }
275
276 // and the coordinate index buffer
277 me->CollideGPU_index_buffer = clCreateFromGLBuffer(p->CL_context, CL_MEM_READ_ONLY, index_vbo, &err);
278 if (err != CL_SUCCESS) {
279 printCLError("clCreateFromGLBuffer",err);
280 return maxdispv;
281 }
282
283 // set the args values
284 count = (unsigned int) ntri;
285
286 err = clSetKernelArg(me->CollideGPU_kernel, 0, sizeof(cl_mem), &me->CollideGPU_output_buffer);
287 TEST_ERR("clSetKernelArg",err);
288
289 err =clSetKernelArg(me->CollideGPU_kernel, 1, sizeof(unsigned int), &count);
290 TEST_ERR("clSetKernelArg",err);
291
292 err =clSetKernelArg(me->CollideGPU_kernel, 2, sizeof (cl_mem), &me->CollideGPU_matrix_buffer);
293 TEST_ERR("clSetKernelArg",err);
294
295 err =clSetKernelArg(me->CollideGPU_kernel, 3, sizeof (cl_mem), &me->CollideGPU_vertex_buffer);
296 TEST_ERR("clSetKernelArg",err);
297
298 err =clSetKernelArg(me->CollideGPU_kernel, 4, sizeof (cl_mem), &me->CollideGPU_index_buffer);
299 TEST_ERR("clSetKernelArg",err);
300
301 err =clSetKernelArg(me->CollideGPU_kernel, 5, sizeof(int), &face_ccw);
302 TEST_ERR("clSetKernelArg",err);
303
304 err =clSetKernelArg(me->CollideGPU_kernel, 6, sizeof(int), &face_flags);
305 TEST_ERR("clSetKernelArg",err);
306
307 err =clSetKernelArg(me->CollideGPU_kernel, 7, sizeof(int), &avatar_radius);
308 TEST_ERR("clSetKernelArg",err);
309
310 err =clSetKernelArg(me->CollideGPU_kernel, 8, sizeof(int), &ntri);
311 TEST_ERR("clSetKernelArg",err);
312
313 // global work group size
314 #define MYWG (me->CollideGPU_workgroup_size)
315 // find out how many "blocks" we can have
316 if (MYWG > 0)
317 global_work_size = (size_t) (ntri) / MYWG;
318 else global_work_size = 0;
319
320 // add 1 to it, because we have to round up
321 global_work_size += 1;
322
323 // now, global_work_size will be an exact multiple of local_work_size
324 global_work_size *= MYWG;
325
326 //ConsoleMessage ("global_work_size is %d %x right now...\n",global_work_size, global_work_size);
327
328 local_work_size = MYWG;
329 //ConsoleMessage ("local_work_size %d\n",local_work_size);
330 //ConsoleMessage ("ntri %d, global_work_size %d, local_work_size %d\n",ntri,global_work_size,local_work_size);
331
332if (!printedOnce) {
333 cl_context myContext;
334 cl_device_id myDevice;
335 cl_uint myReference;
336
337
338 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_CONTEXT, sizeof(myContext), &myContext, NULL);
339printf ("queue context when commandqueue created %p, should be %p\n", myContext,p->CL_context);
340 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_DEVICE, sizeof(myDevice), &myDevice, NULL);
341printf ("queue Device when commandqueue created %p, should be %p\n", myDevice,p->CL_device_id);
342 err = clGetCommandQueueInfo(p->CL_queue, CL_QUEUE_REFERENCE_COUNT, sizeof(myReference), &myReference, NULL);
343printf ("queue Reference when commandqueue created %d\n", myReference);
344
345}
346
347 err = clEnqueueNDRangeKernel(p->CL_queue, me->CollideGPU_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
348if (!printedOnce) {
349 if (err != CL_SUCCESS) {
350 printCLError("clEnqueueNDRangeKernel",err);
351 return maxdispv;
352 }
353printedOnce = TRUE;
354}
355
356#ifdef TRY_FLUSH
357 // wait for things to finish
358 err = clFlush(p->CL_queue);
359 if (err != CL_SUCCESS) {
360 printCLError("clFlush",err);
361 return maxdispv;
362 }
363
364 err = clFinish(p->CL_queue);
365 if (err != CL_SUCCESS) {
366 printCLError("clFinish",err);
367 return maxdispv;
368 }
369#endif
370
371 // get the data
372
373 // get the data
374 /* working code has:
375 err = clEnqueueReadBuffer (me->queue, me->output_buffer,
376 CL_TRUE, 0, sizeof(struct SFColorRGBA) * ntri,
377 me->collide_rvs.p, 0, NULL, NULL);
378*/
379
380
381 err = clEnqueueReadBuffer (p->CL_queue, me->CollideGPU_output_buffer,
382 CL_TRUE, 0, sizeof(struct SFColorRGBA) * ntri,
383 me->CollideGPU_returnValues.p, 0, NULL, NULL);
384
385 if (err != CL_SUCCESS) {
386 printCLError("clEnqueueReadBuffer",err);
387 return maxdispv;
388 }
389
390
391 for (i=0; i < ntri; i++) {
392 /* XXX float to double conversion; make a vecdotf for speed */
393 double disp;
394 /* printf ("i %d rv %f %f %f %f\n",i,me->CollideGPU_returnValues.p[i].c[0],
395 me->CollideGPU_returnValues.p[i].c[1],me->CollideGPU_returnValues.p[i].c[2],
396 me->CollideGPU_returnValues.p[i].c[3]);
397 */
398 // we use the last float to indicate whether to bother here; saves us
399 // doing unneeded calculations here
400
401 if (me->CollideGPU_returnValues.p[i].c[3] > 1.0) {
402 //ConsoleMessage ("possibly triangle %d has some stuff for us\n",i);
403
404
405 dispv.x = me->CollideGPU_returnValues.p[i].c[0];
406 dispv.y = me->CollideGPU_returnValues.p[i].c[1];
407 dispv.z = me->CollideGPU_returnValues.p[i].c[2];
408 //ConsoleMessage ("GPU collide tri %d, disp %f %f %f\n",i,dispv.x,dispv.y,dispv.z);
409
410 /*keep result only if:
411 displacement is positive
412 displacement is smaller than minimum displacement up to date
413 */
414
415 disp = vecdot (&dispv,&dispv);
416 if ((disp > FLOAT_TOLERANCE) && (disp>maxdisp)) {
417 maxdisp = disp;
418 maxdispv = dispv;
419 }
420 }
421
422 }
423
424
425 //ConsoleMessage ("OpenCL ntri %d - at end of opencl, maxdispv %f %f %f\n",ntri, maxdispv.x, maxdispv.y, maxdispv.z);
426
427 return maxdispv;
428}
429
430#ifdef GL_ES_VERSION_2_0
431static const char* collide_non_walk_kernel_headers = " \
432//#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n\
433#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
434//#pragma OPENCL EXTENSION CL_APPLE_gl_sharing : enable \n\
435//#pragma OPENCL EXTENSION CL_KHR_gl_sharing : enable \n\
436//#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode : enable \n\
437";
438#else
439
440// OLD_IPHONE_AQUA #if defined (TARGET_AQUA)
441// OLD_IPHONE_AQUA static const char* collide_non_walk_kernel_headers = " \
442// OLD_IPHONE_AQUA //#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n\
443// OLD_IPHONE_AQUA #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
444// OLD_IPHONE_AQUA #pragma OPENCL EXTENSION CL_APPLE_gl_sharing : enable \n\
445// OLD_IPHONE_AQUA #pragma OPENCL EXTENSION CL_KHR_gl_sharing : enable \n\
446// OLD_IPHONE_AQUA #pragma OPENCL EXTENSION cl_khr_select_fprounding_mode : enable \n\
447// OLD_IPHONE_AQUA ";
448// OLD_IPHONE_AQUA #else
449
450// this seems to be ok on AMD drivers under Linux
451static const char* collide_non_walk_kernel_headers = " \
452#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n\
453";
454// OLD_IPHONE_AQUA #endif // AQUA
455
456#endif
457
458
459
460static const char* collide_non_walk_kernel = " \
461 \n\
462/********************************************************************************/ \n\
463/* */ \n\
464/* Collide kernel for fly and examine modes */ \n\
465/* */ \n\
466/********************************************************************************/ \n\
467/* Function prototypes */ \n\
468float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c); \n\
469 \n\
470/* start the collide process. \n\
471 \n\
4721) transform the vertex. \n\
4732) calculate normal \n\
4743) if triangle is visible to us, get ready for collide calcs \n\
475 \n\
476*/ \n\
477 \n\
478 \n\
479#define DOUGS_FLOAT_TOLERANCE 0.00000001 \n\
480#define FLOAT_TOLERANCE 0.0000001f \n\
481#define PR_DOUBLESIDED 0x01 \n\
482#define PR_FRONTFACING 0x02 /* overrides effect of doublesided. */ \n\
483#define PR_BACKFACING 0x04 /* overrides effect of doublesided, all normals are reversed. */ \n\
484 \n\
485/********************************************************************************/ \n\
486 \n\
487 \n\
488#define APPROX (a, b) (fabs(a-b) < FLOAT_TOLERANCE) \n\
489#define VECSCALE(v,s) (float4)(v.x*s, v.y*s, v.z*s, 0.0f) \n\
490#define VECLENGTH(v) (float)sqrt((float)dot((float4)v,(float4)v)) \n\
491 \n\
492 \n\
493 \n\
494/********************************************************************************/ \n\
495/* */ \n\
496/* Three vertices; find the closest one which intersects the Z plane; */ \n\
497/* either we choose a Vertex, on an edge, or fabricate one in the */ \n\
498/* middle of the triangle somewhere. */ \n\
499/* */ \n\
500/* Adapted from \"Real time Collision Detection\", Christer Ericson. */ \n\
501/* */ \n\
502/********************************************************************************/ \n\
503 \n\
504 \n\
505float4 closest_point_on_plane(float4 point_a, float4 point_b, float4 point_c) { \n\
506 float4 vector_ab = (point_b - point_a); // b - a \n\
507 float4 vector_ac = (point_c - point_a); // c - a \n\
508 float4 vector_bc = (point_c - point_b); // c - b \n\
509 float4 vector_ba = (point_a - point_b); // a - b \n\
510 float4 vector_ca = (point_a - point_c); // a - c \n\
511 float4 vector_cb = (point_b - point_c); // b - c \n\
512 \n\
513 \n\
514 // we have moved points, so our bounding sphere is at (0,0,0) so p = (0,0,0) \n\
515 float4 vector_ap = point_a * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - a \n\
516 float4 vector_bp = point_b * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - b \n\
517 float4 vector_cp = point_c * (float4)(-1.0f, -1.0f, -1.0f, -1.0f); // p - c \n\
518 #define vector_pa point_a /* a - p */ \n\
519 #define vector_pb point_b /* b - p */ \n\
520 #define vector_pc point_c /* c - p */ \n\
521 \n\
522 // Step 2. Compute parametric position s for projection P' of P on AB, \n\
523 // P' = A + s*AB, s = snom/(snom+sdenom) \n\
524 \n\
525 float snom = dot(vector_ap, vector_ab); // (p - a, ab); \n\
526 float sdenom = dot(vector_bp, vector_ba); // (p - b, a - b); \n\
527 \n\
528 // Step 3. \n\
529 // Compute parametric position t for projection P' of P on AC, \n\
530 // P' = A + t*AC, s = tnom/(tnom+tdenom) \n\
531 float tnom = dot(vector_ap, vector_ac); // (p - a, ac); \n\
532 float tdenom = dot(vector_cp, vector_ca); // (p - c, a - c); \n\
533 \n\
534 // Step 4. \n\
535 if (snom <= 0.0f && tnom <= 0.0f) { \n\
536 return point_a; \n\
537 } \n\
538 \n\
539 // Step 5. \n\
540 // Compute parametric position u for projection P' of P on BC, \n\
541 // P' = B + u*BC, u = unom/(unom+udenom) \n\
542 float unom = dot(vector_bp, vector_bc); //(p - b, bc) \n\
543 float udenom = dot(vector_cp, vector_cb); // (p - c, b - c); \n\
544 \n\
545 // Step 6. \n\
546 if (sdenom <= 0.0f && unom <= 0.0f) { \n\
547 return point_b; \n\
548 } \n\
549 \n\
550 if (tdenom <= 0.0f && udenom <= 0.0f) { \n\
551 return point_c; \n\
552 } \n\
553 \n\
554 \n\
555 // Step 7. \n\
556 // P is outside (or on) AB if the triple scalar product [N PA PB] <= 0 \n\
557 float4 n; \n\
558 float4 tmp; \n\
559 float vc; \n\
560 \n\
561 n = cross(vector_ab, vector_ac); // (b - a, c - a); \n\
562 tmp = cross(vector_pa, vector_pb); // veccross (a-p, b-p); \n\
563 \n\
564 // vc = dot(n, veccross(a - p, b - p)); \n\
565 vc = dot(n, tmp); \n\
566 \n\
567 \n\
568 // If P outside AB and within feature region of AB, \n\
569 // return projection of P onto AB \n\
570 if (vc <= 0.0f && snom >= 0.0f && sdenom >= 0.0f) { \n\
571 return point_a + snom / (snom + sdenom) * vector_ab; \n\
572 } \n\
573 \n\
574 \n\
575 \n\
576 // Step 8. \n\
577 // P is outside (or on) BC if the triple scalar product [N PB PC] <= 0 \n\
578 tmp = cross (vector_pb, vector_pc); \n\
579 \n\
580 float va = dot(n, tmp); // Cross(b - p, c - p)); \n\
581 \n\
582 // If P outside BC and within feature region of BC, \n\
583 // return projection of P onto BC \n\
584 if (va <= 0.0f && unom >= 0.0f && udenom >= 0.0f) { \n\
585 return point_b + unom / (unom + udenom) * vector_bc; \n\
586 } \n\
587 \n\
588 // Step 9. \n\
589 // P is outside (or on) CA if the triple scalar product [N PC PA] <= 0 \n\
590 tmp = cross (vector_pc, vector_pa); \n\
591 \n\
592 float vb = dot(n, tmp); // Cross(c - p, a - p)); \n\
593 // If P outside CA and within feature region of CA, \n\
594 // return projection of P onto CA \n\
595 if (vb <= 0.0f && tnom >= 0.0f && tdenom >= 0.0f) { \n\
596 return point_a + tnom / (tnom + tdenom) * vector_ac; \n\
597 } \n\
598 \n\
599 // 10. \n\
600 // P must project inside face region. Compute Q using barycentric coordinates \n\
601 float u = va / (va + vb + vc); \n\
602 float v = vb / (va + vb + vc); \n\
603 float w = 1.0f - u - v; // = vc / (va + vb + vc) \n\
604 float4 u4 = (float4)(u); \n\
605 float4 v4 = (float4)(v); \n\
606 float4 w4 = (float4)(w); \n\
607 \n\
608 //return u * point_a + v * point_b + w * point_c; \n\
609 float4 rv = mad(point_a,u4,mad(point_b,v4,point_c*w4)); \n\
610 return rv; \n\
611} \n\
612 \n\
613/********************************************************************************/ \n\
614 \n\
615 __kernel void compute_collide ( \n\
616 __global float4 *output, /* 0 */ \n\
617 const unsigned int count, /* 1 */ \n\
618 __global float *mymat, /* 2 */ \n\
619 __global float *my_vertex, /* 3 */ \n\
620 __global short *my_cindex, /* 4 */ \n\
621 const int face_ccw, /* 5 */ \n\
622 const int face_flags, /* 6 */ \n\
623 const float avatar_radius, /* 7 */ \n\
624 const int ntri /* 8 */ \n\
625 ) { \n\
626 \n\
627 /* which index this instantation is working on */ \n\
628 int i_am_canadian = get_global_id(0); \n\
629 if (i_am_canadian >= ntri) return; /* allows for workgroup size sizes */ \n\
630 \n\
631 /* vertices for this triangle */ \n\
632 /* transformed by matrix */ \n\
633 float4 tv1; \n\
634 float4 tv2; \n\
635 float4 tv3; \n\
636 \n\
637 /* starting index in my_vertex of this vertex */ \n\
638 /* we work in triangles; each triangle has 3 vertices */ \n\
639 #define COORD_1 (my_cindex[i_am_canadian*3+0]*3) \n\
640 #define COORD_2 (my_cindex[i_am_canadian*3+1]*3) \n\
641 #define COORD_3 (my_cindex[i_am_canadian*3+2]*3) \n\
642 \n\
643 /* do matrix transform, 4 floats wide. */ \n\
644 float4 matColumn1 = (float4)(convert_float(mymat[0]),convert_float(mymat[1]),convert_float(mymat[2]),0.0f); \n\
645 float4 matColumn2 = (float4)(convert_float(mymat[4]),convert_float(mymat[5]),convert_float(mymat[6]),0.0f); \n\
646 float4 matColumn3 = (float4)(convert_float(mymat[8]),convert_float(mymat[9]),convert_float(mymat[10]),0.0f); \n\
647 float4 matColumn4 = (float4)(convert_float(mymat[12]),convert_float(mymat[13]),convert_float(mymat[14]),0.0f); \n\
648 \n\
649 /* first vertex */ \n\
650 float4 Vertex_X = (float4)(my_vertex[COORD_1+0]); \n\
651 float4 Vertex_Y = (float4)(my_vertex[COORD_1+1]); \n\
652 float4 Vertex_Z = (float4)(my_vertex[COORD_1+2]); \n\
653 tv1 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
654 \n\
655 /* second vertex */ \n\
656 Vertex_X = (float4)(my_vertex[COORD_2+0]); \n\
657 Vertex_Y = (float4)(my_vertex[COORD_2+1]); \n\
658 Vertex_Z = (float4)(my_vertex[COORD_2+2]); \n\
659 tv2 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
660 \n\
661 /* third vertex */ \n\
662 Vertex_X = (float4)(my_vertex[COORD_3+0]); \n\
663 Vertex_Y = (float4)(my_vertex[COORD_3+1]); \n\
664 Vertex_Z = (float4)(my_vertex[COORD_3+2]); \n\
665 tv3 = mad(matColumn1,Vertex_X,mad(matColumn2,Vertex_Y,mad(matColumn3,Vertex_Z,matColumn4))); \n\
666 \n\
667 \n\
668 /* calculate normal for face from transformed vertices */ \n\
669 /* this replicates polynormalf for opencl */ \n\
670 #define VEC_DIST_1 (tv2-tv1) \n\
671 #define VEC_DIST_2 (tv3-tv1) \n\
672 float4 norm = normalize(cross(VEC_DIST_1,VEC_DIST_2)); \n\
673 \n\
674 /* from polyrep_disp_rec2, see that function for full comments */ \n\
675 bool frontfacing; \n\
676 \n\
677 /* how we view it from the avatar */ \n\
678 if (face_ccw) frontfacing = (dot(norm,tv1) < 0); \n\
679 else frontfacing = (dot(norm,tv1) >= 0); \n\
680 \n\
681 /* now, is solid false, or ccw or ccw winded triangle? */ \n\
682 /* if we should do this triangle, the if statement is true */ \n\
683 \n\
684 bool should_do_this_triangle = \n\
685 ((frontfacing && !(face_flags & PR_DOUBLESIDED) ) \n\
686 || ( (face_flags & PR_DOUBLESIDED) && !(face_flags & (PR_FRONTFACING | PR_BACKFACING) ) ) \n\
687 || (frontfacing && (face_flags & PR_FRONTFACING)) \n\
688 || (!frontfacing && (face_flags & PR_BACKFACING)) ); \n\
689 \n\
690 \n\
691 if (!should_do_this_triangle) { \n\
692 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
693\
694 return; \n\
695 } \n\
696 \n\
697 \n\
698 /* if we are down to here, we have to do this triangle */ \n\
699 \n\
700 if(!frontfacing) { /*can only be here in DoubleSided mode*/ \n\
701 /*reverse polygon orientation, and do calculations*/ \n\
702 norm = VECSCALE(norm,-1.0f); \n\
703 } \n\
704 \n\
705 /********************************************************************************/ \n\
706 /* */ \n\
707 /* Collide Kernel Step 2: do hit calculations */ \n\
708 /* replicate Dougs get_poly_min_disp_with_sphere function */ \n\
709 /* */ \n\
710 /********************************************************************************/ \n\
711 \n\
712 float4 closest_point = closest_point_on_plane(tv1,tv2,tv3); \n\
713 \n\
714 float get_poly_mindisp = dot(closest_point,closest_point); \n\
715 \n\
716 if (get_poly_mindisp > (avatar_radius * avatar_radius)) { \n\
717 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
718 return; \n\
719 } \n\
720 \n\
721 /* do we have a movement here? */ \n\
722 if (VECLENGTH(closest_point) > FLOAT_TOLERANCE) { \n\
723 float poly_min_rt = sqrt(get_poly_mindisp); \n\
724 float sFactor = (avatar_radius -poly_min_rt) /VECLENGTH(closest_point); \n\
725 \n\
726 float4 result = VECSCALE(closest_point,sFactor); \n\
727 /* copy over the result */ \n\
728 result.w = 100.0f; /* flag that this is a good one */ \n\
729 output[i_am_canadian] = result; \n\
730 return; \n\
731 } \n\
732 \n\
733 \n\
734 /* if we are down to here, we can just return zero */ \n\
735 output[i_am_canadian] = (float4)(0.0f,0.0f,0.0f,0.0f); \n\
736} \n\
737";
738
739#endif //HAVE_OPENCL