diff --git a/Extras/BulletMultiThreaded/CMakeLists.txt b/Extras/BulletMultiThreaded/CMakeLists.txt
index 3ae2a40fe..dc602b5b1 100644
--- a/Extras/BulletMultiThreaded/CMakeLists.txt
+++ b/Extras/BulletMultiThreaded/CMakeLists.txt
@@ -21,6 +21,8 @@ ADD_LIBRARY(LibBulletMultiThreaded
 		SpuSampleTaskProcess.h
 		SpuSampleTaskProcess.cpp
 
+		SpuCollisionObjectWrapper.cpp 
+		SpuCollisionObjectWrapper.h 
 		SpuCollisionTaskProcess.h
 		SpuCollisionTaskProcess.cpp
 		SpuGatheringCollisionDispatcher.h
@@ -39,15 +41,20 @@ ADD_LIBRARY(LibBulletMultiThreaded
 		SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.h
 		SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
 		SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.h
-		SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
+		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+		SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h
 
 		SpuParallelSolver.cpp
 		SpuParallelSolver.h
 		SpuSolverTask/SpuParallellSolverTask.cpp
 		SpuSolverTask/SpuParallellSolverTask.h
 
-#		SpuRaycastTaskProcess.cpp
-#		SpuRaycastTaskProcess.h
-#		SpuRaycastTask/SpuRaycastTask.cpp
-#		SpuRaycastTask/SpuRaycastTask.h
+		SpuBatchRaycaster.cpp
+		SpuBatchRaycaster.h
+		SpuRaycastTaskProcess.cpp
+		SpuRaycastTaskProcess.h
+		SpuRaycastTask/SpuRaycastTask.cpp
+		SpuRaycastTask/SpuRaycastTask.h
+		SpuRaycastTask/SpuSubSimplexConvexCast.cpp
+		SpuRaycastTask/SpuSubSimplexConvexCast.h
 )
diff --git a/Extras/BulletMultiThreaded/SequentialThreadSupport.cpp b/Extras/BulletMultiThreaded/SequentialThreadSupport.cpp
index b7158cd61..32447299e 100644
--- a/Extras/BulletMultiThreaded/SequentialThreadSupport.cpp
+++ b/Extras/BulletMultiThreaded/SequentialThreadSupport.cpp
@@ -55,7 +55,6 @@ void SequentialThreadSupport::sendRequest(uint32_t uiCommand, uint32_t uiArgumen
 
 }
 
-
 ///check for messages from SPUs
 void SequentialThreadSupport::waitForResponse(unsigned int *puiArgument0, unsigned int *puiArgument1)
 {
@@ -65,8 +64,6 @@ void SequentialThreadSupport::waitForResponse(unsigned int *puiArgument0, unsign
 	*puiArgument1 = spuStatus.m_status;
 }
 
-
-
 void SequentialThreadSupport::startThreads(SequentialThreadConstructionInfo& threadConstructionInfo)
 {
 	m_activeSpuStatus.resize(1);
@@ -78,7 +75,7 @@ void SequentialThreadSupport::startThreads(SequentialThreadConstructionInfo& thr
 	spuStatus.m_status = 0;
 	spuStatus.m_lsMemory = threadConstructionInfo.m_lsMemoryFunc();
 	spuStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
-	printf("STS: Created local store at %p for function %p\n",spuStatus.m_lsMemory, spuStatus.m_userThreadFunc);
+	printf("STS: Created local store at %p for task %s\n", spuStatus.m_lsMemory, threadConstructionInfo.m_uniqueName);
 }
 
 void SequentialThreadSupport::startSPU()
diff --git a/Extras/BulletMultiThreaded/SpuBatchRaycaster.cpp b/Extras/BulletMultiThreaded/SpuBatchRaycaster.cpp
index d03944a26..dd7c76ca0 100644
--- a/Extras/BulletMultiThreaded/SpuBatchRaycaster.cpp
+++ b/Extras/BulletMultiThreaded/SpuBatchRaycaster.cpp
@@ -39,7 +39,7 @@ void
 SpuBatchRaycaster::addRay (const btVector3& rayFrom, const btVector3& rayTo)
 {
 	SpuRaycastTaskWorkUnitOut workUnitOut;
-	workUnitOut.hitFraction = 0.99;
+	workUnitOut.hitFraction = 1.0;
 	workUnitOut.hitNormal = btVector3(0.0, 1.0, 0.0);
 
 	rayBatchOutput.push_back (workUnitOut);
diff --git a/Extras/BulletMultiThreaded/SpuCollisionObjectWrapper.h b/Extras/BulletMultiThreaded/SpuCollisionObjectWrapper.h
index 3b069a34a..840c0e4b3 100644
--- a/Extras/BulletMultiThreaded/SpuCollisionObjectWrapper.h
+++ b/Extras/BulletMultiThreaded/SpuCollisionObjectWrapper.h
@@ -16,7 +16,7 @@ subject to the following restrictions:
 #include "PlatformDefinitions.h"
 #include "BulletCollision/CollisionDispatch/btCollisionObject.h"
 
-class SpuCollisionObjectWrapper
+ATTRIBUTE_ALIGNED16(class) SpuCollisionObjectWrapper
 {
 protected:
 	int m_shapeType;
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
index 344e5c9c0..a190ec7f1 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuCollisionShapes.cpp
@@ -1,221 +1,221 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "SpuCollisionShapes.h"
-
-btPoint3 localGetSupportingVertexWithoutMargin(int shapeType, void* shape, btVector3& localDir,struct	SpuConvexPolyhedronVertexData* convexVertexData)//, int *featureIndex)
-{
-    switch (shapeType)
-    {
-    case SPHERE_SHAPE_PROXYTYPE:
-        {
-            return btPoint3(0,0,0);
-        }
-	case BOX_SHAPE_PROXYTYPE:
-		{
-//			spu_printf("SPU: getSupport BOX_SHAPE_PROXYTYPE\n");
-			btConvexInternalShape* convexShape = (btConvexInternalShape*)shape;
-			const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();
-			
-			return btPoint3(
-				localDir.getX() < 0.0f ? -halfExtents.x() : halfExtents.x(),
-							localDir.getY() < 0.0f ? -halfExtents.y() : halfExtents.y(),
-							localDir.getZ() < 0.0f ? -halfExtents.z() : halfExtents.z());
-		}
-
-	case TRIANGLE_SHAPE_PROXYTYPE:
-		{
-
-			btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
-			btVector3* vertices = (btVector3*)shape;
-			btVector3 dots(dir.dot(vertices[0]), dir.dot(vertices[1]), dir.dot(vertices[2]));
-	  		btVector3 sup = vertices[dots.maxAxis()];
-			return btPoint3(sup.getX(),sup.getY(),sup.getZ());
-			break;
-		}
-
-	case CYLINDER_SHAPE_PROXYTYPE:
-		{
-			btCylinderShape* cylShape = (btCylinderShape*)shape;
-
-			//mapping of halfextents/dimension onto radius/height depends on how cylinder local orientation is (upAxis)
-
-			btVector3 halfExtents = cylShape->getImplicitShapeDimensions();
-			btVector3 v(localDir.getX(),localDir.getY(),localDir.getZ());
-			
-			int cylinderUpAxis = cylShape->getUpAxis();
-			int XX(1),YY(0),ZZ(2);
-
-			switch (cylinderUpAxis)
-			{
-			case 0:
-				{
-					XX = 1;
-					YY = 0;
-					ZZ = 2;
-					break;
-				}
-			case 1:
-				{
-					XX = 0;
-					YY = 1;
-					ZZ = 2;
-				break;
-				}
-			case 2:
-				{
-					XX = 0;
-					YY = 2;
-					ZZ = 1;
-					break;
-				}
-			default:
-				btAssert(0);
-				//printf("SPU:localGetSupportingVertexWithoutMargin unknown Cylinder up-axis\n");
-			};
-
-			btScalar radius = halfExtents[XX];
-			btScalar halfHeight = halfExtents[cylinderUpAxis];
-
-			btVector3 tmp;
-			btScalar d ;
-
-			btScalar s = btSqrt(v[XX] * v[XX] + v[ZZ] * v[ZZ]);
-			if (s != btScalar(0.0))
-			{
-				d = radius / s;  
-				tmp[XX] = v[XX] * d;
-				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
-				tmp[ZZ] = v[ZZ] * d;
-				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
-			}
-			else
-			{
-				tmp[XX] = radius;
-				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
-				tmp[ZZ] = btScalar(0.0);
-				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
-			}
-		}
-
-	case CAPSULE_SHAPE_PROXYTYPE:
-	{
-		//spu_printf("SPU: todo: getSupport CAPSULE_SHAPE_PROXYTYPE\n");
-		btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
-
-		btConvexInternalShape* cnvxShape = (btConvexInternalShape*)shape;
-		btVector3 halfExtents = cnvxShape->getImplicitShapeDimensions();
-		btScalar halfHeight = halfExtents.getY();
-		btScalar radius = halfExtents.getX();
-		btVector3 supVec(0,0,0);
-
-		btScalar maxDot(btScalar(-1e30));
-
-		btVector3 vec = vec0;
-		btScalar lenSqr = vec.length2();
-		if (lenSqr < btScalar(0.0001))
-		{
-			vec.setValue(1,0,0);
-		} else
-		{
-			btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
-			vec *= rlen;
-		}
-		btVector3 vtx;
-		btScalar newDot;
-		{
-			btVector3 pos(0,halfHeight,0);
-			vtx = pos +vec*(radius);
-			newDot = vec.dot(vtx);
-			if (newDot > maxDot)
-			{
-				maxDot = newDot;
-				supVec = vtx;
-			}
-		}
-		{
-			btVector3 pos(0,-halfHeight,0);
-			vtx = pos +vec*(radius);
-			newDot = vec.dot(vtx);
-			if (newDot > maxDot)
-			{
-				maxDot = newDot;
-				supVec = vtx;
-			}
-		}
-		return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
-		break;
-	};
-
-	case CONVEX_HULL_SHAPE_PROXYTYPE:
-		{
-			//spu_printf("SPU: todo: getSupport CONVEX_HULL_SHAPE_PROXYTYPE\n");
-
-		
-
-			btPoint3* points = 0;
-			int numPoints = 0;
-			points = convexVertexData->gConvexPoints;
-			numPoints = convexVertexData->gNumConvexPoints;
-
-		//	spu_printf("numPoints = %d\n",numPoints);
-
-			btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
-			btScalar newDot,maxDot = btScalar(-1e30);
-
-			btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
-			btVector3 vec = vec0;
-			btScalar lenSqr = vec.length2();
-			if (lenSqr < btScalar(0.0001))
-			{
-				vec.setValue(1,0,0);
-			} else
-			{
-				btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
-				vec *= rlen;
-			}
-
-
-			for (int i=0;i<numPoints;i++)
-			{
-				btPoint3 vtx = points[i];// * m_localScaling;
-
-				newDot = vec.dot(vtx);
-				if (newDot > maxDot)
-				{
-					maxDot = newDot;
-					supVec = vtx;
-				}
-			}
-			return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
-
-			break;
-		};
-
-    default:
-
-		//spu_printf("SPU:(type %i) missing support function\n",shapeType);
-
-		
-#if __ASSERT
-        spu_printf("localGetSupportingVertexWithoutMargin() - Unsupported bound type: %d.\n", shapeType);
-#endif // __ASSERT
-        return btPoint3(0.f, 0.f, 0.f);
-    }
-}
-
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuCollisionShapes.h"
+
+btPoint3 localGetSupportingVertexWithoutMargin(int shapeType, void* shape, btVector3& localDir,struct	SpuConvexPolyhedronVertexData* convexVertexData)//, int *featureIndex)
+{
+    switch (shapeType)
+    {
+    case SPHERE_SHAPE_PROXYTYPE:
+        {
+            return btPoint3(0,0,0);
+        }
+	case BOX_SHAPE_PROXYTYPE:
+		{
+//			spu_printf("SPU: getSupport BOX_SHAPE_PROXYTYPE\n");
+			btConvexInternalShape* convexShape = (btConvexInternalShape*)shape;
+			const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();
+			
+			return btPoint3(
+				localDir.getX() < 0.0f ? -halfExtents.x() : halfExtents.x(),
+							localDir.getY() < 0.0f ? -halfExtents.y() : halfExtents.y(),
+							localDir.getZ() < 0.0f ? -halfExtents.z() : halfExtents.z());
+		}
+
+	case TRIANGLE_SHAPE_PROXYTYPE:
+		{
+
+			btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
+			btVector3* vertices = (btVector3*)shape;
+			btVector3 dots(dir.dot(vertices[0]), dir.dot(vertices[1]), dir.dot(vertices[2]));
+	  		btVector3 sup = vertices[dots.maxAxis()];
+			return btPoint3(sup.getX(),sup.getY(),sup.getZ());
+			break;
+		}
+
+	case CYLINDER_SHAPE_PROXYTYPE:
+		{
+			btCylinderShape* cylShape = (btCylinderShape*)shape;
+
+			//mapping of halfextents/dimension onto radius/height depends on how cylinder local orientation is (upAxis)
+
+			btVector3 halfExtents = cylShape->getImplicitShapeDimensions();
+			btVector3 v(localDir.getX(),localDir.getY(),localDir.getZ());
+			
+			int cylinderUpAxis = cylShape->getUpAxis();
+			int XX(1),YY(0),ZZ(2);
+
+			switch (cylinderUpAxis)
+			{
+			case 0:
+				{
+					XX = 1;
+					YY = 0;
+					ZZ = 2;
+					break;
+				}
+			case 1:
+				{
+					XX = 0;
+					YY = 1;
+					ZZ = 2;
+				break;
+				}
+			case 2:
+				{
+					XX = 0;
+					YY = 2;
+					ZZ = 1;
+					break;
+				}
+			default:
+				btAssert(0);
+				//printf("SPU:localGetSupportingVertexWithoutMargin unknown Cylinder up-axis\n");
+			};
+
+			btScalar radius = halfExtents[XX];
+			btScalar halfHeight = halfExtents[cylinderUpAxis];
+
+			btVector3 tmp;
+			btScalar d ;
+
+			btScalar s = btSqrt(v[XX] * v[XX] + v[ZZ] * v[ZZ]);
+			if (s != btScalar(0.0))
+			{
+				d = radius / s;  
+				tmp[XX] = v[XX] * d;
+				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
+				tmp[ZZ] = v[ZZ] * d;
+				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
+			}
+			else
+			{
+				tmp[XX] = radius;
+				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
+				tmp[ZZ] = btScalar(0.0);
+				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
+			}
+		}
+
+	case CAPSULE_SHAPE_PROXYTYPE:
+	{
+		//spu_printf("SPU: todo: getSupport CAPSULE_SHAPE_PROXYTYPE\n");
+		btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
+
+		btConvexInternalShape* cnvxShape = (btConvexInternalShape*)shape;
+		btVector3 halfExtents = cnvxShape->getImplicitShapeDimensions();
+		btScalar halfHeight = halfExtents.getY();
+		btScalar radius = halfExtents.getX();
+		btVector3 supVec(0,0,0);
+
+		btScalar maxDot(btScalar(-1e30));
+
+		btVector3 vec = vec0;
+		btScalar lenSqr = vec.length2();
+		if (lenSqr < btScalar(0.0001))
+		{
+			vec.setValue(1,0,0);
+		} else
+		{
+			btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
+			vec *= rlen;
+		}
+		btVector3 vtx;
+		btScalar newDot;
+		{
+			btVector3 pos(0,halfHeight,0);
+			vtx = pos +vec*(radius);
+			newDot = vec.dot(vtx);
+			if (newDot > maxDot)
+			{
+				maxDot = newDot;
+				supVec = vtx;
+			}
+		}
+		{
+			btVector3 pos(0,-halfHeight,0);
+			vtx = pos +vec*(radius);
+			newDot = vec.dot(vtx);
+			if (newDot > maxDot)
+			{
+				maxDot = newDot;
+				supVec = vtx;
+			}
+		}
+		return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
+		break;
+	};
+
+	case CONVEX_HULL_SHAPE_PROXYTYPE:
+		{
+			//spu_printf("SPU: todo: getSupport CONVEX_HULL_SHAPE_PROXYTYPE\n");
+
+		
+
+			btPoint3* points = 0;
+			int numPoints = 0;
+			points = convexVertexData->gConvexPoints;
+			numPoints = convexVertexData->gNumConvexPoints;
+
+		//	spu_printf("numPoints = %d\n",numPoints);
+
+			btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
+			btScalar newDot,maxDot = btScalar(-1e30);
+
+			btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
+			btVector3 vec = vec0;
+			btScalar lenSqr = vec.length2();
+			if (lenSqr < btScalar(0.0001))
+			{
+				vec.setValue(1,0,0);
+			} else
+			{
+				btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
+				vec *= rlen;
+			}
+
+
+			for (int i=0;i<numPoints;i++)
+			{
+				btPoint3 vtx = points[i];// * m_localScaling;
+
+				newDot = vec.dot(vtx);
+				if (newDot > maxDot)
+				{
+					maxDot = newDot;
+					supVec = vtx;
+				}
+			}
+			return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
+
+			break;
+		};
+
+    default:
+
+		//spu_printf("SPU:(type %i) missing support function\n",shapeType);
+
+		
+#if __ASSERT
+        spu_printf("localGetSupportingVertexWithoutMargin() - Unsupported bound type: %d.\n", shapeType);
+#endif // __ASSERT
+        return btPoint3(0.f, 0.f, 0.f);
+    }
+}
+
 void computeAabb (btVector3& aabbMin, btVector3& aabbMax, btConvexInternalShape* convexShape, ppu_address_t convexShapePtr, int shapeType, btTransform xform)
 {
 	//calculate the aabb, given the types...
@@ -390,7 +390,6 @@ void dmaConvexVertexData (SpuConvexPolyhedronVertexData* convexVertexData, btCon
 	register int dmaSize = convexVertexData->gNumConvexPoints*sizeof(btPoint3);
 	ppu_address_t pointsPPU = (ppu_address_t) convexShapeSPU->getPoints();
 	cellDmaGet(&convexVertexData->g_convexPointBuffer[0], pointsPPU  , dmaSize, DMA_TAG(2), 0, 0);
-	
 }
 
 void dmaCollisionShape (void* collisionShapeLocation, ppu_address_t collisionShapePtr, uint32_t dmaTag, int shapeType)
@@ -422,6 +421,7 @@ void dmaCompoundSubShapes (CompoundShape_LocalStoreMemory* compoundShapeLocation
 	}
 }
 
+
 void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex)
 {
 
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
index 31b05123d..7ebc54da9 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.cpp
@@ -1,36 +1,36 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-#include "SpuContactResult.h"
-
-
-//#define DEBUG_SPU_COLLISION_DETECTION 1
-
-
-SpuContactResult::SpuContactResult()
-{
-	m_manifoldAddress = 0;
-	m_spuManifold = NULL;
-	m_RequiresWriteBack = false;
-}
-
- SpuContactResult::~SpuContactResult()
-{
-	g_manifoldDmaExport.swapBuffers();
-}
-
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuContactResult.h"
+
+
+//#define DEBUG_SPU_COLLISION_DETECTION 1
+
+
+SpuContactResult::SpuContactResult()
+{
+	m_manifoldAddress = 0;
+	m_spuManifold = NULL;
+	m_RequiresWriteBack = false;
+}
+
+ SpuContactResult::~SpuContactResult()
+{
+	g_manifoldDmaExport.swapBuffers();
+}
+
  	///User can override this material combiner by implementing gContactAddedCallback and setting body0->m_collisionFlags |= btCollisionObject::customMaterialCallback;
 inline btScalar	calculateCombinedFriction(btScalar friction0,btScalar friction1)
 {
@@ -50,179 +50,179 @@ inline btScalar	calculateCombinedRestitution(btScalar restitution0,btScalar rest
 {
 	return restitution0*restitution1;
 }
-
-
-
- void	SpuContactResult::setContactInfo(btPersistentManifold* spuManifold, uint64_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction1, bool isSwapped)
- {
-	//spu_printf("SpuContactResult::setContactInfo ManifoldAddress: %lu\n", manifoldAddress);
-	m_rootWorldTransform0 = worldTrans0;
-	m_rootWorldTransform1 = worldTrans1;
-	m_manifoldAddress = manifoldAddress;    
-	m_spuManifold = spuManifold;
-
-	m_combinedFriction = calculateCombinedFriction(friction0,friction1);
-	m_combinedRestitution = calculateCombinedRestitution(restitution0,restitution1);
-	m_isSwapped = isSwapped;
- }
-
- void SpuContactResult::setShapeIdentifiers(int partId0,int index0,	int partId1,int index1)
- {
-	
- }
-	
-
-
- ///return true if it requires a dma transfer back
-bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
-								   const btVector3& pointInWorld,
-								   float depth,
-								   btPersistentManifold* manifoldPtr,
-								   btTransform& transA,
-								   btTransform& transB,
-									btScalar	combinedFriction,
-									btScalar	combinedRestitution,
-								   bool isSwapped)
-{
-	
-	float contactTreshold = manifoldPtr->getContactBreakingThreshold();
-
-	//spu_printf("SPU: add contactpoint, depth:%f, contactTreshold %f, manifoldPtr %llx\n",depth,contactTreshold,manifoldPtr);
-
-#ifdef DEBUG_SPU_COLLISION_DETECTION
-	spu_printf("SPU: contactTreshold %f\n",contactTreshold);
-#endif //DEBUG_SPU_COLLISION_DETECTION
-	if (depth > manifoldPtr->getContactBreakingThreshold())
-		return false;
-
-	//provide inverses or just calculate?
-	btTransform transAInv = transA.inverse();//m_body0->m_cachedInvertedWorldTransform;
-	btTransform transBInv= transB.inverse();//m_body1->m_cachedInvertedWorldTransform;
-
-	btVector3 pointA;
-	btVector3 localA;
-	btVector3 localB;
-	btVector3 normal;
-
-	if (isSwapped)
-	{
-		normal = normalOnBInWorld * -1;
-		pointA = pointInWorld + normal * depth;
-		localA = transAInv(pointA );
-		localB = transBInv(pointInWorld);
-		/*localA = transBInv(pointA );
-		localB = transAInv(pointInWorld);*/
-	}
-	else
-	{
-		normal = normalOnBInWorld;
-		pointA = pointInWorld + normal * depth;
-		localA = transAInv(pointA );
-		localB = transBInv(pointInWorld);
-	}
-
-	btManifoldPoint newPt(localA,localB,normal,depth);
-
-	int insertIndex = manifoldPtr->getCacheEntry(newPt);
-	if (insertIndex >= 0)
-	{
-//		manifoldPtr->replaceContactPoint(newPt,insertIndex);
-//		return true;
-
-#ifdef DEBUG_SPU_COLLISION_DETECTION
-		spu_printf("SPU: same contact detected, nothing done\n");
-#endif //DEBUG_SPU_COLLISION_DETECTION
-		// This is not needed, just use the old info! saves a DMA transfer as well
-	} else
-	{
-
-		newPt.m_combinedFriction = combinedFriction;
-		newPt.m_combinedRestitution = combinedRestitution;
-
-		/*
-		//potential TODO: SPU callbacks, either immediate (local on the SPU), or deferred
-		//User can override friction and/or restitution
-		if (gContactAddedCallback &&
-			//and if either of the two bodies requires custom material
-			 ((m_body0->m_collisionFlags & btCollisionObject::customMaterialCallback) ||
-			   (m_body1->m_collisionFlags & btCollisionObject::customMaterialCallback)))
-		{
-			//experimental feature info, for per-triangle material etc.
-			(*gContactAddedCallback)(newPt,m_body0,m_partId0,m_index0,m_body1,m_partId1,m_index1);
-		}
-		*/
-		manifoldPtr->AddManifoldPoint(newPt);
-		return true;
-
-	}
-	return false;
-	
-}
-
-
-void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold)
-{
-    memcpy(g_manifoldDmaExport.getFront(),lsManifold,sizeof(btPersistentManifold));
-
-    g_manifoldDmaExport.swapBuffers();
-    uint64_t mmAddr = (uint32_t)mmManifold;
-    g_manifoldDmaExport.backBufferDmaPut(mmAddr, sizeof(btPersistentManifold), DMA_TAG(9));
-	// Should there be any kind of wait here?  What if somebody tries to use this tag again?  What if we call this function again really soon?
-	//no, the swapBuffers does the wait
-}
-
-void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btPoint3& pointInWorld,float depth)
-{
-	//spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
-
-#ifdef DEBUG_SPU_COLLISION_DETECTION
- //   int sman = sizeof(rage::phManifold);
-//	spu_printf("sizeof_manifold = %i\n",sman);
-#endif //DEBUG_SPU_COLLISION_DETECTION
-
-	btPersistentManifold* localManifold = m_spuManifold;
-
-	btVector3	normalB(normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
-	btVector3	pointWrld(pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
-
-	//process the contact point
-	const bool retVal = ManifoldResultAddContactPoint(normalB,
-		pointWrld,
-		depth,
-		localManifold,
-		m_rootWorldTransform0,
-		m_rootWorldTransform1,
-		m_combinedFriction,
-		m_combinedRestitution,
-		m_isSwapped);
-	m_RequiresWriteBack = m_RequiresWriteBack || retVal;
-}
-
-void SpuContactResult::flush()
-{
-
-	if (m_spuManifold && m_spuManifold->getNumContacts())
-	{
-		m_spuManifold->refreshContactPoints(m_rootWorldTransform0,m_rootWorldTransform1);
-		m_RequiresWriteBack = true;
-	}
-
-
-	if (m_RequiresWriteBack)
-	{
-#ifdef DEBUG_SPU_COLLISION_DETECTION
-		spu_printf("SPU: Start SpuContactResult::flush (Put) DMA\n");
-		spu_printf("Num contacts:%d\n", m_spuManifold->getNumContacts());
-		spu_printf("Manifold address: %llu\n", m_manifoldAddress);
-#endif //DEBUG_SPU_COLLISION_DETECTION
-	//	spu_printf("writeDoubleBufferedManifold\n");
-		writeDoubleBufferedManifold(m_spuManifold, (btPersistentManifold*)m_manifoldAddress);
-#ifdef DEBUG_SPU_COLLISION_DETECTION
-		spu_printf("SPU: Finished (Put) DMA\n");
-#endif //DEBUG_SPU_COLLISION_DETECTION
-	}
-	m_spuManifold = NULL;
-	m_RequiresWriteBack = false;
-}
-
-
+
+
+
+ void	SpuContactResult::setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction1, bool isSwapped)
+ {
+	//spu_printf("SpuContactResult::setContactInfo ManifoldAddress: %lu\n", manifoldAddress);
+	m_rootWorldTransform0 = worldTrans0;
+	m_rootWorldTransform1 = worldTrans1;
+	m_manifoldAddress = manifoldAddress;    
+	m_spuManifold = spuManifold;
+
+	m_combinedFriction = calculateCombinedFriction(friction0,friction1);
+	m_combinedRestitution = calculateCombinedRestitution(restitution0,restitution1);
+	m_isSwapped = isSwapped;
+ }
+
+ void SpuContactResult::setShapeIdentifiers(int partId0,int index0,	int partId1,int index1)
+ {
+	
+ }
+	
+
+
+ ///return true if it requires a dma transfer back
+bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
+								   const btVector3& pointInWorld,
+								   float depth,
+								   btPersistentManifold* manifoldPtr,
+								   btTransform& transA,
+								   btTransform& transB,
+									btScalar	combinedFriction,
+									btScalar	combinedRestitution,
+								   bool isSwapped)
+{
+	
+	float contactTreshold = manifoldPtr->getContactBreakingThreshold();
+
+	//spu_printf("SPU: add contactpoint, depth:%f, contactTreshold %f, manifoldPtr %llx\n",depth,contactTreshold,manifoldPtr);
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+	spu_printf("SPU: contactTreshold %f\n",contactTreshold);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	if (depth > manifoldPtr->getContactBreakingThreshold())
+		return false;
+
+	//provide inverses or just calculate?
+	btTransform transAInv = transA.inverse();//m_body0->m_cachedInvertedWorldTransform;
+	btTransform transBInv= transB.inverse();//m_body1->m_cachedInvertedWorldTransform;
+
+	btVector3 pointA;
+	btVector3 localA;
+	btVector3 localB;
+	btVector3 normal;
+
+	if (isSwapped)
+	{
+		normal = normalOnBInWorld * -1;
+		pointA = pointInWorld + normal * depth;
+		localA = transAInv(pointA );
+		localB = transBInv(pointInWorld);
+		/*localA = transBInv(pointA );
+		localB = transAInv(pointInWorld);*/
+	}
+	else
+	{
+		normal = normalOnBInWorld;
+		pointA = pointInWorld + normal * depth;
+		localA = transAInv(pointA );
+		localB = transBInv(pointInWorld);
+	}
+
+	btManifoldPoint newPt(localA,localB,normal,depth);
+
+	int insertIndex = manifoldPtr->getCacheEntry(newPt);
+	if (insertIndex >= 0)
+	{
+//		manifoldPtr->replaceContactPoint(newPt,insertIndex);
+//		return true;
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: same contact detected, nothing done\n");
+#endif //DEBUG_SPU_COLLISION_DETECTION
+		// This is not needed, just use the old info! saves a DMA transfer as well
+	} else
+	{
+
+		newPt.m_combinedFriction = combinedFriction;
+		newPt.m_combinedRestitution = combinedRestitution;
+
+		/*
+		//potential TODO: SPU callbacks, either immediate (local on the SPU), or deferred
+		//User can override friction and/or restitution
+		if (gContactAddedCallback &&
+			//and if either of the two bodies requires custom material
+			 ((m_body0->m_collisionFlags & btCollisionObject::customMaterialCallback) ||
+			   (m_body1->m_collisionFlags & btCollisionObject::customMaterialCallback)))
+		{
+			//experimental feature info, for per-triangle material etc.
+			(*gContactAddedCallback)(newPt,m_body0,m_partId0,m_index0,m_body1,m_partId1,m_index1);
+		}
+		*/
+		manifoldPtr->AddManifoldPoint(newPt);
+		return true;
+
+	}
+	return false;
+	
+}
+
+
+void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold)
+{
+    memcpy(g_manifoldDmaExport.getFront(),lsManifold,sizeof(btPersistentManifold));
+
+    g_manifoldDmaExport.swapBuffers();
+    uint64_t mmAddr = (uint32_t)mmManifold;
+    g_manifoldDmaExport.backBufferDmaPut(mmAddr, sizeof(btPersistentManifold), DMA_TAG(9));
+	// Should there be any kind of wait here?  What if somebody tries to use this tag again?  What if we call this function again really soon?
+	//no, the swapBuffers does the wait
+}
+
+void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btPoint3& pointInWorld,float depth)
+{
+	//spu_printf("*** SpuContactResult::addContactPoint: depth = %f\n",depth);
+
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+ //   int sman = sizeof(rage::phManifold);
+//	spu_printf("sizeof_manifold = %i\n",sman);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+
+	btPersistentManifold* localManifold = m_spuManifold;
+
+	btVector3	normalB(normalOnBInWorld.getX(),normalOnBInWorld.getY(),normalOnBInWorld.getZ());
+	btVector3	pointWrld(pointInWorld.getX(),pointInWorld.getY(),pointInWorld.getZ());
+
+	//process the contact point
+	const bool retVal = ManifoldResultAddContactPoint(normalB,
+		pointWrld,
+		depth,
+		localManifold,
+		m_rootWorldTransform0,
+		m_rootWorldTransform1,
+		m_combinedFriction,
+		m_combinedRestitution,
+		m_isSwapped);
+	m_RequiresWriteBack = m_RequiresWriteBack || retVal;
+}
+
+void SpuContactResult::flush()
+{
+
+	if (m_spuManifold && m_spuManifold->getNumContacts())
+	{
+		m_spuManifold->refreshContactPoints(m_rootWorldTransform0,m_rootWorldTransform1);
+		m_RequiresWriteBack = true;
+	}
+
+
+	if (m_RequiresWriteBack)
+	{
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: Start SpuContactResult::flush (Put) DMA\n");
+		spu_printf("Num contacts:%d\n", m_spuManifold->getNumContacts());
+		spu_printf("Manifold address: %llu\n", m_manifoldAddress);
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	//	spu_printf("writeDoubleBufferedManifold\n");
+		writeDoubleBufferedManifold(m_spuManifold, (btPersistentManifold*)m_manifoldAddress);
+#ifdef DEBUG_SPU_COLLISION_DETECTION
+		spu_printf("SPU: Finished (Put) DMA\n");
+#endif //DEBUG_SPU_COLLISION_DETECTION
+	}
+	m_spuManifold = NULL;
+	m_RequiresWriteBack = false;
+}
+
+
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
index fb69a5516..072212e34 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuContactResult.h
@@ -35,10 +35,10 @@ subject to the following restrictions:
 
 struct SpuCollisionPairInput
 {
-	uint64_t m_collisionShapes[2];
+	ppu_address_t m_collisionShapes[2];
 	void*	m_spuCollisionShapes[2];
 
-	uint64_t m_persistentManifoldPtr;
+	ppu_address_t m_persistentManifoldPtr;
 	btVector3	m_primitiveDimensions0;
 	btVector3	m_primitiveDimensions1;
 	int		m_shapeType0;
@@ -50,9 +50,6 @@ struct SpuCollisionPairInput
 	btTransform m_worldTransform1;
 	
 	bool	m_isSwapped;
-
-
-	
 };
 
 
@@ -68,7 +65,7 @@ struct SpuClosestPointInput
     btTransform m_transformB;
     float	m_maximumDistanceSquared;
     class	btStackAlloc* m_stackAlloc;
-	struct SpuConvexPolyhedronVertexData* m_convexVertexData;
+	struct SpuConvexPolyhedronVertexData* m_convexVertexData[2];
 };
 
 ///SpuContactResult exports the contact points using double-buffered DMA transfers, only when needed
@@ -77,7 +74,7 @@ class SpuContactResult
 {
     btTransform		m_rootWorldTransform0;
 	btTransform		m_rootWorldTransform1;
-	uint64_t	m_manifoldAddress;
+	ppu_address_t	m_manifoldAddress;
 
     btPersistentManifold* m_spuManifold;
 	bool m_RequiresWriteBack;
@@ -99,7 +96,7 @@ class SpuContactResult
 
 		virtual void setShapeIdentifiers(int partId0,int index0,	int partId1,int index1);
 
-		void	setContactInfo(btPersistentManifold* spuManifold, uint64_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction01, bool isSwapped);
+		void	setContactInfo(btPersistentManifold* spuManifold, ppu_address_t	manifoldAddress,const btTransform& worldTrans0,const btTransform& worldTrans1, btScalar restitution0,btScalar restitution1, btScalar friction0,btScalar friction01, bool isSwapped);
 
 
         void writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold);
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
index 6152851f2..2a18fa2ba 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuConvexPenetrationDepthSolver.h
@@ -39,7 +39,8 @@ public:
             btTransform& transA,const btTransform& transB,
 			btVector3& v, btPoint3& pa, btPoint3& pb,
 			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
-			struct SpuConvexPolyhedronVertexData* convexVertexData
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
 			) const = 0;
 
 
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
index 11c8227f4..516b632b6 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
@@ -26,7 +26,7 @@
 #include "SpuGjkPairDetector.h"
 #include "SpuVoronoiSimplexSolver.h"
 
-#include "SpuLocalSupport.h" //definition of SpuConvexPolyhedronVertexData
+#include "SpuCollisionShapes.h" //definition of SpuConvexPolyhedronVertexData
 
 #ifdef __CELLOS_LV2__
 ///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases
@@ -92,16 +92,11 @@ int g_CacheHits=0;
 #include <stdio.h>
 #endif
 
-#define MAX_SHAPE_SIZE 256
-
 //int gNumConvexPoints0=0;
 
-
-
 ///Make sure no destructors are called on this memory
 struct	CollisionTask_LocalStoreMemory
 {
-
 	ATTRIBUTE_ALIGNED16(char	bufferProxy0[16]);
 	ATTRIBUTE_ALIGNED16(char	bufferProxy1[16]);
 
@@ -138,41 +133,16 @@ struct	CollisionTask_LocalStoreMemory
 	}
 	btPersistentManifold	gPersistentManifold;
 
-	ATTRIBUTE_ALIGNED16(char	gCollisionShape0[MAX_SHAPE_SIZE]);
-	ATTRIBUTE_ALIGNED16(char	gCollisionShape1[MAX_SHAPE_SIZE]);
+	CollisionShape_LocalStoreMemory gCollisionShapes[2];
 
 	ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
 
-	//ATTRIBUTE_ALIGNED16(btOptimizedBvh	gOptimizedBvh);
-	ATTRIBUTE_ALIGNED16(char gOptimizedBvh[sizeof(btOptimizedBvh)+16]);
-	btOptimizedBvh*	getOptimizedBvh()
-	{
-		return (btOptimizedBvh*) gOptimizedBvh;
-	}
-
-	ATTRIBUTE_ALIGNED16(btTriangleIndexVertexArray	gTriangleMeshInterfaceStorage);
-	btTriangleIndexVertexArray*	gTriangleMeshInterfacePtr;
-	///only a single mesh part for now, we can add support for multiple parts, but quantized trees don't support this at the moment 
-	ATTRIBUTE_ALIGNED16(btIndexedMesh	gIndexMesh);
-
-#define MAX_SPU_SUBTREE_HEADERS 32
-	//1024
-	ATTRIBUTE_ALIGNED16(btBvhSubtreeInfo	gSubtreeHeaders[MAX_SPU_SUBTREE_HEADERS]);
-	ATTRIBUTE_ALIGNED16(btQuantizedBvhNode	gSubtreeNodes[MAX_SUBTREE_SIZE_IN_BYTES/sizeof(btQuantizedBvhNode)]);
-
-	SpuConvexPolyhedronVertexData convexVertexData;
-
-	// Compound data
-#define MAX_SPU_COMPOUND_SUBSHAPES 16
-	ATTRIBUTE_ALIGNED16(btCompoundShapeChild gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES*2]);
-	ATTRIBUTE_ALIGNED16(char gSubshapeShape[MAX_SPU_COMPOUND_SUBSHAPES*2][MAX_SHAPE_SIZE]);
-	
+	bvhMeshShape_LocalStoreMemory bvhShapeData;
+	SpuConvexPolyhedronVertexData convexVertexData[2];
+	CompoundShape_LocalStoreMemory compoundShapeData[2];
 };
 
 
-
-
-
 #if defined(__CELLOS_LV2__) || defined(USE_LIBSPE2) 
 
 ATTRIBUTE_ALIGNED16(CollisionTask_LocalStoreMemory	gLocalStoreMemory);
@@ -189,73 +159,8 @@ void* createCollisionLocalStoreMemory()
 
 #endif
 
-
 void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts);
 
-#define USE_BRANCHFREE_TEST 1
-#ifdef USE_BRANCHFREE_TEST
-SIMD_FORCE_INLINE unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
-{		
-	return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
-		& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
-		& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
-		1, 0);
-}
-#else
-
-unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int*  aabbMax2)
-{
-	unsigned int overlap = 1;
-	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? 0 : overlap;
-	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? 0 : overlap;
-	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? 0 : overlap;
-	return overlap;
-}
-#endif
-
-
-
-void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex)
-{
-
-	int curIndex = startNodeIndex;
-	int walkIterations = 0;
-	int subTreeSize = endNodeIndex - startNodeIndex;
-
-	int escapeIndex;
-
-	unsigned int aabbOverlap, isLeafNode;
-
-	while (curIndex < endNodeIndex)
-	{
-		//catch bugs in tree data
-		assert (walkIterations < subTreeSize);
-
-		walkIterations++;
-		aabbOverlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
-		isLeafNode = rootNode->isLeafNode();
-
-		if (isLeafNode && aabbOverlap)
-		{
-			//printf("overlap with node %d\n",rootNode->getTriangleIndex());
-			nodeCallback->processNode(0,rootNode->getTriangleIndex());
-			//			spu_printf("SPU: overlap detected with triangleIndex:%d\n",rootNode->getTriangleIndex());
-		} 
-
-		if (aabbOverlap || isLeafNode)
-		{
-			rootNode++;
-			curIndex++;
-		} else
-		{
-			escapeIndex = rootNode->getEscapeIndex();
-			rootNode += escapeIndex;
-			curIndex += escapeIndex;
-		}
-	}
-
-}
-
 
 SIMD_FORCE_INLINE void small_cache_read(void* buffer, ppu_address_t ea, size_t size)
 {
@@ -271,7 +176,6 @@ SIMD_FORCE_INLINE void small_cache_read(void* buffer, ppu_address_t ea, size_t s
 #endif
 }
 
-
 SIMD_FORCE_INLINE void small_cache_read_triple(	void* ls0, ppu_address_t ea0,
 												void* ls1, ppu_address_t ea1,
 												void* ls2, ppu_address_t ea2,
@@ -326,7 +230,7 @@ class spuNodeCallback : public btNodeOverlapCallback
 
 	ATTRIBUTE_ALIGNED16(btVector3	spuTriangleVertices[3]);
 	ATTRIBUTE_ALIGNED16(btScalar	spuUnscaledVertex[4]);
-	ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
+	//ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
 
 
 public:
@@ -346,7 +250,7 @@ public:
 
 
 
-		int* indexBasePtr = (int*)(m_lsMemPtr->gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->gIndexMesh.m_triangleIndexStride);
+		int* indexBasePtr = (int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
 
 		small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0],
 								&m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1],
@@ -358,13 +262,13 @@ public:
 		//		spu_printf("SPU index2=%d ,",spuIndices[2]);
 		//		spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr);
 
-		const btVector3& meshScaling = m_lsMemPtr->gTriangleMeshInterfacePtr->getScaling();
+		const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling();
 		for (int j=2;btLikely( j>=0 );j--)
 		{
 			int graphicsindex = m_lsMemPtr->spuIndices[j];
 
 			//			spu_printf("SPU index=%d ,",graphicsindex);
-			btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->gIndexMesh.m_vertexStride);
+			btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride);
 			//			spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr);
 
 
@@ -405,38 +309,18 @@ public:
 };
 
 
-
-
 ////////////////////////
 /// Convex versus Concave triangle mesh collision detection (handles concave triangle mesh versus sphere, box, cylinder, triangle, cone, convex polyhedron etc)
 ///////////////////
 void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts)
 {
 	//order: first collision shape is convex, second concave. m_isSwapped is true, if the original order was opposite
-
-
 	register int dmaSize;
 	register ppu_address_t	dmaPpuAddress2;
 
 	btBvhTriangleMeshShape*	trimeshShape = (btBvhTriangleMeshShape*)wuInput->m_spuCollisionShapes[1];
 	//need the mesh interface, for access to triangle vertices
-	
-	dmaSize = sizeof(btTriangleIndexVertexArray);
-	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(trimeshShape->getMeshInterface());
-	//	spu_printf("trimeshShape->getMeshInterface() == %llx\n",dmaPpuAddress2);
-	lsMemPtr->gTriangleMeshInterfacePtr = (btTriangleIndexVertexArray*)cellDmaGetReadOnly(&lsMemPtr->gTriangleMeshInterfaceStorage, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-	//cellDmaWaitTagStatusAll(DMA_MASK(1));
-	
-
-	///now DMA over the BVH
-	
-	dmaSize = sizeof(btOptimizedBvh);
-	dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(trimeshShape->getOptimizedBvh());
-	//spu_printf("trimeshShape->getOptimizedBvh() == %llx\n",dmaPpuAddress2);
-	cellDmaGet(&lsMemPtr->gOptimizedBvh, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-	//cellDmaWaitTagStatusAll(DMA_MASK(2));
-	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-	
+	dmaBvhShapeData (&lsMemPtr->bvhShapeData, trimeshShape);
 
 	btVector3 aabbMin(-1,-400,-1);
 	btVector3 aabbMax(1,400,1);
@@ -446,82 +330,9 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 	btTransform convexInTriangleSpace;
 	convexInTriangleSpace = wuInput->m_worldTransform1.inverse() * wuInput->m_worldTransform0;
 	btConvexInternalShape* convexShape = (btConvexInternalShape*)wuInput->m_spuCollisionShapes[0];
-	//calculate the aabb, given the types...
-	switch (wuInput->m_shapeType0)
-	{
-	case CYLINDER_SHAPE_PROXYTYPE:
 
-	case BOX_SHAPE_PROXYTYPE:
-		{
-			float margin=convexShape->getMarginNV();
-			btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
-			btTransform& t = convexInTriangleSpace;
-			btMatrix3x3 abs_b = t.getBasis().absolute();  
-			btPoint3 center = t.getOrigin();
-			btVector3 extent = btVector3(abs_b[0].dot(halfExtents),
-				abs_b[1].dot(halfExtents),
-				abs_b[2].dot(halfExtents));
-			extent += btVector3(margin,margin,margin);
-			aabbMin = center - extent;
-			aabbMax = center + extent;
-			break;
-		}
+	computeAabb (aabbMin, aabbMax, convexShape, wuInput->m_collisionShapes[0], wuInput->m_shapeType0, convexInTriangleSpace);
 
-	case CAPSULE_SHAPE_PROXYTYPE:
-		{
-			float margin=convexShape->getMarginNV();
-			btVector3 halfExtents = convexShape->getImplicitShapeDimensions();
-			//add the radius to y-axis to get full height
-			btScalar radius = halfExtents[0];
-			halfExtents[1] += radius;
-			btTransform& t = convexInTriangleSpace;
-			btMatrix3x3 abs_b = t.getBasis().absolute();  
-			btPoint3 center = t.getOrigin();
-			btVector3 extent = btVector3(abs_b[0].dot(halfExtents),
-				abs_b[1].dot(halfExtents),
-				abs_b[2].dot(halfExtents));
-			extent += btVector3(margin,margin,margin);
-			aabbMin = center - extent;
-			aabbMax = center + extent;
-			break;
-		}
-
-
-	case SPHERE_SHAPE_PROXYTYPE:
-		{
-			float radius = convexShape->getImplicitShapeDimensions().getX();// * convexShape->getLocalScaling().getX();
-			float margin = radius + convexShape->getMarginNV();
-			btTransform& t = convexInTriangleSpace;
-			const btVector3& center = t.getOrigin();
-			btVector3 extent(margin,margin,margin);
-			aabbMin = center - extent;
-			aabbMax = center + extent;
-			break;
-		}
-	case CONVEX_HULL_SHAPE_PROXYTYPE:
-		{
-			dmaSize = sizeof(btConvexHullShape);
-			dmaPpuAddress2 = wuInput->m_collisionShapes[0];
-			ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
-
-			cellDmaGet(&convexHullShape0, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			btConvexHullShape* localPtr = (btConvexHullShape*)&convexHullShape0;
-			btTransform& t = convexInTriangleSpace;
-
-			btScalar margin = convexShape->getMarginNV();
-
-			localPtr->getNonvirtualAabb(t,aabbMin,aabbMax,margin);
-
-			//spu_printf("SPU convex aabbMin=%f,%f,%f=\n",aabbMin.getX(),aabbMin.getY(),aabbMin.getZ());
-			//spu_printf("SPU convex aabbMax=%f,%f,%f=\n",aabbMax.getX(),aabbMax.getY(),aabbMax.getZ());
-
-			break;
-		}
-
-	default:
-		spu_printf("SPU: unsupported shapetype %d in AABB calculation\n");
-	};
 
 	//CollisionShape* triangleShape = static_cast<btCollisionShape*>(triBody->m_collisionShape);
 	//convexShape->getAabb(convexInTriangleSpace,m_aabbMin,m_aabbMax);
@@ -531,51 +342,38 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 	//	aabbMax += extra;
 	//	aabbMin -= extra;
 
-
-
 	///quantize query AABB
 	unsigned short int quantizedQueryAabbMin[3];
 	unsigned short int quantizedQueryAabbMax[3];
-	lsMemPtr->getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin,aabbMin);
-	lsMemPtr->getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax,aabbMax);
+	lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin,aabbMin);
+	lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax,aabbMax);
 
-	QuantizedNodeArray&	nodeArray = lsMemPtr->getOptimizedBvh()->getQuantizedNodeArray();
+	QuantizedNodeArray&	nodeArray = lsMemPtr->bvhShapeData.getOptimizedBvh()->getQuantizedNodeArray();
 	//spu_printf("SPU: numNodes = %d\n",nodeArray.size());
 
-	BvhSubtreeInfoArray& subTrees = lsMemPtr->getOptimizedBvh()->getSubtreeInfoArray();
+	BvhSubtreeInfoArray& subTrees = lsMemPtr->bvhShapeData.getOptimizedBvh()->getSubtreeInfoArray();
 
 	spuNodeCallback	nodeCallback(wuInput,lsMemPtr,spuContacts);
-	IndexedMeshArray&	indexArray = lsMemPtr->gTriangleMeshInterfacePtr->getIndexedMeshArray();
+	IndexedMeshArray&	indexArray = lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getIndexedMeshArray();
 	//spu_printf("SPU:indexArray.size() = %d\n",indexArray.size());
 
-
 	//	spu_printf("SPU: numSubTrees = %d\n",subTrees.size());
 	//not likely to happen
 	if (subTrees.size() && indexArray.size() == 1)
 	{
 		///DMA in the index info
-		
-		dmaSize = sizeof(btIndexedMesh);
-		dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(&indexArray[0]);
-		cellDmaGet(&lsMemPtr->gIndexMesh, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+		dmaBvhIndexedMesh (&lsMemPtr->bvhShapeData.gIndexMesh, indexArray, 0 /* index into indexArray */, 1 /* dmaTag */);
 		cellDmaWaitTagStatusAll(DMA_MASK(1));
 		
-
-		//spu_printf("SPU gIndexMesh dma finished\n");
-
 		//display the headers
 		int numBatch = subTrees.size();
 		for (int i=0;i<numBatch;)
 		{
-
 // BEN: TODO - can reorder DMA transfers for less stall
 			int remaining = subTrees.size() - i;
 			int nextBatch = remaining < MAX_SPU_SUBTREE_HEADERS ? remaining : MAX_SPU_SUBTREE_HEADERS;
 			
-			dmaSize = nextBatch* sizeof(btBvhSubtreeInfo);
-			dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(&subTrees[i]);
-			//				spu_printf("&subtree[i]=%llx, dmaSize = %d\n",dmaPpuAddress2,dmaSize);
-			cellDmaGet(&lsMemPtr->gSubtreeHeaders[0], dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
+			dmaBvhSubTreeHeaders (&lsMemPtr->bvhShapeData.gSubtreeHeaders[0], (ppu_address_t)(&subTrees[i]), nextBatch, 1);
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
 			
 
@@ -583,7 +381,7 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 
 			for (int j=0;j<nextBatch;j++)
 			{
-				const btBvhSubtreeInfo& subtree = lsMemPtr->gSubtreeHeaders[j];
+				const btBvhSubtreeInfo& subtree = lsMemPtr->bvhShapeData.gSubtreeHeaders[j];
 
 				unsigned int overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
 				if (overlap)
@@ -591,23 +389,15 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 					btAssert(subtree.m_subtreeSize);
 
 					//dma the actual nodes of this subtree
-					
-					dmaSize = subtree.m_subtreeSize* sizeof(btQuantizedBvhNode);
-					dmaPpuAddress2 = reinterpret_cast<ppu_address_t>(&nodeArray[subtree.m_rootNodeIndex]);
-					cellDmaGet(&lsMemPtr->gSubtreeNodes[0], dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
+					dmaBvhSubTreeNodes (&lsMemPtr->bvhShapeData.gSubtreeNodes[0], subtree, nodeArray, 2);
 					cellDmaWaitTagStatusAll(DMA_MASK(2));
-					
-
-
 
+					/* Walk this subtree */
 					spuWalkStacklessQuantizedTree(&nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,
-						&lsMemPtr->gSubtreeNodes[0],
+						&lsMemPtr->bvhShapeData.gSubtreeNodes[0],
 						0,
 						subtree.m_subtreeSize);
-
 				}
-
-
 				//				spu_printf("subtreeSize = %d\n",gSubtreeHeaders[j].m_subtreeSize);
 			}
 
@@ -619,73 +409,10 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 		}
 
 		//pre-fetch first tree, then loop and double buffer
-
-
-
 	}
 
 }
 
-///getShapeTypeSize could easily be optimized, but it is not likely a bottleneck
-SIMD_FORCE_INLINE int		getShapeTypeSize(int shapeType)
-{
-
-
-	switch (shapeType)
-	{
-	case CYLINDER_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btCylinderShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-	case BOX_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btBoxShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-	case SPHERE_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btSphereShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-	case TRIANGLE_MESH_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btBvhTriangleMeshShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-	case CAPSULE_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btCapsuleShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-
-	case CONVEX_HULL_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btConvexHullShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-
-	case COMPOUND_SHAPE_PROXYTYPE:
-		{
-			int shapeSize = sizeof(btCompoundShape);
-			btAssert(shapeSize < MAX_SHAPE_SIZE);
-			return shapeSize;
-		}
-
-	default:
-		btAssert(0);
-		//unsupported shapetype, please add here
-		return 0;
-	}
-}
-
-
 
 
 ////////////////////////
@@ -693,8 +420,6 @@ SIMD_FORCE_INLINE int		getShapeTypeSize(int shapeType)
 ///////////////////
 void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts)
 {
-
-	
 	register int dmaSize;
 	register ppu_address_t	dmaPpuAddress2;
 	
@@ -705,12 +430,8 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 	//CollisionShape* shape1 = (CollisionShape*)wuInput->m_collisionShapes[1];
 	btPersistentManifold* manifold = (btPersistentManifold*)wuInput->m_persistentManifoldPtr;
 
-
-
 	bool genericGjk = true;
 
-
-
 	if (genericGjk)
 	{
 		//try generic GJK
@@ -718,8 +439,6 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 		SpuVoronoiSimplexSolver vsSolver;
 		SpuMinkowskiPenetrationDepthSolver	penetrationSolver;
 
-
-
 		///DMA in the vertices for convex shapes
 		ATTRIBUTE_ALIGNED16(char convexHullShape0[sizeof(btConvexHullShape)]);
 		ATTRIBUTE_ALIGNED16(char convexHullShape1[sizeof(btConvexHullShape)]);
@@ -735,12 +454,8 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 			//cellDmaWaitTagStatusAll(DMA_MASK(1));
 		}
 
-		
-		
 		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
 		{
-
-
 			//	spu_printf("SPU: DMA btConvexHullShape\n");
 			dmaSize = sizeof(btConvexHullShape);
 			dmaPpuAddress2 = wuInput->m_collisionShapes[1];
@@ -748,68 +463,31 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 			//cellDmaWaitTagStatusAll(DMA_MASK(1));
 		}
 		
-		
-
 		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
 		{		
-
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			btConvexHullShape* localPtr = (btConvexHullShape*)&convexHullShape0;
-
-			lsMemPtr->convexVertexData.gNumConvexPoints0 = localPtr->getNumPoints();
-			if (lsMemPtr->convexVertexData.gNumConvexPoints0>MAX_NUM_SPU_CONVEX_POINTS)
-			{
-				btAssert(0);
-				spu_printf("SPU: Error: MAX_NUM_SPU_CONVEX_POINTS(%d) exceeded: %d\n",MAX_NUM_SPU_CONVEX_POINTS,lsMemPtr->convexVertexData.gNumConvexPoints0);
-				return;
-			}
-			
-			dmaSize = lsMemPtr->convexVertexData.gNumConvexPoints0*sizeof(btPoint3);
-			dmaPpuAddress2 = (ppu_address_t) localPtr->getPoints();
-			cellDmaGet(&lsMemPtr->convexVertexData.g_convexPointBuffer0, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-
-			lsMemPtr->convexVertexData.gSpuConvexShapePtr0 = wuInput->m_spuCollisionShapes[0];
-			
-
+			dmaConvexVertexData (&lsMemPtr->convexVertexData[0], (btConvexHullShape*)&convexHullShape0);
+			lsMemPtr->convexVertexData[0].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[0];
 		}
 
 			
 		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
 		{
-			
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			btConvexHullShape* localPtr = (btConvexHullShape*)&convexHullShape1;
-
-			lsMemPtr->convexVertexData.gNumConvexPoints1 = localPtr->getNumPoints();
-			if (lsMemPtr->convexVertexData.gNumConvexPoints1>MAX_NUM_SPU_CONVEX_POINTS)
-			{
-				btAssert(0);
-				spu_printf("SPU: Error: MAX_NUM_SPU_CONVEX_POINTS(%d) exceeded: %d\n",MAX_NUM_SPU_CONVEX_POINTS,lsMemPtr->convexVertexData.gNumConvexPoints1);
-				return;
-			}
-			
-			
-			dmaSize = lsMemPtr->convexVertexData.gNumConvexPoints1*sizeof(btPoint3);
-			dmaPpuAddress2 = (ppu_address_t) localPtr->getPoints();
-			cellDmaGet(&lsMemPtr->convexVertexData.g_convexPointBuffer1, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-
-			lsMemPtr->convexVertexData.gSpuConvexShapePtr1 = wuInput->m_spuCollisionShapes[1];
-			
-
+			dmaConvexVertexData (&lsMemPtr->convexVertexData[1], (btConvexHullShape*)&convexHullShape1);
+			lsMemPtr->convexVertexData[1].gSpuConvexShapePtr = wuInput->m_spuCollisionShapes[1];
 		}
 
 		if ( btLikely( wuInput->m_shapeType0 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
 		{		
 			cellDmaWaitTagStatusAll(DMA_MASK(2));
-			
-			lsMemPtr->convexVertexData.gConvexPoints0 = &lsMemPtr->convexVertexData.g_convexPointBuffer0[0];
+			lsMemPtr->convexVertexData[0].gConvexPoints = &lsMemPtr->convexVertexData[0].g_convexPointBuffer[0];
 		}
 
 		if ( btLikely( wuInput->m_shapeType1 == CONVEX_HULL_SHAPE_PROXYTYPE ) )
 		{
-			cellDmaWaitTagStatusAll(DMA_MASK(2));
-			
-			lsMemPtr->convexVertexData.gConvexPoints1 = &lsMemPtr->convexVertexData.g_convexPointBuffer1[0];
+			cellDmaWaitTagStatusAll(DMA_MASK(2));		
+			lsMemPtr->convexVertexData[1].gConvexPoints = &lsMemPtr->convexVertexData[1].g_convexPointBuffer[0];
 		}
 
 
@@ -821,7 +499,8 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 		float marginB = wuInput->m_collisionMargin1;
 
 		SpuClosestPointInput	cpInput;
-		cpInput.m_convexVertexData = &lsMemPtr->convexVertexData;
+		cpInput.m_convexVertexData[0] = &lsMemPtr->convexVertexData[0];
+		cpInput.m_convexVertexData[1] = &lsMemPtr->convexVertexData[1];
 		cpInput.m_transformA = wuInput->m_worldTransform0;
 		cpInput.m_transformB = wuInput->m_worldTransform1;
 		float sumMargin = (marginA+marginB+lsMemPtr->gPersistentManifold.getContactBreakingThreshold());
@@ -858,27 +537,18 @@ SIMD_FORCE_INLINE void	dmaAndSetupCollisionObjects(SpuCollisionPairInput& collis
 	register int dmaSize;
 	register ppu_address_t	dmaPpuAddress2;
 		
-	
-		dmaSize = sizeof(btCollisionObject);
-		dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr1->m_clientObject :*/ (ppu_address_t)lsMem.gProxyPtr0->m_clientObject;
-		cellDmaGet(&lsMem.gColObj0, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);		
-	
-	
-		dmaSize = sizeof(btCollisionObject);
-		dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr0->m_clientObject :*/ (ppu_address_t)lsMem.gProxyPtr1->m_clientObject;
-		cellDmaGet(&lsMem.gColObj1, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);		
-	
+	dmaSize = sizeof(btCollisionObject);
+	dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr1->m_clientObject :*/ (ppu_address_t)lsMem.gProxyPtr0->m_clientObject;
+	cellDmaGet(&lsMem.gColObj0, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);		
 
+	dmaSize = sizeof(btCollisionObject);
+	dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr0->m_clientObject :*/ (ppu_address_t)lsMem.gProxyPtr1->m_clientObject;
+	cellDmaGet(&lsMem.gColObj1, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);		
+	
 	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 
 	collisionPairInput.m_worldTransform0 = lsMem.getColObj0()->getWorldTransform();
 	collisionPairInput.m_worldTransform1 = lsMem.getColObj1()->getWorldTransform();
-
-
-
-#ifdef DEBUG_SPU_COLLISION_DETECTION
-#endif //DEBUG_SPU_COLLISION_DETECTION
-
 }
 
 
@@ -894,26 +564,11 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 	if (btBroadphaseProxy::isConvex(collisionPairInput.m_shapeType0) 
 		&& btBroadphaseProxy::isConvex(collisionPairInput.m_shapeType1))
 	{
-
-		//dmaAndSetupCollisionObjects(collisionPairInput, lsMem);
-
 		if (dmaShapes)
 		{
-			
-				dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType0);
-				//uint64_t	dmaPpuAddress2 = (uint64_t)lsMem.gColObj0.getCollisionShape();
-				dmaPpuAddress2 = collisionShape0Ptr;
-				cellDmaGet(collisionShape0Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-				//cellDmaWaitTagStatusAll(DMA_MASK(1));
-			
-			
-				dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType1);
-				dmaPpuAddress2 = collisionShape1Ptr;
-				cellDmaGet(collisionShape1Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-				//cellDmaWaitTagStatusAll(DMA_MASK(2));
-				
-				cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-			
+			dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
+			dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
+			cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 		}
 
 		btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
@@ -935,82 +590,41 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 	{
 		//snPause();
 
+		dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
+		dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
+
 		// Both are compounds, do N^2 CD for now
 		// TODO: add some AABB-based pruning
-		
-			dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType0);
-			dmaPpuAddress2 = collisionShape0Ptr;
-			cellDmaGet(collisionShape0Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		
-		
-			dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType1);
-			dmaPpuAddress2 = collisionShape1Ptr;
-			cellDmaGet(collisionShape1Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(2));
-			
-			cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		
-
+	
 		btCompoundShape* spuCompoundShape0 = (btCompoundShape*)collisionShape0Loc;
 		btCompoundShape* spuCompoundShape1 = (btCompoundShape*)collisionShape1Loc;
 
+		dmaCompoundShapeInfo (&lsMem.compoundShapeData[0], spuCompoundShape0, 1);
+		dmaCompoundShapeInfo (&lsMem.compoundShapeData[1], spuCompoundShape1, 2);
+		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
+		
+
+		dmaCompoundSubShapes (&lsMem.compoundShapeData[0], spuCompoundShape0, 1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		dmaCompoundSubShapes (&lsMem.compoundShapeData[1], spuCompoundShape1, 1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+
 		int childShapeCount0 = spuCompoundShape0->getNumChildShapes();
 		int childShapeCount1 = spuCompoundShape1->getNumChildShapes();
 
-		// dma the first list of child shapes
-		
-			dmaSize = childShapeCount0 * sizeof(btCompoundShapeChild);
-			dmaPpuAddress2 = (ppu_address_t)spuCompoundShape0->getChildList();
-			cellDmaGet(lsMem.gSubshapes, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		
-
-		// dma the second list of child shapes
-		
-			dmaSize = childShapeCount1 * sizeof(btCompoundShapeChild);
-			dmaPpuAddress2 = (ppu_address_t)spuCompoundShape1->getChildList();
-			cellDmaGet(&lsMem.gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES], dmaPpuAddress2, dmaSize, DMA_TAG(2), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(2));
-			cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		
-
-			int i;
-
-		// DMA all the subshapes 
-		for ( i = 0; i < childShapeCount0; ++i)
-		{
-			btCompoundShapeChild& childShape = lsMem.gSubshapes[i];
-
-			dmaSize = getShapeTypeSize(childShape.m_childShapeType);
-			dmaPpuAddress2 = (ppu_address_t)childShape.m_childShape;
-			cellDmaGet(lsMem.gSubshapeShape[i], dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		}
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-
-		for ( i = 0; i < childShapeCount1; ++i)
-		{
-			btCompoundShapeChild& childShape = lsMem.gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES+i];
-
-			dmaSize = getShapeTypeSize(childShape.m_childShapeType);
-			dmaPpuAddress2 = (ppu_address_t)childShape.m_childShape;
-
-			cellDmaGet(lsMem.gSubshapeShape[MAX_SPU_COMPOUND_SUBSHAPES+i], dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		}
-		cellDmaWaitTagStatusAll(DMA_MASK(1));
-
 		// Start the N^2
-		for ( i = 0; i < childShapeCount0; ++i)
+		for (int i = 0; i < childShapeCount0; ++i)
 		{
-			btCompoundShapeChild& childShape0 = lsMem.gSubshapes[i];
+			btCompoundShapeChild& childShape0 = lsMem.compoundShapeData[0].gSubshapes[i];
 
 			for (int j = 0; j < childShapeCount1; ++j)
 			{
-				btCompoundShapeChild& childShape1 = lsMem.gSubshapes[MAX_SPU_COMPOUND_SUBSHAPES+j];
+				btCompoundShapeChild& childShape1 = lsMem.compoundShapeData[1].gSubshapes[j];
 
+				/* Create a new collision pair input struct using the two child shapes */
 				SpuCollisionPairInput cinput (collisionPairInput);
+
 				cinput.m_worldTransform0 = collisionPairInput.m_worldTransform0 * childShape0.m_transform;
 				cinput.m_shapeType0 = childShape0.m_childShapeType;
 				cinput.m_collisionMargin0 = childShape0.m_childMargin;
@@ -1018,10 +632,10 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 				cinput.m_worldTransform1 = collisionPairInput.m_worldTransform1 * childShape1.m_transform;
 				cinput.m_shapeType1 = childShape1.m_childShapeType;
 				cinput.m_collisionMargin1 = childShape1.m_childMargin;
-
+				/* Recursively call handleCollisionPair () with new collision pair input */
 				handleCollisionPair(cinput, lsMem, spuContacts,			
-					(ppu_address_t)childShape0.m_childShape, lsMem.gSubshapeShape[i], 
-					(ppu_address_t)childShape1.m_childShape, lsMem.gSubshapeShape[MAX_SPU_COMPOUND_SUBSHAPES+i], false);
+					(ppu_address_t)childShape0.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], 
+					(ppu_address_t)childShape1.m_childShape, lsMem.compoundShapeData[1].gSubshapeShape[j], false); // bug fix: changed index to j.
 			}
 		}
 	}
@@ -1029,55 +643,32 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 	{
 		//snPause();
 		
-			dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType0);
-			dmaPpuAddress2 = collisionShape0Ptr;
-			cellDmaGet(collisionShape0Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		
-		
-			dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType1);
-			dmaPpuAddress2 = collisionShape1Ptr;
-			cellDmaGet(collisionShape1Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-//			cellDmaWaitTagStatusAll(DMA_MASK(2));
-			cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		
+		dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
+		dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 
 		// object 0 compound, object 1 non-compound
 		btCompoundShape* spuCompoundShape = (btCompoundShape*)collisionShape0Loc;
+		dmaCompoundShapeInfo (&lsMem.compoundShapeData[0], spuCompoundShape, 1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
 
 		int childShapeCount = spuCompoundShape->getNumChildShapes();
 
-		// dma the list of child shapes
-		
-			dmaSize = childShapeCount * sizeof(btCompoundShapeChild);
-
-			dmaPpuAddress2 = (ppu_address_t)spuCompoundShape->getChildList();
-
-			cellDmaGet(lsMem.gSubshapes, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-		
-
 		for (int i = 0; i < childShapeCount; ++i)
 		{
-			btCompoundShapeChild& childShape = lsMem.gSubshapes[i];
+			btCompoundShapeChild& childShape = lsMem.compoundShapeData[0].gSubshapes[i];
 
 			// Dma the child shape
+			dmaCollisionShape (&lsMem.compoundShapeData[0].gSubshapeShape[i], (ppu_address_t)childShape.m_childShape, 1, childShape.m_childShapeType);
+			cellDmaWaitTagStatusAll(DMA_MASK(1));
 			
-				dmaSize = getShapeTypeSize(childShape.m_childShapeType);
-				dmaPpuAddress2 = (ppu_address_t)childShape.m_childShape;
-
-				cellDmaGet(lsMem.gSubshapeShape[i], dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-				cellDmaWaitTagStatusAll(DMA_MASK(1));
-			
-
 			SpuCollisionPairInput cinput (collisionPairInput);
 			cinput.m_worldTransform0 = collisionPairInput.m_worldTransform0 * childShape.m_transform;
 			cinput.m_shapeType0 = childShape.m_childShapeType;
 			cinput.m_collisionMargin0 = childShape.m_childMargin;
 
-
 			handleCollisionPair(cinput, lsMem, spuContacts,			
-				(ppu_address_t)childShape.m_childShape, lsMem.gSubshapeShape[i], 
+				(ppu_address_t)childShape.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], 
 				collisionShape1Ptr, collisionShape1Loc, false);
 		}
 	}
@@ -1085,57 +676,30 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 	{
 		//snPause();
 		
-			dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType0);
-			dmaPpuAddress2 = collisionShape0Ptr;
-			cellDmaGet(collisionShape0Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(1));
-		
-		
-			dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType1);
-			dmaPpuAddress2 = collisionShape1Ptr;
-
-			cellDmaGet(collisionShape1Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-			//cellDmaWaitTagStatusAll(DMA_MASK(2));
-			cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-		
-
+		dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
+		dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 		// object 0 non-compound, object 1 compound
 		btCompoundShape* spuCompoundShape = (btCompoundShape*)collisionShape1Loc;
-
+		dmaCompoundShapeInfo (&lsMem.compoundShapeData[0], spuCompoundShape, 1);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		
 		int childShapeCount = spuCompoundShape->getNumChildShapes();
 
-		// dma the list of child shapes
-		
-			dmaSize = childShapeCount * sizeof(btCompoundShapeChild);
-
-			dmaPpuAddress2 = (ppu_address_t)spuCompoundShape->getChildList();
-
-			cellDmaGet(lsMem.gSubshapes, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-			cellDmaWaitTagStatusAll(DMA_MASK(1));
-		
-
 		for (int i = 0; i < childShapeCount; ++i)
 		{
-			btCompoundShapeChild& childShape = lsMem.gSubshapes[i];
-
+			btCompoundShapeChild& childShape = lsMem.compoundShapeData[0].gSubshapes[i];
 			// Dma the child shape
-			
-				dmaSize = getShapeTypeSize(childShape.m_childShapeType);
-				dmaPpuAddress2 = (ppu_address_t)childShape.m_childShape;
-
-				cellDmaGet(lsMem.gSubshapeShape[i], dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
-				cellDmaWaitTagStatusAll(DMA_MASK(1));
-			
+			dmaCollisionShape (&lsMem.compoundShapeData[0].gSubshapeShape[i], (ppu_address_t)childShape.m_childShape, 1, childShape.m_childShapeType);
+			cellDmaWaitTagStatusAll(DMA_MASK(1));
 
 			SpuCollisionPairInput cinput (collisionPairInput);
 			cinput.m_worldTransform1 = collisionPairInput.m_worldTransform1 * childShape.m_transform;
 			cinput.m_shapeType1 = childShape.m_childShapeType;
 			cinput.m_collisionMargin1 = childShape.m_childMargin;
-
 			handleCollisionPair(cinput, lsMem, spuContacts,
 				collisionShape0Ptr, collisionShape0Loc, 
-				(ppu_address_t)childShape.m_childShape, lsMem.gSubshapeShape[i], false);
-
+				(ppu_address_t)childShape.m_childShape, lsMem.compoundShapeData[0].gSubshapeShape[i], false);
 		}
 		
 	}
@@ -1166,29 +730,11 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 		}
 		if (handleConvexConcave)
 		{
-
 			if (dmaShapes)
 			{
-				///dma and initialize the convex object
-				
-					dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType0);
-					//uint64_t	dmaPpuAddress2 = (uint64_t)lsMem.gColObj0.getCollisionShape();
-
-					dmaPpuAddress2 = collisionShape0Ptr;
-
-					cellDmaGet(collisionShape0Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(1), 0, 0);
-					//cellDmaWaitTagStatusAll(DMA_MASK(1));
-				
-				///dma and initialize the concave object
-				
-					dmaSize = getShapeTypeSize(collisionPairInput.m_shapeType1);
-
-					dmaPpuAddress2 = collisionShape1Ptr;
-
-					cellDmaGet(collisionShape1Loc, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
-					//cellDmaWaitTagStatusAll(DMA_MASK(2));
-					cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
-				
+				dmaCollisionShape (collisionShape0Loc, collisionShape0Ptr, 1, collisionPairInput.m_shapeType0);
+				dmaCollisionShape (collisionShape1Loc, collisionShape1Ptr, 2, collisionPairInput.m_shapeType1);
+				cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 			}
 			
 			btConvexInternalShape* spuConvexShape0 = (btConvexInternalShape*)collisionShape0Loc;
@@ -1210,7 +756,6 @@ void	handleCollisionPair(SpuCollisionPairInput& collisionPairInput, CollisionTas
 }
 
 
-
 void	processCollisionTask(void* userPtr, void* lsMemPtr)
 {
 
@@ -1225,7 +770,7 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 
 	////////////////////
 
-	uint64_t dmaInPtr = taskDesc.inPtr;
+	ppu_address_t dmaInPtr = taskDesc.inPtr;
 	unsigned int numPages = taskDesc.numPages;
 	unsigned int numOnLastPage = taskDesc.numOnLastPage;
 
@@ -1336,7 +881,7 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 						lsMem.gProxyPtr0 = (btBroadphaseProxy*) lsMem.bufferProxy0;
 						stallingUnalignedDmaSmallGet(lsMem.gProxyPtr0, dmaPpuAddress2  , dmaSize);
 
-						collisionPairInput.m_persistentManifoldPtr = (uint64_t) lsMem.gSpuContactManifoldAlgo.getContactManifoldPtr();
+						collisionPairInput.m_persistentManifoldPtr = (ppu_address_t) lsMem.gSpuContactManifoldAlgo.getContactManifoldPtr();
 						collisionPairInput.m_isSwapped = false;
 						
 						
@@ -1387,10 +932,10 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 								dmaAndSetupCollisionObjects(collisionPairInput, lsMem);
 
 								handleCollisionPair(collisionPairInput, lsMem, spuContacts, 
-									(ppu_address_t)lsMem.getColObj0()->getCollisionShape(), lsMem.gCollisionShape0,
-									(ppu_address_t)lsMem.getColObj1()->getCollisionShape(), lsMem.gCollisionShape1);
+									(ppu_address_t)lsMem.getColObj0()->getCollisionShape(), &lsMem.gCollisionShapes[0].collisionShape,
+									(ppu_address_t)lsMem.getColObj1()->getCollisionShape(), &lsMem.gCollisionShapes[1].collisionShape);
 
-							}		
+							}
 						}
 
 					}
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
index 05262d85e..34e60ef94 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.h
@@ -23,11 +23,11 @@ subject to the following restrictions:
 ///Task Description for SPU collision detection
 struct SpuGatherAndProcessPairsTaskDesc 
 {
-	uint64_t	inPtr;//m_pairArrayPtr;
+	ppu_address_t	inPtr;//m_pairArrayPtr;
 	//mutex variable
 	uint32_t	m_someMutexVariableInMainMemory;
 
-	uint64_t	m_dispatcher;
+	ppu_address_t	m_dispatcher;
 
 	uint32_t	numOnLastPage;
 
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
index b598e4f23..4d6d6d92e 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGjkPairDetector.cpp
@@ -15,7 +15,7 @@ subject to the following restrictions:
 
 #include "SpuGjkPairDetector.h"
 #include "SpuConvexPenetrationDepthSolver.h"
-#include "SpuLocalSupport.h"
+#include "SpuCollisionShapes.h"
 
 
 
@@ -106,8 +106,8 @@ void SpuGjkPairDetector::getClosestPoints(const SpuClosestPointInput& input,SpuC
 //			btVector3 pInA = m_minkowskiA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
 //			btVector3 qInB = m_minkowskiB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
 
-			btVector3 pInA  = localGetSupportingVertexWithoutMargin(m_shapeTypeA, m_minkowskiA, seperatingAxisInA,input.m_convexVertexData);//, &featureIndexA);
-			btVector3 qInB  = localGetSupportingVertexWithoutMargin(m_shapeTypeB, m_minkowskiB, seperatingAxisInB,input.m_convexVertexData);//, &featureIndexB);
+			btVector3 pInA  = localGetSupportingVertexWithoutMargin(m_shapeTypeA, m_minkowskiA, seperatingAxisInA,input.m_convexVertexData[0]);//, &featureIndexA);
+			btVector3 qInB  = localGetSupportingVertexWithoutMargin(m_shapeTypeB, m_minkowskiB, seperatingAxisInB,input.m_convexVertexData[1]);//, &featureIndexB);
 
 
 			btPoint3  pWorld = localTransA(pInA);	
@@ -250,7 +250,7 @@ void SpuGjkPairDetector::getClosestPoints(const SpuClosestPointInput& input,SpuC
                     marginA, marginB,
 					localTransA,localTransB,
 					m_cachedSeparatingAxis, tmpPointOnA, tmpPointOnB,
-					0,input.m_stackAlloc,input.m_convexVertexData
+					0,input.m_stackAlloc,input.m_convexVertexData[0], input.m_convexVertexData[1]
 					);
 
 				if (isValid2)
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
index 7ad95dd7f..8b89de03f 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuLocalSupport.h
@@ -16,233 +16,4 @@ subject to the following restrictions:
 
 
 
-#include "BulletCollision/BroadphaseCollision/btBroadphaseProxy.h"
-#include "BulletCollision/CollisionShapes/btConvexInternalShape.h"
-#include "BulletCollision/CollisionShapes/btCylinderShape.h"
-
-#define MAX_NUM_SPU_CONVEX_POINTS 128
-
-struct	SpuConvexPolyhedronVertexData
-{
-	void*	gSpuConvexShapePtr0;
-	void*	gSpuConvexShapePtr1;
-	btPoint3* gConvexPoints0;
-	btPoint3* gConvexPoints1;
-	int gNumConvexPoints0;
-	int gNumConvexPoints1;
-	ATTRIBUTE_ALIGNED16(btPoint3	g_convexPointBuffer0[MAX_NUM_SPU_CONVEX_POINTS]);
-	ATTRIBUTE_ALIGNED16(btPoint3	g_convexPointBuffer1[MAX_NUM_SPU_CONVEX_POINTS]);
-
-};
-
-
-inline btPoint3 localGetSupportingVertexWithoutMargin(int shapeType, void* shape, btVector3& localDir,struct	SpuConvexPolyhedronVertexData* convexVertexData)//, int *featureIndex)
-{
-    switch (shapeType)
-    {
-    case SPHERE_SHAPE_PROXYTYPE:
-        {
-            return btPoint3(0,0,0);
-        }
-	case BOX_SHAPE_PROXYTYPE:
-		{
-//			spu_printf("SPU: getSupport BOX_SHAPE_PROXYTYPE\n");
-			btConvexInternalShape* convexShape = (btConvexInternalShape*)shape;
-			const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();
-			
-			return btPoint3(
-				localDir.getX() < 0.0f ? -halfExtents.x() : halfExtents.x(),
-							localDir.getY() < 0.0f ? -halfExtents.y() : halfExtents.y(),
-							localDir.getZ() < 0.0f ? -halfExtents.z() : halfExtents.z());
-		}
-
-	case TRIANGLE_SHAPE_PROXYTYPE:
-		{
-
-			btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
-			btVector3* vertices = (btVector3*)shape;
-			btVector3 dots(dir.dot(vertices[0]), dir.dot(vertices[1]), dir.dot(vertices[2]));
-	  		btVector3 sup = vertices[dots.maxAxis()];
-			return btPoint3(sup.getX(),sup.getY(),sup.getZ());
-			break;
-		}
-
-	case CYLINDER_SHAPE_PROXYTYPE:
-		{
-			btCylinderShape* cylShape = (btCylinderShape*)shape;
-
-			//mapping of halfextents/dimension onto radius/height depends on how cylinder local orientation is (upAxis)
-
-			btVector3 halfExtents = cylShape->getImplicitShapeDimensions();
-			btVector3 v(localDir.getX(),localDir.getY(),localDir.getZ());
-			
-			int cylinderUpAxis = cylShape->getUpAxis();
-			int XX(1),YY(0),ZZ(2);
-
-			switch (cylinderUpAxis)
-			{
-			case 0:
-				{
-					XX = 1;
-					YY = 0;
-					ZZ = 2;
-					break;
-				}
-			case 1:
-				{
-					XX = 0;
-					YY = 1;
-					ZZ = 2;
-				break;
-				}
-			case 2:
-				{
-					XX = 0;
-					YY = 2;
-					ZZ = 1;
-					break;
-				}
-			default:
-				btAssert(0);
-				//printf("SPU:localGetSupportingVertexWithoutMargin unknown Cylinder up-axis\n");
-			};
-
-			btScalar radius = halfExtents[XX];
-			btScalar halfHeight = halfExtents[cylinderUpAxis];
-
-			btVector3 tmp;
-			btScalar d ;
-
-			btScalar s = btSqrt(v[XX] * v[XX] + v[ZZ] * v[ZZ]);
-			if (s != btScalar(0.0))
-			{
-				d = radius / s;  
-				tmp[XX] = v[XX] * d;
-				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
-				tmp[ZZ] = v[ZZ] * d;
-				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
-			}
-			else
-			{
-				tmp[XX] = radius;
-				tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
-				tmp[ZZ] = btScalar(0.0);
-				return btPoint3(tmp.getX(),tmp.getY(),tmp.getZ());
-			}
-		}
-
-	case CAPSULE_SHAPE_PROXYTYPE:
-	{
-		//spu_printf("SPU: todo: getSupport CAPSULE_SHAPE_PROXYTYPE\n");
-		btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
-
-		btConvexInternalShape* cnvxShape = (btConvexInternalShape*)shape;
-		btVector3 halfExtents = cnvxShape->getImplicitShapeDimensions();
-		btScalar halfHeight = halfExtents.getY();
-		btScalar radius = halfExtents.getX();
-		btVector3 supVec(0,0,0);
-
-		btScalar maxDot(btScalar(-1e30));
-
-		btVector3 vec = vec0;
-		btScalar lenSqr = vec.length2();
-		if (lenSqr < btScalar(0.0001))
-		{
-			vec.setValue(1,0,0);
-		} else
-		{
-			btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
-			vec *= rlen;
-		}
-		btVector3 vtx;
-		btScalar newDot;
-		{
-			btVector3 pos(0,halfHeight,0);
-			vtx = pos +vec*(radius);
-			newDot = vec.dot(vtx);
-			if (newDot > maxDot)
-			{
-				maxDot = newDot;
-				supVec = vtx;
-			}
-		}
-		{
-			btVector3 pos(0,-halfHeight,0);
-			vtx = pos +vec*(radius);
-			newDot = vec.dot(vtx);
-			if (newDot > maxDot)
-			{
-				maxDot = newDot;
-				supVec = vtx;
-			}
-		}
-		return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
-		break;
-	};
-
-	case CONVEX_HULL_SHAPE_PROXYTYPE:
-		{
-			//spu_printf("SPU: todo: getSupport CONVEX_HULL_SHAPE_PROXYTYPE\n");
-
-		
-
-			btPoint3* points = 0;
-			int numPoints = 0;
-			if (shape==convexVertexData->gSpuConvexShapePtr0)
-			{
-				points = convexVertexData->gConvexPoints0;
-				numPoints = convexVertexData->gNumConvexPoints0;
-			}
-			if (shape == convexVertexData->gSpuConvexShapePtr1)
-			{
-				points = convexVertexData->gConvexPoints1;
-				numPoints = convexVertexData->gNumConvexPoints1;
-			}
-
-		//	spu_printf("numPoints = %d\n",numPoints);
-
-			btVector3 supVec(btScalar(0.),btScalar(0.),btScalar(0.));
-			btScalar newDot,maxDot = btScalar(-1e30);
-
-			btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());
-			btVector3 vec = vec0;
-			btScalar lenSqr = vec.length2();
-			if (lenSqr < btScalar(0.0001))
-			{
-				vec.setValue(1,0,0);
-			} else
-			{
-				btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
-				vec *= rlen;
-			}
-
-
-			for (int i=0;i<numPoints;i++)
-			{
-				btPoint3 vtx = points[i];// * m_localScaling;
-
-				newDot = vec.dot(vtx);
-				if (newDot > maxDot)
-				{
-					maxDot = newDot;
-					supVec = vtx;
-				}
-			}
-			return btPoint3(supVec.getX(),supVec.getY(),supVec.getZ());
-
-			break;
-		};
-
-    default:
-
-		//spu_printf("SPU:(type %i) missing support function\n",shapeType);
-
-		
-#if __ASSERT
-        spu_printf("localGetSupportingVertexWithoutMargin() - Unsupported bound type: %d.\n", shapeType);
-#endif // __ASSERT
-        return btPoint3(0.f, 0.f, 0.f);
-    }
-}
-
 
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
index 553269f21..cc39afb71 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.cpp
@@ -20,7 +20,7 @@ subject to the following restrictions:
 #include "SpuPreferredPenetrationDirections.h"
 
 
-#include "SpuLocalSupport.h"
+#include "SpuCollisionShapes.h"
 
 #define NUM_UNITSPHERE_POINTS 42
 static btVector3	sPenetrationDirections[NUM_UNITSPHERE_POINTS+MAX_PREFERRED_PENETRATION_DIRECTIONS*2] = 
@@ -74,7 +74,8 @@ bool SpuMinkowskiPenetrationDepthSolver::calcPenDepth( SpuVoronoiSimplexSolver&
             btTransform& transA,const btTransform& transB,
 			btVector3& v, btPoint3& pa, btPoint3& pb,
 			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
-			struct SpuConvexPolyhedronVertexData* convexVertexData
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
 			) const
 {
 
@@ -241,8 +242,8 @@ bool SpuMinkowskiPenetrationDepthSolver::calcPenDepth( SpuVoronoiSimplexSolver&
 		seperatingAxisInA = (-norm)* transA.getBasis();
 		seperatingAxisInB = norm* transB.getBasis();
 
-		pInA = localGetSupportingVertexWithoutMargin(shapeTypeA, convexA, seperatingAxisInA,convexVertexData);//, NULL);
-		qInB = localGetSupportingVertexWithoutMargin(shapeTypeB, convexB, seperatingAxisInB,convexVertexData);//, NULL);
+		pInA = localGetSupportingVertexWithoutMargin(shapeTypeA, convexA, seperatingAxisInA,convexVertexDataA);//, NULL);
+		qInB = localGetSupportingVertexWithoutMargin(shapeTypeB, convexB, seperatingAxisInB,convexVertexDataB);//, NULL);
 
 	//	pInA = convexA->localGetSupportingVertexWithoutMargin(seperatingAxisInA);
 	//	qInB = convexB->localGetSupportingVertexWithoutMargin(seperatingAxisInB);
@@ -299,7 +300,8 @@ bool SpuMinkowskiPenetrationDepthSolver::calcPenDepth( SpuVoronoiSimplexSolver&
 	
 
 	SpuClosestPointInput input;
-	input.m_convexVertexData = convexVertexData;
+	input.m_convexVertexData[0] = convexVertexDataA;
+	input.m_convexVertexData[1] = convexVertexDataB;
 	btVector3 newOrg = transA.getOrigin() + offset;
 
 	btTransform displacedTrans = transA;
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
index c862713b1..6193741ae 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuMinkowskiPenetrationDepthSolver.h
@@ -35,7 +35,8 @@ public:
             btTransform& transA,const btTransform& transB,
 			btVector3& v, btPoint3& pa, btPoint3& pb,
 			class btIDebugDraw* debugDraw,btStackAlloc* stackAlloc,
-			struct SpuConvexPolyhedronVertexData* convexVertexData
+			struct SpuConvexPolyhedronVertexData* convexVertexDataA,
+			struct SpuConvexPolyhedronVertexData* convexVertexDataB
 			) const;
 
 
diff --git a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.cpp b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.cpp
index f7c1d7dad..c640ca9c8 100644
--- a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.cpp
+++ b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.cpp
@@ -1,10 +1,21 @@
-#include <stdio.h>
+
 
 #include "SpuRaycastTask.h"
 #include "SpuCollisionObjectWrapper.h"
 #include "SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
+#include "SpuSubSimplexConvexCast.h"
+#include "LinearMath/btAabbUtil2.h"
 
 
+/* Future optimization strategies: 
+1. BBOX prune before loading shape data
+2. When doing bvh tree traversal do it once for entire batch of rays.
+*/
+
+/* Future work:
+1. support first hit, closest hit, etc rather than just closest hit.
+2. support compound objects
+*/
 
 struct RaycastTask_LocalStoreMemory
 {
@@ -14,7 +25,7 @@ struct RaycastTask_LocalStoreMemory
 		return (btCollisionObject*) gColObj;
 	}
 
-	SpuCollisionObjectWrapper gCollisionObjectWrapper;
+	ATTRIBUTE_ALIGNED16(SpuCollisionObjectWrapper gCollisionObjectWrapper);
 	SpuCollisionObjectWrapper* getCollisionObjectWrapper ()
 	{
 		return &gCollisionObjectWrapper;
@@ -41,7 +52,7 @@ void* createRaycastLocalStoreMemory()
 }
 #endif
 
-void GatherCollisionObjectAndShapeData (RaycastGatheredObjectData& gatheredObjectData, RaycastTask_LocalStoreMemory& lsMem, ppu_address_t objectWrapper)
+void GatherCollisionObjectAndShapeData (RaycastGatheredObjectData* gatheredObjectData, RaycastTask_LocalStoreMemory* lsMemPtr, ppu_address_t objectWrapper)
 {
 	register int dmaSize;
 	register ppu_address_t	dmaPpuAddress2;
@@ -49,27 +60,32 @@ void GatherCollisionObjectAndShapeData (RaycastGatheredObjectData& gatheredObjec
 	/* DMA Collision object wrapper into local store */
 	dmaSize = sizeof(SpuCollisionObjectWrapper);
 	dmaPpuAddress2 = objectWrapper;
-	cellDmaGet(&lsMem.gCollisionObjectWrapper, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
+	cellDmaGet(&lsMemPtr->gCollisionObjectWrapper, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
 	cellDmaWaitTagStatusAll(DMA_MASK(1));
 
 	/* DMA Collision object into local store */
 	dmaSize = sizeof(btCollisionObject);
-	dmaPpuAddress2 = lsMem.getCollisionObjectWrapper()->getCollisionObjectPtr();
-	cellDmaGet(&lsMem.gColObj, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
+	dmaPpuAddress2 = lsMemPtr->getCollisionObjectWrapper()->getCollisionObjectPtr();
+	cellDmaGet(&lsMemPtr->gColObj, dmaPpuAddress2  , dmaSize, DMA_TAG(2), 0, 0);
 	cellDmaWaitTagStatusAll(DMA_MASK(2));
 	
 	/* Gather information about collision object and shape */
-	gatheredObjectData.m_worldTransform = lsMem.getColObj()->getWorldTransform();
-	gatheredObjectData.m_collisionMargin = lsMem.getCollisionObjectWrapper()->getCollisionMargin ();
-	gatheredObjectData.m_shapeType = lsMem.getCollisionObjectWrapper()->getShapeType ();
-	gatheredObjectData.m_collisionShape = (ppu_address_t)lsMem.getColObj()->getCollisionShape();
-	gatheredObjectData.m_spuCollisionShape = (void*)&lsMem.gCollisionShape.collisionShape[0];
+	gatheredObjectData->m_worldTransform = lsMemPtr->getColObj()->getWorldTransform();
+	gatheredObjectData->m_collisionMargin = lsMemPtr->getCollisionObjectWrapper()->getCollisionMargin ();
+	gatheredObjectData->m_shapeType = lsMemPtr->getCollisionObjectWrapper()->getShapeType ();
+	gatheredObjectData->m_collisionShape = (ppu_address_t)lsMemPtr->getColObj()->getCollisionShape();
+	gatheredObjectData->m_spuCollisionShape = (void*)&lsMemPtr->gCollisionShape.collisionShape;
 
 	/* DMA shape data */
-	dmaCollisionShape (gatheredObjectData.m_spuCollisionShape, gatheredObjectData.m_collisionShape, 1, gatheredObjectData.m_shapeType);
+	dmaCollisionShape (gatheredObjectData->m_spuCollisionShape, gatheredObjectData->m_collisionShape, 1, gatheredObjectData->m_shapeType);
 	cellDmaWaitTagStatusAll(DMA_MASK(1));
-	btConvexInternalShape* spuConvexShape = (btConvexInternalShape*)gatheredObjectData.m_spuCollisionShape;
-	gatheredObjectData.m_primitiveDimensions = spuConvexShape->getImplicitShapeDimensions ();
+	if (btBroadphaseProxy::isConvex (gatheredObjectData->m_shapeType))
+	{
+		btConvexInternalShape* spuConvexShape = (btConvexInternalShape*)gatheredObjectData->m_spuCollisionShape;
+		gatheredObjectData->m_primitiveDimensions = spuConvexShape->getImplicitShapeDimensions ();
+	} else {
+		gatheredObjectData->m_primitiveDimensions = btVector3(1.0, 1.0, 1.0);
+	}
 }
 
 void dmaLoadRayOutput (ppu_address_t rayOutputAddr, SpuRaycastTaskWorkUnitOut* rayOutput, uint32_t dmaTag)
@@ -82,6 +98,366 @@ void dmaStoreRayOutput (ppu_address_t rayOutputAddr, const SpuRaycastTaskWorkUni
 	cellDmaLargePut (rayOutput, rayOutputAddr, sizeof(*rayOutput), DMA_TAG(dmaTag), 0, 0);
 }
 
+#if 0
+SIMD_FORCE_INLINE void small_cache_read(void* buffer, ppu_address_t ea, size_t size)
+{
+#if USE_SOFTWARE_CACHE
+	// Check for alignment requirements. We need to make sure the entire request fits within one cache line,
+	// so the first and last bytes should fall on the same cache line
+	btAssert((ea & ~SPE_CACHELINE_MASK) == ((ea + size - 1) & ~SPE_CACHELINE_MASK));
+
+	void* ls = spe_cache_read(ea);
+	memcpy(buffer, ls, size);
+#else
+	stallingUnalignedDmaSmallGet(buffer,ea,size);
+#endif
+}
+#endif
+
+void small_cache_read_triple(	void* ls0, ppu_address_t ea0,
+												void* ls1, ppu_address_t ea1,
+												void* ls2, ppu_address_t ea2,
+												size_t size)
+{
+		btAssert(size<16);
+		ATTRIBUTE_ALIGNED16(char	tmpBuffer0[32]);
+		ATTRIBUTE_ALIGNED16(char	tmpBuffer1[32]);
+		ATTRIBUTE_ALIGNED16(char	tmpBuffer2[32]);
+
+		uint32_t i;
+		
+
+		///make sure last 4 bits are the same, for cellDmaSmallGet
+		char* localStore0 = (char*)ls0;
+		uint32_t last4BitsOffset = ea0 & 0x0f;
+		char* tmpTarget0 = tmpBuffer0 + last4BitsOffset;
+		tmpTarget0 = (char*)cellDmaSmallGetReadOnly(tmpTarget0,ea0,size,DMA_TAG(1),0,0);
+
+
+		char* localStore1 = (char*)ls1;
+		last4BitsOffset = ea1 & 0x0f;
+		char* tmpTarget1 = tmpBuffer1 + last4BitsOffset;
+		tmpTarget1 = (char*)cellDmaSmallGetReadOnly(tmpTarget1,ea1,size,DMA_TAG(1),0,0);
+		
+		char* localStore2 = (char*)ls2;
+		last4BitsOffset = ea2 & 0x0f;
+		char* tmpTarget2 = tmpBuffer2 + last4BitsOffset;
+		tmpTarget2 = (char*)cellDmaSmallGetReadOnly(tmpTarget2,ea2,size,DMA_TAG(1),0,0);
+		
+		
+		cellDmaWaitTagStatusAll( DMA_MASK(1) );
+
+		//this is slowish, perhaps memcpy on SPU is smarter?
+		for (i=0; btLikely( i<size );i++)
+		{
+			localStore0[i] = tmpTarget0[i];
+			localStore1[i] = tmpTarget1[i];
+			localStore2[i] = tmpTarget2[i];
+		}
+}
+
+void performRaycastAgainstConvex (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr);
+
+class spuRaycastNodeCallback : public btNodeOverlapCallback
+{
+	RaycastGatheredObjectData* m_gatheredObjectData;
+	const SpuRaycastTaskWorkUnit& m_workUnit;
+	SpuRaycastTaskWorkUnitOut* m_workUnitOut;
+	RaycastTask_LocalStoreMemory* m_lsMemPtr;
+
+	ATTRIBUTE_ALIGNED16(btVector3	spuTriangleVertices[3]);
+	ATTRIBUTE_ALIGNED16(btScalar	spuUnscaledVertex[4]);
+	//ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
+public:
+	spuRaycastNodeCallback(RaycastGatheredObjectData* gatheredObjectData,const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+		: m_gatheredObjectData(gatheredObjectData),
+		  m_workUnit(workUnit),
+		  m_workUnitOut(workUnitOut),
+		  m_lsMemPtr (lsMemPtr)
+	{
+	}
+
+	virtual void processNode(int subPart, int triangleIndex)
+	{
+		///Create a triangle on the stack, call process collision, with GJK
+		///DMA the vertices, can benefit from software caching
+
+		//		spu_printf("processNode with triangleIndex %d\n",triangleIndex);
+
+		int* indexBasePtr = (int*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexBase+triangleIndex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_triangleIndexStride);
+		
+		small_cache_read_triple(&m_lsMemPtr->spuIndices[0],(ppu_address_t)&indexBasePtr[0],
+								&m_lsMemPtr->spuIndices[1],(ppu_address_t)&indexBasePtr[1],
+								&m_lsMemPtr->spuIndices[2],(ppu_address_t)&indexBasePtr[2],
+								sizeof(int));
+		//printf("%d %d %d\n", m_lsMemPtr->spuIndices[0], m_lsMemPtr->spuIndices[1], m_lsMemPtr->spuIndices[2]);
+		//		spu_printf("SPU index0=%d ,",spuIndices[0]);
+		//		spu_printf("SPU index1=%d ,",spuIndices[1]);
+		//		spu_printf("SPU index2=%d ,",spuIndices[2]);
+		//		spu_printf("SPU: indexBasePtr=%llx\n",indexBasePtr);
+
+		const btVector3& meshScaling = m_lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getScaling();
+	
+		for (int j=2;btLikely( j>=0 );j--)
+		{
+			int graphicsindex = m_lsMemPtr->spuIndices[j];
+
+						//spu_printf("SPU index=%d ,",graphicsindex);
+			btScalar* graphicsbasePtr = (btScalar*)(m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexBase+graphicsindex*m_lsMemPtr->bvhShapeData.gIndexMesh.m_vertexStride);
+			
+			//			spu_printf("SPU graphicsbasePtr=%llx\n",graphicsbasePtr);
+
+
+			///handle un-aligned vertices...
+
+			//another DMA for each vertex
+			small_cache_read_triple(&spuUnscaledVertex[0],(ppu_address_t)&graphicsbasePtr[0],
+									&spuUnscaledVertex[1],(ppu_address_t)&graphicsbasePtr[1],
+									&spuUnscaledVertex[2],(ppu_address_t)&graphicsbasePtr[2],
+									sizeof(btScalar));
+			
+			//printf("%f %f %f\n", spuUnscaledVertex[0],spuUnscaledVertex[1],spuUnscaledVertex[2]);
+			spuTriangleVertices[j] = btVector3(
+				spuUnscaledVertex[0]*meshScaling.getX(),
+				spuUnscaledVertex[1]*meshScaling.getY(),
+				spuUnscaledVertex[2]*meshScaling.getZ());
+
+				//spu_printf("SPU:triangle vertices:%f,%f,%f\n",spuTriangleVertices[j].x(),spuTriangleVertices[j].y(),spuTriangleVertices[j].z());
+		}
+		
+		RaycastGatheredObjectData triangleGatheredObjectData (*m_gatheredObjectData);
+		triangleGatheredObjectData.m_shapeType = TRIANGLE_SHAPE_PROXYTYPE;
+		triangleGatheredObjectData.m_spuCollisionShape = &spuTriangleVertices[0];
+
+		//printf("%f %f %f\n", spuTriangleVertices[0][0],spuTriangleVertices[0][1],spuTriangleVertices[0][2]);
+		//printf("%f %f %f\n", spuTriangleVertices[1][0],spuTriangleVertices[1][1],spuTriangleVertices[1][2]);
+		//printf("%f %f %f\n", spuTriangleVertices[2][0],spuTriangleVertices[2][1],spuTriangleVertices[2][2]);
+		SpuRaycastTaskWorkUnitOut out;
+		out.hitFraction = 1.0;
+
+		performRaycastAgainstConvex (&triangleGatheredObjectData, m_workUnit, &out, m_lsMemPtr);
+		/* XXX: For now only take the closest hit */
+		if (out.hitFraction < m_workUnitOut->hitFraction)
+		{
+			m_workUnitOut->hitFraction = out.hitFraction;
+			m_workUnitOut->hitNormal = out.hitNormal;
+		}
+	}
+
+};
+
+void	spuWalkStacklessQuantizedTreeAgainstRay(RaycastTask_LocalStoreMemory* lsMemPtr, btNodeOverlapCallback* nodeCallback,const btVector3& raySource, const btVector3& rayTarget,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode, int startNodeIndex,int endNodeIndex)
+{
+
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+
+	int escapeIndex;
+
+	unsigned int boxBoxOverlap, rayBoxOverlap;
+	unsigned int isLeafNode;
+#define RAYAABB2
+#ifdef RAYAABB2
+	btScalar lambda_max = 1.0;
+	btVector3 rayFrom = raySource;
+	btVector3 rayDirection = (rayTarget-raySource);
+	rayDirection.normalize ();
+	lambda_max = rayDirection.dot(rayTarget-raySource);
+	rayDirection[0] = btScalar(1.0) / rayDirection[0];
+	rayDirection[1] = btScalar(1.0) / rayDirection[1];
+	rayDirection[2] = btScalar(1.0) / rayDirection[2];
+	unsigned int sign[3] = { rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0};
+#endif
+
+	while (curIndex < endNodeIndex)
+	{
+		//catch bugs in tree data
+		assert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		boxBoxOverlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+
+		rayBoxOverlap = 0;
+		btScalar param = 1.0;
+		btVector3 normal;
+		if (boxBoxOverlap)
+		{
+			btVector3 bounds[2];
+			bounds[0] = lsMemPtr->bvhShapeData.getOptimizedBvh()->unQuantize(rootNode->m_quantizedAabbMin);
+			bounds[1] = lsMemPtr->bvhShapeData.getOptimizedBvh()->unQuantize(rootNode->m_quantizedAabbMax);
+#ifdef RAYAABB2
+			rayBoxOverlap = btRayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0, lambda_max);
+#else
+			rayBoxOverlap = btRayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal);
+#endif
+		}
+
+		if (isLeafNode && rayBoxOverlap)
+		{
+			//printf("overlap with node %d\n",rootNode->getTriangleIndex());
+			nodeCallback->processNode(0,rootNode->getTriangleIndex());
+			//			spu_printf("SPU: overlap detected with triangleIndex:%d\n",rootNode->getTriangleIndex());
+		} 
+
+		if (rayBoxOverlap || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+
+}
+
+void performRaycastAgainstConcave (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+{
+	//order: first collision shape is convex, second concave. m_isSwapped is true, if the original order was opposite
+	register int dmaSize;
+	register ppu_address_t	dmaPpuAddress2;
+
+	btBvhTriangleMeshShape*	trimeshShape = (btBvhTriangleMeshShape*)gatheredObjectData->m_spuCollisionShape;
+
+	//need the mesh interface, for access to triangle vertices
+	dmaBvhShapeData (&(lsMemPtr->bvhShapeData), trimeshShape);
+
+	btVector3 aabbMin;
+	btVector3 aabbMax;
+
+	/* Calculate the AABB for the ray in the triangle mesh shape */
+	btTransform rayInTriangleSpace;
+	rayInTriangleSpace = gatheredObjectData->m_worldTransform.inverse();
+
+	btVector3 rayFromInTriangleSpace = rayInTriangleSpace(workUnit.rayFrom);
+	btVector3 rayToInTriangleSpace = rayInTriangleSpace(workUnit.rayTo);
+
+	aabbMin = rayFromInTriangleSpace;
+	aabbMin.setMin (rayToInTriangleSpace);
+	aabbMax = rayFromInTriangleSpace;
+	aabbMax.setMax (rayToInTriangleSpace);
+
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin,aabbMin);
+	lsMemPtr->bvhShapeData.getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax,aabbMax);
+
+	QuantizedNodeArray&	nodeArray = lsMemPtr->bvhShapeData.getOptimizedBvh()->getQuantizedNodeArray();
+	//spu_printf("SPU: numNodes = %d\n",nodeArray.size());
+
+	BvhSubtreeInfoArray& subTrees = lsMemPtr->bvhShapeData.getOptimizedBvh()->getSubtreeInfoArray();	
+
+	spuRaycastNodeCallback nodeCallback (gatheredObjectData, workUnit, workUnitOut, lsMemPtr);
+	
+	IndexedMeshArray&	indexArray = lsMemPtr->bvhShapeData.gTriangleMeshInterfacePtr->getIndexedMeshArray();
+
+	//spu_printf("SPU:indexArray.size() = %d\n",indexArray.size());
+	//	spu_printf("SPU: numSubTrees = %d\n",subTrees.size());
+	//not likely to happen
+	if (subTrees.size() && indexArray.size() == 1)
+	{
+		///DMA in the index info
+		dmaBvhIndexedMesh (&lsMemPtr->bvhShapeData.gIndexMesh, indexArray, 0 /* index into indexArray */, 1 /* dmaTag */);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		
+		//display the headers
+		int numBatch = subTrees.size();
+		for (int i=0;i<numBatch;)
+		{
+// BEN: TODO - can reorder DMA transfers for less stall
+			int remaining = subTrees.size() - i;
+			int nextBatch = remaining < MAX_SPU_SUBTREE_HEADERS ? remaining : MAX_SPU_SUBTREE_HEADERS;
+			
+			dmaBvhSubTreeHeaders (&lsMemPtr->bvhShapeData.gSubtreeHeaders[0], (ppu_address_t)(&subTrees[i]), nextBatch, 1);
+			cellDmaWaitTagStatusAll(DMA_MASK(1));
+			
+
+			//			spu_printf("nextBatch = %d\n",nextBatch);
+
+			for (int j=0;j<nextBatch;j++)
+			{
+				const btBvhSubtreeInfo& subtree = lsMemPtr->bvhShapeData.gSubtreeHeaders[j];
+				
+				unsigned int overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+				if (overlap)
+				{
+					btAssert(subtree.m_subtreeSize);
+
+					//dma the actual nodes of this subtree
+					dmaBvhSubTreeNodes (&lsMemPtr->bvhShapeData.gSubtreeNodes[0], subtree, nodeArray, 2);
+
+					cellDmaWaitTagStatusAll(DMA_MASK(2));
+
+					/* Walk this subtree */
+					spuWalkStacklessQuantizedTreeAgainstRay(lsMemPtr, &nodeCallback,rayFromInTriangleSpace, rayToInTriangleSpace, quantizedQueryAabbMin,quantizedQueryAabbMax,
+						&lsMemPtr->bvhShapeData.gSubtreeNodes[0],
+						0,
+						subtree.m_subtreeSize);
+				}
+				//				spu_printf("subtreeSize = %d\n",gSubtreeHeaders[j].m_subtreeSize);
+			}
+
+			//	unsigned short int	m_quantizedAabbMin[3];
+			//	unsigned short int	m_quantizedAabbMax[3];
+			//	int			m_rootNodeIndex;
+			//	int			m_subtreeSize;
+			i+=nextBatch;
+		}
+
+		//pre-fetch first tree, then loop and double buffer
+	}
+}
+
+void performRaycastAgainstCompound (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+{
+	spu_printf ("Currently no support for ray. vs compound objects. Support coming soon.\n");
+}
+
+void
+performRaycastAgainstConvex (RaycastGatheredObjectData* gatheredObjectData, const SpuRaycastTaskWorkUnit& workUnit, SpuRaycastTaskWorkUnitOut* workUnitOut, RaycastTask_LocalStoreMemory* lsMemPtr)
+{
+	SpuVoronoiSimplexSolver simplexSolver;
+
+	btTransform rayFromTrans, rayToTrans;
+	rayFromTrans.setIdentity ();
+	rayFromTrans.setOrigin (workUnit.rayFrom);
+	rayToTrans.setIdentity ();
+	rayToTrans.setOrigin (workUnit.rayTo);
+
+	SpuCastResult result;
+
+	/* Load the vertex data if the shape is a convex hull */
+	/* XXX: We might be loading the shape twice */
+	ATTRIBUTE_ALIGNED16(char convexHullShape[sizeof(btConvexHullShape)]);
+	if (gatheredObjectData->m_shapeType == CONVEX_HULL_SHAPE_PROXYTYPE)
+	{
+		register int dmaSize;
+		register ppu_address_t	dmaPpuAddress2;
+		dmaSize = sizeof(btConvexHullShape);
+		dmaPpuAddress2 = gatheredObjectData->m_collisionShape;
+		cellDmaGet(&convexHullShape, dmaPpuAddress2, dmaSize, DMA_TAG(1), 0, 0);
+		cellDmaWaitTagStatusAll(DMA_MASK(1));
+		dmaConvexVertexData (&lsMemPtr->convexVertexData, (btConvexHullShape*)&convexHullShape);
+		cellDmaWaitTagStatusAll(DMA_MASK(2)); // dmaConvexVertexData uses dma channel 2!
+		lsMemPtr->convexVertexData.gSpuConvexShapePtr = gatheredObjectData->m_spuCollisionShape;
+		lsMemPtr->convexVertexData.gConvexPoints = &lsMemPtr->convexVertexData.g_convexPointBuffer[0];
+	}
+
+	/* performRaycast */
+	SpuSubsimplexRayCast caster (gatheredObjectData->m_spuCollisionShape, &lsMemPtr->convexVertexData, gatheredObjectData->m_shapeType, 0.0, &simplexSolver);
+	bool r = caster.calcTimeOfImpact (rayFromTrans, rayToTrans, gatheredObjectData->m_worldTransform, gatheredObjectData->m_worldTransform,result);
+
+	if (r)
+	{
+		workUnitOut->hitFraction = result.m_fraction;
+		workUnitOut->hitNormal = result.m_normal;
+	}
+}
+
 void	processRaycastTask(void* userPtr, void* lsMemory)
 {
 	RaycastTask_LocalStoreMemory* localMemory = (RaycastTask_LocalStoreMemory*)lsMemory;
@@ -95,22 +471,36 @@ void	processRaycastTask(void* userPtr, void* lsMemory)
 	for (int objectId = 0; objectId < taskDesc.numSpuCollisionObjectWrappers; objectId++)
 	{
 		RaycastGatheredObjectData gatheredObjectData;
-		GatherCollisionObjectAndShapeData (gatheredObjectData, *localMemory, (ppu_address_t)&cows[objectId]);
+		GatherCollisionObjectAndShapeData (&gatheredObjectData, localMemory, (ppu_address_t)&cows[objectId]);
 		/* load initial collision shape */
 		for (int rayId = 0; rayId < taskDesc.numWorkUnits; rayId++)
 		{
-			SpuRaycastTaskWorkUnitOut rayOut;
-
-			dmaLoadRayOutput ((ppu_address_t)taskDesc.workUnits[rayId].output, &rayOut, 1);
+			const SpuRaycastTaskWorkUnit& workUnit = taskDesc.workUnits[rayId];
+			ATTRIBUTE_ALIGNED16(SpuRaycastTaskWorkUnitOut workUnitOut);
+			dmaLoadRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
-			
-			float t = (float)rayId/(float)taskDesc.numWorkUnits;
-			/* performRaycast */
-			rayOut.hitFraction = 0.1f * t;
-			rayOut.hitNormal = btVector3(1.0, 0.0, 0.0);
+
+			SpuRaycastTaskWorkUnitOut tWorkUnitOut;
+			tWorkUnitOut.hitFraction = 1.0;
+
+			if (btBroadphaseProxy::isConvex (gatheredObjectData.m_shapeType))
+			{
+				//performRaycastAgainstConvex (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory);
+			} else if (btBroadphaseProxy::isCompound (gatheredObjectData.m_shapeType)) {
+				performRaycastAgainstCompound (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory);
+			} else if (btBroadphaseProxy::isConcave (gatheredObjectData.m_shapeType)) {
+				performRaycastAgainstConcave (&gatheredObjectData, workUnit, &tWorkUnitOut, localMemory);
+			}
+
+			/* XXX Only support taking the closest hit for now */
+			if (tWorkUnitOut.hitFraction < workUnitOut.hitFraction)
+			{
+				workUnitOut.hitFraction = tWorkUnitOut.hitFraction;
+				workUnitOut.hitNormal = tWorkUnitOut.hitNormal;
+			}
 
 			/* write ray cast data back */
-			dmaStoreRayOutput ((ppu_address_t)taskDesc.workUnits[rayId].output, &rayOut, 1);
+			dmaStoreRayOutput ((ppu_address_t)workUnit.output, &workUnitOut, 1);
 			cellDmaWaitTagStatusAll(DMA_MASK(1));
 		}
 	}
diff --git a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.h b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.h
index 0eb1b5d8b..682c25c89 100644
--- a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.h
+++ b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuRaycastTask.h
@@ -16,7 +16,7 @@ struct RaycastGatheredObjectData
 	btTransform	m_worldTransform;
 };
 
-struct SpuRaycastTaskWorkUnitOut
+ATTRIBUTE_ALIGNED16(struct) SpuRaycastTaskWorkUnitOut
 {
 	btVector3 hitNormal; /* out */
 	btScalar hitFraction; /* out */
@@ -24,14 +24,14 @@ struct SpuRaycastTaskWorkUnitOut
 };
 
 /* Perform a raycast on collision object */
-struct SpuRaycastTaskWorkUnit
+ATTRIBUTE_ALIGNED16(struct) SpuRaycastTaskWorkUnit
 {
 	btVector3 rayFrom; /* in */
 	btVector3 rayTo; /* in */
 	SpuRaycastTaskWorkUnitOut* output; /* out */
 };
 
-#define SPU_RAYCAST_WORK_UNITS_PER_TASK 16
+#define SPU_RAYCAST_WORK_UNITS_PER_TASK 4
 
 struct SpuRaycastTaskDesc
 {
diff --git a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.cpp b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.cpp
index 8cb60a770..e8f93ed4d 100644
--- a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.cpp
+++ b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.cpp
@@ -13,19 +13,17 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */
 
-
 #include "SpuSubSimplexConvexCast.h"
-#include "SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
+
 
 #include "BulletCollision/CollisionShapes/btConvexShape.h"
 #include "BulletCollision/CollisionShapes/btMinkowskiSumShape.h"
 #include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
 
 
-SpuSubsimplexConvexCast::SpuSubsimplexConvexCast (const void* convexA,
-												  const void* convexB,
-												  SpuVoronoiSimplexSolver* simplexSolver)
-	:m_simplexSolver(simplexSolver), m_convexA(convexA),m_convexB(convexB)
+SpuSubsimplexRayCast::SpuSubsimplexRayCast (void* shapeB, SpuConvexPolyhedronVertexData* convexDataB, int shapeTypeB, float marginB,
+										    SpuVoronoiSimplexSolver* simplexSolver)
+	:m_simplexSolver(simplexSolver), m_shapeB(shapeB), m_convexDataB(convexDataB), m_shapeTypeB(shapeTypeB), m_marginB(marginB)
 {
 }
 
@@ -37,27 +35,33 @@ SpuSubsimplexConvexCast::SpuSubsimplexConvexCast (const void* convexA,
 #define MAX_ITERATIONS 32
 #endif
 
-bool	SpuSubsimplexConvexCast::calcTimeOfImpact(const btTransform& fromA,
-												  const btTransform& toA,
-												  const btTransform& fromB,
-												  const btTransform& toB,
-												  SpuCastResult& result)
+/* Returns the support point of the minkowski sum:
+ * MSUM(Pellet, ConvexShape)
+ *
+ */
+btVector3 supportPoint (btTransform xform, int shapeType, const void* shape, SpuConvexPolyhedronVertexData* convexVertexData, btVector3 seperatingAxis)
 {
-	//localGetSupportingVertexWithoutMargin(m_shapeTypeA, m_minkowskiA, seperatingAxisInA,input.m_convexVertexData[0]);
-#if 0
-	btMinkowskiSumShape combi(m_convexA,m_convexB);
-	btMinkowskiSumShape* convex = &combi;
+	btVector3 SupportPellet = btVector3(0.0, 0.0, 0.0);
+	btVector3 rotatedSeperatingAxis = seperatingAxis * xform.getBasis();
+	btVector3 SupportShape = xform(localGetSupportingVertexWithoutMargin(shapeType, (void*)shape, rotatedSeperatingAxis, convexVertexData));
+	return SupportPellet + SupportShape;
+}
 
+bool	SpuSubsimplexRayCast::calcTimeOfImpact(const btTransform& fromRay,
+											   const btTransform& toRay,
+											   const btTransform& fromB,
+											   const btTransform& toB,
+											   SpuCastResult& result)
+{
 	btTransform	rayFromLocalA;
 	btTransform	rayToLocalA;
 
-	rayFromLocalA = fromA.inverse()* fromB;
-	rayToLocalA = toA.inverse()* toB;
-
+	rayFromLocalA = fromRay.inverse()* fromB;
+	rayToLocalA = toRay.inverse()* toB;
 
 	m_simplexSolver->reset();
-
-	convex->setTransformB(btTransform(rayFromLocalA.getBasis()));
+	
+	btTransform bXform = btTransform(rayFromLocalA.getBasis());
 
 	//btScalar radius = btScalar(0.01);
 
@@ -69,8 +73,7 @@ bool	SpuSubsimplexConvexCast::calcTimeOfImpact(const btTransform& fromA,
 	btVector3 r = -(rayToLocalA.getOrigin()-rayFromLocalA.getOrigin());
 	btVector3 x = s;
 	btVector3 v;
-	btVector3 arbitraryPoint = convex->localGetSupportingVertex(r);
-	
+	btVector3 arbitraryPoint = supportPoint(bXform, m_shapeTypeB, m_shapeB, m_convexDataB, r);
 	v = x - arbitraryPoint;
 
 	int maxIter = MAX_ITERATIONS;
@@ -82,7 +85,6 @@ bool	SpuSubsimplexConvexCast::calcTimeOfImpact(const btTransform& fromA,
 
 	btScalar lastLambda = lambda;
 
-
 	btScalar dist2 = v.length2();
 #ifdef BT_USE_DOUBLE_PRECISION
 	btScalar epsilon = btScalar(0.0001);
@@ -94,8 +96,8 @@ bool	SpuSubsimplexConvexCast::calcTimeOfImpact(const btTransform& fromA,
 	
 	while ( (dist2 > epsilon) && maxIter--)
 	{
-		p = convex->localGetSupportingVertex( v);
-		 w = x - p;
+		p = supportPoint(bXform, m_shapeTypeB, m_shapeB, m_convexDataB, v);
+		w = x - p;
 
 		btScalar VdotW = v.dot(w);
 
@@ -136,8 +138,6 @@ bool	SpuSubsimplexConvexCast::calcTimeOfImpact(const btTransform& fromA,
 	result.m_fraction = lambda;
 	result.m_normal = n;
 
-#endif
-
 	return true;
 }
 
diff --git a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.h b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.h
index f539722c7..81321648e 100644
--- a/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.h
+++ b/Extras/BulletMultiThreaded/SpuRaycastTask/SpuSubSimplexConvexCast.h
@@ -14,43 +14,47 @@ subject to the following restrictions:
 */
 
 
-#ifndef SPU_SUBSIMPLEX_CONVEX_CAST_H
-#define SPU_SUBSIMPLEX_CONVEX_CAST_H
+#ifndef SPU_SUBSIMPLEX_RAY_CAST_H
+#define SPU_SUBSIMPLEX_RAY_CAST_H
 
 #include "SpuNarrowPhaseCollisionTask/SpuVoronoiSimplexSolver.h"
+#include "SpuNarrowPhaseCollisionTask/SpuCollisionShapes.h"
 #include "SpuRaycastTask.h"
 
 class btConvexShape;
 
 struct SpuCastResult
 {
+	float m_fraction;
+	btVector3 m_normal;
 };
 
 /// btSubsimplexConvexCast implements Gino van den Bergens' paper
 ///"Ray Casting against bteral Convex Objects with Application to Continuous Collision Detection"
 /// GJK based Ray Cast, optimized version
 /// Objects should not start in overlap, otherwise results are not defined.
-class SpuSubsimplexConvexCast
+class SpuSubsimplexRayCast
 {
 	SpuVoronoiSimplexSolver* m_simplexSolver;
-	const void*				 m_convexA;
-	const void*				 m_convexB;
-	RaycastGatheredObjectData* m_dataB;
-public:
+	void* m_shapeB;
+	SpuConvexPolyhedronVertexData* m_convexDataB;
+	int m_shapeTypeB;
+	float m_marginB;
 
-	SpuSubsimplexConvexCast (const void* shapeA,
-							 const void* shapeB,
-							 SpuVoronoiSimplexSolver* simplexSolver);
+public:
+	SpuSubsimplexRayCast (void* shapeB, SpuConvexPolyhedronVertexData* convexDataB, int shapeTypeB, float marginB,
+						  SpuVoronoiSimplexSolver* simplexSolver);
 
 	//virtual ~btSubsimplexConvexCast();
+
 	///SimsimplexConvexCast calculateTimeOfImpact calculates the time of impact+normal for the linear cast (sweep) between two moving objects.
 	///Precondition is that objects should not penetration/overlap at the start from the interval. Overlap can be tested using btGjkPairDetector.
-	bool calcTimeOfImpact(const btTransform& fromA,
-						  const btTransform& toA,
+	bool calcTimeOfImpact(const btTransform& fromRay,
+						  const btTransform& toRay,
 						  const btTransform& fromB,
 						  const btTransform& toB,
 						  SpuCastResult& result);
 
 };
 
-#endif //SUBSIMPLEX_CONVEX_CAST_H
+#endif //SUBSIMPLEX_RAY_CAST_H
diff --git a/Extras/BulletMultiThreaded/SpuRaycastTaskProcess.cpp b/Extras/BulletMultiThreaded/SpuRaycastTaskProcess.cpp
index 9c453d377..964348f59 100644
--- a/Extras/BulletMultiThreaded/SpuRaycastTaskProcess.cpp
+++ b/Extras/BulletMultiThreaded/SpuRaycastTaskProcess.cpp
@@ -1,172 +1,172 @@
-/*
-Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-
-
-#include "SpuRaycastTaskProcess.h"
-
-SpuRaycastTaskProcess::SpuRaycastTaskProcess(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks)
-:m_threadInterface(threadInterface),
-m_maxNumOutstandingTasks(maxNumOutstandingTasks)
-{
-	m_workUnitTaskBuffers = (unsigned char *)0;
-	m_taskBusy.resize(m_maxNumOutstandingTasks);
-	m_spuRaycastTaskDesc.resize(m_maxNumOutstandingTasks);
-
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_currentWorkUnitInTask = 0;
-
-	m_threadInterface->startSPU();
-
-	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
-	//printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", sizeof(SpuGatherAndProcessWorkUnitInput));
-
-}
-
-SpuRaycastTaskProcess::~SpuRaycastTaskProcess()
-{
-	
-	if (m_workUnitTaskBuffers != 0)
-	{
-		btAlignedFree(m_workUnitTaskBuffers);
-		m_workUnitTaskBuffers = 0;
-	}
-	
-	m_threadInterface->stopSPU();	
-}
-
-
-
-void SpuRaycastTaskProcess::initialize2(void* spuCollisionObjectsWrappers, int numSpuCollisionObjectWrappers)
-{
-	m_spuCollisionObjectWrappers = spuCollisionObjectsWrappers;
-	m_numSpuCollisionObjectWrappers = numSpuCollisionObjectWrappers;
-	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
-	{
-		m_taskBusy[i] = false;
-	}
-	m_numBusyTasks = 0;
-	m_currentTask = 0;
-	m_currentWorkUnitInTask = 0;
-
-#ifdef DEBUG_SpuRaycastTaskProcess
-	m_initialized = true;
-#endif
-}
-
-
-void SpuRaycastTaskProcess::issueTask2()
-{
-	m_taskBusy[m_currentTask] = true;
-	m_numBusyTasks++;
-
-	SpuRaycastTaskDesc& taskDesc = m_spuRaycastTaskDesc[m_currentTask];
-
-	taskDesc.taskId = m_currentTask;
-	m_threadInterface->sendRequest(1, (uint32_t) &taskDesc,m_currentTask);
-	//printf("send thread requested for task %d\n", m_currentTask);
-	// if all tasks busy, wait for spu event to clear the task.
-	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
-	{
-		unsigned int taskId;
-		unsigned int outputSize;
-
-		m_threadInterface->waitForResponse(&taskId, &outputSize);
-
-		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
-
-		m_taskBusy[taskId] = false;
-
-		m_numBusyTasks--;
-	} else {
-		//printf("Sent request, not enough busy tasks\n");
-	}
-}
-
-void SpuRaycastTaskProcess::addWorkToTask(SpuRaycastTaskWorkUnit workunit)
-{
-	m_spuRaycastTaskDesc[m_currentTask].workUnits[m_currentWorkUnitInTask] = workunit;
-	m_currentWorkUnitInTask++;
-	if (m_currentWorkUnitInTask == SPU_RAYCAST_WORK_UNITS_PER_TASK)
-	{
-		m_spuRaycastTaskDesc[m_currentTask].numWorkUnits = m_currentWorkUnitInTask;
-		m_spuRaycastTaskDesc[m_currentTask].numSpuCollisionObjectWrappers = m_numSpuCollisionObjectWrappers;
-		m_spuRaycastTaskDesc[m_currentTask].spuCollisionObjectsWrappers = m_spuCollisionObjectWrappers;
-		//printf("Task buffer full, issuing\n");
-		issueTask2 ();
-		//printf("Returned from issueTask2()\n");
-		m_currentWorkUnitInTask = 0;
-
-		// find new task buffer
-		for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
-		{
-			if (!m_taskBusy[i])
-			{
-				m_currentTask = i;
-				//init the task data
-				break;
-			}
-		}
-		//printf("next task = %d\n", m_currentTask);
-	}
-}
-
-
-void 
-SpuRaycastTaskProcess::flush2()
-{
-#ifdef DEBUG_SPU_TASK_SCHEDULING
-	printf("\nSpuRaycastTaskProcess::flush()\n");
-#endif //DEBUG_SPU_TASK_SCHEDULING
-	
-	// if there's a partially filled task buffer, submit that task
-	//printf("Flushing... %d remaining\n", m_currentWorkUnitInTask);
-	if (m_currentWorkUnitInTask > 0)
-	{
-		m_spuRaycastTaskDesc[m_currentTask].numWorkUnits = m_currentWorkUnitInTask;
-		m_spuRaycastTaskDesc[m_currentTask].numSpuCollisionObjectWrappers = m_numSpuCollisionObjectWrappers;
-		m_spuRaycastTaskDesc[m_currentTask].spuCollisionObjectsWrappers = m_spuCollisionObjectWrappers;
-		issueTask2();
-		m_currentWorkUnitInTask = 0;
-	}
-
-
-	// all tasks are issued, wait for all tasks to be complete
-	while(m_numBusyTasks > 0)
-	{
-	  // Consolidating SPU code
-	  unsigned int taskId;
-	  unsigned int outputSize;
-	  
-	  //printf("Busy tasks... %d\n", m_numBusyTasks);
-
-	  {
-			// SPURS support.
-			m_threadInterface->waitForResponse(&taskId, &outputSize);
-		}
-
-		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
-
-		//postProcess(taskId, outputSize);
-
-		m_taskBusy[taskId] = false;
-
-		m_numBusyTasks--;
-	}
-}
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "SpuRaycastTaskProcess.h"
+
+
+SpuRaycastTaskProcess::SpuRaycastTaskProcess(class	btThreadSupportInterface*	threadInterface, unsigned int	maxNumOutstandingTasks)
+:m_threadInterface(threadInterface),
+m_maxNumOutstandingTasks(maxNumOutstandingTasks)
+{
+	m_workUnitTaskBuffers = (unsigned char *)0;
+	m_taskBusy.resize(m_maxNumOutstandingTasks);
+	m_spuRaycastTaskDesc.resize(m_maxNumOutstandingTasks);
+
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentWorkUnitInTask = 0;
+
+	m_threadInterface->startSPU();
+
+	//printf("sizeof vec_float4: %d\n", sizeof(vec_float4));
+	//printf("sizeof SpuGatherAndProcessWorkUnitInput: %d\n", sizeof(SpuGatherAndProcessWorkUnitInput));
+
+}
+
+SpuRaycastTaskProcess::~SpuRaycastTaskProcess()
+{
+	
+	if (m_workUnitTaskBuffers != 0)
+	{
+		btAlignedFree(m_workUnitTaskBuffers);
+		m_workUnitTaskBuffers = 0;
+	}
+	
+	m_threadInterface->stopSPU();	
+}
+
+
+
+void SpuRaycastTaskProcess::initialize2(void* spuCollisionObjectsWrappers, int numSpuCollisionObjectWrappers)
+{
+	m_spuCollisionObjectWrappers = spuCollisionObjectsWrappers;
+	m_numSpuCollisionObjectWrappers = numSpuCollisionObjectWrappers;
+	for (int i = 0; i < m_maxNumOutstandingTasks; i++)
+	{
+		m_taskBusy[i] = false;
+	}
+	m_numBusyTasks = 0;
+	m_currentTask = 0;
+	m_currentWorkUnitInTask = 0;
+
+#ifdef DEBUG_SpuRaycastTaskProcess
+	m_initialized = true;
+#endif
+}
+
+
+void SpuRaycastTaskProcess::issueTask2()
+{
+	m_taskBusy[m_currentTask] = true;
+	m_numBusyTasks++;
+
+	SpuRaycastTaskDesc& taskDesc = m_spuRaycastTaskDesc[m_currentTask];
+
+	taskDesc.taskId = m_currentTask;
+	m_threadInterface->sendRequest(1, (uint32_t) &taskDesc,m_currentTask);
+	//printf("send thread requested for task %d\n", m_currentTask);
+	// if all tasks busy, wait for spu event to clear the task.
+	if (m_numBusyTasks >= m_maxNumOutstandingTasks)
+	{
+		unsigned int taskId;
+		unsigned int outputSize;
+
+		m_threadInterface->waitForResponse(&taskId, &outputSize);
+
+		//printf("PPU: after issue, received event: %u %d\n", taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	} else {
+		//printf("Sent request, not enough busy tasks\n");
+	}
+}
+
+void SpuRaycastTaskProcess::addWorkToTask(SpuRaycastTaskWorkUnit workunit)
+{
+	m_spuRaycastTaskDesc[m_currentTask].workUnits[m_currentWorkUnitInTask] = workunit;
+	m_currentWorkUnitInTask++;
+	if (m_currentWorkUnitInTask == SPU_RAYCAST_WORK_UNITS_PER_TASK)
+	{
+		m_spuRaycastTaskDesc[m_currentTask].numWorkUnits = m_currentWorkUnitInTask;
+		m_spuRaycastTaskDesc[m_currentTask].numSpuCollisionObjectWrappers = m_numSpuCollisionObjectWrappers;
+		m_spuRaycastTaskDesc[m_currentTask].spuCollisionObjectsWrappers = m_spuCollisionObjectWrappers;
+		//printf("Task buffer full, issuing\n");
+		issueTask2 ();
+		//printf("Returned from issueTask2()\n");
+		m_currentWorkUnitInTask = 0;
+
+		// find new task buffer
+		for (unsigned int i = 0; i < m_maxNumOutstandingTasks; i++)
+		{
+			if (!m_taskBusy[i])
+			{
+				m_currentTask = i;
+				//init the task data
+				break;
+			}
+		}
+		//printf("next task = %d\n", m_currentTask);
+	}
+}
+
+
+void 
+SpuRaycastTaskProcess::flush2()
+{
+#ifdef DEBUG_SPU_TASK_SCHEDULING
+	printf("\nSpuRaycastTaskProcess::flush()\n");
+#endif //DEBUG_SPU_TASK_SCHEDULING
+	
+	// if there's a partially filled task buffer, submit that task
+	//printf("Flushing... %d remaining\n", m_currentWorkUnitInTask);
+	if (m_currentWorkUnitInTask > 0)
+	{
+		m_spuRaycastTaskDesc[m_currentTask].numWorkUnits = m_currentWorkUnitInTask;
+		m_spuRaycastTaskDesc[m_currentTask].numSpuCollisionObjectWrappers = m_numSpuCollisionObjectWrappers;
+		m_spuRaycastTaskDesc[m_currentTask].spuCollisionObjectsWrappers = m_spuCollisionObjectWrappers;
+		issueTask2();
+		m_currentWorkUnitInTask = 0;
+	}
+
+
+	// all tasks are issued, wait for all tasks to be complete
+	while(m_numBusyTasks > 0)
+	{
+	  // Consolidating SPU code
+	  unsigned int taskId;
+	  unsigned int outputSize;
+	  
+	  //printf("Busy tasks... %d\n", m_numBusyTasks);
+
+	  {
+			// SPURS support.
+			m_threadInterface->waitForResponse(&taskId, &outputSize);
+		}
+
+		//printf("PPU: flushing, received event: %u %d\n", taskId, outputSize);
+
+		//postProcess(taskId, outputSize);
+
+		m_taskBusy[taskId] = false;
+
+		m_numBusyTasks--;
+	}
+}
diff --git a/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.cpp b/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.cpp
index fff0e66d2..73fd3643a 100644
--- a/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.cpp
+++ b/Extras/BulletMultiThreaded/SpuSolverTask/SpuParallellSolverTask.cpp
@@ -17,6 +17,7 @@ Written by: Marten Svanfeldt
 
 #define IN_PARALLELL_SOLVER 1
 
+
 #include "SpuParallellSolverTask.h"
 #include "BulletDynamics/Dynamics/btRigidBody.h"
 #include "BulletCollision/NarrowPhaseCollision/btPersistentManifold.h"