cleanup CL/CUDA example harnesses.

refactor CL/CUDA specific initialization stuffs into examples/common/clDeviceContext and cudaDeviceContext, and update examples to use those structs. also - remove CL/CUDA tests from osd_regression. The tests for those kernels will be covered by glImaging. - update cuda initialization to use the GL-interoperable device if available. - remove CL specialization from glShareTopology, following the same pattern as we took in the previous OsdGLMesh refactoring. (still something strange with XFB kernels though) - fix file permissions.
2024-09-19 14:20:00 +00:00 · 2015-04-28 15:46:37 -07:00 · 2015-04-28 15:46:37 -07:00 · 82a0513326
commit 82a0513326
parent 99f1b57ba5
27 changed files with 494 additions and 408 deletions
--- a/examples/common/CMakeLists.txt
+++ b/examples/common/CMakeLists.txt
@ -36,8 +36,6 @@ set(EXAMPLES_COMMON_SOURCE_FILES
 )

 set(EXAMPLES_COMMON_HEADER_FILES
-    clInit.h
-    cudaInit.h
    font_image.h
    hdr_reader.h
    hud.h
@ -86,10 +84,29 @@ if(DXSDK_FOUND)

 endif()

-if( OPENCL_FOUND )
+if(OPENCL_FOUND)
    include_directories("${OPENCL_INCLUDE_DIRS}")
+
+    list(APPEND EXAMPLES_COMMON_SOURCE_FILES
+        clDeviceContext.cpp
+    )
+    list(APPEND EXAMPLES_COMMON_HEADER_FILES
+        clDeviceContext.h
+    )
 endif()

+if(CUDA_FOUND)
+    include_directories("${CUDA_INCLUDE_DIRS}")
+
+    list(APPEND EXAMPLES_COMMON_SOURCE_FILES
+        cudaDeviceContext.cpp
+    )
+    list(APPEND EXAMPLES_COMMON_HEADER_FILES
+        cudaDeviceContext.h
+    )
+endif()
+
+
 include_directories(
    "${PROJECT_SOURCE_DIR}/opensubdiv"
    "${CMAKE_CURRENT_BINARY_DIR}"
@ -106,6 +123,6 @@ add_library(examples_common_obj
    OBJECT
        ${EXAMPLES_COMMON_SOURCE_FILES}
        ${EXAMPLES_COMMON_HEADER_FILES}
-        ${INC_FILES}        
+        ${INC_FILES}
 )

--- a/examples/common/clDeviceContext.cpp
+++ b/examples/common/clDeviceContext.cpp
@ -1,5 +1,5 @@
 //
-//   Copyright 2013 Pixar
+//   Copyright 2015 Pixar
 //
 //   Licensed under the Apache License, Version 2.0 (the "Apache License")
 //   with the following modification; you may not use this file except in
@ -22,8 +22,7 @@
 //   language governing permissions and limitations under the Apache License.
 //

-#ifndef OSD_EXAMPLE_CL_INIT_H
-#define OSD_EXAMPLE_CL_INIT_H
+#include "clDeviceContext.h"

 #if defined(_WIN32)
    #include <windows.h>
@ -33,33 +32,44 @@
    #include <GL/glx.h>
 #endif

-#include "osd/opencl.h"
-
 #include <cstdio>
+#include <cstring>
 #include <string>

-static inline bool HAS_CL_VERSION_1_1 () {
-#ifdef OPENSUBDIV_HAS_OPENCL
-     #ifdef OPENSUBDIV_HAS_CLEW
-        static bool clewInitialized = false;
-        static bool clewLoadSuccess;
-        if (not clewInitialized) {
-            clewInitialized = true;
-            clewLoadSuccess = clewInit() == CLEW_SUCCESS;
-            if (not clewLoadSuccess) {
-                fprintf(stderr, "Loading OpenCL failed.\n");
-            }
-        }
-        return clewLoadSuccess;
-    #endif
-    return true;
-#else
-    return false;
-#endif
+CLDeviceContext::CLDeviceContext() :
+    _clContext(NULL), _clCommandQueue(NULL) {
 }

-static bool initCL(cl_context *clContext, cl_command_queue *clQueue)
-{
+CLDeviceContext::~CLDeviceContext() {
+
+    if (_clCommandQueue)
+        clReleaseCommandQueue(_clCommandQueue);
+    if (_clContext)
+        clReleaseContext(_clContext);
+}
+
+/*static*/
+bool
+CLDeviceContext::HAS_CL_VERSION_1_1 () {
+
+#ifdef OPENSUBDIV_HAS_CLEW
+    static bool clewInitialized = false;
+    static bool clewLoadSuccess;
+    if (not clewInitialized) {
+        clewInitialized = true;
+        clewLoadSuccess = clewInit() == CLEW_SUCCESS;
+        if (not clewLoadSuccess) {
+            fprintf(stderr, "Loading OpenCL failed.\n");
+        }
+    }
+    return clewLoadSuccess;
+#endif
+    return true;
+}
+
+bool
+CLDeviceContext::Initialize() {
+
 #ifdef OPENSUBDIV_HAS_CLEW
    if (!clGetPlatformIDs) {
        printf("Error clGetPlatformIDs function not bound.\n");
@ -117,21 +127,21 @@ static bool initCL(cl_context *clContext, cl_command_queue *clQueue)
    int clDeviceUsed = 0;

 #if defined(__APPLE__)
-    *clContext = clCreateContext(props, 0, NULL, clLogMessagesToStdoutAPPLE, NULL, &ciErrNum);
+    _clContext = clCreateContext(props, 0, NULL, clLogMessagesToStdoutAPPLE, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS) {
        printf("Error %d in clCreateContext\n", ciErrNum);
        return false;
    }

    size_t devicesSize = 0;
-    clGetGLContextInfoAPPLE(*clContext, kCGLContext, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, 0, NULL, &devicesSize);
+    clGetGLContextInfoAPPLE(_clContext, kCGLContext, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, 0, NULL, &devicesSize);
    int numDevices = int(devicesSize / sizeof(cl_device_id));
    if (numDevices == 0) {
        printf("No sharable devices.\n");
        return false;
    }
    cl_device_id *clDevices = new cl_device_id[numDevices];
-    clGetGLContextInfoAPPLE(*clContext, kCGLContext, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, numDevices * sizeof(cl_device_id), clDevices, NULL);
+    clGetGLContextInfoAPPLE(_clContext, kCGLContext, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, numDevices * sizeof(cl_device_id), clDevices, NULL);
 #else

    // get the number of GPU devices available to the platform
@ -190,7 +200,8 @@ static bool initCL(cl_context *clContext, cl_command_queue *clQueue)
        return false;
    }

-    *clContext = clCreateContext(props, 1, &clDevices[clDeviceUsed], NULL, NULL, &ciErrNum);
+    _clContext = clCreateContext(props, 1, &clDevices[clDeviceUsed],
+                                 NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS) {
        printf("Error %d in clCreateContext\n", ciErrNum);
        delete[] clDevices;
@ -198,7 +209,8 @@ static bool initCL(cl_context *clContext, cl_command_queue *clQueue)
    }
 #endif

-    *clQueue = clCreateCommandQueue(*clContext, clDevices[clDeviceUsed], 0, &ciErrNum);
+    _clCommandQueue = clCreateCommandQueue(_clContext, clDevices[clDeviceUsed],
+                                    0, &ciErrNum);
    delete[] clDevices;
    if (ciErrNum != CL_SUCCESS) {
        printf("Error %d in clCreateCommandQueue\n", ciErrNum);
@ -207,10 +219,3 @@ static bool initCL(cl_context *clContext, cl_command_queue *clQueue)
    return true;
 }

-static void uninitCL(cl_context clContext, cl_command_queue clQueue)
-{
-    clReleaseCommandQueue(clQueue);
-    clReleaseContext(clContext);
-}
-
-#endif // OSD_EXAMPLE_CL_INIT_H
--- a/examples/common/clDeviceContext.h
+++ b/examples/common/clDeviceContext.h
@ -0,0 +1,57 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OSD_EXAMPLES_COMMON_CL_DEVICE_CONTEXT_H
+#define OSD_EXAMPLES_COMMON_CL_DEVICE_CONTEXT_H
+
+#include "osd/opencl.h"
+
+class CLDeviceContext {
+public:
+    CLDeviceContext();
+    ~CLDeviceContext();
+
+    static bool HAS_CL_VERSION_1_1 ();
+
+    bool Initialize();
+
+    bool IsInitialized() const {
+        return (_clContext != NULL);
+    }
+
+    cl_context GetContext() const {
+        return _clContext;
+    }
+    cl_command_queue GetCommandQueue() const {
+        return _clCommandQueue;
+    }
+
+private:
+    cl_context _clContext;
+    cl_command_queue _clCommandQueue;
+};
+
+
+
+#endif  // OSD_EXAMPLES_COMMON_CL_DEVICE_CONTEXT_H
--- a/examples/common/cudaDeviceContext.cpp
+++ b/examples/common/cudaDeviceContext.cpp
@ -0,0 +1,137 @@
+//
+//   Copyright 2015 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#include "cudaDeviceContext.h"
+
+#if defined(_WIN32)
+    #include <windows.h>
+#elif defined(__APPLE__)
+    #include <OpenGL/OpenGL.h>
+#else
+    #include <X11/Xlib.h>
+    #include <GL/glx.h>
+#endif
+
+#include <cstdio>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_gl_interop.h>
+
+#define message(fmt, ...)
+//#define message(fmt, ...)  fprintf(stderr, fmt, __VA_ARGS__)
+#define error(fmt, ...)  fprintf(stderr, fmt, __VA_ARGS__)
+
+// -----------------------------------------------------------------------
+#if CUDA_VERSION < 5000
+static int _GetCudaDeviceForCurrentGLContext()
+{
+#if defined(_WIN32)
+
+    return 0;
+
+#elif defined(__APPLE__)
+
+    return 0;
+
+#else  // X11
+    // If we don't have a current GL context, then choose the device which
+    // matches the current X11 screen number.
+    Display * display = glXGetCurrentDisplay();
+    if (not display) {
+        display = XOpenDisplay(NULL);
+        if (display) {
+            int screen = DefaultScreen(display);
+            XCloseDisplay(display);
+            message("CUDA init using device for default screen: %d\n", screen);
+            return screen;
+        }
+        return 0;
+    }
+
+    // We can't use the new interop API, so use the device
+    // corresponding to the screen number of the current GL context.
+    int screen = DefaultScreen(display);
+    message("CUDA init using device for screen: %d\n", screen);
+    return screen;
+#endif  // X11
+}
+
+#else   // CUDA_VERSION >= 50000 -----------------------------------------
+static int _GetCudaDeviceForCurrentGLContext()
+{
+    // Find and use the CUDA device for the current GL context
+    unsigned int interopDeviceCount = 0;
+    int interopDevices[1];
+    cudaError_t status = cudaGLGetDevices(&interopDeviceCount, interopDevices,
+                                          1,  cudaGLDeviceListCurrentFrame);
+    if (status == cudaErrorNoDevice or interopDeviceCount != 1) {
+        message("CUDA no interop devices found.\n");
+        return 0;
+    }
+    int device = interopDevices[0];
+
+#if defined(_WIN32)
+    return device;
+
+#elif defined(__APPLE__)
+    return device;
+
+#else  // X11
+    Display * display = glXGetCurrentDisplay();
+    int screen = DefaultScreen(display);
+    if (device != screen) {
+        error("The CUDA interop device (%d) does not match "
+              "the screen used by the current GL context (%d), "
+              "which may cause slow performance on systems "
+              "with multiple GPU devices.", device, screen);
+    }
+    message("CUDA init using device for current GL context: %d\n", device);
+    return device;
+#endif
+}
+#endif   // CUDA_VERSION -----------------------------------------------
+
+CudaDeviceContext::CudaDeviceContext() :
+    _initialized(false) {
+}
+
+CudaDeviceContext::~CudaDeviceContext() {
+    cudaDeviceReset();
+}
+
+bool
+CudaDeviceContext::Initialize() {
+
+    // see if any cuda device is available.
+    int deviceCount = 0;
+    cudaGetDeviceCount(&deviceCount);
+    message("CUDA device count: %d\n", deviceCount);
+    if (deviceCount <= 0) {
+        return false;
+    }
+
+    cudaGLSetGLDevice(_GetCudaDeviceForCurrentGLContext());
+    _initialized = true;
+    return true;
+}
--- a/examples/common/cudaDeviceContext.h
+++ b/examples/common/cudaDeviceContext.h
@ -0,0 +1,43 @@
+//
+//   Copyright 2013 Pixar
+//
+//   Licensed under the Apache License, Version 2.0 (the "Apache License")
+//   with the following modification; you may not use this file except in
+//   compliance with the Apache License and the following modification to it:
+//   Section 6. Trademarks. is deleted and replaced with:
+//
+//   6. Trademarks. This License does not grant permission to use the trade
+//      names, trademarks, service marks, or product names of the Licensor
+//      and its affiliates, except as required to comply with Section 4(c) of
+//      the License and to reproduce the content of the NOTICE file.
+//
+//   You may obtain a copy of the Apache License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the Apache License with the above modification is
+//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+//   KIND, either express or implied. See the Apache License for the specific
+//   language governing permissions and limitations under the Apache License.
+//
+
+#ifndef OSD_EXAMPLES_COMMON_CUDA_DEVICE_CONTEXT_H
+#define OSD_EXAMPLES_COMMON_CUDA_DEVICE_CONTEXT_H
+
+class CudaDeviceContext {
+public:
+    CudaDeviceContext();
+    ~CudaDeviceContext();
+
+    bool Initialize();
+
+    bool IsInitialized() const {
+        return _initialized;
+    }
+
+private:
+    bool _initialized;
+};
+
+#endif  // OSD_EXAMPLES_COMMON_CUDA_DEVICE_CONTEXT_H
--- a/examples/common/cudaInit.h
+++ b/examples/common/cudaInit.h
@ -1,111 +0,0 @@
-//
-//   Copyright 2013 Pixar
-//
-//   Licensed under the Apache License, Version 2.0 (the "Apache License")
-//   with the following modification; you may not use this file except in
-//   compliance with the Apache License and the following modification to it:
-//   Section 6. Trademarks. is deleted and replaced with:
-//
-//   6. Trademarks. This License does not grant permission to use the trade
-//      names, trademarks, service marks, or product names of the Licensor
-//      and its affiliates, except as required to comply with Section 4(c) of
-//      the License and to reproduce the content of the NOTICE file.
-//
-//   You may obtain a copy of the Apache License at
-//
-//       http://www.apache.org/licenses/LICENSE-2.0
-//
-//   Unless required by applicable law or agreed to in writing, software
-//   distributed under the Apache License with the above modification is
-//   distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//   KIND, either express or implied. See the Apache License for the specific
-//   language governing permissions and limitations under the Apache License.
-//
-
-#ifndef OSD_CUDA_INIT_H
-#define OSD_CUDA_INIT_H
-
-#include <algorithm>
-#include <cstdio>
-
-// From "NVIDIA GPU Computing SDK 4.2/C/common/inc/cutil_inline_runtime.h":
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2Cores_local(int major, int minor)
-{
-    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-    typedef struct {
-        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-        int Cores;
-    } sSMtoCores;
-
-    sSMtoCores nGpuArchCoresPerSM[] =
-    { { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
-      { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
-      { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
-      { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
-      { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
-      { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
-      { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
-      {   -1, -1 }
-    };
-
-    int index = 0;
-    while (nGpuArchCoresPerSM[index].SM != -1) {
-        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
-            return nGpuArchCoresPerSM[index].Cores;
-        }
-        index++;
-    }
-    printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
-    return -1;
-}
-// end of GPU Architecture definitions
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int cutGetMaxGflopsDeviceId()
-{
-    int current_device   = 0, sm_per_multiproc = 0;
-    int max_compute_perf = 0, max_perf_device  = 0;
-    int device_count     = 0, best_SM_arch     = 0;
-    cudaDeviceProp deviceProp;
-
-    cudaGetDeviceCount( &device_count );
-    // Find the best major SM Architecture GPU device
-    while ( current_device < device_count ) {
-        cudaGetDeviceProperties( &deviceProp, current_device );
-        if (deviceProp.major > 0 && deviceProp.major < 9999) {
-            best_SM_arch = std::max(best_SM_arch, deviceProp.major);
-        }
-        current_device++;
-    }
-
-    // Find the best CUDA capable GPU device
-    current_device = 0;
-    while( current_device < device_count ) {
-        cudaGetDeviceProperties( &deviceProp, current_device );
-        if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
-            sm_per_multiproc = 1;
-        } else {
-            sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
-        }
-        int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
-        if( compute_perf  > max_compute_perf ) {
-            // If we find GPU with SM major > 2, search only these
-            if ( best_SM_arch > 2 ) {
-                // If our device==dest_SM_arch, choose this, or else pass
-                if (deviceProp.major == best_SM_arch) {
-                    max_compute_perf  = compute_perf;
-                    max_perf_device   = current_device;
-                }
-            } else {
-                max_compute_perf  = compute_perf;
-                max_perf_device   = current_device;
-            }
-        }
-        ++current_device;
-    }
-    return max_perf_device;
-}
-
-#endif //OSD_CUDA_INIT_H
--- a/examples/common/d3d11_hud.cpp
+++ b/examples/common/d3d11_hud.cpp
--- a/examples/glImaging/CMakeLists.txt
+++ b/examples/glImaging/CMakeLists.txt
@ -34,7 +34,6 @@ set(SHADER_FILES

 set(SOURCE_FILES
    glImaging.cpp
-    ../common/patchColors.cpp
 )

 set(PLATFORM_LIBRARIES
@ -59,11 +58,23 @@ _stringify("${SHADER_FILES}" INC_FILES)

 include_directories("${CMAKE_CURRENT_BINARY_DIR}")

-_add_possibly_cuda_executable(glImaging
+# optional dependency - enables screenshots
+# XXX: this is actually unnecessary for this test since glImaging
+# use stb_image_write, however, examples_common_obj has libpng
+# dependency so we need to add here. We'll remove the libpng dependency soon.
+find_package(PNG)
+if (PNG_FOUND)
+    include_directories("${PNG_INCLUDE_DIRS}")
+    list(APPEND PLATFORM_LIBRARIES "${PNG_LIBRARIES}")
+    add_definitions(-DOPENSUBDIV_HAS_PNG)
+endif()
+
+_add_glfw_executable(glImaging
    "${SOURCE_FILES}"
    "${SHADER_FILES}"
    "${INC_FILES}"
    $<TARGET_OBJECTS:regression_common_obj>
+    $<TARGET_OBJECTS:examples_common_obj>
 )

 add_dependencies(glImaging blarg )
--- a/examples/glImaging/glImaging.cpp
+++ b/examples/glImaging/glImaging.cpp
@ -66,15 +66,9 @@
    #include <osd/clComputeContext.h>
    #include <osd/clComputeController.h>

-    #include "../common/clInit.h"
+    #include "../common/clDeviceContext.h"

-    struct CLContext {
-        cl_context GetContext() const { return clContext; }
-        cl_command_queue GetCommandQueue() const { return clQueue; }
-        cl_context clContext;
-        cl_command_queue clQueue;
-    };
-    CLContext g_clContext;
+    CLDeviceContext g_clDeviceContext;
    OpenSubdiv::Osd::CLComputeController *g_clComputeController = NULL;
 #endif

@ -86,7 +80,8 @@
    #include <cuda_runtime_api.h>
    #include <cuda_gl_interop.h>

-    #include "../common/cudaInit.h"
+    #include "../common/cudaDeviceContext.h"
+    CudaDeviceContext g_cudaDeviceContext;

    OpenSubdiv::Osd::CudaComputeController *g_cudaComputeController = NULL;
 #endif
@ -297,17 +292,18 @@ createOsdMesh(std::string const &kernel,
    } else if(kernel == "CL") {
        if (not g_clComputeController) {
            g_clComputeController = new Osd::CLComputeController(
-                g_clContext.clContext, g_clContext.clQueue);
+                g_clDeviceContext.GetContext(),
+                g_clDeviceContext.GetCommandQueue());
        }
        return new Osd::Mesh<Osd::CLGLVertexBuffer,
            Osd::CLComputeController,
            Osd::GLDrawContext,
-            CLContext>(
+            CLDeviceContext>(
                g_clComputeController,
                refiner,
                numVertexElements,
                numVaryingElements,
-                level, bits, &g_clContext);
+                level, bits, &g_clDeviceContext);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
    } else if(kernel == "CUDA") {
@ -726,15 +722,22 @@ int main(int argc, char ** argv) {
        // prep GPU kernel
 #ifdef OPENSUBDIV_HAS_OPENCL
        if (kernel == "CL") {
-            if (initCL(&g_clContext.clContext, &g_clContext.clQueue) == false) {
-                std::cout << "Error in initializing OpenCL\n";
-                exit(1);
+            if (g_clDeviceContext.IsInitialized() == false) {
+                if (g_clDeviceContext.Initialize() == false) {
+                    std::cout << "Error in initializing OpenCL\n";
+                    exit(1);
+                }
            }
        }
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
        if (kernel == "CUDA") {
-            cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
+            if (g_cudaDeviceContext.IsInitialized() == false) {
+                if (g_cudaDeviceContext.Initialize() == false) {
+                    std::cout << "Error in initializing Cuda\n";
+                    exit(1);
+                }
+            }
        }
 #endif
        for (size_t i = 0; i < g_shapes.size(); ++i) {
@ -754,12 +757,6 @@ int main(int argc, char ** argv) {

            glfwSwapBuffers(window);
        }
-
-#ifdef OPENSUBDIV_HAS_OPENCL
-        if (kernel == "CL") {
-            uninitCL(g_clContext.clContext, g_clContext.clQueue);
-        }
-#endif
    }

    return 0;
--- a/examples/glPtexViewer/glPtexViewer.cpp
+++ b/examples/glPtexViewer/glPtexViewer.cpp
@ -80,16 +80,9 @@ OpenSubdiv::Osd::CpuComputeController * g_cpuComputeController = NULL;
    #include <osd/clComputeContext.h>
    #include <osd/clComputeController.h>

-    #include "../common/clInit.h"
-
-    struct CLContext {
-        cl_context GetContext() const { return clContext; }
-        cl_command_queue GetCommandQueue() const { return clQueue; }
-        cl_context clContext;
-        cl_command_queue clQueue;
-    };
-    CLContext g_clContext;
+    #include "../common/clDeviceContext.h"

+    CLDeviceContext g_clDeviceContext;
    OpenSubdiv::Osd::CLComputeController * g_clComputeController = NULL;
 #endif

@ -101,9 +94,9 @@ OpenSubdiv::Osd::CpuComputeController * g_cpuComputeController = NULL;
    #include <cuda_runtime_api.h>
    #include <cuda_gl_interop.h>

-    #include "../common/cudaInit.h"
+    #include "../common/cudaDeviceContext.h"

-    bool g_cudaInitialized = false;
+    CudaDeviceContext g_cudaDeviceContext;
    OpenSubdiv::Osd::CudaComputeController * g_cudaComputeController = NULL;
 #endif

@ -1088,17 +1081,18 @@ createOsdMesh(int level, int kernel) {
    } else if (kernel == kCL) {
        if (not g_clComputeController) {
            g_clComputeController = new OpenSubdiv::Osd::CLComputeController(
-                g_clContext.clContext, g_clContext.clQueue);
+                g_clDeviceContext.GetContext(),
+                g_clDeviceContext.GetCommandQueue());
        }
        g_mesh = new OpenSubdiv::Osd::Mesh<OpenSubdiv::Osd::CLGLVertexBuffer,
                                         OpenSubdiv::Osd::CLComputeController,
                                         OpenSubdiv::Osd::GLDrawContext,
-                                         CLContext>(
+                                         CLDeviceContext>(
                                                g_clComputeController,
                                                refiner,
                                                numVertexElements,
                                                numVaryingElements,
-                                                level, bits, &g_clContext);
+                                                level, bits, &g_clDeviceContext);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
    } else if (kernel == kCUDA) {
@ -2042,12 +2036,10 @@ void uninitGL() {

 #ifdef OPENSUBDIV_HAS_OPENCL
    delete g_clComputeController;
-    uninitCL(g_clContext.clContext, g_clContext.clQueue);
 #endif

 #ifdef OPENSUBDIV_HAS_CUDA
    delete g_cudaComputeController;
-    cudaDeviceReset();
 #endif

 #ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
@ -2092,13 +2084,21 @@ callbackKernel(int k) {
    g_kernel = k;

 #ifdef OPENSUBDIV_HAS_OPENCL
-    if (g_kernel == kCL and g_clContext.clContext == NULL) {
+    if (g_kernel == kCL and (not g_clDeviceContext.IsInitialized())) {
        // Initialize OpenCL
-        if (initCL(&g_clContext.clContext, &g_clContext.clQueue) == false) {
+        if (g_clDeviceContext.Initialize() == false) {
            printf("Error in initializing OpenCL\n");
            exit(1);
        }
    }
+#endif
+#ifdef OPENSUBDIV_HAS_CUDA
+    if (g_kernel == kCUDA and (not g_cudaDeviceContext.IsInitialized())) {
+        if (g_cudaDeviceContext.Initialize() == false) {
+            printf("Error in initializing Cuda\n");
+            exit(1);
+        }
+    }
 #endif
    createOsdMesh(g_level, g_kernel);
 }
@ -2474,12 +2474,6 @@ int main(int argc, char ** argv) {
    // activate feature adaptive tessellation if OSD supports it
    g_adaptive = OpenSubdiv::Osd::GLDrawContext::SupportsAdaptiveTessellation();

-#if OPENSUBDIV_HAS_CUDA
-    // Note: This function randomly crashes with linux 5.0-dev driver.
-    // cudaGetDeviceProperties overrun stack..?
-    cudaGLSetGLDevice(cutGetMaxGflopsDeviceId());
-#endif
-
    int windowWidth = g_width, windowHeight = g_height;

    // window size might not match framebuffer size on a high DPI display
@ -2541,7 +2535,7 @@ int main(int argc, char ** argv) {
    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
 #endif
 #ifdef OPENSUBDIV_HAS_OPENCL
-    if (HAS_CL_VERSION_1_1()) {
+    if (CLDeviceContext::HAS_CL_VERSION_1_1()) {
        g_hud.AddPullDownButton(compute_pulldown, "OpenCL", kCL);
    }
 #endif
--- a/examples/glShareTopology/glShareTopology.cpp
+++ b/examples/glShareTopology/glShareTopology.cpp
@ -52,48 +52,50 @@ GLFWmonitor* g_primary=0;
 #include <osd/cpuGLVertexBuffer.h>
 #include <osd/cpuComputeContext.h>
 #include <osd/cpuComputeController.h>
+OpenSubdiv::Osd::CpuComputeController *g_cpuComputeController = NULL;

 #ifdef OPENSUBDIV_HAS_OPENMP
    #include <osd/ompComputeController.h>
+    OpenSubdiv::Osd::OmpComputeController *g_ompComputeController = NULL;
 #endif

 #ifdef OPENSUBDIV_HAS_TBB
    #include <osd/tbbComputeController.h>
+    OpenSubdiv::Osd::TbbComputeController *g_tbbComputeController = NULL;
 #endif

 #ifdef OPENSUBDIV_HAS_OPENCL
    #include <osd/clGLVertexBuffer.h>
    #include <osd/clComputeContext.h>
    #include <osd/clComputeController.h>
+    OpenSubdiv::Osd::CLComputeController *g_clComputeController = NULL;

-    #include "../common/clInit.h"
-
-    cl_context g_clContext;
-    cl_command_queue g_clQueue;
+    #include "../common/clDeviceContext.h"
+    CLDeviceContext g_clDeviceContext;
 #endif

 #ifdef OPENSUBDIV_HAS_CUDA
    #include <osd/cudaGLVertexBuffer.h>
    #include <osd/cudaComputeContext.h>
    #include <osd/cudaComputeController.h>
+    OpenSubdiv::Osd::CudaComputeController *g_cudaComputeController = NULL;

-    #include <cuda_runtime_api.h>
-    #include <cuda_gl_interop.h>
-
-    #include "../common/cudaInit.h"
-    bool g_cudaInitialized = false;
+    #include "../common/cudaDeviceContext.h"
+    CudaDeviceContext g_cudaDeviceContext;
 #endif

 #ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
    #include <osd/glslTransformFeedbackComputeContext.h>
    #include <osd/glslTransformFeedbackComputeController.h>
    #include <osd/glVertexBuffer.h>
+    OpenSubdiv::Osd::GLSLTransformFeedbackComputeController *g_glslXFBComputeController = NULL;
 #endif

 #ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
    #include <osd/glslComputeContext.h>
    #include <osd/glslComputeController.h>
    #include <osd/glVertexBuffer.h>
+    OpenSubdiv::Osd::GLSLComputeController *g_glslComputeController = NULL;
 #endif


@ -155,16 +157,18 @@ private:
    int _numVertices;                // # of vertices of single instance
 };

-template <class VERTEX_BUFFER>
+template <class VERTEX_BUFFER, class DEVICE_CONTEXT>
 class Instances : public InstancesBase {
 public:
    Instances(int numInstances,
              Osd::VertexBufferDescriptor const &vertexDesc,
              Osd::VertexBufferDescriptor const &varyingDesc,
              bool interleaved,
-              int numVertices) :
+              int numVertices,
+              DEVICE_CONTEXT *deviceContext) :
        InstancesBase(vertexDesc, varyingDesc, numVertices),
-        _vertexBuffer(NULL), _varyingBuffer(NULL), _interleaved(interleaved) {
+        _vertexBuffer(NULL), _varyingBuffer(NULL), _interleaved(interleaved),
+        _deviceContext(deviceContext) {

        if (interleaved) {
            assert(vertexDesc.stride == varyingDesc.stride);
@ -206,11 +210,12 @@ public:
    }

    VERTEX_BUFFER *createVertexBuffer(int numElements, int numVertices) {
-        return VERTEX_BUFFER::Create(numElements, numVertices);
+        return VERTEX_BUFFER::Create(numElements, numVertices, _deviceContext);
    }
-    void updateVertexBuffer(VERTEX_BUFFER *vertexBuffer, const float *src, int startVertex,
+    void updateVertexBuffer(VERTEX_BUFFER *vertexBuffer,
+                            const float *src, int startVertex,
                            int numVertices) {
-        vertexBuffer->UpdateData(src, startVertex, numVertices);
+        vertexBuffer->UpdateData(src, startVertex, numVertices, _deviceContext);
    }

    VERTEX_BUFFER *GetVertexBuffer() const { return _vertexBuffer; }
@ -220,6 +225,7 @@ private:
    VERTEX_BUFFER *_vertexBuffer;
    VERTEX_BUFFER *_varyingBuffer;
    bool _interleaved;
+    DEVICE_CONTEXT *_deviceContext;
 };

 // ---------------------------------------------------------------------------
@ -282,19 +288,27 @@ private:
    std::vector<float> _restPosition;
 };

-template <class COMPUTE_CONTROLLER, class VERTEX_BUFFER>
+template <class COMPUTE_CONTROLLER, class VERTEX_BUFFER,
+          class DEVICE_CONTEXT=void>
 class Topology : public TopologyBase {

 public:

+    typedef COMPUTE_CONTROLLER ComputeController;
    typedef typename COMPUTE_CONTROLLER::ComputeContext ComputeContext;
+    typedef DEVICE_CONTEXT DeviceContext;

-    Topology(Far::PatchTables const * patchTables,
-        Far::StencilTables const * vertexStencils,
-            Far::StencilTables const * varyingStencils)
-                : TopologyBase(patchTables) {
+    Topology(ComputeController * computeController,
+             Far::PatchTables const * patchTables,
+             Far::StencilTables const * vertexStencils,
+             Far::StencilTables const * varyingStencils,
+             DeviceContext * deviceContext = NULL)
+        : TopologyBase(patchTables),
+          _computeController(computeController),
+          _deviceContext(deviceContext) {

-        _computeContext = ComputeContext::Create(vertexStencils, varyingStencils);
+        _computeContext = ComputeContext::Create(
+            vertexStencils, varyingStencils, deviceContext);

        _numVertices = vertexStencils->GetNumStencils() +
            vertexStencils->GetNumControlVertices();
@ -311,8 +325,8 @@ public:
        Osd::VertexBufferDescriptor const &globalVaryingDesc =
            instance->GetVaryingDesc();

-        Instances<VERTEX_BUFFER> *typedInstance =
-            static_cast<Instances<VERTEX_BUFFER> *>(instance);
+        Instances<VERTEX_BUFFER, DEVICE_CONTEXT> *typedInstance =
+            static_cast<Instances<VERTEX_BUFFER, DEVICE_CONTEXT> *>(instance);

        for (int i = 0; i < numInstances; ++i) {

@ -326,11 +340,11 @@ public:
                globalVaryingDesc.length,
                globalVaryingDesc.stride);

-            _computeController.Compute(_computeContext,
-                                      typedInstance->GetVertexBuffer(),
-                                      typedInstance->GetVaryingBuffer(),
-                                      &vertexDesc,
-                                      &varyingDesc);
+            _computeController->Compute(_computeContext,
+                                        typedInstance->GetVertexBuffer(),
+                                        typedInstance->GetVaryingBuffer(),
+                                        &vertexDesc,
+                                        &varyingDesc);
        }
    }

@ -340,65 +354,29 @@ public:
        Osd::VertexBufferDescriptor const &varyingDesc,
        bool interleaved) {

-        return new Instances<VERTEX_BUFFER>(numInstances,
-                                            vertexDesc,
-                                            varyingDesc,
-                                            interleaved,
-                                            _numVertices);
+        return new Instances<VERTEX_BUFFER, DEVICE_CONTEXT>(
+            numInstances, vertexDesc, varyingDesc,
+            interleaved, _numVertices, _deviceContext);
    }

    virtual void Synchronize() {
-        _computeController.Synchronize();
+        _computeController->Synchronize();
    }

    virtual void UpdateVertexTexture(InstancesBase *instances) {
-        Instances<VERTEX_BUFFER> *typedInstance =
-            static_cast<Instances<VERTEX_BUFFER> *>(instances);
+        Instances<VERTEX_BUFFER, DEVICE_CONTEXT> *typedInstance =
+            static_cast<Instances<VERTEX_BUFFER, DEVICE_CONTEXT> *>(instances);
        GetDrawContext()->UpdateVertexTexture(typedInstance->GetVertexBuffer());

        updateVertexBufferStride(typedInstance->GetVertexBuffer()->GetNumElements());
    }

 private:
-    COMPUTE_CONTROLLER _computeController;
+    ComputeController *_computeController;
    ComputeContext *_computeContext;
+    DeviceContext *_deviceContext;
 };

-// ---------------------------------------------------------------------------
-
-// CL specializations
-#ifdef OPENSUBDIV_HAS_OPENCL
-
-template<> Osd::CLGLVertexBuffer *
-Instances<Osd::CLGLVertexBuffer>::createVertexBuffer(
-    int numElements, int numVertices) {
-    return Osd::CLGLVertexBuffer::Create(
-        numElements, numVertices, g_clContext);
-}
-
-template<> void
-Instances<Osd::CLGLVertexBuffer>::updateVertexBuffer(
-    Osd::CLGLVertexBuffer *vertexBuffer,
-    const float *src, int startVertex, int numVertices) {
-    vertexBuffer->UpdateData(src, startVertex, numVertices, g_clQueue);
-}
-
-template<>
-Topology<Osd::CLComputeController, Osd::CLGLVertexBuffer>::
-Topology(Far::PatchTables const * patchTables,
-    Far::StencilTables const * vertexStencils, Far::StencilTables const * varyingStencils) :
-        TopologyBase(patchTables), _computeController(g_clContext, g_clQueue) {
-
-    _computeContext = ComputeContext::Create(vertexStencils, varyingStencils, g_clContext);
-
-    _numVertices = vertexStencils->GetNumStencils() +
-        vertexStencils->GetNumControlVertices();
-}
-#endif
-
-// ---------------------------------------------------------------------------
-
-
 TopologyBase *g_topology = NULL;
 InstancesBase *g_instances = NULL;

@ -616,7 +594,8 @@ createOsdMesh( const std::string &shapeStr, int level, Scheme scheme=kCatmark )
    bool doAdaptive = (g_adaptive!=0 and scheme==kCatmark);

    if (doAdaptive) {
-        refiner->RefineAdaptive(Far::TopologyRefiner::AdaptiveOptions(level));
+        Far::TopologyRefiner::AdaptiveOptions options(level);
+        refiner->RefineAdaptive(options);
    } else {
        Far::TopologyRefiner::UniformOptions options(level);
        options.fullTopologyInLastLevel = true;
@ -638,45 +617,82 @@ createOsdMesh( const std::string &shapeStr, int level, Scheme scheme=kCatmark )
        assert(vertexStencils);
    }

-    Far::PatchTables const * patchTables =
-        Far::PatchTablesFactory::Create(*refiner);
+    Far::PatchTables const * patchTables = NULL;
+    {
+        Far::PatchTablesFactory::Options poptions(level);
+        poptions.SetEndCapType(
+            Far::PatchTablesFactory::Options::ENDCAP_LEGACY_GREGORY);
+        patchTables = Far::PatchTablesFactory::Create(*refiner, poptions);
+    }


    // create partitioned patcharray
    TopologyBase *topology = NULL;

    if (g_kernel == kCPU) {
+        if (not g_cpuComputeController)
+            g_cpuComputeController = new Osd::CpuComputeController();
        topology = new Topology<Osd::CpuComputeController,
-            Osd::CpuGLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::CpuGLVertexBuffer>(g_cpuComputeController,
+                                    patchTables,
+                                    vertexStencils, varyingStencils);
 #ifdef OPENSUBDIV_HAS_OPENMP
    } else if (g_kernel == kOPENMP) {
+        if (not g_ompComputeController)
+            g_ompComputeController = new Osd::OmpComputeController();
        topology = new Topology<Osd::OmpComputeController,
-            Osd::CpuGLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::CpuGLVertexBuffer>(g_ompComputeController,
+                                    patchTables,
+                                    vertexStencils, varyingStencils);
 #endif
 #ifdef OPENSUBDIV_HAS_TBB
    } else if (g_kernel == kTBB) {
+        if (not g_tbbComputeController)
+            g_tbbComputeController = new Osd::TbbComputeController();
        topology = new Topology<Osd::TbbComputeController,
-            Osd::CpuGLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::CpuGLVertexBuffer>(g_tbbComputeController,
+                                    patchTables,
+                                    vertexStencils, varyingStencils);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
    } else if (g_kernel == kCUDA) {
+        if (not g_cudaComputeController)
+            g_cudaComputeController = new Osd::CudaComputeController();
        topology = new Topology<Osd::CudaComputeController,
-            Osd::CudaGLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::CudaGLVertexBuffer>(g_cudaComputeController,
+                                     patchTables,
+                                     vertexStencils, varyingStencils);
 #endif
 #ifdef OPENSUBDIV_HAS_OPENCL
    } else if (g_kernel == kCL) {
+        if (not g_clComputeController)
+            g_clComputeController = new Osd::CLComputeController(
+                g_clDeviceContext.GetContext(),
+                g_clDeviceContext.GetCommandQueue());
        topology = new Topology<Osd::CLComputeController,
-            Osd::CLGLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::CLGLVertexBuffer,
+            CLDeviceContext>(g_clComputeController,
+                             patchTables,
+                             vertexStencils, varyingStencils,
+                             &g_clDeviceContext);
 #endif
 #ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
    } else if (g_kernel == kGLSL) {
+        if (not g_glslXFBComputeController)
+            g_glslXFBComputeController = new Osd::GLSLTransformFeedbackComputeController();
        topology = new Topology<Osd::GLSLTransformFeedbackComputeController,
-            Osd::GLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::GLVertexBuffer>(g_glslXFBComputeController,
+                                 patchTables,
+                                 vertexStencils, varyingStencils);
 #endif
 #ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
    } else if (g_kernel == kGLSLCompute) {
+        if (not g_glslComputeController)
+            g_glslComputeController = new Osd::GLSLComputeController();
        topology = new Topology<Osd::GLSLComputeController,
-            Osd::GLVertexBuffer>(patchTables, vertexStencils, varyingStencils);
+            Osd::GLVertexBuffer>(g_glslComputeController,
+                                 patchTables,
+                                 vertexStencils, varyingStencils);
 #endif
    } else {
    }
@ -1257,10 +1273,27 @@ uninitGL() {
    if (g_topology)
        delete g_topology;

-#ifdef OPENSUBDIV_HAS_OPENCL
-    uninitCL(g_clContext, g_clQueue);
+    delete g_cpuComputeController;
+
+#ifdef OPENSUBDIV_HAS_OPENMP
+    delete g_ompComputeController;
 #endif

+#ifdef OPENSUBDIV_HAS_TBB
+    delete g_tbbComputeController;
+#endif
+#ifdef OPENSUBDIV_HAS_OPENCL
+    delete g_clComputeController;
+#endif
+#ifdef OPENSUBDIV_HAS_CUDA
+    delete g_cudaComputeController;
+#endif
+#ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
+    delete g_glslXFBComputeController;
+#endif
+#ifdef OPENSUBDIV_HAS_GLSL_COMPUTE
+    delete g_glslComputeController;
+#endif
 }

 //------------------------------------------------------------------------------
@ -1363,8 +1396,8 @@ callbackKernel(int k) {
    g_kernel = k;

 #ifdef OPENSUBDIV_HAS_OPENCL
-    if (g_kernel == kCL and g_clContext == NULL) {
-        if (initCL(&g_clContext, &g_clQueue) == false) {
+    if (g_kernel == kCL and (not g_clDeviceContext.IsInitialized())) {
+        if (g_clDeviceContext.Initialize() == false) {
            printf("Error in initializing OpenCL\n");
            exit(1);
        }
@ -1372,9 +1405,11 @@ callbackKernel(int k) {
 #endif

 #ifdef OPENSUBDIV_HAS_CUDA
-    if (g_kernel == kCUDA and g_cudaInitialized == false) {
-        g_cudaInitialized = true;
-        cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
+    if (g_kernel == kCUDA and (not g_cudaDeviceContext.IsInitialized())) {
+        if (g_cudaDeviceContext.Initialize() == false) {
+            printf("Error in initializing Cuda\n");
+            exit(1);
+        }
    }
 #endif

@ -1457,7 +1492,7 @@ initHUD() {
    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
 #endif
 #ifdef OPENSUBDIV_HAS_OPENCL
-    if (HAS_CL_VERSION_1_1()) {
+    if (CLDeviceContext::HAS_CL_VERSION_1_1()) {
        g_hud.AddPullDownButton(compute_pulldown, "OpenCL", kCL);
    }
 #endif
--- a/examples/glViewer/glViewer.cpp
+++ b/examples/glViewer/glViewer.cpp
@ -67,16 +67,9 @@ OpenSubdiv::Osd::CpuComputeController *g_cpuComputeController = NULL;
    #include <osd/clComputeContext.h>
    #include <osd/clComputeController.h>

-    #include "../common/clInit.h"
-
-    struct CLContext {
-        cl_context GetContext() const { return clContext; }
-        cl_command_queue GetCommandQueue() const { return clQueue; }
-        cl_context clContext;
-        cl_command_queue clQueue;
-    };
-    CLContext g_clContext;
+    #include "../common/clDeviceContext.h"

+    CLDeviceContext g_clDeviceContext;
    OpenSubdiv::Osd::CLComputeController *g_clComputeController = NULL;
 #endif

@ -88,9 +81,9 @@ OpenSubdiv::Osd::CpuComputeController *g_cpuComputeController = NULL;
    #include <cuda_runtime_api.h>
    #include <cuda_gl_interop.h>

-    #include "../common/cudaInit.h"
+    #include "../common/cudaDeviceContext.h"

-    bool g_cudaInitialized = false;
+    CudaDeviceContext g_cudaDeviceContext;
    OpenSubdiv::Osd::CudaComputeController *g_cudaComputeController = NULL;
 #endif

@ -601,17 +594,18 @@ createOsdMesh(ShapeDesc const & shapeDesc, int level, int kernel, Scheme scheme=
    } else if(kernel == kCL) {
        if (not g_clComputeController) {
            g_clComputeController = new OpenSubdiv::Osd::CLComputeController(
-                g_clContext.clContext, g_clContext.clQueue);
+                g_clDeviceContext.GetContext(),
+                g_clDeviceContext.GetCommandQueue());
        }
        g_mesh = new OpenSubdiv::Osd::Mesh<OpenSubdiv::Osd::CLGLVertexBuffer,
                                         OpenSubdiv::Osd::CLComputeController,
                                         OpenSubdiv::Osd::GLDrawContext,
-                                         CLContext>(
+                                         CLDeviceContext>(
                                                g_clComputeController,
                                                refiner,
                                                numVertexElements,
                                                numVaryingElements,
-                                                level, bits, &g_clContext);
+                                                level, bits, &g_clDeviceContext);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
    } else if(kernel == kCUDA) {
@ -1406,11 +1400,9 @@ uninitGL() {
 #endif
 #ifdef OPENSUBDIV_HAS_OPENCL
    delete g_clComputeController;
-    uninitCL(g_clContext.clContext, g_clContext.clQueue);
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
    delete g_cudaComputeController;
-    cudaDeviceReset();
 #endif
 #ifdef OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK
    delete g_glslTransformFeedbackComputeController;
@ -1495,17 +1487,19 @@ callbackKernel(int k) {
    g_kernel = k;

 #ifdef OPENSUBDIV_HAS_OPENCL
-    if (g_kernel == kCL and g_clContext.clContext == NULL) {
-        if (initCL(&g_clContext.clContext, &g_clContext.clQueue) == false) {
+    if (g_kernel == kCL and (not g_clDeviceContext.IsInitialized())) {
+        if (g_clDeviceContext.Initialize() == false) {
            printf("Error in initializing OpenCL\n");
            exit(1);
        }
    }
 #endif
 #ifdef OPENSUBDIV_HAS_CUDA
-    if (g_kernel == kCUDA and g_cudaInitialized == false) {
-        g_cudaInitialized = true;
-        cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
+    if (g_kernel == kCUDA and (not g_cudaDeviceContext.IsInitialized())) {
+        if (g_cudaDeviceContext.Initialize() == false) {
+            printf("Error in initializing Cuda\n");
+            exit(1);
+        }
    }
 #endif

@ -1629,7 +1623,7 @@ initHUD() {
    g_hud.AddPullDownButton(compute_pulldown, "CUDA", kCUDA);
 #endif
 #ifdef OPENSUBDIV_HAS_OPENCL
-    if (HAS_CL_VERSION_1_1()) {
+    if (CLDeviceContext::HAS_CL_VERSION_1_1()) {
        g_hud.AddPullDownButton(compute_pulldown, "OpenCL", kCL);
    }
 #endif
--- a/opensubdiv/osd/clD3D11VertexBuffer.cpp
+++ b/opensubdiv/osd/clD3D11VertexBuffer.cpp
--- a/opensubdiv/osd/clD3D11VertexBuffer.h
+++ b/opensubdiv/osd/clD3D11VertexBuffer.h
--- a/opensubdiv/osd/cpuComputeContext.cpp
+++ b/opensubdiv/osd/cpuComputeContext.cpp
--- a/opensubdiv/osd/cpuD3D11VertexBuffer.h
+++ b/opensubdiv/osd/cpuD3D11VertexBuffer.h
--- a/opensubdiv/osd/cpuGLVertexBuffer.cpp
+++ b/opensubdiv/osd/cpuGLVertexBuffer.cpp
--- a/opensubdiv/osd/cudaComputeContext.cpp
+++ b/opensubdiv/osd/cudaComputeContext.cpp
--- a/opensubdiv/osd/cudaD3D11VertexBuffer.cpp
+++ b/opensubdiv/osd/cudaD3D11VertexBuffer.cpp
@ -51,7 +51,7 @@ CudaD3D11VertexBuffer *
 CudaD3D11VertexBuffer::Create(int numElements, int numVertices,
                              ID3D11DeviceContext *deviceContext) {
    CudaD3D11VertexBuffer *instance =
-        new CudaD3D11VertexBuffer(numElements, numVertices, device);
+        new CudaD3D11VertexBuffer(numElements, numVertices);

    ID3D11Device *device;
    deviceContext->GetDevice(&device);
--- a/opensubdiv/osd/cudaGLVertexBuffer.cpp
+++ b/opensubdiv/osd/cudaGLVertexBuffer.cpp
--- a/opensubdiv/osd/d3d11DrawContext.h
+++ b/opensubdiv/osd/d3d11DrawContext.h
--- a/opensubdiv/osd/d3d11Mesh.h
+++ b/opensubdiv/osd/d3d11Mesh.h
--- a/opensubdiv/osd/glVertexBuffer.cpp
+++ b/opensubdiv/osd/glVertexBuffer.cpp
--- a/opensubdiv/osd/glslComputeContext.cpp
+++ b/opensubdiv/osd/glslComputeContext.cpp
--- a/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
+++ b/opensubdiv/osd/glslTransformFeedbackComputeContext.cpp
--- a/regression/osd_regression/CMakeLists.txt
+++ b/regression/osd_regression/CMakeLists.txt
@ -42,15 +42,7 @@ if ( GLEW_FOUND )
    list(APPEND PLATFORM_LIBRARIES "${GLEW_LIBRARY}")
 endif()

-if ( OPENCL_FOUND )
-    list(APPEND PLATFORM_LIBRARIES
-        "${OPENCL_LIBRARIES}"
-    )
-    include_directories( "${OPENCL_INCLUDE_DIRS}" )
-endif()
-
-
-_add_possibly_cuda_executable(osd_regression
+_add_executable(osd_regression
    "${SOURCE_FILES}"
    $<TARGET_OBJECTS:regression_common_obj>
 )
--- a/regression/osd_regression/main.cpp
+++ b/regression/osd_regression/main.cpp
@ -55,19 +55,6 @@ GLFWwindow* g_window=0;

 #include <far/stencilTablesFactory.h>

-#ifdef OPENSUBDIV_HAS_CUDA
-#endif
-
-#ifdef OPENSUBDIV_HAS_OPENCL
-    #include <osd/clComputeContext.h>
-    #include <osd/clComputeController.h>
-    #include <osd/clGLVertexBuffer.h>
-    static cl_context g_clContext;
-    static cl_command_queue g_clQueue;
-    #include "../../examples/common/clInit.h" // XXXX TODO move file out of examples
-#endif
-
-
 #include "../../regression/common/cmp_utils.h"
 #include "../../regression/common/hbr_utils.h"
 #include "../../regression/common/vtr_utils.h"
@ -91,14 +78,12 @@ using namespace OpenSubdiv;
 enum BackendType {
    kBackendCPU   = 0, // raw CPU
    kBackendCPUGL = 1, // CPU with GL-backed buffer
-    kBackendCL    = 2, // OpenCL
    kBackendCount
 };

 static const char* g_BackendNames[kBackendCount] = {
    "CPU",
    "CPUGL",
-    "CL",
 };

 static int g_Backend = -1;
@ -350,54 +335,6 @@ checkMeshCPUGL(FarTopologyRefiner *refiner,
    return result;
 }

-
-//------------------------------------------------------------------------------
-static int 
-checkMeshCL( FarTopologyRefiner *refiner,
-             const std::vector<xyzVV>& coarseverts,
-             xyzmesh * refmesh) {
-        
-#ifdef OPENSUBDIV_HAS_OPENCL
-
-    static Osd::CLComputeController *controller = 
-        new Osd::CLComputeController(g_clContext, g_clQueue);
-
-    Far::StencilTables const *vertexStencils;
-    Far::StencilTables const *varyingStencils;
-    buildStencilTables(*refiner, &vertexStencils, &varyingStencils);
-    Osd::CLComputeContext *context = Osd::CLComputeContext::Create(
-        vertexStencils, varyingStencils, g_clContext);
-
-    Osd::CLGLVertexBuffer *vb = 
-        Osd::CLGLVertexBuffer::Create(3, refiner->GetNumVerticesTotal(), 
-            g_clContext);
-    
-    vb->UpdateData( coarseverts[0].GetPos(), 0, (int)coarseverts.size(),
-        g_clQueue );
-
-    controller->Compute( context, vb );
-
-    // read data back from CL buffer
-    size_t dataSize = vb->GetNumVertices() * vb->GetNumElements();
-    float* data = new float[dataSize];
-    
-    clEnqueueReadBuffer (g_clQueue, vb->BindCLBuffer(g_clQueue), CL_TRUE, 0, dataSize * sizeof(float), data, 0, NULL, NULL);
-    
-    int result = checkVertexBuffer(
-        *refiner, refmesh, data, vb->GetNumElements());
-    
-    delete[] data;
-    delete context;
-    delete vertexStencils;
-    delete varyingStencils;
-    delete vb;
-    
-    return result;
-#else
-    return 0;
-#endif
-}
-
 //------------------------------------------------------------------------------
 static int 
 checkMesh( char const * msg, std::string const & shape, int levels, Scheme scheme, int backend ) {
@ -422,9 +359,6 @@ checkMesh( char const * msg, std::string const & shape, int levels, Scheme schem
        case kBackendCPUGL: 
            result = checkMeshCPUGL(refiner, vtrVertexData, refmesh); 
            break;
-        case kBackendCL: 
-            result = checkMeshCL(refiner, vtrVertexData, refmesh);
-            break;
    }

    delete refmesh;
@ -438,18 +372,6 @@ int checkBackend(int backend, int levels) {

    printf("*** checking backend : %s\n", g_BackendNames[backend]);

-    if (backend == kBackendCL) {
-#ifdef OPENSUBDIV_HAS_OPENCL
-        if (initCL(&g_clContext, &g_clQueue) == false) {
-            printf("  Cannot initialize OpenCL, skipping...\n");
-            return 0;
-        }
-#else
-        printf("  No OpenCL available, skipping...\n");
-        return 0;
-#endif
-    }
-
    int total = 0;

 #define test_catmark_edgeonly
@ -652,13 +574,6 @@ int checkBackend(int backend, int levels) {
    total += checkMesh( "test_bilinear_cube", bilinear_cube, levels, kBilinear, backend );
 #endif

-
-    if (backend == kBackendCL) {
-#ifdef OPENSUBDIV_HAS_OPENCL
-        uninitCL(g_clContext, g_clQueue);
-#endif
-    }
-
    return total;
 }