OpenGL interop is simplified when the cl_context is not created by SKC.

Added GEN9 HotSort kernels so the hs_cl_gen9 lib and hs_bench_cl app can be built. Bug: skia: Change-Id: I5b21d33499a6ec3524f39a51443981802b722c8b Reviewed-on: https://skia-review.googlesource.com/136608 Commit-Queue: Allan MacKinnon <allanmac@google.com> Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Mike Klein <mtklein@google.com>
2018-06-21 09:09:56 -07:00 · 2018-06-21 09:09:56 -07:00 · c110e7941e
commit c110e7941e
parent 867ce8fc8e
36 changed files with 20693 additions and 10527 deletions
--- a/src/compute/common/cl/find_cl.c
+++ b/src/compute/common/cl/find_cl.c
@ -20,7 +20,7 @@

 #include "find_cl.h"
 #include "assert_cl.h"
-#include "macros.h"
+#include "../macros.h"

 //
 // search platforms and devices for a match
--- a/src/compute/common/macros.h
+++ b/src/compute/common/macros.h
@ -38,9 +38,9 @@
 //

 #if defined(_MSC_VER)
-    #define ALLOCA(n)  _alloca(n)
+#define ALLOCA(n)  _alloca(n)
 #else
-    #define ALLOCA(n) alloca(n)
+#define ALLOCA(n) alloca(n)
 #endif
 //
 //
--- a/src/compute/hs/cl/gen9/hs_cl.cl
+++ b/src/compute/hs/cl/gen9/hs_cl.cl
--- a/src/compute/hs/cl/gen9/hs_cl.h
+++ b/src/compute/hs/cl/gen9/hs_cl.h
@ -0,0 +1,122 @@
+//                                                            
+// Copyright 2016 Google Inc.                                 
+//                                                            
+// Use of this source code is governed by a BSD-style         
+// license that can be found in the LICENSE file.             
+//                                                            
+                                                              
+#ifndef HS_CL_ONCE                                            
+#define HS_CL_ONCE                                            
+                                                              
+#define HS_LANES_PER_WARP_LOG2  3                            
+#define HS_LANES_PER_WARP       (1 << HS_LANES_PER_WARP_LOG2) 
+#define HS_BS_WARPS             16                            
+#define HS_BS_WARPS_LOG2_RU     4                            
+#define HS_BC_WARPS_LOG2_MAX    4                            
+#define HS_FM_BLOCKS_LOG2_MIN   1                            
+#define HS_HM_BLOCKS_LOG2_MIN   1                            
+#define HS_KEYS_PER_LANE        16                            
+#define HS_REG_LAST(c)          c##16                         
+#define HS_KEY_WORDS            2                            
+#define HS_KEY_TYPE             ulong                            
+#define HS_EMPTY                                              
+                                                              
+#define HS_SLAB_ROWS()    \
+  HS_SLAB_ROW(   1,   0 ) \
+  HS_SLAB_ROW(   2,   1 ) \
+  HS_SLAB_ROW(   3,   2 ) \
+  HS_SLAB_ROW(   4,   3 ) \
+  HS_SLAB_ROW(   5,   4 ) \
+  HS_SLAB_ROW(   6,   5 ) \
+  HS_SLAB_ROW(   7,   6 ) \
+  HS_SLAB_ROW(   8,   7 ) \
+  HS_SLAB_ROW(   9,   8 ) \
+  HS_SLAB_ROW(  10,   9 ) \
+  HS_SLAB_ROW(  11,  10 ) \
+  HS_SLAB_ROW(  12,  11 ) \
+  HS_SLAB_ROW(  13,  12 ) \
+  HS_SLAB_ROW(  14,  13 ) \
+  HS_SLAB_ROW(  15,  14 ) \
+  HS_SLAB_ROW(  16,  15 ) \
+  HS_EMPTY
+          
+#define HS_TRANSPOSE_SLAB()                \
+  HS_TRANSPOSE_STAGE( 1 )                  \
+  HS_TRANSPOSE_STAGE( 2 )                  \
+  HS_TRANSPOSE_STAGE( 3 )                  \
+  HS_TRANSPOSE_BLEND( r, s,  1,   2,   1 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   4,   3 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   6,   5 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,   8,   7 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  10,   9 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  12,  11 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  14,  13 ) \
+  HS_TRANSPOSE_BLEND( r, s,  1,  16,  15 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   3,   1 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   4,   2 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   7,   5 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,   8,   6 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  11,   9 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  12,  10 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  15,  13 ) \
+  HS_TRANSPOSE_BLEND( s, t,  2,  16,  14 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   5,   1 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   6,   2 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   7,   3 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,   8,   4 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  13,   9 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  14,  10 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  15,  11 ) \
+  HS_TRANSPOSE_BLEND( t, u,  3,  16,  12 ) \
+  HS_TRANSPOSE_REMAP( u,   1,   1 )        \
+  HS_TRANSPOSE_REMAP( u,   2,   3 )        \
+  HS_TRANSPOSE_REMAP( u,   3,   5 )        \
+  HS_TRANSPOSE_REMAP( u,   4,   7 )        \
+  HS_TRANSPOSE_REMAP( u,   5,   9 )        \
+  HS_TRANSPOSE_REMAP( u,   6,  11 )        \
+  HS_TRANSPOSE_REMAP( u,   7,  13 )        \
+  HS_TRANSPOSE_REMAP( u,   8,  15 )        \
+  HS_TRANSPOSE_REMAP( u,   9,   2 )        \
+  HS_TRANSPOSE_REMAP( u,  10,   4 )        \
+  HS_TRANSPOSE_REMAP( u,  11,   6 )        \
+  HS_TRANSPOSE_REMAP( u,  12,   8 )        \
+  HS_TRANSPOSE_REMAP( u,  13,  10 )        \
+  HS_TRANSPOSE_REMAP( u,  14,  12 )        \
+  HS_TRANSPOSE_REMAP( u,  15,  14 )        \
+  HS_TRANSPOSE_REMAP( u,  16,  16 )        \
+  HS_EMPTY
+          
+#define HS_FM_BLOCKS_LOG2_1    0 
+#define HS_FM_BLOCKS_LOG2_2    1 
+#define HS_FM_BLOCKS_LOG2_3    2 
+#define HS_FM_BLOCKS_LOG2_4    3 
+#define HS_FM_BLOCKS_LOG2_5    4 
+#define HS_FM_BLOCKS_LOG2_6    5 
+#define HS_HM_BLOCKS_LOG2_5    0 
+#define HS_FM_BLOCKS_LOG2_7    6 
+#define HS_HM_BLOCKS_LOG2_6    1 
+#define HS_FM_BLOCKS_LOG2_8    7 
+#define HS_HM_BLOCKS_LOG2_7    2 
+#define HS_FM_BLOCKS_LOG2_9    8 
+#define HS_HM_BLOCKS_LOG2_8    3 
+#define HS_FM_BLOCKS_LOG2_10   9 
+#define HS_HM_BLOCKS_LOG2_9    4 
+#define HS_FM_BLOCKS_LOG2_11   10 
+#define HS_HM_BLOCKS_LOG2_10   5 
+#define HS_FM_BLOCKS_LOG2_12   11 
+#define HS_HM_BLOCKS_LOG2_11   6 
+#define HS_FM_BLOCKS_LOG2_13   12 
+#define HS_HM_BLOCKS_LOG2_12   7 
+#define HS_FM_BLOCKS_LOG2_14   13 
+#define HS_HM_BLOCKS_LOG2_13   8 
+#define HS_FM_BLOCKS_LOG2_15   14 
+#define HS_HM_BLOCKS_LOG2_14   9 
+#define HS_FM_BLOCKS_LOG2_16   15 
+#define HS_HM_BLOCKS_LOG2_15   10 
+                                
+#endif                          
+                                
+//                              
+//                              
+//                              
+                                
--- a/src/compute/hs/cl/gen9/hs_cl_macros.h
+++ b/src/compute/hs/cl/gen9/hs_cl_macros.h
@ -0,0 +1,199 @@
+//
+// Copyright 2016 Google Inc.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+
+#ifndef HS_CL_MACROS_ONCE
+#define HS_CL_MACROS_ONCE
+
+//
+//
+//
+
+#include "hs_cl.h"
+
+//
+// Inter-lane compare exchange
+//
+
+// default
+#define HS_CMP_XCHG_V0(a,b)                     \
+  {                                             \
+    HS_KEY_TYPE const t = min(a,b);             \
+    b = max(a,b);                               \
+    a = t;                                      \
+  }
+
+// super slow
+#define HS_CMP_XCHG_V1(a,b)                     \
+  {                                             \
+    HS_KEY_TYPE const tmp = a;                  \
+    a  = (a < b) ? a : b;                       \
+    b ^= a ^ tmp;                               \
+  }
+
+// best
+#define HS_CMP_XCHG_V2(a,b)                     \
+  if (a >= b) {                                 \
+    HS_KEY_TYPE const t = a;                    \
+    a = b;                                      \
+    b = t;                                      \
+  }
+
+// good
+#define HS_CMP_XCHG_V3(a,b)                     \
+  {                                             \
+    int         const ge = a >= b;              \
+    HS_KEY_TYPE const t  = a;                   \
+    a = ge ? b : a;                             \
+    b = ge ? t : b;                             \
+  }
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V0(a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V2(a,b)
+#endif
+
+//
+// Conditional inter-subgroup flip/half compare exchange
+//
+
+#define HS_CMP_FLIP(i,a,b)                                              \
+  {                                                                     \
+    HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx);    \
+    HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx);    \
+    a = HS_COND_MIN_MAX(t_lt,a,tb);                                     \
+    b = HS_COND_MIN_MAX(t_lt,b,ta);                                     \
+  }
+
+#define HS_CMP_HALF(i,a)                                                \
+  {                                                                     \
+    HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx);    \
+    a = HS_COND_MIN_MAX(t_lt,a,ta);                                     \
+  }
+
+//
+// The device's comparison operator might return what we actually
+// want.  For example, it appears GEN 'cmp' returns {true:-1,false:0}.
+//
+
+#define HS_CMP_IS_ZERO_ONE
+
+#ifdef HS_CMP_IS_ZERO_ONE
+// OpenCL requires a {true: +1, false: 0} scalar result
+// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
+#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
+#define HS_CMP_TO_MASK(a)   (HS_KEY_TYPE)(-a)
+#else
+// However, OpenCL requires { -1, 0 } for vectors
+// (a < b) -> { 0xFFFFFFFF, 0 }
+#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
+#define HS_CMP_TO_MASK(a)   (a)
+#endif
+
+//
+// The flip/half comparisons rely on a "conditional min/max":
+//
+//  - if the flag is false, return min(a,b)
+//  - otherwise, return max(a,b)
+//
+// What's a little surprising is that sequence (1) is faster than (2)
+// for 32-bit keys.
+//
+// I suspect either a code generation problem or that the sequence
+// maps well to the GEN instruction set.
+//
+// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
+// fastest for this wider type.
+//
+
+// this is what you would normally use
+#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
+
+// this seems to be faster for 32-bit keys
+#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
+
+//
+//
+//
+
+#if   (HS_KEY_WORDS == 1)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
+#elif (HS_KEY_WORDS == 2)
+#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
+#endif
+
+//
+// This snarl of macros is for transposing a "slab" of sorted elements
+// into linear order.
+//
+// This can occur as the last step in hs_sort() or via a custom kernel
+// that inspects the slab and then transposes and stores it to memory.
+//
+// The slab format can be inspected more efficiently than a linear
+// arrangement.
+//
+// The prime example is detecting when adjacent keys (in sort order)
+// have differing high order bits ("key changes").  The index of each
+// change is recorded to an auxilary array.
+//
+// A post-processing step like this needs to be able to navigate the
+// slab and eventually transpose and store the slab in linear order.
+//
+
+#define HS_TRANSPOSE_REG(prefix,row)   prefix##row
+#define HS_TRANSPOSE_DECL(prefix,row)  HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
+
+#define HS_TRANSPOSE_DELTA(level)     (HS_LANES_PER_WARP + (1 << (level-1)))
+#define HS_TRANSPOSE_IF(level)        ((get_sub_group_local_id() >> (level - 1)) & 1)
+
+#define HS_TRANSPOSE_LL(level)        HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level)
+#define HS_TRANSPOSE_UR(level)        HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0
+
+#define HS_TRANSPOSE_DELTA_LL(level)  delta_ll_##level
+#define HS_TRANSPOSE_DELTA_UR(level)  delta_ur_##level
+
+#define HS_TRANSPOSE_STAGE(level)                                       \
+  uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level);     \
+  uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level);
+
+#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
+  HS_TRANSPOSE_DECL(prefix_curr,row_ll) =                               \
+    intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll),  \
+                                 HS_TRANSPOSE_REG(prefix_prev,row_ur),  \
+                                 HS_TRANSPOSE_DELTA_LL(level));         \
+  HS_TRANSPOSE_DECL(prefix_curr,row_ur) =                               \
+    intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll),    \
+                               HS_TRANSPOSE_REG(prefix_prev,row_ur),    \
+                               HS_TRANSPOSE_DELTA_UR(level));           \
+
+// #define HS_TRANSPOSE_LOAD(row)                                        \
+//   HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2];
+
+#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to)                      \
+  (vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] =             \
+    HS_TRANSPOSE_REG(prefix,row_from);
+
+//
+// undefine these if you want to override
+//
+
+#define HS_TRANSPOSE_PREAMBLE()
+#define HS_TRANSPOSE_BODY()
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
--- a/src/compute/skc/context.c
+++ b/src/compute/skc/context.c
@ -28,23 +28,20 @@
 //

 skc_err
-skc_context_create(skc_context_t       * context,
-                   char          const * target_platform_substring,
-                   char          const * target_device_substring,
-                   intptr_t              context_properties[])
+skc_context_create_cl(skc_context_t * context,
+                      cl_context      context_cl,
+                      cl_device_id    device_id_cl)
 {
  (*context) = malloc(sizeof(**context));

  //
-  // FIXME -- don't directly grab a CL runtime but for now juts create
-  // the CL_12 runtime here
+  // FIXME -- we'll clean up context creation by platform later.  For
+  // now, just create a CL_12 context.
  //
  skc_err err;

-  err = skc_runtime_cl_12_create(*context,
-                                 target_platform_substring,
-                                 target_device_substring,
-                                 context_properties);
+  err = skc_runtime_cl_12_create(*context,context_cl,device_id_cl);
+
  return err;
 }

--- a/src/compute/skc/main.c
+++ b/src/compute/skc/main.c
@ -21,6 +21,11 @@
 #include <stdlib.h>
 #include <conio.h>

+#include "skc_create_cl.h"
+
+#include "common/cl/find_cl.h"
+#include "common/cl/assert_cl.h"
+
 #include "svg/svg_doc.h"
 #include "svg2skc/svg2skc.h"
 #include "svg2skc/transform_stack.h"
@ -49,7 +54,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context);
 //
 //

-static 
+static
 void
 is_render_complete(skc_surface_t     surface,
                   skc_styling_t     styling,
@ -67,9 +72,9 @@ int
 main(int argc, char** argv)
 {
  //
-  // 
  //
-  if (argc <= 1) 
+  //
+  if (argc <= 1)
    {
      fprintf(stderr,"-- missing filename\n");
      return EXIT_FAILURE; // no filename
@ -94,6 +99,18 @@ main(int argc, char** argv)

  skc_interop_init(&window);

+  //
+  // find platform and device by name
+  //
+  cl_platform_id platform_id_cl;
+  cl_device_id   device_id_cl;
+
+  cl(FindIdsByName("Intel","Graphics",
+                   &platform_id_cl,
+                   &device_id_cl,
+                   0,NULL,NULL,
+                   true));
+
  //
  // get GL and device contexts
  //
@ -101,22 +118,31 @@ main(int argc, char** argv)
  HDC   hDC   = wglGetCurrentDC();

  //
+  // create the CL context
  //
-  //
-  cl_context_properties context_properties[] =
+  cl_context_properties context_properties_cl[] =
    {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)-1,
+      CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id_cl,
      CL_GL_CONTEXT_KHR,   (cl_context_properties)hGLRC,
      CL_WGL_HDC_KHR,      (cl_context_properties)hDC,
      0
    };
-  
+
+  cl_int     cl_err;
+  cl_context context_cl = clCreateContext(context_properties_cl,
+                                          1,
+                                          &device_id_cl,
+                                          NULL,
+                                          NULL,
+                                          &cl_err); cl_ok(cl_err);
  //
-  // create context
+  // create SKC context
  //
  skc_context_t context;

-  skc_err err = skc_context_create(&context,"Intel","Graphics",context_properties);
+  skc_err err = skc_context_create_cl(&context,
+                                      context_cl,
+                                      device_id_cl);

  //
  // associate
@ -136,14 +162,14 @@ main(int argc, char** argv)
  skc_raster_builder_t raster_builder;

  err = skc_raster_builder_create(context,&raster_builder);
-  
+
  //
  // create a composition
  //
  skc_composition_t composition;

  err = skc_composition_create(context,&composition);
-  
+
  //
  // create a styling instance
  //
@ -154,7 +180,7 @@ main(int argc, char** argv)
                           svg_doc_layer_count(svg_doc),
                           1000,
                           2 * 1024 * 1024);
-  
+
  //
  // create a surface
  //
@ -191,7 +217,7 @@ main(int argc, char** argv)
      skc_transform_stack_restore(ts,ts_save);

      // decode layers -- places rasters
-      svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);    
+      svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);

      // seal the composition
      skc_composition_seal(composition);
@ -244,7 +270,7 @@ main(int argc, char** argv)
      // unseal the composition
      skc_composition_unseal(composition,true);
    }
-  
+
  //
  // dispose of mundane resources
  //
--- a/src/compute/skc/platforms/cl_12/allocator_device_cl.c
+++ b/src/compute/skc/platforms/cl_12/allocator_device_cl.c
@ -106,7 +106,7 @@ skc_allocator_device_create(struct skc_runtime * const runtime)
                          &runtime->allocator.device.temp.suballocator,
                          "DEVICE",
                          runtime->config->suballocator.device.subbufs,
-                          runtime->cl.base_align,
+                          runtime->cl.align_bytes,
                          runtime->config->suballocator.device.size);

 #ifndef NDEBUG
--- a/src/compute/skc/platforms/cl_12/config_cl.h
+++ b/src/compute/skc/platforms/cl_12/config_cl.h
@ -12,7 +12,6 @@
 //
 //

-#include "runtime_cl.h"
 #include "block_pool_cl.h"

 //
@ -52,8 +51,8 @@ struct skc_config
  union skc_block_pool_size  block_pool;

  struct {
-    skc_cq_type_e            type;
-    skc_uint                 size;
+    cl_command_queue_properties cq_props;
+    skc_uint                    size;
  } cq_pool;

  struct {
--- a/src/compute/skc/platforms/cl_12/cq_pool_cl.c
+++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.c
@ -7,17 +7,18 @@
 */

 //
-//
+// squelch OpenCL 1.2 deprecation warning
 //

-#ifndef NDEBUG
-#include <stdio.h>
+#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 #endif

 //
 //
 //

+#include <stdio.h>
 #include <string.h>

 //
@ -25,6 +26,7 @@
 //

 #include "runtime_cl_12.h"
+#include "common/cl/assert_cl.h"

 //
 // This implementation is probably excessive.
@ -40,21 +42,77 @@
 //
 //

-void
-skc_cq_pool_create(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint             const type,
-                   skc_uint             const size)
-{
-  pool->type   = type;
-  pool->size   = size + 1; // an empty spot
-  pool->reads  = 0;
-  pool->writes = size;
-  pool->cq     = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
+static
+cl_command_queue
+skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime,
+                            struct skc_cq_pool * const pool)
+
+{
+  cl_command_queue cq;
+
+#if 1
+      //
+      // <= OpenCL 1.2
+      //
+      cl_int cl_err;
+
+      cq = clCreateCommandQueue(runtime->cl.context,
+                                runtime->cl.device_id,
+                                pool->cq_props,
+                                &cl_err); cl_ok(cl_err);
+#else
+  if (runtime_cl->version.major < 2)
+    {
+      //
+      // <= OpenCL 1.2
+      //
+      cl_int cl_err;
+
+      cq = clCreateCommandQueue(runtime_cl->context,
+                                runtime_cl->device_id,
+                                (cl_command_queue_properties)type,
+                                &cl_err); cl_ok(cl_err);
+    }
+  else
+    {
+      //
+      // >= OpenCL 2.0
+      //
+      cl_int                    cl_err;
+      cl_queue_properties const queue_properties[] = {
+        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
+      };
+
+      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
+                                              runtime_cl->device_id,
+                                              queue_properties,
+                                              &cl_err); cl_ok(cl_err);
+    }
+#endif
+
+  return cq;
+}
+
+//
+//
+//
+
+void
+skc_cq_pool_create(struct skc_runtime        * const runtime,
+                   struct skc_cq_pool        * const pool,
+                   cl_command_queue_properties const cq_props,
+                   skc_uint                    const size)
+{
+  pool->size     = size + 1; // an empty spot
+  pool->reads    = 0;
+  pool->writes   = size;
+
+  pool->cq_props = cq_props;
+  pool->cq       = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
+                                               pool->size * sizeof(*pool->cq));
+  for (skc_uint ii=0; ii<size; ii++)
+    pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);

-  for (skc_uint ii=0; ii<size; ii++) {
-    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
-  }
  pool->cq[size] = NULL;
 }

@ -77,7 +135,7 @@ skc_cq_pool_dispose(struct skc_runtime * const runtime,
 //
 //

-static 
+static
 void
 skc_cq_pool_write(struct skc_cq_pool * const pool,
                  cl_command_queue           cq)
@ -109,14 +167,14 @@ skc_cq_pool_expand(struct skc_runtime * const runtime,
  pool->writes = expand;

  for (skc_uint ii=0; ii<expand; ii++)
-    pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
+    pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
 }

 //
 //
 //

-static 
+static
 cl_command_queue
 skc_cq_pool_read(struct skc_runtime * const runtime,
                 struct skc_cq_pool * const pool)
@ -141,7 +199,7 @@ skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime)
 }

 void
-skc_runtime_release_cq_in_order(struct skc_runtime * const runtime, 
+skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
                                cl_command_queue           cq)
 {
  skc_cq_pool_write(&runtime->cq_pool,cq);
--- a/src/compute/skc/platforms/cl_12/cq_pool_cl.h
+++ b/src/compute/skc/platforms/cl_12/cq_pool_cl.h
@ -20,11 +20,12 @@

 struct skc_cq_pool
 {
-  skc_cq_type_e      type;
-  skc_uint           size;
-  skc_uint           reads;
-  skc_uint           writes;
-  cl_command_queue * cq;
+  cl_command_queue          * cq;
+  cl_command_queue_properties cq_props;
+
+  skc_uint                    size;
+  skc_uint                    reads;
+  skc_uint                    writes;
 };

 //l
@ -32,10 +33,10 @@ struct skc_cq_pool
 //

 void
-skc_cq_pool_create(struct skc_runtime * const runtime,
-                   struct skc_cq_pool * const pool,
-                   skc_uint             const type,
-                   skc_uint             const size);
+skc_cq_pool_create(struct skc_runtime        * const runtime,
+                   struct skc_cq_pool        * const pool,
+                   cl_command_queue_properties const cq_props,
+                   skc_uint                    const size);

 void
 skc_cq_pool_dispose(struct skc_runtime * const runtime,
--- a/src/compute/skc/platforms/cl_12/device_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/device_cl_12.h
@ -77,6 +77,10 @@ cl_kernel
 skc_device_acquire_kernel(struct skc_device  * const device, 
                          skc_device_kernel_id const type);

+void
+skc_device_release_kernel(struct skc_device  * const device,
+                          cl_kernel                  kernel);
+
 //
 // grid shape can vary greatly by target platform
 //
--- a/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/block_pool_init.cl
@ -1,64 +1,64 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "device_cl_12.h"
-
-//
-// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
-//
-
-__kernel
-SKC_BP_INIT_IDS_KERNEL_ATTRIBS
-void
-skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
-{
-  uint const gid = get_global_id(0);
-
-  //
-  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
-  // accomplish this with fewer threads and using either IPC and/or
-  // vector stores -- it should be on certain architectures!
-  //
-
-  //
-  // initialize pool with sequence
-  //
-  if (gid < bp_size)
-    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
-}
-
-//
-//
-//
-
-__kernel
-SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
-void
-skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
-{
-  // the version test is to squelch a bug with the Intel OpenCL CPU
-  // compiler declaring it supports the cl_intel_subgroups extension
-#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
-  uint const tid = get_sub_group_local_id();
-#else
-  uint const tid = get_local_id(0);
-#endif
-
-  //
-  // launch two threads and store [ 0, bp_size ]
-  //
-  bp_atomics[tid] = tid * bp_size;
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "kernel_cl_12.h"
+
+//
+// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
+//
+
+__kernel
+SKC_BP_INIT_IDS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
+{
+  uint const gid = get_global_id(0);
+
+  //
+  // FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
+  // accomplish this with fewer threads and using either IPC and/or
+  // vector stores -- it should be on certain architectures!
+  //
+
+  //
+  // initialize pool with sequence
+  //
+  if (gid < bp_size)
+    ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
+}
+
+//
+//
+//
+
+__kernel
+SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
+void
+skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
+{
+  // the version test is to squelch a bug with the Intel OpenCL CPU
+  // compiler declaring it supports the cl_intel_subgroups extension
+#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
+  uint const tid = get_sub_group_local_id();
+#else
+  uint const tid = get_local_id(0);
+#endif
+
+  //
+  // launch two threads and store [ 0, bp_size ]
+  //
+  bp_atomics[tid] = tid * bp_size;
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/device_cl_12.c
@ -19,6 +19,7 @@
 #include "config_cl.h"
 #include "runtime_cl_12.h"

+#include "kernel_cl_12.h"
 #include "device_cl_12.h"

 #include "hs/cl/hs_cl_launcher.h"
@ -124,9 +125,9 @@ struct skc_config const config =

    .cq_pool     = {
 #ifndef NDEBUG
-      .type         = SKC_CQ_TYPE_IN_ORDER_PROFILING,
+       .cq_props    = CL_QUEUE_PROFILING_ENABLE,
 #else
-      .type         = 0,
+       .cq_props    = 0,
 #endif
      .size         = 8
    },
@ -841,6 +842,14 @@ skc_device_acquire_kernel(struct skc_device  * const device,
  return kernel;
 }

+
+void
+skc_device_release_kernel(struct skc_device  * const device,
+                          cl_kernel                  kernel)
+{
+  cl(ReleaseKernel(kernel));
+}
+
 //
 // INITIALIZE KERNEL ARGS
 //
--- a/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/kernels/devices/gen9/kernel_cl_12.h
--- a/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/fills_expand.cl
@ -1,309 +1,309 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "block.h"
-#include "path.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
-
-#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)
-#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
-
-#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-//
-//
-
-#if   ( SKC_FILLS_EXPAND_X == 1 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0
-
-#elif ( SKC_FILLS_EXPAND_X == 2 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1
-
-#elif ( SKC_FILLS_EXPAND_X == 4 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3
-
-#elif ( SKC_FILLS_EXPAND_X == 8 )
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7
-
-#elif ( SKC_FILLS_EXPAND_X == 16)
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()
-#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15
-
-#else
-#error "MISSING SKC_FILLS_EXPAND_X"
-#endif
-
-//
-// Fill and rasterize cmds only differ in their first word semantics
-//
-
-union skc_cmd_expand
-{
-  union skc_cmd_fill      fill;
-  union skc_cmd_rasterize rasterize;
-};
-
-//
-//
-//
-
-union skc_path_elem
-{
-  skc_uint  u32;
-  skc_float f32;
-};
-
-//
-// COMPILE-TIME AND RUN-TIME MACROS
-//
-
-#define SKC_ELEM_IN_RANGE(X,I)                                          \
-  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \
-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-#define SKC_ELEM_GTE(X,I)                                       \
-  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I)                                            \
-  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \
-  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I)                 \
-  SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-//
-//
-
-void
-skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
-                    skc_uint                         * const out_idx,
-                    union skc_cmd_expand             * const cmd,
-                    union skc_path_elem                const e,
-                    skc_uint                           const e_idx)
-{
-  //
-  // FIXME -- we can append a large number of nodeword indices to a
-  // local SMEM queue and flush when full.  It may or may not be a
-  // performance win on some architectures.
-  //
-  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
-  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
-
-  cmd->rasterize.nodeword = e_idx;
-
-  if (is_elem) {
-    cmds_out[*out_idx + offset] = cmd->rasterize;
-  }
-
-  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
-}
-
-//
-//
-//
-
-__kernel
-SKC_FILLS_EXPAND_KERNEL_ATTRIBS
-void
-skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,
-                        __global skc_uint                volatile * const atomics,
-                        __global skc_block_id_t          const    * const map,
-                        __global union skc_cmd_fill      const    * const cmds_in,
-                        __global union skc_cmd_rasterize          * const cmds_out)
-{
-  //
-  // Need to harmonize the way we determine a subgroup's id.  In this
-  // kernel it's not as important because no local memory is being
-  // used.  Although the device/mask calc to determine subgroup and
-  // lanes is still proper, we might want to make it clearer that
-  // we're working with subgroups by using the subgroup API.
-  //
-  // every subgroup/simd that will work on the block loads the same command
-  //
-#if (__OPENCL_VERSION__ < 200)
-  skc_uint const       cmd_stride = get_num_sub_groups();
-#else
-  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
-  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();
-
-  // load fill command -- we reuse y component
-  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };
-
-  // get the path header block from the map
-  skc_block_id_t       id         = map[cmd.fill.path];
-
-#if 0
-  if (get_sub_group_local_id() == 0)
-    printf("expand[%u] = %u\n",cmd_idx,id);
-#endif
-
-  //
-  // blindly load all of the head elements into registers
-  //
-  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  //
-  // pick out count.nodes and count.prims from the header
-  //
-  skc_uint count_nodes, count_prims;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \
-    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \
-  }                                                                     \
-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \
-    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \
-  }
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  //
-  // debug of path head
-  //
-#if 0
-  skc_uint count_blocks;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \
-    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \
-  }
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  if (get_sub_group_local_id() == 0)
-    printf("path header = { %5u, %5u, %5u }\n",
-           count_blocks,count_nodes,count_prims);
-#endif
-
-  //
-  // acquire slots in the expanded cmd extent
-  //
-  // decrement prim_idx by 1 so we can use inclusive warp scan later
-  //
-  skc_uint out_idx = 0;
-
-  if (get_sub_group_local_id() == 0) {
-    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
-      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
-  }
-
-  out_idx = sub_group_broadcast(out_idx,0);
-
-  //
-  // process ids trailing the path header
-  //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \
-    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \
-      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
-        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \
-      }                                                                 \
-    }                                                                   \
-    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \
-                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
-  }
-
-  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-  //
-  // we're done if it was just the header
-  //
-  if (count_nodes == 0)
-    return;
-
-  //
-  // otherwise, process the nodes
-  //
-
-  //
-  // get id of next node
-  //
-  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
-
-  //
-  // the following blocks are nodes
-  //
-  while (true)
-    {
-      // get index of each element
-      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
-
-      //
-      // blindly load all of the node elements into registers
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
-
-      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-      //
-      // append all valid ids
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \
-                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
-
-      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
-
-      // any more nodes?
-      if (--count_nodes == 0)
-        return;
-
-      //
-      // get id of next node
-      //
-      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK    (SKC_DEVICE_BLOCK_WORDS    / SKC_FILLS_EXPAND_ELEM_WORDS)
+#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
+
+#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD   (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#define SKC_FILLS_EXPAND_X  (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+//
+//
+
+#if   ( SKC_FILLS_EXPAND_X == 1 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_FILLS_EXPAND_X == 2 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_FILLS_EXPAND_X == 4 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_FILLS_EXPAND_X == 8 )
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_FILLS_EXPAND_X == 16)
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_FILLS_EXPAND_X"
+#endif
+
+//
+// Fill and rasterize cmds only differ in their first word semantics
+//
+
+union skc_cmd_expand
+{
+  union skc_cmd_fill      fill;
+  union skc_cmd_rasterize rasterize;
+};
+
+//
+//
+//
+
+union skc_path_elem
+{
+  skc_uint  u32;
+  skc_float f32;
+};
+
+//
+// COMPILE-TIME AND RUN-TIME MACROS
+//
+
+#define SKC_ELEM_IN_RANGE(X,I)                                          \
+  (skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) &&   \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_ELEM_GTE(X,I)                                       \
+  SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                                  \
+  sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+//
+//
+
+void
+skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
+                    skc_uint                         * const out_idx,
+                    union skc_cmd_expand             * const cmd,
+                    union skc_path_elem                const e,
+                    skc_uint                           const e_idx)
+{
+  //
+  // FIXME -- we can append a large number of nodeword indices to a
+  // local SMEM queue and flush when full.  It may or may not be a
+  // performance win on some architectures.
+  //
+  skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
+  skc_uint const offset  = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
+
+  cmd->rasterize.nodeword = e_idx;
+
+  if (is_elem) {
+    cmds_out[*out_idx + offset] = cmd->rasterize;
+  }
+
+  *out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
+}
+
+//
+//
+//
+
+__kernel
+SKC_FILLS_EXPAND_KERNEL_ATTRIBS
+void
+skc_kernel_fills_expand(__global union skc_path_elem     const    * const blocks,
+                        __global skc_uint                volatile * const atomics,
+                        __global skc_block_id_t          const    * const map,
+                        __global union skc_cmd_fill      const    * const cmds_in,
+                        __global union skc_cmd_rasterize          * const cmds_out)
+{
+  //
+  // Need to harmonize the way we determine a subgroup's id.  In this
+  // kernel it's not as important because no local memory is being
+  // used.  Although the device/mask calc to determine subgroup and
+  // lanes is still proper, we might want to make it clearer that
+  // we're working with subgroups by using the subgroup API.
+  //
+  // every subgroup/simd that will work on the block loads the same command
+  //
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const       cmd_stride = get_num_sub_groups();
+#else
+  skc_uint const       cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint             cmd_idx    = get_group_id(0) * cmd_stride + get_sub_group_id();
+
+  // load fill command -- we reuse y component
+  union skc_cmd_expand cmd        = { .fill = cmds_in[cmd_idx] };
+
+  // get the path header block from the map
+  skc_block_id_t       id         = map[cmd.fill.path];
+
+#if 0
+  if (get_sub_group_local_id() == 0)
+    printf("expand[%u] = %u\n",cmd_idx,id);
+#endif
+
+  //
+  // blindly load all of the head elements into registers
+  //
+  skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // pick out count.nodes and count.prims from the header
+  //
+  skc_uint count_nodes, count_prims;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) {                \
+    count_nodes  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I);       \
+  }                                                                     \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) {                \
+    count_prims  = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I);       \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // debug of path head
+  //
+#if 0
+  skc_uint count_blocks;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) {               \
+    count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I);      \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  if (get_sub_group_local_id() == 0)
+    printf("path header = { %5u, %5u, %5u }\n",
+           count_blocks,count_nodes,count_prims);
+#endif
+
+  //
+  // acquire slots in the expanded cmd extent
+  //
+  // decrement prim_idx by 1 so we can use inclusive warp scan later
+  //
+  skc_uint out_idx = 0;
+
+  if (get_sub_group_local_id() == 0) {
+    out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
+      (atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
+  }
+
+  out_idx = sub_group_broadcast(out_idx,0);
+
+  //
+  // process ids trailing the path header
+  //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+  if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                      \
+    if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) {                \
+      if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
+        h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID;                         \
+      }                                                                 \
+    }                                                                   \
+    skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I,                    \
+                        head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
+  }
+
+  SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+  //
+  // we're done if it was just the header
+  //
+  if (count_nodes == 0)
+    return;
+
+  //
+  // otherwise, process the nodes
+  //
+
+  //
+  // get id of next node
+  //
+  id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+
+  //
+  // the following blocks are nodes
+  //
+  while (true)
+    {
+      // get index of each element
+      skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
+
+      //
+      // blindly load all of the node elements into registers
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
+
+      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+      //
+      // append all valid ids
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I,                  \
+                          node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
+
+      SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
+
+      // any more nodes?
+      if (--count_nodes == 0)
+        return;
+
+      //
+      // get id of next node
+      //
+      id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
+    }
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_copy.cl
--- a/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/paths_reclaim.cl
@ -1,390 +1,390 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// FIXME -- a pre-allocation step could load the path header quads and
-// total up the number of blocks in the workgroup or subgroup
-// minimizing the number of later atomics adds.
-//
-
-#include "block.h"
-#include "path.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
-
-//
-//
-//
-
-#if   ( SKC_PATHS_RECLAIM_X == 1 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0
-
-#elif ( SKC_PATHS_RECLAIM_X == 2 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1
-
-#elif ( SKC_PATHS_RECLAIM_X == 4 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3
-
-#elif ( SKC_PATHS_RECLAIM_X == 8 )
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7
-
-#elif ( SKC_PATHS_RECLAIM_X == 16)
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
-#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15
-
-#else
-#error "MISSING SKC_PATHS_RECLAIM_X"
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I)                                            \
-  sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
-  sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I)                 \
-  SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \
-  SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \
-  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \
-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \
-  SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
-
-#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \
-  SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \
-  (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
-
-//
-//
-//
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \
-  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
-   ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \
-  S = sub_group_scan_exclusive_add(C)
-
-#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \
-  (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
-  skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
-                         __global skc_uint                * const bp_elems,    // block pool blocks
-                         __global skc_uint       volatile * const bp_atomics,  // read/write atomics
-                         skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
-                         __global skc_block_id_t const    * const map,         // path host-to-device map
-                         struct   skc_reclaim               const reclaim)     // array of host path ids
-{
-#if (__OPENCL_VERSION__ < 200)
-  skc_uint const reclaim_stride = get_num_sub_groups();
-#else
-  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
-  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
-  //
-  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
-  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
-  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
-  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
-  //
-  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
-    {
-      // get host path id
-      skc_path_h const path = reclaim.aN[reclaim_idx];
-
-      // get the path header block from the map
-      skc_block_id_t   id   = map[path];
-
-      //
-      // blindly load all of the head elements into registers
-      //
-      skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // pick out count.nodes and count.prims from the header
-      //
-      skc_uint count_blocks, count_nodes;
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
-        count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
-      }                                                                 \
-      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
-        count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
-      }
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-#if 0
-      if (get_sub_group_local_id() == 0) {
-        printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);
-      }
-#endif
-
-      //
-      // acquire a span in the block pool ids ring for reclaimed ids
-      //
-      // FIXME count_blocks and atomic add can be done in same lane
-      //
-      skc_uint bp_ids_base = 0;
-
-      if (get_sub_group_local_id() == 0) {
-        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
-
-#if 0
-        printf("paths: bp_ids_base = %u\n",bp_ids_base);
-#endif
-      }
-
-      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
-      //
-      // shift away the tagged block id's tag
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
-        h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \
-      }
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // swap current id with next
-      //
-      if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-        {
-          skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-          SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-          id = next;
-        }
-
-      //
-      // - we'll skip subgroups that are entirely header
-      //
-      // - but we need to mark any header elements that partially fill
-      //   a subgroup as invalid tagged block ids
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
-        if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \
-          if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \
-            h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \
-          }                                             \
-        }                                               \
-      }
-
-      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-      {
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
-          packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
-        }
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
-          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = h##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        // printf("P %7u ! %u\n",bp_ids_idx,h##I);
-      }
-
-      //
-      // we're done if it was just the header
-      //
-      if (count_nodes == 0)
-        return;
-
-      //
-      // otherwise, walk the nodes
-      //
-      do {
-        // id of next block is in last lane
-        id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
-
-        // get index of each element
-        skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
-
-        //
-        // blindly load all of the node elements into registers
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // shift away the tagged block id's tag
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-        n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // swap current id with next
-        //
-        if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
-          {
-            skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-            SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-            id = next;
-          }
-
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = n##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
-
-        // printf("P %7u ! %u\n",bp_ids_idx,n##I);
-
-        // any more nodes?
-      } while (--count_nodes > 0);
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// FIXME -- a pre-allocation step could load the path header quads and
+// total up the number of blocks in the workgroup or subgroup
+// minimizing the number of later atomics adds.
+//
+
+#include "block.h"
+#include "path.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
+
+//
+//
+//
+
+#if   ( SKC_PATHS_RECLAIM_X == 1 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_PATHS_RECLAIM_X == 2 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_PATHS_RECLAIM_X == 4 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_PATHS_RECLAIM_X == 8 )
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_PATHS_RECLAIM_X == 16)
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_PATHS_RECLAIM_X"
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
+  sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \
+  SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \
+  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \
+  SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
+
+#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \
+  SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \
+  (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
+
+//
+//
+//
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \
+  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
+   ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \
+  S = sub_group_scan_exclusive_add(C)
+
+#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \
+  (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+  skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
+                         __global skc_uint                * const bp_elems,    // block pool blocks
+                         __global skc_uint       volatile * const bp_atomics,  // read/write atomics
+                         skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
+                         __global skc_block_id_t const    * const map,         // path host-to-device map
+                         struct   skc_reclaim               const reclaim)     // array of host path ids
+{
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+  //
+  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+  //
+  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+    {
+      // get host path id
+      skc_path_h const path = reclaim.aN[reclaim_idx];
+
+      // get the path header block from the map
+      skc_block_id_t   id   = map[path];
+
+      //
+      // blindly load all of the head elements into registers
+      //
+      skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // pick out count.nodes and count.prims from the header
+      //
+      skc_uint count_blocks, count_nodes;
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
+        count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
+      }                                                                 \
+      if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
+        count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+#if 0
+      if (get_sub_group_local_id() == 0) {
+        printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);
+      }
+#endif
+
+      //
+      // acquire a span in the block pool ids ring for reclaimed ids
+      //
+      // FIXME count_blocks and atomic add can be done in same lane
+      //
+      skc_uint bp_ids_base = 0;
+
+      if (get_sub_group_local_id() == 0) {
+        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+
+#if 0
+        printf("paths: bp_ids_base = %u\n",bp_ids_base);
+#endif
+      }
+
+      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+      //
+      // shift away the tagged block id's tag
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
+        h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // swap current id with next
+      //
+      if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+        {
+          skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+          SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+          id = next;
+        }
+
+      //
+      // - we'll skip subgroups that are entirely header
+      //
+      // - but we need to mark any header elements that partially fill
+      //   a subgroup as invalid tagged block ids
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
+        if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \
+          if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \
+            h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \
+          }                                             \
+        }                                               \
+      }
+
+      SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+      {
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
+          packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
+          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = h##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("P %7u ! %u\n",bp_ids_idx,h##I);
+      }
+
+      //
+      // we're done if it was just the header
+      //
+      if (count_nodes == 0)
+        return;
+
+      //
+      // otherwise, walk the nodes
+      //
+      do {
+        // id of next block is in last lane
+        id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
+
+        // get index of each element
+        skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
+
+        //
+        // blindly load all of the node elements into registers
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // shift away the tagged block id's tag
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+        n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // swap current id with next
+        //
+        if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
+          {
+            skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+            SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+            id = next;
+          }
+
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+          skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = n##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_PATHS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("P %7u ! %u\n",bp_ids_idx,n##I);
+
+        // any more nodes?
+      } while (--count_nodes > 0);
+    }
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/kernels/place.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/place.cl
--- a/src/compute/skc/platforms/cl_12/kernels/prefix.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/prefix.cl
--- a/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasterize.cl
--- a/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasters_alloc.cl
@ -1,144 +1,144 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "raster.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "raster_builder_cl_12.h"
-#include "device_cl_12.h"
-
-//
-// There is a fixed-size meta table per raster cohort that we use to
-// peform a mostly coalesced sizing and allocation of blocks.
-//
-// This code is simple and fast.
-//
-
-__kernel
-SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
-                         __global skc_block_id_t  const    * const bp_ids,
-                         skc_uint                            const bp_mask, // pow2 modulo mask for block pool ring
-                         __global skc_block_id_t           * const map,
-                         __global skc_uint                 * const metas,
-                         __global skc_uint        const    * const raster_ids, // FIXME -- CONSTANT
-                         skc_uint                            const count)
-{
-  // access to the meta extent is linear
-  skc_uint const gid       = get_global_id(0);
-  skc_bool const is_active = gid < count;
-
-  //
-  // init with defaults for all lanes
-  //
-  union skc_raster_cohort_meta_inout meta         = { .in.u32v4 = { 0, 0, 0, 0 } };
-  skc_uint                           raster_id    = SKC_UINT_MAX;
-  skc_uint                           extra_blocks = 0;
-
-  if (is_active)
-    {
-      // load meta_in
-      meta.in.u32v4     = vload4(gid,metas);
-
-      // load raster_id as early as possible
-      raster_id         = raster_ids[gid];
-
-#if 0
-      printf("%3u + %5u, %5u, %5u, %5u\n",
-             gid,
-             meta.in.blocks,
-             meta.in.offset,
-             meta.in.pk,
-             meta.in.rk);
-#endif
-
-      // how many blocks will the ttpb blocks consume?
-      extra_blocks      = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / 
-                           SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
-
-      // total keys
-      meta.out.keys    += meta.in.pk;
-
-      // how many blocks do we need to store the keys in the head and trailing nodes?
-      skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
-                           (SKC_RASTER_NODE_DWORDS - 1));
-      // increment blocks
-      extra_blocks     += hn;
-
-      // how many nodes trail the head?
-      meta.out.nodes    = hn - 1;
-      
-      // update blocks
-      meta.out.blocks  += extra_blocks;
-
-#if 0
-      printf("%3u - %5u, %5u, %5u, %5u\n",
-             gid,
-             meta.out.blocks,
-             meta.out.offset,
-             meta.out.nodes,
-             meta.out.keys);
-#endif
-    }
-
-  //
-  // allocate blocks from block pool
-  //
-  // first perform a prefix sum on the subgroup to reduce atomic
-  // operation traffic
-  //
-  // note this idiom can be implemented with vectors, subgroups or
-  // workgroups
-  //
-  
-  skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
-  skc_uint       reads  = 0;
-
-  // last lane performs the block pool allocation with an atomic increment
-  if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
-    reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
-  }
-
-  // broadcast block pool base to all lanes
-  reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
-
-  // update base for each lane
-  reads += prefix - extra_blocks;
-
-  //
-  // store meta header
-  //
-  if (is_active)
-    {
-      // store headers back to meta extent
-      vstore4(meta.out.u32v4,gid,metas);
-
-      // store reads
-      metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; 
-
-      // get block_id of each raster head 
-      skc_block_id_t const block_id = bp_ids[reads & bp_mask];
-
-      // update map
-      map[raster_id] = block_id;
-
-#if 0
-      printf("alloc: %u / %u\n",raster_id,block_id);
-#endif
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "raster.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "raster_builder_cl_12.h"
+#include "kernel_cl_12.h"
+
+//
+// There is a fixed-size meta table per raster cohort that we use to
+// peform a mostly coalesced sizing and allocation of blocks.
+//
+// This code is simple and fast.
+//
+
+__kernel
+SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
+                         __global skc_block_id_t  const    * const bp_ids,
+                         skc_uint                            const bp_mask, // pow2 modulo mask for block pool ring
+                         __global skc_block_id_t           * const map,
+                         __global skc_uint                 * const metas,
+                         __global skc_uint        const    * const raster_ids, // FIXME -- CONSTANT
+                         skc_uint                            const count)
+{
+  // access to the meta extent is linear
+  skc_uint const gid       = get_global_id(0);
+  skc_bool const is_active = gid < count;
+
+  //
+  // init with defaults for all lanes
+  //
+  union skc_raster_cohort_meta_inout meta         = { .in.u32v4 = { 0, 0, 0, 0 } };
+  skc_uint                           raster_id    = SKC_UINT_MAX;
+  skc_uint                           extra_blocks = 0;
+
+  if (is_active)
+    {
+      // load meta_in
+      meta.in.u32v4     = vload4(gid,metas);
+
+      // load raster_id as early as possible
+      raster_id         = raster_ids[gid];
+
+#if 0
+      printf("%3u + %5u, %5u, %5u, %5u\n",
+             gid,
+             meta.in.blocks,
+             meta.in.offset,
+             meta.in.pk,
+             meta.in.rk);
+#endif
+
+      // how many blocks will the ttpb blocks consume?
+      extra_blocks      = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) / 
+                           SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
+
+      // total keys
+      meta.out.keys    += meta.in.pk;
+
+      // how many blocks do we need to store the keys in the head and trailing nodes?
+      skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
+                           (SKC_RASTER_NODE_DWORDS - 1));
+      // increment blocks
+      extra_blocks     += hn;
+
+      // how many nodes trail the head?
+      meta.out.nodes    = hn - 1;
+      
+      // update blocks
+      meta.out.blocks  += extra_blocks;
+
+#if 0
+      printf("%3u - %5u, %5u, %5u, %5u\n",
+             gid,
+             meta.out.blocks,
+             meta.out.offset,
+             meta.out.nodes,
+             meta.out.keys);
+#endif
+    }
+
+  //
+  // allocate blocks from block pool
+  //
+  // first perform a prefix sum on the subgroup to reduce atomic
+  // operation traffic
+  //
+  // note this idiom can be implemented with vectors, subgroups or
+  // workgroups
+  //
+  
+  skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
+  skc_uint       reads  = 0;
+
+  // last lane performs the block pool allocation with an atomic increment
+  if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
+    reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
+  }
+
+  // broadcast block pool base to all lanes
+  reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
+
+  // update base for each lane
+  reads += prefix - extra_blocks;
+
+  //
+  // store meta header
+  //
+  if (is_active)
+    {
+      // store headers back to meta extent
+      vstore4(meta.out.u32v4,gid,metas);
+
+      // store reads
+      metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads; 
+
+      // get block_id of each raster head 
+      skc_block_id_t const block_id = bp_ids[reads & bp_mask];
+
+      // update map
+      map[raster_id] = block_id;
+
+#if 0
+      printf("alloc: %u / %u\n",raster_id,block_id);
+#endif
+    }
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/rasters_reclaim.cl
@ -1,442 +1,442 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include "tile.h"
-#include "block.h"
-#include "raster.h"
-#include "common.h"
-#include "atomic_cl.h"
-#include "block_pool_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS     (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
-
-#define SKC_RASTERS_RECLAIM_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
-
-//
-//
-//
-
-#if   ( SKC_RASTERS_RECLAIM_X == 1 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  0
-
-#elif ( SKC_RASTERS_RECLAIM_X == 2 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  1
-
-#elif ( SKC_RASTERS_RECLAIM_X == 4 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  3
-
-#elif ( SKC_RASTERS_RECLAIM_X == 8 )
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  7
-
-#elif ( SKC_RASTERS_RECLAIM_X == 16)
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
-#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  15
-
-#else
-#error "MISSING SKC_RASTERS_RECLAIM_X"
-#endif
-
-#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
-                                                      (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
-
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
-
-#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
-#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
-
-#endif
-
-//
-// FIXME -- slate these for replacement
-//
-
-#define SKC_BROADCAST(E,S,I)                                            \
-  sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
-  sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-
-#define SKC_BROADCAST_LAST(E,I)                 \
-  SKC_BROADCAST_LAST_HELPER(E,I)
-
-//
-// COMPILE-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I)                       \
-  SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I)                          \
-  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
-  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
-
-#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)          \
-  SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
-
-#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)                 \
-  SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
-
-//
-// RUN-TIME PREDICATES
-//
-
-#define SKC_RASTERS_RECLAIM_IS_HEADER(I)                                \
-  (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
-
-//
-// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
-// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
-// COMBOS (NOT NECESSARILY POW2)
-//
-// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
-// UINT TYPE INSTEAD OF A ULONG.
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS     SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
-
-//
-//
-//
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)          \
-  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
-   ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)        \
-  S = sub_group_scan_exclusive_add(C)
-
-#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I)                       \
-  (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
-
-//
-//
-//
-
-struct skc_reclaim
-{
-  skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
-};
-
-__kernel
-SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
-void
-skc_kernel_rasters_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
-                           __global skc_uint                * const bp_elems,    // block pool blocks
-                           __global skc_uint       volatile * const bp_atomics,  // read/write atomics
-                           skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
-                           __global skc_block_id_t const    * const map,         // raster host-to-device map
-                           struct   skc_reclaim               const reclaim)     // array of host raster ids
-{
-#if (__OPENCL_VERSION__ < 200)
-  skc_uint const reclaim_stride = get_num_sub_groups();
-#else
-  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
-#endif
-  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
-
-#if 0
-  //
-  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
-  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
-  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
-  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
-  //
-  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
-#endif
-    {
-      // get host raster id
-      skc_raster_h const raster = reclaim.aN[reclaim_idx];
-
-      // get block id of raster header
-      skc_block_id_t     id     = map[raster];
-
-      //
-      // load all of the head block ttxk.lo keys into registers
-      //
-      // FIXME -- this pattern lends itself to using the higher
-      // performance Intel GEN block load instructions
-      //
-      skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-      skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // pick out count.nodes and count.prims from the header
-      //
-      // load raster header counts -- we only need the blocks and
-      // nodes words the keys are doublewords.
-      //
-      // FIXME -- this can be made portable with compile-time macro expansion
-      //
-      skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
-      skc_uint count_nodes  = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
-
-#if 0
-      if (get_sub_group_local_id() == 0) {
-        printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
-      }
-#endif
-      //
-      // acquire a span in the block pool ids ring for reclaimed ids
-      //
-      skc_uint bp_ids_base = 0;
-
-      if (get_sub_group_local_id() == 0) {
-        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
-      }
-
-      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
-
-      //
-      // mask off everything but the block id
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
-        h##I = h##I & SKC_TTXK_LO_MASK_ID;              \
-      }
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-      //
-      // swap current id with next
-      //
-      if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-        {
-          skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-          SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-          id = next;
-#if 0
-          printf("rasters next = %u\n",id);
-#endif
-        }
-
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-        printf("%08X %u\n",h##I,h##I);
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-      
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
-        printf("%08X\n",h##I);                          \
-      }
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
-      //
-      // - we'll skip subgroups that are entirely header
-      //
-      // - but we need to mark any header elements that partially fill
-      //   a subgroup as subblocks
-      //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                         \
-      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
-        if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) {  \
-          if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) {       \
-            h##I = SKC_UINT_MAX;                        \
-          }                                             \
-        }                                               \
-      }
-
-      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-      {
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
-          packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
-        }
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
-          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = h##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-      }
-
-      // printf("R %7u ! %u\n",bp_ids_idx,h##I);
-            
-      //
-      // we're done if it was just the header
-      //
-      if (count_nodes == 0)
-        return;
-
-      //
-      // otherwise, walk the nodes
-      //
-      do {
-        // id of next block is in last lane
-        id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
-
-        //
-        // load all of the node block ttxk.lo keys into registers
-        //
-        // FIXME -- this pattern lends itself to using the higher
-        // performance Intel GEN block load instructions
-        //
-        skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // mask off everything but the block id
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-        n##I = n##I & SKC_TTXK_LO_MASK_ID;
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // swap current id with next
-        //
-        if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
-          {
-            skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
-
-            SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
-
-            id = next;
-#if 0
-            printf("rasters next = %u\n",id);            
-#endif
-          }
-
-#if 0
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                 \
-        printf("%08X %u\n",n##I,n##I);
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-#endif
-
-        //
-        // count reclaimable blocks in each lane
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
-
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R)                                         \
-        packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        //
-        // scan to find index of each block
-        //
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
-
-        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
-
-        //
-        // store blocks back to ring
-        //
-#undef  SKC_EXPAND_X
-#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
-          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
-          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
-          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
-          if (count > 0) {                                              \
-            bp_ids[bp_ids_idx] = n##I;                                  \
-          }                                                             \
-          skc_uint const total = index + count;                         \
-          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
-        }
-
-        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
-
-        // printf("R %7u ! %u\n",bp_ids_idx,n##I);
-        
-        // any more nodes?
-      } while (--count_nodes > 0);
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+//
+//
+
+#include "tile.h"
+#include "block.h"
+#include "raster.h"
+#include "common.h"
+#include "atomic_cl.h"
+#include "block_pool_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS     (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
+
+#define SKC_RASTERS_RECLAIM_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
+
+//
+//
+//
+
+#if   ( SKC_RASTERS_RECLAIM_X == 1 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  0
+
+#elif ( SKC_RASTERS_RECLAIM_X == 2 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  1
+
+#elif ( SKC_RASTERS_RECLAIM_X == 4 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  3
+
+#elif ( SKC_RASTERS_RECLAIM_X == 8 )
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  7
+
+#elif ( SKC_RASTERS_RECLAIM_X == 16)
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
+#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST  15
+
+#else
+#error "MISSING SKC_RASTERS_RECLAIM_X"
+#endif
+
+#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
+                                                      (I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (L)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
+
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO           (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK      (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
+
+#define SKC_RASTERS_RECLAIM_STRIDE_H(L)              (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
+#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)           (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I)           (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
+
+#endif
+
+//
+// FIXME -- slate these for replacement
+//
+
+#define SKC_BROADCAST(E,S,I)                                            \
+  sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_BROADCAST_LAST_HELPER(E,I)                          \
+  sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+
+#define SKC_BROADCAST_LAST(E,I)                 \
+  SKC_BROADCAST_LAST_HELPER(E,I)
+
+//
+// COMPILE-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I)                       \
+  SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I)                          \
+  (skc_bool)SKC_GTE_MACRO(X, I   * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
+  (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
+
+#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)          \
+  SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
+
+#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)                 \
+  SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
+
+//
+// RUN-TIME PREDICATES
+//
+
+#define SKC_RASTERS_RECLAIM_IS_HEADER(I)                                \
+  (get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
+
+//
+// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
+// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
+// COMBOS (NOT NECESSARILY POW2)
+//
+// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
+// UINT TYPE INSTEAD OF A ULONG.
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS     SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
+
+//
+//
+//
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)          \
+  (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
+   ? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)        \
+  S = sub_group_scan_exclusive_add(C)
+
+#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I)                       \
+  (((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
+
+//
+//
+//
+
+struct skc_reclaim
+{
+  skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
+};
+
+__kernel
+SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
+void
+skc_kernel_rasters_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
+                           __global skc_uint                * const bp_elems,    // block pool blocks
+                           __global skc_uint       volatile * const bp_atomics,  // read/write atomics
+                           skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
+                           __global skc_block_id_t const    * const map,         // raster host-to-device map
+                           struct   skc_reclaim               const reclaim)     // array of host raster ids
+{
+#if (__OPENCL_VERSION__ < 200)
+  skc_uint const reclaim_stride = get_num_sub_groups();
+#else
+  skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
+#endif
+  skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
+
+#if 0
+  //
+  // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
+  // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
+  // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
+  // RECLAMATION JOB ON THE REST OF THE PIPELINE.
+  //
+  for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
+#endif
+    {
+      // get host raster id
+      skc_raster_h const raster = reclaim.aN[reclaim_idx];
+
+      // get block id of raster header
+      skc_block_id_t     id     = map[raster];
+
+      //
+      // load all of the head block ttxk.lo keys into registers
+      //
+      // FIXME -- this pattern lends itself to using the higher
+      // performance Intel GEN block load instructions
+      //
+      skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+      skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // pick out count.nodes and count.prims from the header
+      //
+      // load raster header counts -- we only need the blocks and
+      // nodes words the keys are doublewords.
+      //
+      // FIXME -- this can be made portable with compile-time macro expansion
+      //
+      skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
+      skc_uint count_nodes  = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
+
+#if 0
+      if (get_sub_group_local_id() == 0) {
+        printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
+      }
+#endif
+      //
+      // acquire a span in the block pool ids ring for reclaimed ids
+      //
+      skc_uint bp_ids_base = 0;
+
+      if (get_sub_group_local_id() == 0) {
+        bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
+      }
+
+      bp_ids_base = sub_group_broadcast(bp_ids_base,0);
+
+      //
+      // mask off everything but the block id
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        h##I = h##I & SKC_TTXK_LO_MASK_ID;              \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      //
+      // swap current id with next
+      //
+      if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+        {
+          skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+          SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+          id = next;
+#if 0
+          printf("rasters next = %u\n",id);
+#endif
+        }
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        printf("%08X %u\n",h##I,h##I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+      
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        printf("%08X\n",h##I);                          \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+      //
+      // - we'll skip subgroups that are entirely header
+      //
+      // - but we need to mark any header elements that partially fill
+      //   a subgroup as subblocks
+      //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                         \
+      if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {    \
+        if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) {  \
+          if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) {       \
+            h##I = SKC_UINT_MAX;                        \
+          }                                             \
+        }                                               \
+      }
+
+      SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+      {
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
+          packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) {                  \
+          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = h##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+      }
+
+      // printf("R %7u ! %u\n",bp_ids_idx,h##I);
+            
+      //
+      // we're done if it was just the header
+      //
+      if (count_nodes == 0)
+        return;
+
+      //
+      // otherwise, walk the nodes
+      //
+      do {
+        // id of next block is in last lane
+        id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
+
+        //
+        // load all of the node block ttxk.lo keys into registers
+        //
+        // FIXME -- this pattern lends itself to using the higher
+        // performance Intel GEN block load instructions
+        //
+        skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // mask off everything but the block id
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        n##I = n##I & SKC_TTXK_LO_MASK_ID;
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // swap current id with next
+        //
+        if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
+          {
+            skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
+
+            SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
+
+            id = next;
+#if 0
+            printf("rasters next = %u\n",id);            
+#endif
+          }
+
+#if 0
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                 \
+        printf("%08X %u\n",n##I,n##I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+#endif
+
+        //
+        // count reclaimable blocks in each lane
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
+
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R)                                         \
+        packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        //
+        // scan to find index of each block
+        //
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
+
+        SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
+
+        //
+        // store blocks back to ring
+        //
+#undef  SKC_EXPAND_X
+#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
+          skc_uint const index      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
+          skc_uint const count      = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
+          skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
+          if (count > 0) {                                              \
+            bp_ids[bp_ids_idx] = n##I;                                  \
+          }                                                             \
+          skc_uint const total = index + count;                         \
+          bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
+        }
+
+        SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
+
+        // printf("R %7u ! %u\n",bp_ids_idx,n##I);
+        
+        // any more nodes?
+      } while (--count_nodes > 0);
+    }
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/kernels/render.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/render.cl
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttck.cl
@ -1,130 +1,130 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include "tile.h"
-#include "atomic_cl.h"
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
-
-//
-//
-//
-
-#define SKC_YX_NEQ(row,prev)                \
-  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,
-                        __global uint                     * SKC_RESTRICT const indices,
-                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
-{
-  uint const global_id = get_global_id(0);
-  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
-  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
-  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
-  //
-  // LOAD ALL THE ROWS
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                                           \
-  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
-  HS_SLAB_ROWS();
-
-  //
-  // LOAD LAST REGISTER FROM COLUMN TO LEFT
-  //
-  uint  diffs = 0;
-  uint2 r0    = r1;
-
-  if (gmem_base > 0) {
-    // if this is the first key in any slab but the first then it
-    // broadcast loads the last key in previous slab
-    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
-  } else if (get_sub_group_local_id() == 0) {
-    // if this is the first lane in the first slab
-    diffs = 1;
-  }
-
-  // now shuffle in the last key from the column to the left
-  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
-  //
-  // FIND ALL DIFFERENCES IN SLAB
-  //
-  uint valid = 0;
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  diffs |= (SKC_YX_NEQ(row,prev) << prev);
-
-  HS_SLAB_ROWS();
-
-  //
-  // SUM UP THE DIFFERENCES
-  //
-  uint const valid_diffs = valid & diffs;
-  uint const count       = popcount(valid_diffs);
-  uint const inclusive   = sub_group_scan_inclusive_add(count);
-  uint const exclusive   = inclusive - count;
-
-  //
-  // RESERVE SPACE IN THE INDICES ARRAY
-  //
-  uint next = 0;
-
-  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
-    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
-
-  // distribute base across subgroup
-  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
-
-  //
-  // STORE THE INDICES
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (valid_diffs & (1 << prev))                \
-    indices[next++] = lane_idx + prev;
-
-  HS_SLAB_ROWS();
-
-  //
-  // TRANSPOSE THE SLAB AND STORE IT
-  //
-  HS_TRANSPOSE_SLAB();
-}
-
-//
-//
-//
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTCK KEY.  IF THE TTCK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "atomic_cl.h"
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
+
+//
+//
+//
+
+#define SKC_YX_NEQ(row,prev)                \
+  (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttck(__global HS_KEY_TYPE              * SKC_RESTRICT const vout,
+                        __global uint                     * SKC_RESTRICT const indices,
+                        __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
+{
+  uint const global_id = get_global_id(0);
+  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
+  uint const lane_idx  = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+  //
+  // LOAD ALL THE ROWS
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                                           \
+  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+  HS_SLAB_ROWS();
+
+  //
+  // LOAD LAST REGISTER FROM COLUMN TO LEFT
+  //
+  uint  diffs = 0;
+  uint2 r0    = r1;
+
+  if (gmem_base > 0) {
+    // if this is the first key in any slab but the first then it
+    // broadcast loads the last key in previous slab
+    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+  } else if (get_sub_group_local_id() == 0) {
+    // if this is the first lane in the first slab
+    diffs = 1;
+  }
+
+  // now shuffle in the last key from the column to the left
+  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+  //
+  // FIND ALL DIFFERENCES IN SLAB
+  //
+  uint valid = 0;
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  diffs |= (SKC_YX_NEQ(row,prev) << prev);
+
+  HS_SLAB_ROWS();
+
+  //
+  // SUM UP THE DIFFERENCES
+  //
+  uint const valid_diffs = valid & diffs;
+  uint const count       = popcount(valid_diffs);
+  uint const inclusive   = sub_group_scan_inclusive_add(count);
+  uint const exclusive   = inclusive - count;
+
+  //
+  // RESERVE SPACE IN THE INDICES ARRAY
+  //
+  uint next = 0;
+
+  if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
+    next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
+
+  // distribute base across subgroup
+  next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
+
+  //
+  // STORE THE INDICES
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (valid_diffs & (1 << prev))                \
+    indices[next++] = lane_idx + prev;
+
+  HS_SLAB_ROWS();
+
+  //
+  // TRANSPOSE THE SLAB AND STORE IT
+  //
+  HS_TRANSPOSE_SLAB();
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
+++ b/src/compute/skc/platforms/cl_12/kernels/segment_ttrk.cl
@ -1,394 +1,394 @@
-/*
- * Copyright 2018 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
-// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS
-// KERNEL WILL NEED TO BE UPDATED
-//
-
-#include "tile.h"
-#include "raster_builder_cl_12.h" // need meta_in structure
-#include "device_cl_12.h"
-
-//
-//
-//
-
-#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
-#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
-
-//
-// THE BEST TYPE TO ZERO SMEM
-//
-
-#define SKC_ZERO_TYPE  ulong
-#define SKC_ZERO_WORDS 2
-
-//
-// THE ORDER OF COMPONENTS IS:
-//
-// 0: blocks
-// 1: offset
-// 2: pk
-// 3: rk
-//
-
-#if (HS_KEYS_PER_SLAB < 256)
-
-#define SKC_META_TYPE       uint
-#define SKC_META_WORDS      1
-
-#define SKC_COMPONENT_TYPE  uchar
-
-#else
-
-#define SKC_META_TYPE       uint2
-#define SKC_META_WORDS      2
-
-#define SKC_COMPONENT_TYPE  ushort
-
-#endif
-
-//
-//
-//
-
-#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
-#define SKC_COHORT_TYPE uchar
-#else
-#define SKC_COHORT_TYPE ushort
-#endif
-
-//
-//
-//
-
-#define SKC_COHORT_ID(row)                      \
-  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
-
-//
-// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
-//
-
-#define SKC_IS_BLOCK(row)                                               \
-  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
-
-#define SKC_YX(row,prev)                        \
-  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
-
-#define SKC_IS_PK(row,prev)                             \
-  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
-
-//
-// COHORT   SIZE IS ALWAYS A POWER-OF-TWO
-// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
-//
-// COHORT SIZE >= SUBGROUP SIZE
-//
-
-#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)
-
-#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)
-#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
-#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
-
-#define SKC_META_COMPONENTS       4
-#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
-
-//
-//
-//
-
-__kernel
-__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
-void
-skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
-                        __global uint        * SKC_RESTRICT const metas)
-{
-  __local union
-  {
-    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
-    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];
-    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];
-  } shared;
-
-  uint const global_id = get_global_id(0);
-  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
-  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
-  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
-
-  //
-  // LOAD ALL THE ROWS
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                                           \
-  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
-
-  HS_SLAB_ROWS();
-
-  //
-  // LOAD LAST REGISTER FROM COLUMN TO LEFT
-  //
-  uint  diffs = 0;
-  uint2 r0    = 0;
-
-  if (gmem_base > 0) {
-    // if this is the first key in any slab but the first then it
-    // broadcast loads the last key in previous slab
-    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
-  } else {
-    // otherwise broadcast the first key in the first slab
-    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
-    // and mark it as an implicit diff
-    if (get_sub_group_local_id() == 0)
-      diffs = 1;
-  }
-
-  // now shuffle in the last key from the column to the left
-  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
-
-  // shift away y/x
-  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
-
-  //
-  // EXTRACT ALL COHORT IDS EARLY...
-  //
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                           \
-  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
-
-  HS_SLAB_ROWS();
-
-  //
-  // DEBUG
-  //
-#if 0
-  if (gmem_base == HS_KEYS_PER_SLAB * 7)
-    {
-      if (get_sub_group_local_id() == 0)
-        printf("\n%llX ",as_ulong(r0));
-      else
-        printf("%llX ",as_ulong(r0));
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-      if (get_sub_group_local_id() == 0)        \
-        printf("\n%llX ",r##row);               \
-      else                                      \
-        printf("%llX ",r##row);
-
-      HS_SLAB_ROWS();
-    }
-#endif
-
-  //
-  // CAPTURE ALL CONDITIONS WE CARE ABOUT
-  //
-  // Diffs must be captured before cohorts
-  //
-  uint            valid  = 0;
-  uint            blocks = 0;
-  uint            pks    = 0;
-  SKC_COHORT_TYPE c_max  = 0;
-
-  //
-  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
-  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
-  //
-#if 0
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  diffs |= ((c##row != c##prev) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  blocks |= (SKC_IS_BLOCK(row) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  pks |= SKC_IS_PK(row,prev) << prev);
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  valid |= ((r##row != SKC_ULONG_MAX) << prev);
-
-  HS_SLAB_ROWS();
-
-#else
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (c##row != c##prev)                        \
-    diffs |= 1<<prev;
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (SKC_IS_BLOCK(row))                        \
-    blocks |= 1<<prev;
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (SKC_IS_PK(row,prev))                      \
-    pks |= 1<<prev;
-
-  HS_SLAB_ROWS();
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (r##row != SKC_ULONG_MAX) {                \
-    valid |= 1<<prev;                           \
-    c_max  = max(c_max,c##row);                 \
-  }
-
-  HS_SLAB_ROWS();
-
-#endif
-
-  //
-  // TRANSPOSE THE SLAB AND STORE IT
-  //
-  HS_TRANSPOSE_SLAB();
-
-  // the min cohort is the first key in the slab
-  uint const c_min = sub_group_broadcast(c1,0);
-  
-  // the max cohort is the max across all lanes
-  c_max = sub_group_reduce_max(c_max);
-
-#if 0 // REMOVE ME LATER
-  if (get_sub_group_local_id() == 0)
-    printf("%3u : ( %3u , %3u )\n",
-           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
-#endif
-
-  //
-  // ZERO SMEM
-  //
-  // zero only the meta info for the cohort ids found in this slab
-  //
-#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)
-  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
-  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
-
-  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
-    shared.z[zz] = 0;
-#else
-  // ERROR -- it's highly unlikely that the zero type is smaller than
-  // the meta type
-#error("Unsupported right now...")
-#endif
-
-  //
-  // ACCUMULATE AND STORE META INFO
-  //
-  uint const    valid_blocks = valid & blocks;
-  uint const    valid_pks    = valid & pks & ~diffs;
-  SKC_META_TYPE meta         = ( 0 );
-
-#define SKC_META_LOCAL_ADD(meta)                \
-  atomic_add(shared.m+HS_REG_LAST(c),meta);
-
-#define SKC_META_LOCAL_STORE(meta,prev)         \
-  shared.m[c##prev] = meta;
-
-  // note this is purposefully off by +1
-#define SKC_META_RESET(meta,curr)               \
-  meta = ((gmem_off + curr) << 8);
-
-#if 0
-
-  // FIXME -- this can be tweaked to shift directly
-#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
-  meta += ((((blocks >> prev) & 1)      ) |     \
-           (((pks    >> prev) & 1) << 16) |     \
-           (((rks    >> prev) & 1) << 24));
-
-#else
-
-#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
-  if (blocks & (1<<prev))                       \
-    meta += 1;                                  \
-  if (pks    & (1<<prev))                       \
-    meta += 1<<16;                              \
-  if (rks    & (1<<prev))                       \
-    meta += 1<<24;
-
-#endif
-
-#undef  HS_SLAB_ROW
-#define HS_SLAB_ROW(row,prev)                   \
-  if (diffs & (1<<prev)) {                      \
-    SKC_META_LOCAL_STORE(meta,prev);            \
-    SKC_META_RESET(meta,row);                   \
-  }                                             \
-  SKC_META_ADD(meta,prev,                       \
-               valid_blocks,                    \
-               valid_pks,                       \
-               valid);
-
-  HS_SLAB_ROWS();
-
-  //
-  // ATOMICALLY ADD THE CARRIED OUT METAS
-  //
-#if 0 // BUG
-  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
-    SKC_META_LOCAL_ADD(meta);
-#else
-  if (meta != 0)
-    SKC_META_LOCAL_ADD(meta);
-#endif
-
-  //
-  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
-  //
-
-  // convert the slab offset to an extent offset
-  bool const is_offset = (get_sub_group_local_id() & 3) == 1;
-  uint const adjust    = is_offset ? gmem_base - 1 : 0;
-
-  //
-  // only process the meta components found in this slab
-  //
-  uint const cc_min = c_min * SKC_META_COMPONENTS;
-  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
-  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
-
-  if ((cc >= cc_min) && (cc <= cc_max))
-    {
-      uint const c = shared.c[cc];
-
-      if (c != 0)
-        atomic_add(metas+cc,c+adjust);
-    }
-
-  cc += HS_LANES_PER_WARP;
-
-  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
-    {
-      uint const c = shared.c[cc];
-
-      if (c != 0)
-        atomic_add(metas+cc,c+adjust);
-    }
-}
-
-//
-//
-//
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+//
+// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
+// LAYOUT OF THE TTRK KEY.  IF THE TTRK KEY IS ALTERED THEN THIS
+// KERNEL WILL NEED TO BE UPDATED
+//
+
+#include "tile.h"
+#include "raster_builder_cl_12.h" // need meta_in structure
+#include "kernel_cl_12.h"
+
+//
+//
+//
+
+#define HS_KEYS_PER_SLAB  (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
+#define HS_LANE_MASK      (HS_LANES_PER_WARP - 1)
+
+//
+// THE BEST TYPE TO ZERO SMEM
+//
+
+#define SKC_ZERO_TYPE  ulong
+#define SKC_ZERO_WORDS 2
+
+//
+// THE ORDER OF COMPONENTS IS:
+//
+// 0: blocks
+// 1: offset
+// 2: pk
+// 3: rk
+//
+
+#if (HS_KEYS_PER_SLAB < 256)
+
+#define SKC_META_TYPE       uint
+#define SKC_META_WORDS      1
+
+#define SKC_COMPONENT_TYPE  uchar
+
+#else
+
+#define SKC_META_TYPE       uint2
+#define SKC_META_WORDS      2
+
+#define SKC_COMPONENT_TYPE  ushort
+
+#endif
+
+//
+//
+//
+
+#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
+#define SKC_COHORT_TYPE uchar
+#else
+#define SKC_COHORT_TYPE ushort
+#endif
+
+//
+//
+//
+
+#define SKC_COHORT_ID(row)                      \
+  as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
+
+//
+// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
+//
+
+#define SKC_IS_BLOCK(row)                                               \
+  ((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
+
+#define SKC_YX(row,prev)                        \
+  (as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
+
+#define SKC_IS_PK(row,prev)                             \
+  ((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
+
+//
+// COHORT   SIZE IS ALWAYS A POWER-OF-TWO
+// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
+//
+// COHORT SIZE >= SUBGROUP SIZE
+//
+
+#define SKC_COHORT_SIZE           (1<<SKC_TTRK_HI_BITS_COHORT)
+
+#define SKC_ZERO_RATIO            (SKC_ZERO_WORDS / SKC_META_WORDS)
+#define SKC_META_ZERO_COUNT       (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
+#define SKC_META_ZERO_REM         (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
+
+#define SKC_META_COMPONENTS       4
+#define SKC_META_COMPONENT_COUNT  (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
+
+//
+//
+//
+
+__kernel
+__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
+void
+skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
+                        __global uint        * SKC_RESTRICT const metas)
+{
+  __local union
+  {
+    SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
+    SKC_ZERO_TYPE          z[SKC_META_ZERO_COUNT];
+    SKC_COMPONENT_TYPE     c[SKC_META_COMPONENT_COUNT];
+  } shared;
+
+  uint const global_id = get_global_id(0);
+  uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
+  uint const gmem_idx  = gmem_base + (global_id & HS_LANE_MASK);
+  uint const gmem_off  = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
+
+  //
+  // LOAD ALL THE ROWS
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                                           \
+  HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
+
+  HS_SLAB_ROWS();
+
+  //
+  // LOAD LAST REGISTER FROM COLUMN TO LEFT
+  //
+  uint  diffs = 0;
+  uint2 r0    = 0;
+
+  if (gmem_base > 0) {
+    // if this is the first key in any slab but the first then it
+    // broadcast loads the last key in previous slab
+    r0.hi = as_uint2(vout[gmem_base - 1]).hi;
+  } else {
+    // otherwise broadcast the first key in the first slab
+    r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
+    // and mark it as an implicit diff
+    if (get_sub_group_local_id() == 0)
+      diffs = 1;
+  }
+
+  // now shuffle in the last key from the column to the left
+  r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
+
+  // shift away y/x
+  SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
+
+  //
+  // EXTRACT ALL COHORT IDS EARLY...
+  //
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                           \
+  SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
+
+  HS_SLAB_ROWS();
+
+  //
+  // DEBUG
+  //
+#if 0
+  if (gmem_base == HS_KEYS_PER_SLAB * 7)
+    {
+      if (get_sub_group_local_id() == 0)
+        printf("\n%llX ",as_ulong(r0));
+      else
+        printf("%llX ",as_ulong(r0));
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+      if (get_sub_group_local_id() == 0)        \
+        printf("\n%llX ",r##row);               \
+      else                                      \
+        printf("%llX ",r##row);
+
+      HS_SLAB_ROWS();
+    }
+#endif
+
+  //
+  // CAPTURE ALL CONDITIONS WE CARE ABOUT
+  //
+  // Diffs must be captured before cohorts
+  //
+  uint            valid  = 0;
+  uint            blocks = 0;
+  uint            pks    = 0;
+  SKC_COHORT_TYPE c_max  = 0;
+
+  //
+  // FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
+  // EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
+  //
+#if 0
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  diffs |= ((c##row != c##prev) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  blocks |= (SKC_IS_BLOCK(row) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  pks |= SKC_IS_PK(row,prev) << prev);
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  valid |= ((r##row != SKC_ULONG_MAX) << prev);
+
+  HS_SLAB_ROWS();
+
+#else
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (c##row != c##prev)                        \
+    diffs |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (SKC_IS_BLOCK(row))                        \
+    blocks |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (SKC_IS_PK(row,prev))                      \
+    pks |= 1<<prev;
+
+  HS_SLAB_ROWS();
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (r##row != SKC_ULONG_MAX) {                \
+    valid |= 1<<prev;                           \
+    c_max  = max(c_max,c##row);                 \
+  }
+
+  HS_SLAB_ROWS();
+
+#endif
+
+  //
+  // TRANSPOSE THE SLAB AND STORE IT
+  //
+  HS_TRANSPOSE_SLAB();
+
+  // the min cohort is the first key in the slab
+  uint const c_min = sub_group_broadcast(c1,0);
+  
+  // the max cohort is the max across all lanes
+  c_max = sub_group_reduce_max(c_max);
+
+#if 0 // REMOVE ME LATER
+  if (get_sub_group_local_id() == 0)
+    printf("%3u : ( %3u , %3u )\n",
+           get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
+#endif
+
+  //
+  // ZERO SMEM
+  //
+  // zero only the meta info for the cohort ids found in this slab
+  //
+#if   (SKC_ZERO_WORDS >= SKC_META_WORDS)
+  uint       zz     = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
+  uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
+
+  for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
+    shared.z[zz] = 0;
+#else
+  // ERROR -- it's highly unlikely that the zero type is smaller than
+  // the meta type
+#error("Unsupported right now...")
+#endif
+
+  //
+  // ACCUMULATE AND STORE META INFO
+  //
+  uint const    valid_blocks = valid & blocks;
+  uint const    valid_pks    = valid & pks & ~diffs;
+  SKC_META_TYPE meta         = ( 0 );
+
+#define SKC_META_LOCAL_ADD(meta)                \
+  atomic_add(shared.m+HS_REG_LAST(c),meta);
+
+#define SKC_META_LOCAL_STORE(meta,prev)         \
+  shared.m[c##prev] = meta;
+
+  // note this is purposefully off by +1
+#define SKC_META_RESET(meta,curr)               \
+  meta = ((gmem_off + curr) << 8);
+
+#if 0
+
+  // FIXME -- this can be tweaked to shift directly
+#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
+  meta += ((((blocks >> prev) & 1)      ) |     \
+           (((pks    >> prev) & 1) << 16) |     \
+           (((rks    >> prev) & 1) << 24));
+
+#else
+
+#define SKC_META_ADD(meta,prev,blocks,pks,rks)  \
+  if (blocks & (1<<prev))                       \
+    meta += 1;                                  \
+  if (pks    & (1<<prev))                       \
+    meta += 1<<16;                              \
+  if (rks    & (1<<prev))                       \
+    meta += 1<<24;
+
+#endif
+
+#undef  HS_SLAB_ROW
+#define HS_SLAB_ROW(row,prev)                   \
+  if (diffs & (1<<prev)) {                      \
+    SKC_META_LOCAL_STORE(meta,prev);            \
+    SKC_META_RESET(meta,row);                   \
+  }                                             \
+  SKC_META_ADD(meta,prev,                       \
+               valid_blocks,                    \
+               valid_pks,                       \
+               valid);
+
+  HS_SLAB_ROWS();
+
+  //
+  // ATOMICALLY ADD THE CARRIED OUT METAS
+  //
+#if 0 // BUG
+  if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
+    SKC_META_LOCAL_ADD(meta);
+#else
+  if (meta != 0)
+    SKC_META_LOCAL_ADD(meta);
+#endif
+
+  //
+  // NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
+  //
+
+  // convert the slab offset to an extent offset
+  bool const is_offset = (get_sub_group_local_id() & 3) == 1;
+  uint const adjust    = is_offset ? gmem_base - 1 : 0;
+
+  //
+  // only process the meta components found in this slab
+  //
+  uint const cc_min = c_min * SKC_META_COMPONENTS;
+  uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
+  uint       cc     = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
+
+  if ((cc >= cc_min) && (cc <= cc_max))
+    {
+      uint const c = shared.c[cc];
+
+      if (c != 0)
+        atomic_add(metas+cc,c+adjust);
+    }
+
+  cc += HS_LANES_PER_WARP;
+
+  for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
+    {
+      uint const c = shared.c[cc];
+
+      if (c != 0)
+        atomic_add(metas+cc,c+adjust);
+    }
+}
+
+//
+//
+//
--- a/src/compute/skc/platforms/cl_12/runtime_cl.c
+++ b/src/compute/skc/platforms/cl_12/runtime_cl.c
@ -1,362 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-//
-//
-//
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-
-//
-//
-//
-
-#include "runtime_cl.h"
-#include "common/cl/assert_cl.h"
-
-//
-//
-//
-
-static is_verbose = true;
-
-//
-// FIXME -- all variable length device queries need to start querying
-// the parameter's return size before getting its value
-//
-// FIXME -- this is now handled by the common/cl/find.* routine
-//
-
-union skc_cl_device_version {
-  struct {
-    cl_uchar opencl_space[7]; // "OpenCL_"
-    cl_uchar major;
-    cl_uchar dot;
-    cl_uchar minor;
-#if 1 // Intel NEO requires at least 16 bytes
-    cl_uchar space;
-    cl_uchar vendor[32];
-#endif
-  };
-  struct {
-    cl_uchar aN[];
-  };
-};
-
-typedef cl_bitfield cl_diagnostic_verbose_level_intel;
-
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL           0x4106
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL      0x2
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL     0x1
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL  0x4
-
-static
-void 
-CL_CALLBACK 
-skc_context_callback(char const * error, void const * info, size_t size, void * user)
-{
-  if (info != NULL )
-    {
-      fprintf(stderr,"%s\n",error);
-    }
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
-                      char const            * const target_platform_substring,
-                      char const            * const target_device_substring,
-                      cl_context_properties         context_properties[])
-{
-  skc_err err = SKC_ERR_SUCCESS;
-  
-  //
-  // search available devices for a match
-  //
-#define PLATFORM_IDS_MAX         16
-#define DEVICE_IDS_MAX           16
-#define PLATFORM_NAME_SIZE_MAX   64
-#define DEVICE_NAME_SIZE_MAX     64
-#define DRIVER_VERSION_SIZE_MAX  64
-
-  cl_int         cl_err;
-
-  cl_platform_id platform_ids[PLATFORM_IDS_MAX];
-  cl_device_id   device_ids  [PLATFORM_IDS_MAX][DEVICE_IDS_MAX];
-
-  cl_uint        platform_count;
-  cl_uint        device_count[PLATFORM_IDS_MAX];
-  
-  cl_uint        platform_idx = UINT32_MAX, device_idx = UINT32_MAX;
-
-  bool           match = false; // find _first_ match
-
-  //
-  // get number of platforms
-  //
-  cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count));
-
-  //
-  // search platforms
-  //
-  for (cl_uint ii=0; ii<platform_count; ii++)
-    {
-      char platform_name[PLATFORM_NAME_SIZE_MAX];
-
-      cl(GetPlatformInfo(platform_ids[ii],
-                         CL_PLATFORM_NAME,
-                         sizeof(platform_name),
-                         platform_name,
-                         NULL));
-
-      if (!match && (strstr(platform_name,target_platform_substring) != NULL)) 
-        {
-          platform_idx = ii;
-        }
-
-      if (is_verbose) {
-        fprintf(stdout,"%2u: %s\n",ii,platform_name);
-      }
-
-      cl_err = clGetDeviceIDs(platform_ids[ii],
-                              CL_DEVICE_TYPE_ALL,
-                              DEVICE_IDS_MAX,
-                              device_ids[ii],
-                              device_count+ii);
-
-      if (cl_err != CL_DEVICE_NOT_FOUND)
-        cl_ok(cl_err);
-
-      for (cl_uint jj=0; jj<device_count[ii]; jj++)
-        {
-          char                        device_name[DEVICE_NAME_SIZE_MAX];
-          union skc_cl_device_version device_version;
-          cl_uint                     device_align_bits;
-          char                        driver_version[DRIVER_VERSION_SIZE_MAX];
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_NAME,
-                           sizeof(device_name),
-                           device_name,
-                           NULL));
-
-          // FIXME -- some of these variable length parameters should
-          // use the "size the param before reading" idiom
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_VERSION,
-                           sizeof(device_version),
-                           device_version.aN,
-                           NULL));
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DEVICE_MEM_BASE_ADDR_ALIGN,
-                           sizeof(device_align_bits),
-                           &device_align_bits,
-                           NULL));
-          
-          cl_uint const base_align = device_align_bits / 8; // bytes
-
-          cl(GetDeviceInfo(device_ids[ii][jj],
-                           CL_DRIVER_VERSION,
-                           sizeof(driver_version),
-                           driver_version,
-                           NULL));
-          
-          if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL))
-            {
-              match      = true;
-              device_idx = jj;
-
-              runtime_cl->version.major = device_version.major - 48;
-              runtime_cl->version.minor = device_version.minor - 48;
-              runtime_cl->base_align    = base_align;
-
-              if (is_verbose) {
-                fprintf(stdout," >>>");
-              }
-            }
-          else if (is_verbose) 
-            {
-              fprintf(stdout,"    ");
-            }
-
-          if (is_verbose) {
-            fprintf(stdout,
-                    " %1u: %s [ %s ] [ %s ] [ %u ]\n",
-                    jj,
-                    device_name,
-                    device_version.aN,
-                    driver_version,
-                    base_align);
-          }
-        }
-    }
-
-  if (is_verbose) {
-    fprintf(stdout,"\n");
-  }
-
-  //
-  // get target platform and device
-  //
-  if (platform_idx >= platform_count)
-    {
-      fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring);
-      exit(EXIT_FAILURE);
-    }
-  if (device_idx >= device_count[platform_idx])
-    {
-      fprintf(stderr,"no match for target device substring %s\n",target_device_substring);
-      exit(EXIT_FAILURE);
-    }
-
-  runtime_cl->platform_id = platform_ids[platform_idx];
-  runtime_cl->device_id   = device_ids  [platform_idx][device_idx];
-
-  //
-  // create context
-  //
-
-#if 0
-  cl_context_properties context_properties[] = 
-    { 
-      CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id,
-      0 
-    };
-#else
-  context_properties[1] = (cl_context_properties)runtime_cl->platform_id;
-#endif
-
-  runtime_cl->context = clCreateContext(context_properties,
-                                    1,
-                                    &runtime_cl->device_id,
-                                    skc_context_callback,
-                                    NULL,
-                                    &cl_err);
-  cl_ok(cl_err);
-
-  //
-  // get device name, driver version, and unified memory flag
-  //
-  if (is_verbose)
-    {
-      char                       device_name[DEVICE_NAME_SIZE_MAX];
-      char                       driver_version[DRIVER_VERSION_SIZE_MAX];
-      cl_bool                    device_is_unified; 
-      cl_device_svm_capabilities svm_caps;
-      size_t                     printf_buffer_size;
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_NAME,
-                       sizeof(device_name),
-                       device_name,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DRIVER_VERSION,
-                       sizeof(driver_version),
-                       driver_version,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_HOST_UNIFIED_MEMORY,
-                       sizeof(device_is_unified),
-                       &device_is_unified,
-                       NULL));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_SVM_CAPABILITIES,
-                       sizeof(svm_caps),
-                       &svm_caps,
-                       0));
-
-      cl(GetDeviceInfo(runtime_cl->device_id,
-                       CL_DEVICE_PRINTF_BUFFER_SIZE,
-                       sizeof(printf_buffer_size),
-                       &printf_buffer_size,
-                       NULL));
-
-      fprintf(stderr,
-              "CL_DEVICE_SVM_COARSE_GRAIN_BUFFER  %c\n"
-              "CL_DEVICE_SVM_FINE_GRAIN_BUFFER    %c\n"
-              "CL_DEVICE_SVM_FINE_GRAIN_SYSTEM    %c\n"
-              "CL_DEVICE_SVM_ATOMICS              %c\n"
-              "CL_DEVICE_PRINTF_BUFFER_SIZE       %zu\n\n",
-              svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER   ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM   ? '*' : '-',
-              svm_caps & CL_DEVICE_SVM_ATOMICS             ? '*' : '-',
-              printf_buffer_size);
-    }
-
-  return err;
-}
-
-//
-//
-//
-
-skc_err
-skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl)
-{
-  // FIXME
-  printf("%s incomplete!\n",__func__);
-
-  return SKC_ERR_SUCCESS;
-}
-
-//
-//
-//
-
-cl_command_queue
-skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type)
-{
-  cl_command_queue cq;
-
-  if (runtime_cl->version.major < 2)
-    {
-      //
-      // <= OpenCL 1.2
-      //
-      cl_int cl_err;
-
-      cq = clCreateCommandQueue(runtime_cl->context,
-                                runtime_cl->device_id,
-                                (cl_command_queue_properties)type,
-                                &cl_err); cl_ok(cl_err);  
-    }
-  else
-    {
-      //
-      // >= OpenCL 2.0
-      //
-      cl_int                    cl_err;
-      cl_queue_properties const queue_properties[] = {
-        CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
-      };
-
-      cq = clCreateCommandQueueWithProperties(runtime_cl->context,
-                                              runtime_cl->device_id,
-                                              queue_properties,
-                                              &cl_err); cl_ok(cl_err);
-    }
-
-  return cq;
-}
-
-//
-//
-//
-
--- a/src/compute/skc/platforms/cl_12/runtime_cl.h
+++ b/src/compute/skc/platforms/cl_12/runtime_cl.h
@ -1,79 +0,0 @@
-/*
- * Copyright 2017 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can
- * be found in the LICENSE file.
- *
- */
-
-#pragma once
-
-//
-// squelch OpenCL 1.2 deprecation warning
-//
-
-#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#endif
-
-#include <CL/opencl.h>
-
-//
-//
-//
-
-#include "skc.h"
-
-//
-// Minimal OpenCL state needed by the runtime to get started
-//
-
-struct skc_runtime_cl
-{
-  cl_platform_id platform_id;
-  cl_device_id   device_id;
-  cl_context     context;
-  
-  struct {
-    cl_uint      major;
-    cl_uint      minor;
-  } version; // sometimes we need to know this at runtime 
-
-  cl_uint        base_align; // base address alignment for subbuffer origins
-};
-
-//
-//
-//
-
-typedef enum skc_cq_type_e {
-  SKC_CQ_TYPE_IN_ORDER               = 0,
-  SKC_CQ_TYPE_OUT_OF_ORDER           = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-  SKC_CQ_TYPE_IN_ORDER_PROFILING     = (SKC_CQ_TYPE_IN_ORDER     | CL_QUEUE_PROFILING_ENABLE),
-  SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE),
-} skc_cq_type_e;
-
-//
-// safely creates a generic OpenCL target in very few lines
-//
-
-skc_err
-skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
-                      char const            * const target_platform_substring,
-                      char const            * const target_device_substring,
-                      cl_context_properties         context_properties[]);
-
-skc_err
-skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl);
-
-//
-// create a command queue with the non-deprecated function
-//
-
-cl_command_queue
-skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type);
-
-//
-//
-//
-
--- a/src/compute/skc/platforms/cl_12/runtime_cl_12.c
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.c
@ -24,7 +24,6 @@
 #include "grid.h"
 #include "common/cl/assert_cl.h"
 #include "config_cl.h"
-#include "runtime_cl.h"
 #include "runtime_cl_12.h"
 #include "export_cl_12.h"

@ -32,7 +31,7 @@
 //
 //

-static 
+static
 void
 skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
 {
@ -42,7 +41,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
  // create block extent
  skc_extent_pdrw_alloc(runtime,
                        &runtime->block_pool.blocks,
-                        runtime->block_pool.size->pool_size * 
+                        runtime->block_pool.size->pool_size *
                        runtime->config->block.bytes);

  // allocate block pool ids
@ -85,7 +84,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
  cl(ReleaseKernel(k1));
 }

-static 
+static
 void
 skc_block_pool_dispose(struct skc_runtime * const runtime)
 {
@ -106,7 +105,7 @@ skc_runtime_yield(struct skc_runtime * const runtime)
 }

 static
-void 
+void
 skc_runtime_wait(struct skc_runtime * const runtime)
 {
  skc_scheduler_wait(runtime->scheduler);
@ -118,18 +117,26 @@ skc_runtime_wait(struct skc_runtime * const runtime)

 skc_err
 skc_runtime_cl_12_create(struct skc_context * const context,
-                         char const         * const target_platform_substring,
-                         char const         * const target_device_substring,
-                         cl_context_properties      context_properties[])
+                         cl_context                 context_cl,
+                         cl_device_id               device_id_cl)
 {
  // allocate the runtime
  struct skc_runtime * const runtime = malloc(sizeof(*runtime));

-  // acquire OpenCL ids and context for target device
-  skc_err err = skc_runtime_cl_create(&runtime->cl,
-                                      target_platform_substring,
-                                      target_device_substring,
-                                      context_properties);
+  // save off CL objects
+  runtime->cl.context   = context_cl;
+  runtime->cl.device_id = device_id_cl;
+
+  // query device alignment
+  cl_uint align_bits;
+
+  cl(GetDeviceInfo(device_id_cl,
+                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                   sizeof(align_bits),
+                   &align_bits,
+                   NULL));
+
+  runtime->cl.align_bytes = align_bits / 8;

  // create device
  skc_device_create(runtime);
@ -149,7 +156,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
  // initialize cq pool
  skc_cq_pool_create(runtime,
                     &runtime->cq_pool,
-                     runtime->config->cq_pool.type,
+                     runtime->config->cq_pool.cq_props,
                     runtime->config->cq_pool.size);

  // acquire in-order cq
@ -176,7 +183,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,

  context->yield          = skc_runtime_yield;
  context->wait           = skc_runtime_wait;
-  
+
  context->path_builder   = skc_path_builder_cl_12_create;
  context->path_retain    = skc_runtime_path_host_retain;
  context->path_release   = skc_runtime_path_host_release;
@ -189,7 +196,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,

  context->composition    = skc_composition_cl_12_create;
  context->styling        = skc_styling_cl_12_create;
-  
+
  context->surface        = skc_surface_cl_12_create;

  // block on pool creation
@ -198,7 +205,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
  // dispose of in-order cq
  skc_runtime_release_cq_in_order(runtime,cq);

-  return err;
+  return SKC_ERR_SUCCESS;
 };

 //
@ -227,7 +234,7 @@ skc_runtime_cl_12_dispose(struct skc_context * const context)
  skc_block_pool_dispose(context->runtime);

  // skc_handle_pool_dispose(context->runtime);
-  
+
  return SKC_ERR_SUCCESS;
 }

@ -253,12 +260,12 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
    return;

  QueryPerformanceCounter(&EndingTime);
-  
+
  LARGE_INTEGER ElapsedMicroseconds, Frequency;

  ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;

-  QueryPerformanceFrequency(&Frequency);   
+  QueryPerformanceFrequency(&Frequency);

  double const msecs_total  = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart;
  double const msecs_frame  = msecs_total / SKC_FRAMES;
@ -268,7 +275,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
 #endif

  struct skc_runtime * const runtime = context->runtime;
-  
+
  // acquire out-of-order cq
  cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);

@ -311,4 +318,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
 //
 //
 //
-
--- a/src/compute/skc/platforms/cl_12/runtime_cl_12.h
+++ b/src/compute/skc/platforms/cl_12/runtime_cl_12.h
@ -12,8 +12,8 @@
 //
 //

+#include "skc.h"
 #include "runtime.h"
-#include "runtime_cl.h"
 #include "cq_pool_cl.h"
 #include "handle_pool_cl_12.h"
 #include "block_pool_cl_12.h"
@ -31,7 +31,11 @@ struct skc_runtime
  //
  // state visible to device
  //
-  struct skc_runtime_cl            cl;
+  struct {
+    cl_context                     context;
+    cl_device_id                   device_id;
+    cl_uint                        align_bytes;
+  } cl;

  struct {
    struct skc_allocator_host      host;
@ -63,9 +67,8 @@ struct skc_runtime

 skc_err
 skc_runtime_cl_12_create(struct skc_context * const context,
-                         char const         * const target_platform_substring,
-                         char const         * const target_device_substring,
-                         cl_context_properties      context_properties[]);
+                         cl_context                 context_cl,
+                         cl_device_id               device_id_cl);

 skc_err
 skc_runtime_cl_12_dispose(struct skc_context * const context);
--- a/src/compute/skc/raster_builder.c
+++ b/src/compute/skc/raster_builder.c
@ -69,7 +69,7 @@ float const skc_transform_identity[8] =
    0.0f, 0.0f         // w0  w1  1  <-- always 1
  };

-float const * const skc_transform_identity_ptr = skc_transform_identity;
+// float const * const skc_transform_identity_ptr = skc_transform_identity;

 //
 // DEFAULT RASTER CLIP
@ -82,7 +82,7 @@ float const skc_raster_clip_default[4] =
    +FLT_MAX, +FLT_MAX  // upper right corner of bounding box
  };

-float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
+// float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;

 #endif

--- a/src/compute/skc/skc.h
+++ b/src/compute/skc/skc.h
@ -9,125 +9,18 @@
 #ifndef SKC_ONCE_SKC
 #define SKC_ONCE_SKC

-//
-// FIXME -- get rid of these here
-//
-
-#include <stdint.h>
-#include <stdbool.h>
-
 //
 //
 //

-#include "skc_styling.h" // FIXME -- skc_styling
-// #include "skc_err.h"
-
-//
-// FIXME -- move errors to an skc prefixed include
-//
-
-typedef enum skc_err {
-
-  SKC_ERR_SUCCESS                           = 0,
-
-  SKC_ERR_API_BASE                          = 10000,
-
-  SKC_ERR_NOT_IMPLEMENTED                   = SKC_ERR_API_BASE,
-
-  SKC_ERR_POOL_EMPTY,
-
-  SKC_ERR_CONDVAR_WAIT,
-
-  SKC_ERR_LAYER_ID_INVALID,
-  SKC_ERR_LAYER_NOT_EMPTY,
-
-  SKC_ERR_TRANSFORM_WEAKREF_INVALID,
-  SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
-
-  SKC_ERR_COMMAND_NOT_READY,
-  SKC_ERR_COMMAND_NOT_COMPLETED,
-  SKC_ERR_COMMAND_NOT_STARTED,
-
-  SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
-
-  SKC_ERR_COMPOSITION_SEALED,
-  SKC_ERR_STYLING_SEALED,
-
-  SKC_ERR_HANDLE_INVALID,
-  SKC_ERR_HANDLE_OVERFLOW,
-
-  SKC_ERR_COUNT
-
-} skc_err;
-
-//
-// SPINEL TYPES
-//
-
-typedef struct skc_context          * skc_context_t;
-typedef struct skc_path_builder     * skc_path_builder_t;
-typedef struct skc_raster_builder   * skc_raster_builder_t;
-
-typedef struct skc_composition      * skc_composition_t;
-typedef struct skc_styling          * skc_styling_t;
-
-typedef struct skc_surface          * skc_surface_t;
-
-#if 0
-typedef struct skc_interop          * skc_interop_t;
-typedef        uint32_t               skc_interop_surface_t;
-#endif
-
-typedef        uint32_t               skc_path_t;
-typedef        uint32_t               skc_raster_t;
-
-typedef        uint32_t               skc_layer_id;
-typedef        uint32_t               skc_group_id;
-
-typedef        uint32_t               skc_styling_cmd_t;
-
-typedef        uint64_t               skc_weakref_t;
-typedef        skc_weakref_t          skc_transform_weakref_t;
-typedef        skc_weakref_t          skc_raster_clip_weakref_t;
-
-//
-// FIXME -- bury all of this
-//
-
-#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
-#define SKC_GROUP_IDS(...)    _countof(__VA_ARGS__),__VA_ARGS__
-
-//
-//
-//
-
-#define SKC_PATH_INVALID     UINT32_MAX
-#define SKC_RASTER_INVALID   UINT32_MAX
-#define SKC_WEAKREF_INVALID  UINT64_MAX
-
-//
-// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
-//
-
-extern float const * const skc_transform_identity_ptr; // { 1, 0, 0, 0, 1, 0, 0, 0 }
-
-//
-// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
-//
-
-extern float const * const skc_raster_clip_default_ptr;
+#include "skc_err.h"
+#include "skc_types.h"
+#include "skc_styling.h"

 //
 // CONTEXT
 //

-skc_err
-skc_context_create(skc_context_t       * context,
-                   char          const * target_platform_substring,
-                   char          const * target_device_substring,
-                   intptr_t              context_properties[]);
-
 skc_err
 skc_context_retain(skc_context_t context);

@ -137,31 +30,6 @@ skc_context_release(skc_context_t context);
 skc_err
 skc_context_reset(skc_context_t context);

-//
-// COORDINATED EXTERNAL OPERATIONS
-//
-
-/*
-  Examples include:
-
-  - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
-  - Subpixel antialiasing using neighboring pixel color and coverage data.
-  - Performing a blit from one region to another region on a surface.
-  - Blitting from one surface to another.
-  - Loading and processing from one region and storing to another region.
-  - Rendezvousing with an external pipeline.
-*/
-
-//
-//
-//
-
-bool
-skc_context_yield(skc_context_t context);
-
-void
-skc_context_wait(skc_context_t context);
-
 //
 // PATH BUILDER
 //
@ -485,6 +353,31 @@ skc_surface_render(skc_surface_t                 surface,
                   void                        * data,
                   void                        * fb); // FIXME FIXME

+//
+// COORDINATED EXTERNAL OPERATIONS
+//
+//  Examples include:
+//
+//  - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
+//  - Subpixel antialiasing using neighboring pixel color and coverage data.
+//  - Performing a blit from one region to another region on a surface.
+//  - Blitting from one surface to another.
+//  - Loading and processing from one region and storing to another region.
+//  - Rendezvousing with an external pipeline.
+//
+
+// FORTHCOMING...
+
+//
+// SCHEDULER
+//
+
+bool
+skc_context_yield(skc_context_t context);
+
+void
+skc_context_wait(skc_context_t context);
+
 //
 //
 //
--- a/src/compute/skc/skc_create_cl.h
+++ b/src/compute/skc/skc_create_cl.h
@ -0,0 +1,70 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_SKC_CREATE_CL
+#define SKC_ONCE_SKC_CREATE_CL
+
+//
+//
+//
+
+#ifdef __APPLE__
+#include "OpenCL/opencl.h"
+#else
+#include "CL/opencl.h"
+#endif
+
+//
+//
+//
+
+#include "skc.h"
+
+//
+// CONTEXT CREATION
+//
+
+skc_err
+skc_context_create_cl(skc_context_t * context,
+                      cl_context      context_cl,
+                      cl_device_id    device_id_cl);
+
+//
+// FIXME -- SPECIALIZE SURFACE RENDER
+//
+
+#if 0
+
+//
+// SURFACE RENDER
+//
+
+typedef void (*skc_surface_render_pfn_notify)(skc_surface_t     surface,
+                                              skc_styling_t     styling,
+                                              skc_composition_t composition,
+                                              void            * data);
+skc_err
+skc_surface_render(skc_surface_t                 surface,
+                   uint32_t                const clip[4],
+                   skc_styling_t                 styling,
+                   skc_composition_t             composition,
+                   skc_surface_render_pfn_notify notify,
+                   void                        * data,
+                   void                        * fb); // FIXME FIXME
+
+#endif
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
--- a/src/compute/skc/skc_err.h
+++ b/src/compute/skc/skc_err.h
@ -0,0 +1,58 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_SKC_ERR
+#define SKC_ONCE_SKC_ERR
+
+//
+//
+//
+
+typedef enum skc_err {
+
+  SKC_ERR_SUCCESS                           = 0,
+
+  SKC_ERR_API_BASE                          = 10000,
+
+  SKC_ERR_NOT_IMPLEMENTED                   = SKC_ERR_API_BASE,
+
+  SKC_ERR_POOL_EMPTY,
+
+  SKC_ERR_CONDVAR_WAIT,
+
+  SKC_ERR_LAYER_ID_INVALID,
+  SKC_ERR_LAYER_NOT_EMPTY,
+
+  SKC_ERR_TRANSFORM_WEAKREF_INVALID,
+  SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
+
+  SKC_ERR_COMMAND_NOT_READY,
+  SKC_ERR_COMMAND_NOT_COMPLETED,
+  SKC_ERR_COMMAND_NOT_STARTED,
+
+  SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
+
+  SKC_ERR_COMPOSITION_SEALED,
+  SKC_ERR_STYLING_SEALED,
+
+  SKC_ERR_HANDLE_INVALID,
+  SKC_ERR_HANDLE_OVERFLOW,
+
+  SKC_ERR_COUNT
+
+} skc_err;
+
+//
+//
+//
+
+#endif
+
+//
+//
+//
--- a/src/compute/skc/skc_styling.h
+++ b/src/compute/skc/skc_styling.h
@ -79,6 +79,13 @@ typedef enum skc_styling_gradient_type_e {

 } skc_styling_gradient_type_e;

+//
+// FIXME -- bury all of this once we stabilize styling
+//
+
+#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
+#define SKC_GROUP_IDS(...)    _countof(__VA_ARGS__),__VA_ARGS__
+
 //
 //
 //
--- a/src/compute/skc/skc_types.h
+++ b/src/compute/skc/skc_types.h
@ -0,0 +1,73 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ */
+
+#ifndef SKC_ONCE_SKC_TYPES
+#define SKC_ONCE_SKC_TYPES
+
+//
+//
+//
+
+#include <stdint.h>
+#include <stdbool.h>
+
+//
+//
+//
+
+typedef struct skc_context          * skc_context_t;
+typedef struct skc_path_builder     * skc_path_builder_t;
+typedef struct skc_raster_builder   * skc_raster_builder_t;
+
+typedef struct skc_composition      * skc_composition_t;
+typedef struct skc_styling          * skc_styling_t;
+
+typedef struct skc_surface          * skc_surface_t;
+
+typedef        uint32_t               skc_path_t;
+typedef        uint32_t               skc_raster_t;
+
+typedef        uint32_t               skc_layer_id;
+typedef        uint32_t               skc_group_id;
+
+typedef        uint32_t               skc_styling_cmd_t;
+
+typedef        uint64_t               skc_weakref_t;
+typedef        skc_weakref_t          skc_transform_weakref_t;
+typedef        skc_weakref_t          skc_raster_clip_weakref_t;
+
+#if 0
+typedef struct skc_interop          * skc_interop_t;
+typedef        uint32_t               skc_interop_surface_t;
+#endif
+
+//
+//
+//
+
+#define SKC_PATH_INVALID     UINT32_MAX
+#define SKC_RASTER_INVALID   UINT32_MAX
+#define SKC_WEAKREF_INVALID  UINT64_MAX
+
+//
+// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
+//
+
+//
+// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
+//
+
+//
+//
+//
+
+#endif
+
+//
+//
+//