OpenGL interop is simplified when the cl_context is not created by SKC.
Added GEN9 HotSort kernels so the hs_cl_gen9 lib and hs_bench_cl app can be built. Bug: skia: Change-Id: I5b21d33499a6ec3524f39a51443981802b722c8b Reviewed-on: https://skia-review.googlesource.com/136608 Commit-Queue: Allan MacKinnon <allanmac@google.com> Reviewed-by: Mike Reed <reed@google.com> Reviewed-by: Mike Klein <mtklein@google.com>
This commit is contained in:
parent
867ce8fc8e
commit
c110e7941e
@ -20,7 +20,7 @@
|
||||
|
||||
#include "find_cl.h"
|
||||
#include "assert_cl.h"
|
||||
#include "macros.h"
|
||||
#include "../macros.h"
|
||||
|
||||
//
|
||||
// search platforms and devices for a match
|
||||
|
@ -38,9 +38,9 @@
|
||||
//
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define ALLOCA(n) _alloca(n)
|
||||
#define ALLOCA(n) _alloca(n)
|
||||
#else
|
||||
#define ALLOCA(n) alloca(n)
|
||||
#define ALLOCA(n) alloca(n)
|
||||
#endif
|
||||
//
|
||||
//
|
||||
|
10082
src/compute/hs/cl/gen9/hs_cl.cl
Normal file
10082
src/compute/hs/cl/gen9/hs_cl.cl
Normal file
File diff suppressed because it is too large
Load Diff
122
src/compute/hs/cl/gen9/hs_cl.h
Normal file
122
src/compute/hs/cl/gen9/hs_cl.h
Normal file
@ -0,0 +1,122 @@
|
||||
//
|
||||
// Copyright 2016 Google Inc.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
//
|
||||
|
||||
#ifndef HS_CL_ONCE
|
||||
#define HS_CL_ONCE
|
||||
|
||||
#define HS_LANES_PER_WARP_LOG2 3
|
||||
#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2)
|
||||
#define HS_BS_WARPS 16
|
||||
#define HS_BS_WARPS_LOG2_RU 4
|
||||
#define HS_BC_WARPS_LOG2_MAX 4
|
||||
#define HS_FM_BLOCKS_LOG2_MIN 1
|
||||
#define HS_HM_BLOCKS_LOG2_MIN 1
|
||||
#define HS_KEYS_PER_LANE 16
|
||||
#define HS_REG_LAST(c) c##16
|
||||
#define HS_KEY_WORDS 2
|
||||
#define HS_KEY_TYPE ulong
|
||||
#define HS_EMPTY
|
||||
|
||||
#define HS_SLAB_ROWS() \
|
||||
HS_SLAB_ROW( 1, 0 ) \
|
||||
HS_SLAB_ROW( 2, 1 ) \
|
||||
HS_SLAB_ROW( 3, 2 ) \
|
||||
HS_SLAB_ROW( 4, 3 ) \
|
||||
HS_SLAB_ROW( 5, 4 ) \
|
||||
HS_SLAB_ROW( 6, 5 ) \
|
||||
HS_SLAB_ROW( 7, 6 ) \
|
||||
HS_SLAB_ROW( 8, 7 ) \
|
||||
HS_SLAB_ROW( 9, 8 ) \
|
||||
HS_SLAB_ROW( 10, 9 ) \
|
||||
HS_SLAB_ROW( 11, 10 ) \
|
||||
HS_SLAB_ROW( 12, 11 ) \
|
||||
HS_SLAB_ROW( 13, 12 ) \
|
||||
HS_SLAB_ROW( 14, 13 ) \
|
||||
HS_SLAB_ROW( 15, 14 ) \
|
||||
HS_SLAB_ROW( 16, 15 ) \
|
||||
HS_EMPTY
|
||||
|
||||
#define HS_TRANSPOSE_SLAB() \
|
||||
HS_TRANSPOSE_STAGE( 1 ) \
|
||||
HS_TRANSPOSE_STAGE( 2 ) \
|
||||
HS_TRANSPOSE_STAGE( 3 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \
|
||||
HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \
|
||||
HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \
|
||||
HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 1, 1 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 2, 3 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 3, 5 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 4, 7 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 5, 9 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 6, 11 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 7, 13 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 8, 15 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 9, 2 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 10, 4 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 11, 6 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 12, 8 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 13, 10 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 14, 12 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 15, 14 ) \
|
||||
HS_TRANSPOSE_REMAP( u, 16, 16 ) \
|
||||
HS_EMPTY
|
||||
|
||||
#define HS_FM_BLOCKS_LOG2_1 0
|
||||
#define HS_FM_BLOCKS_LOG2_2 1
|
||||
#define HS_FM_BLOCKS_LOG2_3 2
|
||||
#define HS_FM_BLOCKS_LOG2_4 3
|
||||
#define HS_FM_BLOCKS_LOG2_5 4
|
||||
#define HS_FM_BLOCKS_LOG2_6 5
|
||||
#define HS_HM_BLOCKS_LOG2_5 0
|
||||
#define HS_FM_BLOCKS_LOG2_7 6
|
||||
#define HS_HM_BLOCKS_LOG2_6 1
|
||||
#define HS_FM_BLOCKS_LOG2_8 7
|
||||
#define HS_HM_BLOCKS_LOG2_7 2
|
||||
#define HS_FM_BLOCKS_LOG2_9 8
|
||||
#define HS_HM_BLOCKS_LOG2_8 3
|
||||
#define HS_FM_BLOCKS_LOG2_10 9
|
||||
#define HS_HM_BLOCKS_LOG2_9 4
|
||||
#define HS_FM_BLOCKS_LOG2_11 10
|
||||
#define HS_HM_BLOCKS_LOG2_10 5
|
||||
#define HS_FM_BLOCKS_LOG2_12 11
|
||||
#define HS_HM_BLOCKS_LOG2_11 6
|
||||
#define HS_FM_BLOCKS_LOG2_13 12
|
||||
#define HS_HM_BLOCKS_LOG2_12 7
|
||||
#define HS_FM_BLOCKS_LOG2_14 13
|
||||
#define HS_HM_BLOCKS_LOG2_13 8
|
||||
#define HS_FM_BLOCKS_LOG2_15 14
|
||||
#define HS_HM_BLOCKS_LOG2_14 9
|
||||
#define HS_FM_BLOCKS_LOG2_16 15
|
||||
#define HS_HM_BLOCKS_LOG2_15 10
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
199
src/compute/hs/cl/gen9/hs_cl_macros.h
Normal file
199
src/compute/hs/cl/gen9/hs_cl_macros.h
Normal file
@ -0,0 +1,199 @@
|
||||
//
|
||||
// Copyright 2016 Google Inc.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
//
|
||||
|
||||
#ifndef HS_CL_MACROS_ONCE
|
||||
#define HS_CL_MACROS_ONCE
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "hs_cl.h"
|
||||
|
||||
//
|
||||
// Inter-lane compare exchange
|
||||
//
|
||||
|
||||
// default
|
||||
#define HS_CMP_XCHG_V0(a,b) \
|
||||
{ \
|
||||
HS_KEY_TYPE const t = min(a,b); \
|
||||
b = max(a,b); \
|
||||
a = t; \
|
||||
}
|
||||
|
||||
// super slow
|
||||
#define HS_CMP_XCHG_V1(a,b) \
|
||||
{ \
|
||||
HS_KEY_TYPE const tmp = a; \
|
||||
a = (a < b) ? a : b; \
|
||||
b ^= a ^ tmp; \
|
||||
}
|
||||
|
||||
// best
|
||||
#define HS_CMP_XCHG_V2(a,b) \
|
||||
if (a >= b) { \
|
||||
HS_KEY_TYPE const t = a; \
|
||||
a = b; \
|
||||
b = t; \
|
||||
}
|
||||
|
||||
// good
|
||||
#define HS_CMP_XCHG_V3(a,b) \
|
||||
{ \
|
||||
int const ge = a >= b; \
|
||||
HS_KEY_TYPE const t = a; \
|
||||
a = ge ? b : a; \
|
||||
b = ge ? t : b; \
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if (HS_KEY_WORDS == 1)
|
||||
#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
|
||||
#elif (HS_KEY_WORDS == 2)
|
||||
#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b)
|
||||
#endif
|
||||
|
||||
//
|
||||
// Conditional inter-subgroup flip/half compare exchange
|
||||
//
|
||||
|
||||
#define HS_CMP_FLIP(i,a,b) \
|
||||
{ \
|
||||
HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \
|
||||
HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \
|
||||
a = HS_COND_MIN_MAX(t_lt,a,tb); \
|
||||
b = HS_COND_MIN_MAX(t_lt,b,ta); \
|
||||
}
|
||||
|
||||
#define HS_CMP_HALF(i,a) \
|
||||
{ \
|
||||
HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \
|
||||
a = HS_COND_MIN_MAX(t_lt,a,ta); \
|
||||
}
|
||||
|
||||
//
|
||||
// The device's comparison operator might return what we actually
|
||||
// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
|
||||
//
|
||||
|
||||
#define HS_CMP_IS_ZERO_ONE
|
||||
|
||||
#ifdef HS_CMP_IS_ZERO_ONE
|
||||
// OpenCL requires a {true: +1, false: 0} scalar result
|
||||
// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
|
||||
#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
|
||||
#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
|
||||
#else
|
||||
// However, OpenCL requires { -1, 0 } for vectors
|
||||
// (a < b) -> { 0xFFFFFFFF, 0 }
|
||||
#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
|
||||
#define HS_CMP_TO_MASK(a) (a)
|
||||
#endif
|
||||
|
||||
//
|
||||
// The flip/half comparisons rely on a "conditional min/max":
|
||||
//
|
||||
// - if the flag is false, return min(a,b)
|
||||
// - otherwise, return max(a,b)
|
||||
//
|
||||
// What's a little surprising is that sequence (1) is faster than (2)
|
||||
// for 32-bit keys.
|
||||
//
|
||||
// I suspect either a code generation problem or that the sequence
|
||||
// maps well to the GEN instruction set.
|
||||
//
|
||||
// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
|
||||
// fastest for this wider type.
|
||||
//
|
||||
|
||||
// this is what you would normally use
|
||||
#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
|
||||
|
||||
// this seems to be faster for 32-bit keys
|
||||
#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if (HS_KEY_WORDS == 1)
|
||||
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
|
||||
#elif (HS_KEY_WORDS == 2)
|
||||
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
|
||||
#endif
|
||||
|
||||
//
|
||||
// This snarl of macros is for transposing a "slab" of sorted elements
|
||||
// into linear order.
|
||||
//
|
||||
// This can occur as the last step in hs_sort() or via a custom kernel
|
||||
// that inspects the slab and then transposes and stores it to memory.
|
||||
//
|
||||
// The slab format can be inspected more efficiently than a linear
|
||||
// arrangement.
|
||||
//
|
||||
// The prime example is detecting when adjacent keys (in sort order)
|
||||
// have differing high order bits ("key changes"). The index of each
|
||||
// change is recorded to an auxilary array.
|
||||
//
|
||||
// A post-processing step like this needs to be able to navigate the
|
||||
// slab and eventually transpose and store the slab in linear order.
|
||||
//
|
||||
|
||||
#define HS_TRANSPOSE_REG(prefix,row) prefix##row
|
||||
#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
|
||||
|
||||
#define HS_TRANSPOSE_DELTA(level) (HS_LANES_PER_WARP + (1 << (level-1)))
|
||||
#define HS_TRANSPOSE_IF(level) ((get_sub_group_local_id() >> (level - 1)) & 1)
|
||||
|
||||
#define HS_TRANSPOSE_LL(level) HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level)
|
||||
#define HS_TRANSPOSE_UR(level) HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0
|
||||
|
||||
#define HS_TRANSPOSE_DELTA_LL(level) delta_ll_##level
|
||||
#define HS_TRANSPOSE_DELTA_UR(level) delta_ur_##level
|
||||
|
||||
#define HS_TRANSPOSE_STAGE(level) \
|
||||
uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level); \
|
||||
uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level);
|
||||
|
||||
#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
|
||||
HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
|
||||
intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll), \
|
||||
HS_TRANSPOSE_REG(prefix_prev,row_ur), \
|
||||
HS_TRANSPOSE_DELTA_LL(level)); \
|
||||
HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
|
||||
intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll), \
|
||||
HS_TRANSPOSE_REG(prefix_prev,row_ur), \
|
||||
HS_TRANSPOSE_DELTA_UR(level)); \
|
||||
|
||||
// #define HS_TRANSPOSE_LOAD(row) \
|
||||
// HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2];
|
||||
|
||||
#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
|
||||
(vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] = \
|
||||
HS_TRANSPOSE_REG(prefix,row_from);
|
||||
|
||||
//
|
||||
// undefine these if you want to override
|
||||
//
|
||||
|
||||
#define HS_TRANSPOSE_PREAMBLE()
|
||||
#define HS_TRANSPOSE_BODY()
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
@ -28,23 +28,20 @@
|
||||
//
|
||||
|
||||
skc_err
|
||||
skc_context_create(skc_context_t * context,
|
||||
char const * target_platform_substring,
|
||||
char const * target_device_substring,
|
||||
intptr_t context_properties[])
|
||||
skc_context_create_cl(skc_context_t * context,
|
||||
cl_context context_cl,
|
||||
cl_device_id device_id_cl)
|
||||
{
|
||||
(*context) = malloc(sizeof(**context));
|
||||
|
||||
//
|
||||
// FIXME -- don't directly grab a CL runtime but for now juts create
|
||||
// the CL_12 runtime here
|
||||
// FIXME -- we'll clean up context creation by platform later. For
|
||||
// now, just create a CL_12 context.
|
||||
//
|
||||
skc_err err;
|
||||
|
||||
err = skc_runtime_cl_12_create(*context,
|
||||
target_platform_substring,
|
||||
target_device_substring,
|
||||
context_properties);
|
||||
err = skc_runtime_cl_12_create(*context,context_cl,device_id_cl);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <conio.h>
|
||||
|
||||
#include "skc_create_cl.h"
|
||||
|
||||
#include "common/cl/find_cl.h"
|
||||
#include "common/cl/assert_cl.h"
|
||||
|
||||
#include "svg/svg_doc.h"
|
||||
#include "svg2skc/svg2skc.h"
|
||||
#include "svg2skc/transform_stack.h"
|
||||
@ -49,7 +54,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context);
|
||||
//
|
||||
//
|
||||
|
||||
static
|
||||
static
|
||||
void
|
||||
is_render_complete(skc_surface_t surface,
|
||||
skc_styling_t styling,
|
||||
@ -67,9 +72,9 @@ int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
//
|
||||
//
|
||||
//
|
||||
if (argc <= 1)
|
||||
//
|
||||
if (argc <= 1)
|
||||
{
|
||||
fprintf(stderr,"-- missing filename\n");
|
||||
return EXIT_FAILURE; // no filename
|
||||
@ -94,6 +99,18 @@ main(int argc, char** argv)
|
||||
|
||||
skc_interop_init(&window);
|
||||
|
||||
//
|
||||
// find platform and device by name
|
||||
//
|
||||
cl_platform_id platform_id_cl;
|
||||
cl_device_id device_id_cl;
|
||||
|
||||
cl(FindIdsByName("Intel","Graphics",
|
||||
&platform_id_cl,
|
||||
&device_id_cl,
|
||||
0,NULL,NULL,
|
||||
true));
|
||||
|
||||
//
|
||||
// get GL and device contexts
|
||||
//
|
||||
@ -101,22 +118,31 @@ main(int argc, char** argv)
|
||||
HDC hDC = wglGetCurrentDC();
|
||||
|
||||
//
|
||||
// create the CL context
|
||||
//
|
||||
//
|
||||
cl_context_properties context_properties[] =
|
||||
cl_context_properties context_properties_cl[] =
|
||||
{
|
||||
CL_CONTEXT_PLATFORM, (cl_context_properties)-1,
|
||||
CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id_cl,
|
||||
CL_GL_CONTEXT_KHR, (cl_context_properties)hGLRC,
|
||||
CL_WGL_HDC_KHR, (cl_context_properties)hDC,
|
||||
0
|
||||
};
|
||||
|
||||
|
||||
cl_int cl_err;
|
||||
cl_context context_cl = clCreateContext(context_properties_cl,
|
||||
1,
|
||||
&device_id_cl,
|
||||
NULL,
|
||||
NULL,
|
||||
&cl_err); cl_ok(cl_err);
|
||||
//
|
||||
// create context
|
||||
// create SKC context
|
||||
//
|
||||
skc_context_t context;
|
||||
|
||||
skc_err err = skc_context_create(&context,"Intel","Graphics",context_properties);
|
||||
skc_err err = skc_context_create_cl(&context,
|
||||
context_cl,
|
||||
device_id_cl);
|
||||
|
||||
//
|
||||
// associate
|
||||
@ -136,14 +162,14 @@ main(int argc, char** argv)
|
||||
skc_raster_builder_t raster_builder;
|
||||
|
||||
err = skc_raster_builder_create(context,&raster_builder);
|
||||
|
||||
|
||||
//
|
||||
// create a composition
|
||||
//
|
||||
skc_composition_t composition;
|
||||
|
||||
err = skc_composition_create(context,&composition);
|
||||
|
||||
|
||||
//
|
||||
// create a styling instance
|
||||
//
|
||||
@ -154,7 +180,7 @@ main(int argc, char** argv)
|
||||
svg_doc_layer_count(svg_doc),
|
||||
1000,
|
||||
2 * 1024 * 1024);
|
||||
|
||||
|
||||
//
|
||||
// create a surface
|
||||
//
|
||||
@ -191,7 +217,7 @@ main(int argc, char** argv)
|
||||
skc_transform_stack_restore(ts,ts_save);
|
||||
|
||||
// decode layers -- places rasters
|
||||
svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);
|
||||
svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);
|
||||
|
||||
// seal the composition
|
||||
skc_composition_seal(composition);
|
||||
@ -244,7 +270,7 @@ main(int argc, char** argv)
|
||||
// unseal the composition
|
||||
skc_composition_unseal(composition,true);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// dispose of mundane resources
|
||||
//
|
||||
|
@ -106,7 +106,7 @@ skc_allocator_device_create(struct skc_runtime * const runtime)
|
||||
&runtime->allocator.device.temp.suballocator,
|
||||
"DEVICE",
|
||||
runtime->config->suballocator.device.subbufs,
|
||||
runtime->cl.base_align,
|
||||
runtime->cl.align_bytes,
|
||||
runtime->config->suballocator.device.size);
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -12,7 +12,6 @@
|
||||
//
|
||||
//
|
||||
|
||||
#include "runtime_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
|
||||
//
|
||||
@ -52,8 +51,8 @@ struct skc_config
|
||||
union skc_block_pool_size block_pool;
|
||||
|
||||
struct {
|
||||
skc_cq_type_e type;
|
||||
skc_uint size;
|
||||
cl_command_queue_properties cq_props;
|
||||
skc_uint size;
|
||||
} cq_pool;
|
||||
|
||||
struct {
|
||||
|
@ -7,17 +7,18 @@
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
// squelch OpenCL 1.2 deprecation warning
|
||||
//
|
||||
|
||||
#ifndef NDEBUG
|
||||
#include <stdio.h>
|
||||
#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
//
|
||||
@ -25,6 +26,7 @@
|
||||
//
|
||||
|
||||
#include "runtime_cl_12.h"
|
||||
#include "common/cl/assert_cl.h"
|
||||
|
||||
//
|
||||
// This implementation is probably excessive.
|
||||
@ -40,21 +42,77 @@
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
skc_cq_pool_create(struct skc_runtime * const runtime,
|
||||
struct skc_cq_pool * const pool,
|
||||
skc_uint const type,
|
||||
skc_uint const size)
|
||||
{
|
||||
pool->type = type;
|
||||
pool->size = size + 1; // an empty spot
|
||||
pool->reads = 0;
|
||||
pool->writes = size;
|
||||
pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
|
||||
static
|
||||
cl_command_queue
|
||||
skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime,
|
||||
struct skc_cq_pool * const pool)
|
||||
|
||||
{
|
||||
cl_command_queue cq;
|
||||
|
||||
#if 1
|
||||
//
|
||||
// <= OpenCL 1.2
|
||||
//
|
||||
cl_int cl_err;
|
||||
|
||||
cq = clCreateCommandQueue(runtime->cl.context,
|
||||
runtime->cl.device_id,
|
||||
pool->cq_props,
|
||||
&cl_err); cl_ok(cl_err);
|
||||
#else
|
||||
if (runtime_cl->version.major < 2)
|
||||
{
|
||||
//
|
||||
// <= OpenCL 1.2
|
||||
//
|
||||
cl_int cl_err;
|
||||
|
||||
cq = clCreateCommandQueue(runtime_cl->context,
|
||||
runtime_cl->device_id,
|
||||
(cl_command_queue_properties)type,
|
||||
&cl_err); cl_ok(cl_err);
|
||||
}
|
||||
else
|
||||
{
|
||||
//
|
||||
// >= OpenCL 2.0
|
||||
//
|
||||
cl_int cl_err;
|
||||
cl_queue_properties const queue_properties[] = {
|
||||
CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
|
||||
};
|
||||
|
||||
cq = clCreateCommandQueueWithProperties(runtime_cl->context,
|
||||
runtime_cl->device_id,
|
||||
queue_properties,
|
||||
&cl_err); cl_ok(cl_err);
|
||||
}
|
||||
#endif
|
||||
|
||||
return cq;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
skc_cq_pool_create(struct skc_runtime * const runtime,
|
||||
struct skc_cq_pool * const pool,
|
||||
cl_command_queue_properties const cq_props,
|
||||
skc_uint const size)
|
||||
{
|
||||
pool->size = size + 1; // an empty spot
|
||||
pool->reads = 0;
|
||||
pool->writes = size;
|
||||
|
||||
pool->cq_props = cq_props;
|
||||
pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
|
||||
pool->size * sizeof(*pool->cq));
|
||||
for (skc_uint ii=0; ii<size; ii++)
|
||||
pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
|
||||
|
||||
for (skc_uint ii=0; ii<size; ii++) {
|
||||
pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
|
||||
}
|
||||
pool->cq[size] = NULL;
|
||||
}
|
||||
|
||||
@ -77,7 +135,7 @@ skc_cq_pool_dispose(struct skc_runtime * const runtime,
|
||||
//
|
||||
//
|
||||
|
||||
static
|
||||
static
|
||||
void
|
||||
skc_cq_pool_write(struct skc_cq_pool * const pool,
|
||||
cl_command_queue cq)
|
||||
@ -109,14 +167,14 @@ skc_cq_pool_expand(struct skc_runtime * const runtime,
|
||||
pool->writes = expand;
|
||||
|
||||
for (skc_uint ii=0; ii<expand; ii++)
|
||||
pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
|
||||
pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
static
|
||||
static
|
||||
cl_command_queue
|
||||
skc_cq_pool_read(struct skc_runtime * const runtime,
|
||||
struct skc_cq_pool * const pool)
|
||||
@ -141,7 +199,7 @@ skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime)
|
||||
}
|
||||
|
||||
void
|
||||
skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
|
||||
skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
|
||||
cl_command_queue cq)
|
||||
{
|
||||
skc_cq_pool_write(&runtime->cq_pool,cq);
|
||||
|
@ -20,11 +20,12 @@
|
||||
|
||||
struct skc_cq_pool
|
||||
{
|
||||
skc_cq_type_e type;
|
||||
skc_uint size;
|
||||
skc_uint reads;
|
||||
skc_uint writes;
|
||||
cl_command_queue * cq;
|
||||
cl_command_queue * cq;
|
||||
cl_command_queue_properties cq_props;
|
||||
|
||||
skc_uint size;
|
||||
skc_uint reads;
|
||||
skc_uint writes;
|
||||
};
|
||||
|
||||
//l
|
||||
@ -32,10 +33,10 @@ struct skc_cq_pool
|
||||
//
|
||||
|
||||
void
|
||||
skc_cq_pool_create(struct skc_runtime * const runtime,
|
||||
struct skc_cq_pool * const pool,
|
||||
skc_uint const type,
|
||||
skc_uint const size);
|
||||
skc_cq_pool_create(struct skc_runtime * const runtime,
|
||||
struct skc_cq_pool * const pool,
|
||||
cl_command_queue_properties const cq_props,
|
||||
skc_uint const size);
|
||||
|
||||
void
|
||||
skc_cq_pool_dispose(struct skc_runtime * const runtime,
|
||||
|
@ -77,6 +77,10 @@ cl_kernel
|
||||
skc_device_acquire_kernel(struct skc_device * const device,
|
||||
skc_device_kernel_id const type);
|
||||
|
||||
void
|
||||
skc_device_release_kernel(struct skc_device * const device,
|
||||
cl_kernel kernel);
|
||||
|
||||
//
|
||||
// grid shape can vary greatly by target platform
|
||||
//
|
||||
|
@ -1,64 +1,64 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_BP_INIT_IDS_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
|
||||
{
|
||||
uint const gid = get_global_id(0);
|
||||
|
||||
//
|
||||
// FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
|
||||
// accomplish this with fewer threads and using either IPC and/or
|
||||
// vector stores -- it should be on certain architectures!
|
||||
//
|
||||
|
||||
//
|
||||
// initialize pool with sequence
|
||||
//
|
||||
if (gid < bp_size)
|
||||
ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
|
||||
{
|
||||
// the version test is to squelch a bug with the Intel OpenCL CPU
|
||||
// compiler declaring it supports the cl_intel_subgroups extension
|
||||
#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
|
||||
uint const tid = get_sub_group_local_id();
|
||||
#else
|
||||
uint const tid = get_local_id(0);
|
||||
#endif
|
||||
|
||||
//
|
||||
// launch two threads and store [ 0, bp_size ]
|
||||
//
|
||||
bp_atomics[tid] = tid * bp_size;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_BP_INIT_IDS_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
|
||||
{
|
||||
uint const gid = get_global_id(0);
|
||||
|
||||
//
|
||||
// FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
|
||||
// accomplish this with fewer threads and using either IPC and/or
|
||||
// vector stores -- it should be on certain architectures!
|
||||
//
|
||||
|
||||
//
|
||||
// initialize pool with sequence
|
||||
//
|
||||
if (gid < bp_size)
|
||||
ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
|
||||
{
|
||||
// the version test is to squelch a bug with the Intel OpenCL CPU
|
||||
// compiler declaring it supports the cl_intel_subgroups extension
|
||||
#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
|
||||
uint const tid = get_sub_group_local_id();
|
||||
#else
|
||||
uint const tid = get_local_id(0);
|
||||
#endif
|
||||
|
||||
//
|
||||
// launch two threads and store [ 0, bp_size ]
|
||||
//
|
||||
bp_atomics[tid] = tid * bp_size;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "config_cl.h"
|
||||
#include "runtime_cl_12.h"
|
||||
|
||||
#include "kernel_cl_12.h"
|
||||
#include "device_cl_12.h"
|
||||
|
||||
#include "hs/cl/hs_cl_launcher.h"
|
||||
@ -124,9 +125,9 @@ struct skc_config const config =
|
||||
|
||||
.cq_pool = {
|
||||
#ifndef NDEBUG
|
||||
.type = SKC_CQ_TYPE_IN_ORDER_PROFILING,
|
||||
.cq_props = CL_QUEUE_PROFILING_ENABLE,
|
||||
#else
|
||||
.type = 0,
|
||||
.cq_props = 0,
|
||||
#endif
|
||||
.size = 8
|
||||
},
|
||||
@ -841,6 +842,14 @@ skc_device_acquire_kernel(struct skc_device * const device,
|
||||
return kernel;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
skc_device_release_kernel(struct skc_device * const device,
|
||||
cl_kernel kernel)
|
||||
{
|
||||
cl(ReleaseKernel(kernel));
|
||||
}
|
||||
|
||||
//
|
||||
// INITIALIZE KERNEL ARGS
|
||||
//
|
||||
|
@ -1,309 +1,309 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "block.h"
|
||||
#include "path.h"
|
||||
#include "common.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "raster_builder_cl_12.h"
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
|
||||
#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
|
||||
|
||||
#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_FILLS_EXPAND_X == 1 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 2 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 4 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 8 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 16)
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
|
||||
|
||||
#else
|
||||
#error "MISSING SKC_FILLS_EXPAND_X"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Fill and rasterize cmds only differ in their first word semantics
|
||||
//
|
||||
|
||||
union skc_cmd_expand
|
||||
{
|
||||
union skc_cmd_fill fill;
|
||||
union skc_cmd_rasterize rasterize;
|
||||
};
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
union skc_path_elem
|
||||
{
|
||||
skc_uint u32;
|
||||
skc_float f32;
|
||||
};
|
||||
|
||||
//
|
||||
// COMPILE-TIME AND RUN-TIME MACROS
|
||||
//
|
||||
|
||||
#define SKC_ELEM_IN_RANGE(X,I) \
|
||||
(skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
|
||||
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_ELEM_GTE(X,I) \
|
||||
SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
//
|
||||
// FIXME -- slate these for replacement
|
||||
//
|
||||
|
||||
#define SKC_BROADCAST(E,S,I) \
|
||||
sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_BROADCAST_LAST_HELPER(E,I) \
|
||||
sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_BROADCAST_LAST(E,I) \
|
||||
SKC_BROADCAST_LAST_HELPER(E,I)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
|
||||
skc_uint * const out_idx,
|
||||
union skc_cmd_expand * const cmd,
|
||||
union skc_path_elem const e,
|
||||
skc_uint const e_idx)
|
||||
{
|
||||
//
|
||||
// FIXME -- we can append a large number of nodeword indices to a
|
||||
// local SMEM queue and flush when full. It may or may not be a
|
||||
// performance win on some architectures.
|
||||
//
|
||||
skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
|
||||
skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
|
||||
|
||||
cmd->rasterize.nodeword = e_idx;
|
||||
|
||||
if (is_elem) {
|
||||
cmds_out[*out_idx + offset] = cmd->rasterize;
|
||||
}
|
||||
|
||||
*out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_FILLS_EXPAND_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
|
||||
__global skc_uint volatile * const atomics,
|
||||
__global skc_block_id_t const * const map,
|
||||
__global union skc_cmd_fill const * const cmds_in,
|
||||
__global union skc_cmd_rasterize * const cmds_out)
|
||||
{
|
||||
//
|
||||
// Need to harmonize the way we determine a subgroup's id. In this
|
||||
// kernel it's not as important because no local memory is being
|
||||
// used. Although the device/mask calc to determine subgroup and
|
||||
// lanes is still proper, we might want to make it clearer that
|
||||
// we're working with subgroups by using the subgroup API.
|
||||
//
|
||||
// every subgroup/simd that will work on the block loads the same command
|
||||
//
|
||||
#if (__OPENCL_VERSION__ < 200)
|
||||
skc_uint const cmd_stride = get_num_sub_groups();
|
||||
#else
|
||||
skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
|
||||
#endif
|
||||
skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
|
||||
|
||||
// load fill command -- we reuse y component
|
||||
union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
|
||||
|
||||
// get the path header block from the map
|
||||
skc_block_id_t id = map[cmd.fill.path];
|
||||
|
||||
#if 0
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("expand[%u] = %u\n",cmd_idx,id);
|
||||
#endif
|
||||
|
||||
//
|
||||
// blindly load all of the head elements into registers
|
||||
//
|
||||
skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// pick out count.nodes and count.prims from the header
|
||||
//
|
||||
skc_uint count_nodes, count_prims;
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
|
||||
count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
|
||||
} \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
|
||||
count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
|
||||
}
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// debug of path head
|
||||
//
|
||||
#if 0
|
||||
skc_uint count_blocks;
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
|
||||
count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
|
||||
}
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("path header = { %5u, %5u, %5u }\n",
|
||||
count_blocks,count_nodes,count_prims);
|
||||
#endif
|
||||
|
||||
//
|
||||
// acquire slots in the expanded cmd extent
|
||||
//
|
||||
// decrement prim_idx by 1 so we can use inclusive warp scan later
|
||||
//
|
||||
skc_uint out_idx = 0;
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
|
||||
(atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
|
||||
}
|
||||
|
||||
out_idx = sub_group_broadcast(out_idx,0);
|
||||
|
||||
//
|
||||
// process ids trailing the path header
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
|
||||
if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
|
||||
h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
|
||||
} \
|
||||
} \
|
||||
skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
|
||||
head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
|
||||
}
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// we're done if it was just the header
|
||||
//
|
||||
if (count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// otherwise, process the nodes
|
||||
//
|
||||
|
||||
//
|
||||
// get id of next node
|
||||
//
|
||||
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
|
||||
|
||||
//
|
||||
// the following blocks are nodes
|
||||
//
|
||||
while (true)
|
||||
{
|
||||
// get index of each element
|
||||
skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
|
||||
|
||||
//
|
||||
// blindly load all of the node elements into registers
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// append all valid ids
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
|
||||
node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
// any more nodes?
|
||||
if (--count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// get id of next node
|
||||
//
|
||||
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "block.h"
|
||||
#include "path.h"
|
||||
#include "common.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "raster_builder_cl_12.h"
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
|
||||
#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
|
||||
|
||||
#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_FILLS_EXPAND_X == 1 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 2 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 4 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 8 )
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
|
||||
|
||||
#elif ( SKC_FILLS_EXPAND_X == 16)
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
|
||||
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
|
||||
|
||||
#else
|
||||
#error "MISSING SKC_FILLS_EXPAND_X"
|
||||
#endif
|
||||
|
||||
//
|
||||
// Fill and rasterize cmds only differ in their first word semantics
|
||||
//
|
||||
|
||||
union skc_cmd_expand
|
||||
{
|
||||
union skc_cmd_fill fill;
|
||||
union skc_cmd_rasterize rasterize;
|
||||
};
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
union skc_path_elem
|
||||
{
|
||||
skc_uint u32;
|
||||
skc_float f32;
|
||||
};
|
||||
|
||||
//
|
||||
// COMPILE-TIME AND RUN-TIME MACROS
|
||||
//
|
||||
|
||||
#define SKC_ELEM_IN_RANGE(X,I) \
|
||||
(skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
|
||||
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_ELEM_GTE(X,I) \
|
||||
SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
//
|
||||
// FIXME -- slate these for replacement
|
||||
//
|
||||
|
||||
#define SKC_BROADCAST(E,S,I) \
|
||||
sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_BROADCAST_LAST_HELPER(E,I) \
|
||||
sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_BROADCAST_LAST(E,I) \
|
||||
SKC_BROADCAST_LAST_HELPER(E,I)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
void
|
||||
skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
|
||||
skc_uint * const out_idx,
|
||||
union skc_cmd_expand * const cmd,
|
||||
union skc_path_elem const e,
|
||||
skc_uint const e_idx)
|
||||
{
|
||||
//
|
||||
// FIXME -- we can append a large number of nodeword indices to a
|
||||
// local SMEM queue and flush when full. It may or may not be a
|
||||
// performance win on some architectures.
|
||||
//
|
||||
skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
|
||||
skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
|
||||
|
||||
cmd->rasterize.nodeword = e_idx;
|
||||
|
||||
if (is_elem) {
|
||||
cmds_out[*out_idx + offset] = cmd->rasterize;
|
||||
}
|
||||
|
||||
*out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_FILLS_EXPAND_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
|
||||
__global skc_uint volatile * const atomics,
|
||||
__global skc_block_id_t const * const map,
|
||||
__global union skc_cmd_fill const * const cmds_in,
|
||||
__global union skc_cmd_rasterize * const cmds_out)
|
||||
{
|
||||
//
|
||||
// Need to harmonize the way we determine a subgroup's id. In this
|
||||
// kernel it's not as important because no local memory is being
|
||||
// used. Although the device/mask calc to determine subgroup and
|
||||
// lanes is still proper, we might want to make it clearer that
|
||||
// we're working with subgroups by using the subgroup API.
|
||||
//
|
||||
// every subgroup/simd that will work on the block loads the same command
|
||||
//
|
||||
#if (__OPENCL_VERSION__ < 200)
|
||||
skc_uint const cmd_stride = get_num_sub_groups();
|
||||
#else
|
||||
skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
|
||||
#endif
|
||||
skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
|
||||
|
||||
// load fill command -- we reuse y component
|
||||
union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
|
||||
|
||||
// get the path header block from the map
|
||||
skc_block_id_t id = map[cmd.fill.path];
|
||||
|
||||
#if 0
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("expand[%u] = %u\n",cmd_idx,id);
|
||||
#endif
|
||||
|
||||
//
|
||||
// blindly load all of the head elements into registers
|
||||
//
|
||||
skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// pick out count.nodes and count.prims from the header
|
||||
//
|
||||
skc_uint count_nodes, count_prims;
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
|
||||
count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
|
||||
} \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
|
||||
count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
|
||||
}
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// debug of path head
|
||||
//
|
||||
#if 0
|
||||
skc_uint count_blocks;
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
|
||||
count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
|
||||
}
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("path header = { %5u, %5u, %5u }\n",
|
||||
count_blocks,count_nodes,count_prims);
|
||||
#endif
|
||||
|
||||
//
|
||||
// acquire slots in the expanded cmd extent
|
||||
//
|
||||
// decrement prim_idx by 1 so we can use inclusive warp scan later
|
||||
//
|
||||
skc_uint out_idx = 0;
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
|
||||
(atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
|
||||
}
|
||||
|
||||
out_idx = sub_group_broadcast(out_idx,0);
|
||||
|
||||
//
|
||||
// process ids trailing the path header
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
|
||||
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
|
||||
if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
|
||||
h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
|
||||
} \
|
||||
} \
|
||||
skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
|
||||
head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
|
||||
}
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// we're done if it was just the header
|
||||
//
|
||||
if (count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// otherwise, process the nodes
|
||||
//
|
||||
|
||||
//
|
||||
// get id of next node
|
||||
//
|
||||
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
|
||||
|
||||
//
|
||||
// the following blocks are nodes
|
||||
//
|
||||
while (true)
|
||||
{
|
||||
// get index of each element
|
||||
skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
|
||||
|
||||
//
|
||||
// blindly load all of the node elements into registers
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// append all valid ids
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
|
||||
node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
|
||||
|
||||
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
|
||||
|
||||
// any more nodes?
|
||||
if (--count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// get id of next node
|
||||
//
|
||||
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,390 +1,390 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// FIXME -- a pre-allocation step could load the path header quads and
|
||||
// total up the number of blocks in the workgroup or subgroup
|
||||
// minimizing the number of later atomics adds.
|
||||
//
|
||||
|
||||
#include "block.h"
|
||||
#include "path.h"
|
||||
#include "common.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_PATHS_RECLAIM_X == 1 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 2 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 4 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 8 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 16)
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
|
||||
|
||||
#else
|
||||
#error "MISSING SKC_PATHS_RECLAIM_X"
|
||||
#endif
|
||||
|
||||
//
|
||||
// FIXME -- slate these for replacement
|
||||
//
|
||||
|
||||
#define SKC_BROADCAST(E,S,I) \
|
||||
sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_BROADCAST_LAST_HELPER(E,I) \
|
||||
sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_BROADCAST_LAST(E,I) \
|
||||
SKC_BROADCAST_LAST_HELPER(E,I)
|
||||
|
||||
//
|
||||
// COMPILE-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
|
||||
SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
|
||||
(skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
|
||||
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
|
||||
SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
|
||||
SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
|
||||
|
||||
//
|
||||
// RUN-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
|
||||
(get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
|
||||
|
||||
//
|
||||
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
|
||||
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
|
||||
// COMBOS (NOT NECESSARILY POW2)
|
||||
//
|
||||
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
|
||||
// UINT TYPE INSTEAD OF A ULONG.
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
|
||||
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
|
||||
? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
|
||||
S = sub_group_scan_exclusive_add(C)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
|
||||
(((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct skc_reclaim
|
||||
{
|
||||
skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
__kernel
|
||||
SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
|
||||
__global skc_uint * const bp_elems, // block pool blocks
|
||||
__global skc_uint volatile * const bp_atomics, // read/write atomics
|
||||
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
|
||||
__global skc_block_id_t const * const map, // path host-to-device map
|
||||
struct skc_reclaim const reclaim) // array of host path ids
|
||||
{
|
||||
#if (__OPENCL_VERSION__ < 200)
|
||||
skc_uint const reclaim_stride = get_num_sub_groups();
|
||||
#else
|
||||
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
|
||||
#endif
|
||||
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
|
||||
|
||||
#if 0
|
||||
//
|
||||
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
|
||||
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
|
||||
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
|
||||
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
|
||||
//
|
||||
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
|
||||
#endif
|
||||
{
|
||||
// get host path id
|
||||
skc_path_h const path = reclaim.aN[reclaim_idx];
|
||||
|
||||
// get the path header block from the map
|
||||
skc_block_id_t id = map[path];
|
||||
|
||||
//
|
||||
// blindly load all of the head elements into registers
|
||||
//
|
||||
skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// pick out count.nodes and count.prims from the header
|
||||
//
|
||||
skc_uint count_blocks, count_nodes;
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
|
||||
count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
|
||||
} \
|
||||
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
|
||||
count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
#if 0
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
// acquire a span in the block pool ids ring for reclaimed ids
|
||||
//
|
||||
// FIXME count_blocks and atomic add can be done in same lane
|
||||
//
|
||||
skc_uint bp_ids_base = 0;
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
|
||||
|
||||
#if 0
|
||||
printf("paths: bp_ids_base = %u\n",bp_ids_base);
|
||||
#endif
|
||||
}
|
||||
|
||||
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
|
||||
|
||||
//
|
||||
// shift away the tagged block id's tag
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
}
|
||||
|
||||
//
|
||||
// - we'll skip subgroups that are entirely header
|
||||
//
|
||||
// - but we need to mark any header elements that partially fill
|
||||
// a subgroup as invalid tagged block ids
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
|
||||
if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
|
||||
h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
{
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = h##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
// printf("P %7u ! %u\n",bp_ids_idx,h##I);
|
||||
}
|
||||
|
||||
//
|
||||
// we're done if it was just the header
|
||||
//
|
||||
if (count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// otherwise, walk the nodes
|
||||
//
|
||||
do {
|
||||
// id of next block is in last lane
|
||||
id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
|
||||
|
||||
// get index of each element
|
||||
skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
|
||||
|
||||
//
|
||||
// blindly load all of the node elements into registers
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// shift away the tagged block id's tag
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
}
|
||||
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) { \
|
||||
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = n##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
// printf("P %7u ! %u\n",bp_ids_idx,n##I);
|
||||
|
||||
// any more nodes?
|
||||
} while (--count_nodes > 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// FIXME -- a pre-allocation step could load the path header quads and
|
||||
// total up the number of blocks in the workgroup or subgroup
|
||||
// minimizing the number of later atomics adds.
|
||||
//
|
||||
|
||||
#include "block.h"
|
||||
#include "path.h"
|
||||
#include "common.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_PATHS_RECLAIM_X == 1 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 2 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 4 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 8 )
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
|
||||
|
||||
#elif ( SKC_PATHS_RECLAIM_X == 16)
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
|
||||
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
|
||||
|
||||
#else
|
||||
#error "MISSING SKC_PATHS_RECLAIM_X"
|
||||
#endif
|
||||
|
||||
//
|
||||
// FIXME -- slate these for replacement
|
||||
//
|
||||
|
||||
#define SKC_BROADCAST(E,S,I) \
|
||||
sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_BROADCAST_LAST_HELPER(E,I) \
|
||||
sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_BROADCAST_LAST(E,I) \
|
||||
SKC_BROADCAST_LAST_HELPER(E,I)
|
||||
|
||||
//
|
||||
// COMPILE-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
|
||||
SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
|
||||
(skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
|
||||
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
|
||||
SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
|
||||
SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
|
||||
|
||||
//
|
||||
// RUN-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
|
||||
(get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
|
||||
|
||||
//
|
||||
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
|
||||
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
|
||||
// COMBOS (NOT NECESSARILY POW2)
|
||||
//
|
||||
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
|
||||
// UINT TYPE INSTEAD OF A ULONG.
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
|
||||
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
|
||||
? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
|
||||
S = sub_group_scan_exclusive_add(C)
|
||||
|
||||
#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
|
||||
(((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct skc_reclaim
|
||||
{
|
||||
skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
__kernel
|
||||
SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
|
||||
__global skc_uint * const bp_elems, // block pool blocks
|
||||
__global skc_uint volatile * const bp_atomics, // read/write atomics
|
||||
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
|
||||
__global skc_block_id_t const * const map, // path host-to-device map
|
||||
struct skc_reclaim const reclaim) // array of host path ids
|
||||
{
|
||||
#if (__OPENCL_VERSION__ < 200)
|
||||
skc_uint const reclaim_stride = get_num_sub_groups();
|
||||
#else
|
||||
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
|
||||
#endif
|
||||
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
|
||||
|
||||
#if 0
|
||||
//
|
||||
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
|
||||
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
|
||||
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
|
||||
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
|
||||
//
|
||||
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
|
||||
#endif
|
||||
{
|
||||
// get host path id
|
||||
skc_path_h const path = reclaim.aN[reclaim_idx];
|
||||
|
||||
// get the path header block from the map
|
||||
skc_block_id_t id = map[path];
|
||||
|
||||
//
|
||||
// blindly load all of the head elements into registers
|
||||
//
|
||||
skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// pick out count.nodes and count.prims from the header
|
||||
//
|
||||
skc_uint count_blocks, count_nodes;
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
|
||||
count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
|
||||
} \
|
||||
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
|
||||
count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
#if 0
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
// acquire a span in the block pool ids ring for reclaimed ids
|
||||
//
|
||||
// FIXME count_blocks and atomic add can be done in same lane
|
||||
//
|
||||
skc_uint bp_ids_base = 0;
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
|
||||
|
||||
#if 0
|
||||
printf("paths: bp_ids_base = %u\n",bp_ids_base);
|
||||
#endif
|
||||
}
|
||||
|
||||
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
|
||||
|
||||
//
|
||||
// shift away the tagged block id's tag
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
}
|
||||
|
||||
//
|
||||
// - we'll skip subgroups that are entirely header
|
||||
//
|
||||
// - but we need to mark any header elements that partially fill
|
||||
// a subgroup as invalid tagged block ids
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
|
||||
if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
|
||||
h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
{
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = h##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
// printf("P %7u ! %u\n",bp_ids_idx,h##I);
|
||||
}
|
||||
|
||||
//
|
||||
// we're done if it was just the header
|
||||
//
|
||||
if (count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// otherwise, walk the nodes
|
||||
//
|
||||
do {
|
||||
// id of next block is in last lane
|
||||
id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
|
||||
|
||||
// get index of each element
|
||||
skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
|
||||
|
||||
//
|
||||
// blindly load all of the node elements into registers
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// shift away the tagged block id's tag
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
}
|
||||
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) { \
|
||||
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = n##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
// printf("P %7u ! %u\n",bp_ids_idx,n##I);
|
||||
|
||||
// any more nodes?
|
||||
} while (--count_nodes > 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,144 +1,144 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "raster.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
#include "raster_builder_cl_12.h"
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
// There is a fixed-size meta table per raster cohort that we use to
|
||||
// peform a mostly coalesced sizing and allocation of blocks.
|
||||
//
|
||||
// This code is simple and fast.
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
|
||||
__global skc_block_id_t const * const bp_ids,
|
||||
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
|
||||
__global skc_block_id_t * const map,
|
||||
__global skc_uint * const metas,
|
||||
__global skc_uint const * const raster_ids, // FIXME -- CONSTANT
|
||||
skc_uint const count)
|
||||
{
|
||||
// access to the meta extent is linear
|
||||
skc_uint const gid = get_global_id(0);
|
||||
skc_bool const is_active = gid < count;
|
||||
|
||||
//
|
||||
// init with defaults for all lanes
|
||||
//
|
||||
union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } };
|
||||
skc_uint raster_id = SKC_UINT_MAX;
|
||||
skc_uint extra_blocks = 0;
|
||||
|
||||
if (is_active)
|
||||
{
|
||||
// load meta_in
|
||||
meta.in.u32v4 = vload4(gid,metas);
|
||||
|
||||
// load raster_id as early as possible
|
||||
raster_id = raster_ids[gid];
|
||||
|
||||
#if 0
|
||||
printf("%3u + %5u, %5u, %5u, %5u\n",
|
||||
gid,
|
||||
meta.in.blocks,
|
||||
meta.in.offset,
|
||||
meta.in.pk,
|
||||
meta.in.rk);
|
||||
#endif
|
||||
|
||||
// how many blocks will the ttpb blocks consume?
|
||||
extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) /
|
||||
SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
|
||||
|
||||
// total keys
|
||||
meta.out.keys += meta.in.pk;
|
||||
|
||||
// how many blocks do we need to store the keys in the head and trailing nodes?
|
||||
skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
|
||||
(SKC_RASTER_NODE_DWORDS - 1));
|
||||
// increment blocks
|
||||
extra_blocks += hn;
|
||||
|
||||
// how many nodes trail the head?
|
||||
meta.out.nodes = hn - 1;
|
||||
|
||||
// update blocks
|
||||
meta.out.blocks += extra_blocks;
|
||||
|
||||
#if 0
|
||||
printf("%3u - %5u, %5u, %5u, %5u\n",
|
||||
gid,
|
||||
meta.out.blocks,
|
||||
meta.out.offset,
|
||||
meta.out.nodes,
|
||||
meta.out.keys);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// allocate blocks from block pool
|
||||
//
|
||||
// first perform a prefix sum on the subgroup to reduce atomic
|
||||
// operation traffic
|
||||
//
|
||||
// note this idiom can be implemented with vectors, subgroups or
|
||||
// workgroups
|
||||
//
|
||||
|
||||
skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
|
||||
skc_uint reads = 0;
|
||||
|
||||
// last lane performs the block pool allocation with an atomic increment
|
||||
if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
|
||||
reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
|
||||
}
|
||||
|
||||
// broadcast block pool base to all lanes
|
||||
reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
|
||||
|
||||
// update base for each lane
|
||||
reads += prefix - extra_blocks;
|
||||
|
||||
//
|
||||
// store meta header
|
||||
//
|
||||
if (is_active)
|
||||
{
|
||||
// store headers back to meta extent
|
||||
vstore4(meta.out.u32v4,gid,metas);
|
||||
|
||||
// store reads
|
||||
metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads;
|
||||
|
||||
// get block_id of each raster head
|
||||
skc_block_id_t const block_id = bp_ids[reads & bp_mask];
|
||||
|
||||
// update map
|
||||
map[raster_id] = block_id;
|
||||
|
||||
#if 0
|
||||
printf("alloc: %u / %u\n",raster_id,block_id);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "raster.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
#include "raster_builder_cl_12.h"
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
// There is a fixed-size meta table per raster cohort that we use to
|
||||
// peform a mostly coalesced sizing and allocation of blocks.
|
||||
//
|
||||
// This code is simple and fast.
|
||||
//
|
||||
|
||||
__kernel
|
||||
SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
|
||||
__global skc_block_id_t const * const bp_ids,
|
||||
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
|
||||
__global skc_block_id_t * const map,
|
||||
__global skc_uint * const metas,
|
||||
__global skc_uint const * const raster_ids, // FIXME -- CONSTANT
|
||||
skc_uint const count)
|
||||
{
|
||||
// access to the meta extent is linear
|
||||
skc_uint const gid = get_global_id(0);
|
||||
skc_bool const is_active = gid < count;
|
||||
|
||||
//
|
||||
// init with defaults for all lanes
|
||||
//
|
||||
union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } };
|
||||
skc_uint raster_id = SKC_UINT_MAX;
|
||||
skc_uint extra_blocks = 0;
|
||||
|
||||
if (is_active)
|
||||
{
|
||||
// load meta_in
|
||||
meta.in.u32v4 = vload4(gid,metas);
|
||||
|
||||
// load raster_id as early as possible
|
||||
raster_id = raster_ids[gid];
|
||||
|
||||
#if 0
|
||||
printf("%3u + %5u, %5u, %5u, %5u\n",
|
||||
gid,
|
||||
meta.in.blocks,
|
||||
meta.in.offset,
|
||||
meta.in.pk,
|
||||
meta.in.rk);
|
||||
#endif
|
||||
|
||||
// how many blocks will the ttpb blocks consume?
|
||||
extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) /
|
||||
SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
|
||||
|
||||
// total keys
|
||||
meta.out.keys += meta.in.pk;
|
||||
|
||||
// how many blocks do we need to store the keys in the head and trailing nodes?
|
||||
skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
|
||||
(SKC_RASTER_NODE_DWORDS - 1));
|
||||
// increment blocks
|
||||
extra_blocks += hn;
|
||||
|
||||
// how many nodes trail the head?
|
||||
meta.out.nodes = hn - 1;
|
||||
|
||||
// update blocks
|
||||
meta.out.blocks += extra_blocks;
|
||||
|
||||
#if 0
|
||||
printf("%3u - %5u, %5u, %5u, %5u\n",
|
||||
gid,
|
||||
meta.out.blocks,
|
||||
meta.out.offset,
|
||||
meta.out.nodes,
|
||||
meta.out.keys);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// allocate blocks from block pool
|
||||
//
|
||||
// first perform a prefix sum on the subgroup to reduce atomic
|
||||
// operation traffic
|
||||
//
|
||||
// note this idiom can be implemented with vectors, subgroups or
|
||||
// workgroups
|
||||
//
|
||||
|
||||
skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
|
||||
skc_uint reads = 0;
|
||||
|
||||
// last lane performs the block pool allocation with an atomic increment
|
||||
if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
|
||||
reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
|
||||
}
|
||||
|
||||
// broadcast block pool base to all lanes
|
||||
reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
|
||||
|
||||
// update base for each lane
|
||||
reads += prefix - extra_blocks;
|
||||
|
||||
//
|
||||
// store meta header
|
||||
//
|
||||
if (is_active)
|
||||
{
|
||||
// store headers back to meta extent
|
||||
vstore4(meta.out.u32v4,gid,metas);
|
||||
|
||||
// store reads
|
||||
metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads;
|
||||
|
||||
// get block_id of each raster head
|
||||
skc_block_id_t const block_id = bp_ids[reads & bp_mask];
|
||||
|
||||
// update map
|
||||
map[raster_id] = block_id;
|
||||
|
||||
#if 0
|
||||
printf("alloc: %u / %u\n",raster_id,block_id);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
@ -1,442 +1,442 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "block.h"
|
||||
#include "raster.h"
|
||||
#include "common.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_RASTERS_RECLAIM_X == 1 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 2 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 4 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 8 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 16)
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
|
||||
|
||||
#else
|
||||
#error "MISSING SKC_RASTERS_RECLAIM_X"
|
||||
#endif
|
||||
|
||||
#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
|
||||
(I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// FIXME -- slate these for replacement
|
||||
//
|
||||
|
||||
#define SKC_BROADCAST(E,S,I) \
|
||||
sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_BROADCAST_LAST_HELPER(E,I) \
|
||||
sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_BROADCAST_LAST(E,I) \
|
||||
SKC_BROADCAST_LAST_HELPER(E,I)
|
||||
|
||||
//
|
||||
// COMPILE-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
|
||||
SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
|
||||
(skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
|
||||
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
|
||||
SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
|
||||
SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
|
||||
|
||||
//
|
||||
// RUN-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
|
||||
(get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
|
||||
|
||||
//
|
||||
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
|
||||
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
|
||||
// COMBOS (NOT NECESSARILY POW2)
|
||||
//
|
||||
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
|
||||
// UINT TYPE INSTEAD OF A ULONG.
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
|
||||
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
|
||||
? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
|
||||
S = sub_group_scan_exclusive_add(C)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
|
||||
(((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct skc_reclaim
|
||||
{
|
||||
skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
__kernel
|
||||
SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
|
||||
__global skc_uint * const bp_elems, // block pool blocks
|
||||
__global skc_uint volatile * const bp_atomics, // read/write atomics
|
||||
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
|
||||
__global skc_block_id_t const * const map, // raster host-to-device map
|
||||
struct skc_reclaim const reclaim) // array of host raster ids
|
||||
{
|
||||
#if (__OPENCL_VERSION__ < 200)
|
||||
skc_uint const reclaim_stride = get_num_sub_groups();
|
||||
#else
|
||||
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
|
||||
#endif
|
||||
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
|
||||
|
||||
#if 0
|
||||
//
|
||||
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
|
||||
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
|
||||
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
|
||||
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
|
||||
//
|
||||
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
|
||||
#endif
|
||||
{
|
||||
// get host raster id
|
||||
skc_raster_h const raster = reclaim.aN[reclaim_idx];
|
||||
|
||||
// get block id of raster header
|
||||
skc_block_id_t id = map[raster];
|
||||
|
||||
//
|
||||
// load all of the head block ttxk.lo keys into registers
|
||||
//
|
||||
// FIXME -- this pattern lends itself to using the higher
|
||||
// performance Intel GEN block load instructions
|
||||
//
|
||||
skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// pick out count.nodes and count.prims from the header
|
||||
//
|
||||
// load raster header counts -- we only need the blocks and
|
||||
// nodes words the keys are doublewords.
|
||||
//
|
||||
// FIXME -- this can be made portable with compile-time macro expansion
|
||||
//
|
||||
skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
|
||||
skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
|
||||
|
||||
#if 0
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
|
||||
}
|
||||
#endif
|
||||
//
|
||||
// acquire a span in the block pool ids ring for reclaimed ids
|
||||
//
|
||||
skc_uint bp_ids_base = 0;
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
|
||||
}
|
||||
|
||||
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
|
||||
|
||||
//
|
||||
// mask off everything but the block id
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
h##I = h##I & SKC_TTXK_LO_MASK_ID; \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
#if 0
|
||||
printf("rasters next = %u\n",id);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
printf("%08X %u\n",h##I,h##I);
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
printf("%08X\n",h##I); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
#endif
|
||||
|
||||
//
|
||||
// - we'll skip subgroups that are entirely header
|
||||
//
|
||||
// - but we need to mark any header elements that partially fill
|
||||
// a subgroup as subblocks
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
|
||||
if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
|
||||
h##I = SKC_UINT_MAX; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
{
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = h##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
}
|
||||
|
||||
// printf("R %7u ! %u\n",bp_ids_idx,h##I);
|
||||
|
||||
//
|
||||
// we're done if it was just the header
|
||||
//
|
||||
if (count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// otherwise, walk the nodes
|
||||
//
|
||||
do {
|
||||
// id of next block is in last lane
|
||||
id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
|
||||
|
||||
//
|
||||
// load all of the node block ttxk.lo keys into registers
|
||||
//
|
||||
// FIXME -- this pattern lends itself to using the higher
|
||||
// performance Intel GEN block load instructions
|
||||
//
|
||||
skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// mask off everything but the block id
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
n##I = n##I & SKC_TTXK_LO_MASK_ID;
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
#if 0
|
||||
printf("rasters next = %u\n",id);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
printf("%08X %u\n",n##I,n##I);
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
#endif
|
||||
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) { \
|
||||
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = n##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
// printf("R %7u ! %u\n",bp_ids_idx,n##I);
|
||||
|
||||
// any more nodes?
|
||||
} while (--count_nodes > 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "block.h"
|
||||
#include "raster.h"
|
||||
#include "common.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "block_pool_cl.h"
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_RASTERS_RECLAIM_X == 1 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 2 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 4 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 8 )
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
|
||||
|
||||
#elif ( SKC_RASTERS_RECLAIM_X == 16)
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
|
||||
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
|
||||
|
||||
#else
|
||||
#error "MISSING SKC_RASTERS_RECLAIM_X"
|
||||
#endif
|
||||
|
||||
#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
|
||||
(I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// FIXME -- slate these for replacement
|
||||
//
|
||||
|
||||
#define SKC_BROADCAST(E,S,I) \
|
||||
sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_BROADCAST_LAST_HELPER(E,I) \
|
||||
sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
|
||||
#define SKC_BROADCAST_LAST(E,I) \
|
||||
SKC_BROADCAST_LAST_HELPER(E,I)
|
||||
|
||||
//
|
||||
// COMPILE-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
|
||||
SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
|
||||
(skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
|
||||
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
|
||||
SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
|
||||
SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
|
||||
|
||||
//
|
||||
// RUN-TIME PREDICATES
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
|
||||
(get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
|
||||
|
||||
//
|
||||
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
|
||||
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
|
||||
// COMBOS (NOT NECESSARILY POW2)
|
||||
//
|
||||
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
|
||||
// UINT TYPE INSTEAD OF A ULONG.
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
|
||||
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
|
||||
? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
|
||||
S = sub_group_scan_exclusive_add(C)
|
||||
|
||||
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
|
||||
(((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
struct skc_reclaim
|
||||
{
|
||||
skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
__kernel
|
||||
SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
|
||||
void
|
||||
skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
|
||||
__global skc_uint * const bp_elems, // block pool blocks
|
||||
__global skc_uint volatile * const bp_atomics, // read/write atomics
|
||||
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
|
||||
__global skc_block_id_t const * const map, // raster host-to-device map
|
||||
struct skc_reclaim const reclaim) // array of host raster ids
|
||||
{
|
||||
#if (__OPENCL_VERSION__ < 200)
|
||||
skc_uint const reclaim_stride = get_num_sub_groups();
|
||||
#else
|
||||
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
|
||||
#endif
|
||||
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
|
||||
|
||||
#if 0
|
||||
//
|
||||
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
|
||||
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
|
||||
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
|
||||
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
|
||||
//
|
||||
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
|
||||
#endif
|
||||
{
|
||||
// get host raster id
|
||||
skc_raster_h const raster = reclaim.aN[reclaim_idx];
|
||||
|
||||
// get block id of raster header
|
||||
skc_block_id_t id = map[raster];
|
||||
|
||||
//
|
||||
// load all of the head block ttxk.lo keys into registers
|
||||
//
|
||||
// FIXME -- this pattern lends itself to using the higher
|
||||
// performance Intel GEN block load instructions
|
||||
//
|
||||
skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// pick out count.nodes and count.prims from the header
|
||||
//
|
||||
// load raster header counts -- we only need the blocks and
|
||||
// nodes words the keys are doublewords.
|
||||
//
|
||||
// FIXME -- this can be made portable with compile-time macro expansion
|
||||
//
|
||||
skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
|
||||
skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
|
||||
|
||||
#if 0
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
|
||||
}
|
||||
#endif
|
||||
//
|
||||
// acquire a span in the block pool ids ring for reclaimed ids
|
||||
//
|
||||
skc_uint bp_ids_base = 0;
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
|
||||
}
|
||||
|
||||
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
|
||||
|
||||
//
|
||||
// mask off everything but the block id
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
h##I = h##I & SKC_TTXK_LO_MASK_ID; \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
#if 0
|
||||
printf("rasters next = %u\n",id);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
printf("%08X %u\n",h##I,h##I);
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
printf("%08X\n",h##I); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
#endif
|
||||
|
||||
//
|
||||
// - we'll skip subgroups that are entirely header
|
||||
//
|
||||
// - but we need to mark any header elements that partially fill
|
||||
// a subgroup as subblocks
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
|
||||
if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
|
||||
h##I = SKC_UINT_MAX; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
{
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
|
||||
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = h##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
}
|
||||
|
||||
// printf("R %7u ! %u\n",bp_ids_idx,h##I);
|
||||
|
||||
//
|
||||
// we're done if it was just the header
|
||||
//
|
||||
if (count_nodes == 0)
|
||||
return;
|
||||
|
||||
//
|
||||
// otherwise, walk the nodes
|
||||
//
|
||||
do {
|
||||
// id of next block is in last lane
|
||||
id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
|
||||
|
||||
//
|
||||
// load all of the node block ttxk.lo keys into registers
|
||||
//
|
||||
// FIXME -- this pattern lends itself to using the higher
|
||||
// performance Intel GEN block load instructions
|
||||
//
|
||||
skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// mask off everything but the block id
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
n##I = n##I & SKC_TTXK_LO_MASK_ID;
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// swap current id with next
|
||||
//
|
||||
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
|
||||
{
|
||||
skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
|
||||
|
||||
SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
|
||||
|
||||
id = next;
|
||||
#if 0
|
||||
printf("rasters next = %u\n",id);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
printf("%08X %u\n",n##I,n##I);
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
#endif
|
||||
|
||||
//
|
||||
// count reclaimable blocks in each lane
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
|
||||
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) \
|
||||
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
//
|
||||
// scan to find index of each block
|
||||
//
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
|
||||
|
||||
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
|
||||
|
||||
//
|
||||
// store blocks back to ring
|
||||
//
|
||||
#undef SKC_EXPAND_X
|
||||
#define SKC_EXPAND_X(I,S,C,P,R) { \
|
||||
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
|
||||
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
|
||||
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
|
||||
if (count > 0) { \
|
||||
bp_ids[bp_ids_idx] = n##I; \
|
||||
} \
|
||||
skc_uint const total = index + count; \
|
||||
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
|
||||
}
|
||||
|
||||
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
|
||||
|
||||
// printf("R %7u ! %u\n",bp_ids_idx,n##I);
|
||||
|
||||
// any more nodes?
|
||||
} while (--count_nodes > 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,130 +1,130 @@
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
|
||||
// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
|
||||
// KERNEL WILL NEED TO BE UPDATED
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
|
||||
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_YX_NEQ(row,prev) \
|
||||
(((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
|
||||
void
|
||||
skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
|
||||
__global uint * SKC_RESTRICT const indices,
|
||||
__global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
|
||||
{
|
||||
uint const global_id = get_global_id(0);
|
||||
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
|
||||
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
|
||||
uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
|
||||
|
||||
//
|
||||
// LOAD ALL THE ROWS
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// LOAD LAST REGISTER FROM COLUMN TO LEFT
|
||||
//
|
||||
uint diffs = 0;
|
||||
uint2 r0 = r1;
|
||||
|
||||
if (gmem_base > 0) {
|
||||
// if this is the first key in any slab but the first then it
|
||||
// broadcast loads the last key in previous slab
|
||||
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
|
||||
} else if (get_sub_group_local_id() == 0) {
|
||||
// if this is the first lane in the first slab
|
||||
diffs = 1;
|
||||
}
|
||||
|
||||
// now shuffle in the last key from the column to the left
|
||||
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
|
||||
|
||||
//
|
||||
// FIND ALL DIFFERENCES IN SLAB
|
||||
//
|
||||
uint valid = 0;
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
valid |= ((r##row != SKC_ULONG_MAX) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
diffs |= (SKC_YX_NEQ(row,prev) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// SUM UP THE DIFFERENCES
|
||||
//
|
||||
uint const valid_diffs = valid & diffs;
|
||||
uint const count = popcount(valid_diffs);
|
||||
uint const inclusive = sub_group_scan_inclusive_add(count);
|
||||
uint const exclusive = inclusive - count;
|
||||
|
||||
//
|
||||
// RESERVE SPACE IN THE INDICES ARRAY
|
||||
//
|
||||
uint next = 0;
|
||||
|
||||
if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
|
||||
next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
|
||||
|
||||
// distribute base across subgroup
|
||||
next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
|
||||
|
||||
//
|
||||
// STORE THE INDICES
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (valid_diffs & (1 << prev)) \
|
||||
indices[next++] = lane_idx + prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// TRANSPOSE THE SLAB AND STORE IT
|
||||
//
|
||||
HS_TRANSPOSE_SLAB();
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
|
||||
// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
|
||||
// KERNEL WILL NEED TO BE UPDATED
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "atomic_cl.h"
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
|
||||
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_YX_NEQ(row,prev) \
|
||||
(((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
|
||||
void
|
||||
skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
|
||||
__global uint * SKC_RESTRICT const indices,
|
||||
__global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
|
||||
{
|
||||
uint const global_id = get_global_id(0);
|
||||
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
|
||||
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
|
||||
uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
|
||||
|
||||
//
|
||||
// LOAD ALL THE ROWS
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// LOAD LAST REGISTER FROM COLUMN TO LEFT
|
||||
//
|
||||
uint diffs = 0;
|
||||
uint2 r0 = r1;
|
||||
|
||||
if (gmem_base > 0) {
|
||||
// if this is the first key in any slab but the first then it
|
||||
// broadcast loads the last key in previous slab
|
||||
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
|
||||
} else if (get_sub_group_local_id() == 0) {
|
||||
// if this is the first lane in the first slab
|
||||
diffs = 1;
|
||||
}
|
||||
|
||||
// now shuffle in the last key from the column to the left
|
||||
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
|
||||
|
||||
//
|
||||
// FIND ALL DIFFERENCES IN SLAB
|
||||
//
|
||||
uint valid = 0;
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
valid |= ((r##row != SKC_ULONG_MAX) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
diffs |= (SKC_YX_NEQ(row,prev) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// SUM UP THE DIFFERENCES
|
||||
//
|
||||
uint const valid_diffs = valid & diffs;
|
||||
uint const count = popcount(valid_diffs);
|
||||
uint const inclusive = sub_group_scan_inclusive_add(count);
|
||||
uint const exclusive = inclusive - count;
|
||||
|
||||
//
|
||||
// RESERVE SPACE IN THE INDICES ARRAY
|
||||
//
|
||||
uint next = 0;
|
||||
|
||||
if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
|
||||
next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
|
||||
|
||||
// distribute base across subgroup
|
||||
next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
|
||||
|
||||
//
|
||||
// STORE THE INDICES
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (valid_diffs & (1 << prev)) \
|
||||
indices[next++] = lane_idx + prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// TRANSPOSE THE SLAB AND STORE IT
|
||||
//
|
||||
HS_TRANSPOSE_SLAB();
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
@ -1,394 +1,394 @@
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
|
||||
// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
|
||||
// KERNEL WILL NEED TO BE UPDATED
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "raster_builder_cl_12.h" // need meta_in structure
|
||||
#include "device_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
|
||||
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
|
||||
|
||||
//
|
||||
// THE BEST TYPE TO ZERO SMEM
|
||||
//
|
||||
|
||||
#define SKC_ZERO_TYPE ulong
|
||||
#define SKC_ZERO_WORDS 2
|
||||
|
||||
//
|
||||
// THE ORDER OF COMPONENTS IS:
|
||||
//
|
||||
// 0: blocks
|
||||
// 1: offset
|
||||
// 2: pk
|
||||
// 3: rk
|
||||
//
|
||||
|
||||
#if (HS_KEYS_PER_SLAB < 256)
|
||||
|
||||
#define SKC_META_TYPE uint
|
||||
#define SKC_META_WORDS 1
|
||||
|
||||
#define SKC_COMPONENT_TYPE uchar
|
||||
|
||||
#else
|
||||
|
||||
#define SKC_META_TYPE uint2
|
||||
#define SKC_META_WORDS 2
|
||||
|
||||
#define SKC_COMPONENT_TYPE ushort
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
|
||||
#define SKC_COHORT_TYPE uchar
|
||||
#else
|
||||
#define SKC_COHORT_TYPE ushort
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_COHORT_ID(row) \
|
||||
as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
|
||||
|
||||
//
|
||||
// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
|
||||
//
|
||||
|
||||
#define SKC_IS_BLOCK(row) \
|
||||
((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
|
||||
|
||||
#define SKC_YX(row,prev) \
|
||||
(as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
|
||||
|
||||
#define SKC_IS_PK(row,prev) \
|
||||
((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
|
||||
|
||||
//
|
||||
// COHORT SIZE IS ALWAYS A POWER-OF-TWO
|
||||
// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
|
||||
//
|
||||
// COHORT SIZE >= SUBGROUP SIZE
|
||||
//
|
||||
|
||||
#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
|
||||
|
||||
#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
|
||||
#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
|
||||
#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
|
||||
|
||||
#define SKC_META_COMPONENTS 4
|
||||
#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
|
||||
void
|
||||
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
|
||||
__global uint * SKC_RESTRICT const metas)
|
||||
{
|
||||
__local union
|
||||
{
|
||||
SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
|
||||
SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
|
||||
SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
|
||||
} shared;
|
||||
|
||||
uint const global_id = get_global_id(0);
|
||||
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
|
||||
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
|
||||
uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
|
||||
|
||||
//
|
||||
// LOAD ALL THE ROWS
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// LOAD LAST REGISTER FROM COLUMN TO LEFT
|
||||
//
|
||||
uint diffs = 0;
|
||||
uint2 r0 = 0;
|
||||
|
||||
if (gmem_base > 0) {
|
||||
// if this is the first key in any slab but the first then it
|
||||
// broadcast loads the last key in previous slab
|
||||
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
|
||||
} else {
|
||||
// otherwise broadcast the first key in the first slab
|
||||
r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
|
||||
// and mark it as an implicit diff
|
||||
if (get_sub_group_local_id() == 0)
|
||||
diffs = 1;
|
||||
}
|
||||
|
||||
// now shuffle in the last key from the column to the left
|
||||
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
|
||||
|
||||
// shift away y/x
|
||||
SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
|
||||
|
||||
//
|
||||
// EXTRACT ALL COHORT IDS EARLY...
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// DEBUG
|
||||
//
|
||||
#if 0
|
||||
if (gmem_base == HS_KEYS_PER_SLAB * 7)
|
||||
{
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("\n%llX ",as_ulong(r0));
|
||||
else
|
||||
printf("%llX ",as_ulong(r0));
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (get_sub_group_local_id() == 0) \
|
||||
printf("\n%llX ",r##row); \
|
||||
else \
|
||||
printf("%llX ",r##row);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
// CAPTURE ALL CONDITIONS WE CARE ABOUT
|
||||
//
|
||||
// Diffs must be captured before cohorts
|
||||
//
|
||||
uint valid = 0;
|
||||
uint blocks = 0;
|
||||
uint pks = 0;
|
||||
SKC_COHORT_TYPE c_max = 0;
|
||||
|
||||
//
|
||||
// FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
|
||||
// EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
|
||||
//
|
||||
#if 0
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
diffs |= ((c##row != c##prev) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
blocks |= (SKC_IS_BLOCK(row) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
pks |= SKC_IS_PK(row,prev) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
valid |= ((r##row != SKC_ULONG_MAX) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#else
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (c##row != c##prev) \
|
||||
diffs |= 1<<prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (SKC_IS_BLOCK(row)) \
|
||||
blocks |= 1<<prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (SKC_IS_PK(row,prev)) \
|
||||
pks |= 1<<prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (r##row != SKC_ULONG_MAX) { \
|
||||
valid |= 1<<prev; \
|
||||
c_max = max(c_max,c##row); \
|
||||
}
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// TRANSPOSE THE SLAB AND STORE IT
|
||||
//
|
||||
HS_TRANSPOSE_SLAB();
|
||||
|
||||
// the min cohort is the first key in the slab
|
||||
uint const c_min = sub_group_broadcast(c1,0);
|
||||
|
||||
// the max cohort is the max across all lanes
|
||||
c_max = sub_group_reduce_max(c_max);
|
||||
|
||||
#if 0 // REMOVE ME LATER
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("%3u : ( %3u , %3u )\n",
|
||||
get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
|
||||
#endif
|
||||
|
||||
//
|
||||
// ZERO SMEM
|
||||
//
|
||||
// zero only the meta info for the cohort ids found in this slab
|
||||
//
|
||||
#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
|
||||
uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
|
||||
uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
|
||||
|
||||
for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
|
||||
shared.z[zz] = 0;
|
||||
#else
|
||||
// ERROR -- it's highly unlikely that the zero type is smaller than
|
||||
// the meta type
|
||||
#error("Unsupported right now...")
|
||||
#endif
|
||||
|
||||
//
|
||||
// ACCUMULATE AND STORE META INFO
|
||||
//
|
||||
uint const valid_blocks = valid & blocks;
|
||||
uint const valid_pks = valid & pks & ~diffs;
|
||||
SKC_META_TYPE meta = ( 0 );
|
||||
|
||||
#define SKC_META_LOCAL_ADD(meta) \
|
||||
atomic_add(shared.m+HS_REG_LAST(c),meta);
|
||||
|
||||
#define SKC_META_LOCAL_STORE(meta,prev) \
|
||||
shared.m[c##prev] = meta;
|
||||
|
||||
// note this is purposefully off by +1
|
||||
#define SKC_META_RESET(meta,curr) \
|
||||
meta = ((gmem_off + curr) << 8);
|
||||
|
||||
#if 0
|
||||
|
||||
// FIXME -- this can be tweaked to shift directly
|
||||
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
|
||||
meta += ((((blocks >> prev) & 1) ) | \
|
||||
(((pks >> prev) & 1) << 16) | \
|
||||
(((rks >> prev) & 1) << 24));
|
||||
|
||||
#else
|
||||
|
||||
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
|
||||
if (blocks & (1<<prev)) \
|
||||
meta += 1; \
|
||||
if (pks & (1<<prev)) \
|
||||
meta += 1<<16; \
|
||||
if (rks & (1<<prev)) \
|
||||
meta += 1<<24;
|
||||
|
||||
#endif
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (diffs & (1<<prev)) { \
|
||||
SKC_META_LOCAL_STORE(meta,prev); \
|
||||
SKC_META_RESET(meta,row); \
|
||||
} \
|
||||
SKC_META_ADD(meta,prev, \
|
||||
valid_blocks, \
|
||||
valid_pks, \
|
||||
valid);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// ATOMICALLY ADD THE CARRIED OUT METAS
|
||||
//
|
||||
#if 0 // BUG
|
||||
if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
|
||||
SKC_META_LOCAL_ADD(meta);
|
||||
#else
|
||||
if (meta != 0)
|
||||
SKC_META_LOCAL_ADD(meta);
|
||||
#endif
|
||||
|
||||
//
|
||||
// NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
|
||||
//
|
||||
|
||||
// convert the slab offset to an extent offset
|
||||
bool const is_offset = (get_sub_group_local_id() & 3) == 1;
|
||||
uint const adjust = is_offset ? gmem_base - 1 : 0;
|
||||
|
||||
//
|
||||
// only process the meta components found in this slab
|
||||
//
|
||||
uint const cc_min = c_min * SKC_META_COMPONENTS;
|
||||
uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
|
||||
uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
|
||||
|
||||
if ((cc >= cc_min) && (cc <= cc_max))
|
||||
{
|
||||
uint const c = shared.c[cc];
|
||||
|
||||
if (c != 0)
|
||||
atomic_add(metas+cc,c+adjust);
|
||||
}
|
||||
|
||||
cc += HS_LANES_PER_WARP;
|
||||
|
||||
for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
|
||||
{
|
||||
uint const c = shared.c[cc];
|
||||
|
||||
if (c != 0)
|
||||
atomic_add(metas+cc,c+adjust);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
|
||||
// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
|
||||
// KERNEL WILL NEED TO BE UPDATED
|
||||
//
|
||||
|
||||
#include "tile.h"
|
||||
#include "raster_builder_cl_12.h" // need meta_in structure
|
||||
#include "kernel_cl_12.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
|
||||
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
|
||||
|
||||
//
|
||||
// THE BEST TYPE TO ZERO SMEM
|
||||
//
|
||||
|
||||
#define SKC_ZERO_TYPE ulong
|
||||
#define SKC_ZERO_WORDS 2
|
||||
|
||||
//
|
||||
// THE ORDER OF COMPONENTS IS:
|
||||
//
|
||||
// 0: blocks
|
||||
// 1: offset
|
||||
// 2: pk
|
||||
// 3: rk
|
||||
//
|
||||
|
||||
#if (HS_KEYS_PER_SLAB < 256)
|
||||
|
||||
#define SKC_META_TYPE uint
|
||||
#define SKC_META_WORDS 1
|
||||
|
||||
#define SKC_COMPONENT_TYPE uchar
|
||||
|
||||
#else
|
||||
|
||||
#define SKC_META_TYPE uint2
|
||||
#define SKC_META_WORDS 2
|
||||
|
||||
#define SKC_COMPONENT_TYPE ushort
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
|
||||
#define SKC_COHORT_TYPE uchar
|
||||
#else
|
||||
#define SKC_COHORT_TYPE ushort
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_COHORT_ID(row) \
|
||||
as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
|
||||
|
||||
//
|
||||
// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
|
||||
//
|
||||
|
||||
#define SKC_IS_BLOCK(row) \
|
||||
((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
|
||||
|
||||
#define SKC_YX(row,prev) \
|
||||
(as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
|
||||
|
||||
#define SKC_IS_PK(row,prev) \
|
||||
((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
|
||||
|
||||
//
|
||||
// COHORT SIZE IS ALWAYS A POWER-OF-TWO
|
||||
// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
|
||||
//
|
||||
// COHORT SIZE >= SUBGROUP SIZE
|
||||
//
|
||||
|
||||
#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
|
||||
|
||||
#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
|
||||
#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
|
||||
#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
|
||||
|
||||
#define SKC_META_COMPONENTS 4
|
||||
#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
__kernel
|
||||
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
|
||||
void
|
||||
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
|
||||
__global uint * SKC_RESTRICT const metas)
|
||||
{
|
||||
__local union
|
||||
{
|
||||
SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
|
||||
SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
|
||||
SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
|
||||
} shared;
|
||||
|
||||
uint const global_id = get_global_id(0);
|
||||
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
|
||||
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
|
||||
uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
|
||||
|
||||
//
|
||||
// LOAD ALL THE ROWS
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// LOAD LAST REGISTER FROM COLUMN TO LEFT
|
||||
//
|
||||
uint diffs = 0;
|
||||
uint2 r0 = 0;
|
||||
|
||||
if (gmem_base > 0) {
|
||||
// if this is the first key in any slab but the first then it
|
||||
// broadcast loads the last key in previous slab
|
||||
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
|
||||
} else {
|
||||
// otherwise broadcast the first key in the first slab
|
||||
r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
|
||||
// and mark it as an implicit diff
|
||||
if (get_sub_group_local_id() == 0)
|
||||
diffs = 1;
|
||||
}
|
||||
|
||||
// now shuffle in the last key from the column to the left
|
||||
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
|
||||
|
||||
// shift away y/x
|
||||
SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
|
||||
|
||||
//
|
||||
// EXTRACT ALL COHORT IDS EARLY...
|
||||
//
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// DEBUG
|
||||
//
|
||||
#if 0
|
||||
if (gmem_base == HS_KEYS_PER_SLAB * 7)
|
||||
{
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("\n%llX ",as_ulong(r0));
|
||||
else
|
||||
printf("%llX ",as_ulong(r0));
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (get_sub_group_local_id() == 0) \
|
||||
printf("\n%llX ",r##row); \
|
||||
else \
|
||||
printf("%llX ",r##row);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
// CAPTURE ALL CONDITIONS WE CARE ABOUT
|
||||
//
|
||||
// Diffs must be captured before cohorts
|
||||
//
|
||||
uint valid = 0;
|
||||
uint blocks = 0;
|
||||
uint pks = 0;
|
||||
SKC_COHORT_TYPE c_max = 0;
|
||||
|
||||
//
|
||||
// FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
|
||||
// EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
|
||||
//
|
||||
#if 0
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
diffs |= ((c##row != c##prev) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
blocks |= (SKC_IS_BLOCK(row) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
pks |= SKC_IS_PK(row,prev) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
valid |= ((r##row != SKC_ULONG_MAX) << prev);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#else
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (c##row != c##prev) \
|
||||
diffs |= 1<<prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (SKC_IS_BLOCK(row)) \
|
||||
blocks |= 1<<prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (SKC_IS_PK(row,prev)) \
|
||||
pks |= 1<<prev;
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (r##row != SKC_ULONG_MAX) { \
|
||||
valid |= 1<<prev; \
|
||||
c_max = max(c_max,c##row); \
|
||||
}
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// TRANSPOSE THE SLAB AND STORE IT
|
||||
//
|
||||
HS_TRANSPOSE_SLAB();
|
||||
|
||||
// the min cohort is the first key in the slab
|
||||
uint const c_min = sub_group_broadcast(c1,0);
|
||||
|
||||
// the max cohort is the max across all lanes
|
||||
c_max = sub_group_reduce_max(c_max);
|
||||
|
||||
#if 0 // REMOVE ME LATER
|
||||
if (get_sub_group_local_id() == 0)
|
||||
printf("%3u : ( %3u , %3u )\n",
|
||||
get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
|
||||
#endif
|
||||
|
||||
//
|
||||
// ZERO SMEM
|
||||
//
|
||||
// zero only the meta info for the cohort ids found in this slab
|
||||
//
|
||||
#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
|
||||
uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
|
||||
uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
|
||||
|
||||
for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
|
||||
shared.z[zz] = 0;
|
||||
#else
|
||||
// ERROR -- it's highly unlikely that the zero type is smaller than
|
||||
// the meta type
|
||||
#error("Unsupported right now...")
|
||||
#endif
|
||||
|
||||
//
|
||||
// ACCUMULATE AND STORE META INFO
|
||||
//
|
||||
uint const valid_blocks = valid & blocks;
|
||||
uint const valid_pks = valid & pks & ~diffs;
|
||||
SKC_META_TYPE meta = ( 0 );
|
||||
|
||||
#define SKC_META_LOCAL_ADD(meta) \
|
||||
atomic_add(shared.m+HS_REG_LAST(c),meta);
|
||||
|
||||
#define SKC_META_LOCAL_STORE(meta,prev) \
|
||||
shared.m[c##prev] = meta;
|
||||
|
||||
// note this is purposefully off by +1
|
||||
#define SKC_META_RESET(meta,curr) \
|
||||
meta = ((gmem_off + curr) << 8);
|
||||
|
||||
#if 0
|
||||
|
||||
// FIXME -- this can be tweaked to shift directly
|
||||
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
|
||||
meta += ((((blocks >> prev) & 1) ) | \
|
||||
(((pks >> prev) & 1) << 16) | \
|
||||
(((rks >> prev) & 1) << 24));
|
||||
|
||||
#else
|
||||
|
||||
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
|
||||
if (blocks & (1<<prev)) \
|
||||
meta += 1; \
|
||||
if (pks & (1<<prev)) \
|
||||
meta += 1<<16; \
|
||||
if (rks & (1<<prev)) \
|
||||
meta += 1<<24;
|
||||
|
||||
#endif
|
||||
|
||||
#undef HS_SLAB_ROW
|
||||
#define HS_SLAB_ROW(row,prev) \
|
||||
if (diffs & (1<<prev)) { \
|
||||
SKC_META_LOCAL_STORE(meta,prev); \
|
||||
SKC_META_RESET(meta,row); \
|
||||
} \
|
||||
SKC_META_ADD(meta,prev, \
|
||||
valid_blocks, \
|
||||
valid_pks, \
|
||||
valid);
|
||||
|
||||
HS_SLAB_ROWS();
|
||||
|
||||
//
|
||||
// ATOMICALLY ADD THE CARRIED OUT METAS
|
||||
//
|
||||
#if 0 // BUG
|
||||
if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
|
||||
SKC_META_LOCAL_ADD(meta);
|
||||
#else
|
||||
if (meta != 0)
|
||||
SKC_META_LOCAL_ADD(meta);
|
||||
#endif
|
||||
|
||||
//
|
||||
// NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
|
||||
//
|
||||
|
||||
// convert the slab offset to an extent offset
|
||||
bool const is_offset = (get_sub_group_local_id() & 3) == 1;
|
||||
uint const adjust = is_offset ? gmem_base - 1 : 0;
|
||||
|
||||
//
|
||||
// only process the meta components found in this slab
|
||||
//
|
||||
uint const cc_min = c_min * SKC_META_COMPONENTS;
|
||||
uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
|
||||
uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
|
||||
|
||||
if ((cc >= cc_min) && (cc <= cc_max))
|
||||
{
|
||||
uint const c = shared.c[cc];
|
||||
|
||||
if (c != 0)
|
||||
atomic_add(metas+cc,c+adjust);
|
||||
}
|
||||
|
||||
cc += HS_LANES_PER_WARP;
|
||||
|
||||
for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
|
||||
{
|
||||
uint const c = shared.c[cc];
|
||||
|
||||
if (c != 0)
|
||||
atomic_add(metas+cc,c+adjust);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
@ -1,362 +0,0 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "runtime_cl.h"
|
||||
#include "common/cl/assert_cl.h"
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
static is_verbose = true;
|
||||
|
||||
//
|
||||
// FIXME -- all variable length device queries need to start querying
|
||||
// the parameter's return size before getting its value
|
||||
//
|
||||
// FIXME -- this is now handled by the common/cl/find.* routine
|
||||
//
|
||||
|
||||
union skc_cl_device_version {
|
||||
struct {
|
||||
cl_uchar opencl_space[7]; // "OpenCL_"
|
||||
cl_uchar major;
|
||||
cl_uchar dot;
|
||||
cl_uchar minor;
|
||||
#if 1 // Intel NEO requires at least 16 bytes
|
||||
cl_uchar space;
|
||||
cl_uchar vendor[32];
|
||||
#endif
|
||||
};
|
||||
struct {
|
||||
cl_uchar aN[];
|
||||
};
|
||||
};
|
||||
|
||||
typedef cl_bitfield cl_diagnostic_verbose_level_intel;
|
||||
|
||||
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
|
||||
|
||||
static
|
||||
void
|
||||
CL_CALLBACK
|
||||
skc_context_callback(char const * error, void const * info, size_t size, void * user)
|
||||
{
|
||||
if (info != NULL )
|
||||
{
|
||||
fprintf(stderr,"%s\n",error);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
|
||||
char const * const target_platform_substring,
|
||||
char const * const target_device_substring,
|
||||
cl_context_properties context_properties[])
|
||||
{
|
||||
skc_err err = SKC_ERR_SUCCESS;
|
||||
|
||||
//
|
||||
// search available devices for a match
|
||||
//
|
||||
#define PLATFORM_IDS_MAX 16
|
||||
#define DEVICE_IDS_MAX 16
|
||||
#define PLATFORM_NAME_SIZE_MAX 64
|
||||
#define DEVICE_NAME_SIZE_MAX 64
|
||||
#define DRIVER_VERSION_SIZE_MAX 64
|
||||
|
||||
cl_int cl_err;
|
||||
|
||||
cl_platform_id platform_ids[PLATFORM_IDS_MAX];
|
||||
cl_device_id device_ids [PLATFORM_IDS_MAX][DEVICE_IDS_MAX];
|
||||
|
||||
cl_uint platform_count;
|
||||
cl_uint device_count[PLATFORM_IDS_MAX];
|
||||
|
||||
cl_uint platform_idx = UINT32_MAX, device_idx = UINT32_MAX;
|
||||
|
||||
bool match = false; // find _first_ match
|
||||
|
||||
//
|
||||
// get number of platforms
|
||||
//
|
||||
cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count));
|
||||
|
||||
//
|
||||
// search platforms
|
||||
//
|
||||
for (cl_uint ii=0; ii<platform_count; ii++)
|
||||
{
|
||||
char platform_name[PLATFORM_NAME_SIZE_MAX];
|
||||
|
||||
cl(GetPlatformInfo(platform_ids[ii],
|
||||
CL_PLATFORM_NAME,
|
||||
sizeof(platform_name),
|
||||
platform_name,
|
||||
NULL));
|
||||
|
||||
if (!match && (strstr(platform_name,target_platform_substring) != NULL))
|
||||
{
|
||||
platform_idx = ii;
|
||||
}
|
||||
|
||||
if (is_verbose) {
|
||||
fprintf(stdout,"%2u: %s\n",ii,platform_name);
|
||||
}
|
||||
|
||||
cl_err = clGetDeviceIDs(platform_ids[ii],
|
||||
CL_DEVICE_TYPE_ALL,
|
||||
DEVICE_IDS_MAX,
|
||||
device_ids[ii],
|
||||
device_count+ii);
|
||||
|
||||
if (cl_err != CL_DEVICE_NOT_FOUND)
|
||||
cl_ok(cl_err);
|
||||
|
||||
for (cl_uint jj=0; jj<device_count[ii]; jj++)
|
||||
{
|
||||
char device_name[DEVICE_NAME_SIZE_MAX];
|
||||
union skc_cl_device_version device_version;
|
||||
cl_uint device_align_bits;
|
||||
char driver_version[DRIVER_VERSION_SIZE_MAX];
|
||||
|
||||
cl(GetDeviceInfo(device_ids[ii][jj],
|
||||
CL_DEVICE_NAME,
|
||||
sizeof(device_name),
|
||||
device_name,
|
||||
NULL));
|
||||
|
||||
// FIXME -- some of these variable length parameters should
|
||||
// use the "size the param before reading" idiom
|
||||
cl(GetDeviceInfo(device_ids[ii][jj],
|
||||
CL_DEVICE_VERSION,
|
||||
sizeof(device_version),
|
||||
device_version.aN,
|
||||
NULL));
|
||||
|
||||
cl(GetDeviceInfo(device_ids[ii][jj],
|
||||
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
|
||||
sizeof(device_align_bits),
|
||||
&device_align_bits,
|
||||
NULL));
|
||||
|
||||
cl_uint const base_align = device_align_bits / 8; // bytes
|
||||
|
||||
cl(GetDeviceInfo(device_ids[ii][jj],
|
||||
CL_DRIVER_VERSION,
|
||||
sizeof(driver_version),
|
||||
driver_version,
|
||||
NULL));
|
||||
|
||||
if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL))
|
||||
{
|
||||
match = true;
|
||||
device_idx = jj;
|
||||
|
||||
runtime_cl->version.major = device_version.major - 48;
|
||||
runtime_cl->version.minor = device_version.minor - 48;
|
||||
runtime_cl->base_align = base_align;
|
||||
|
||||
if (is_verbose) {
|
||||
fprintf(stdout," >>>");
|
||||
}
|
||||
}
|
||||
else if (is_verbose)
|
||||
{
|
||||
fprintf(stdout," ");
|
||||
}
|
||||
|
||||
if (is_verbose) {
|
||||
fprintf(stdout,
|
||||
" %1u: %s [ %s ] [ %s ] [ %u ]\n",
|
||||
jj,
|
||||
device_name,
|
||||
device_version.aN,
|
||||
driver_version,
|
||||
base_align);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_verbose) {
|
||||
fprintf(stdout,"\n");
|
||||
}
|
||||
|
||||
//
|
||||
// get target platform and device
|
||||
//
|
||||
if (platform_idx >= platform_count)
|
||||
{
|
||||
fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (device_idx >= device_count[platform_idx])
|
||||
{
|
||||
fprintf(stderr,"no match for target device substring %s\n",target_device_substring);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
runtime_cl->platform_id = platform_ids[platform_idx];
|
||||
runtime_cl->device_id = device_ids [platform_idx][device_idx];
|
||||
|
||||
//
|
||||
// create context
|
||||
//
|
||||
|
||||
#if 0
|
||||
cl_context_properties context_properties[] =
|
||||
{
|
||||
CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id,
|
||||
0
|
||||
};
|
||||
#else
|
||||
context_properties[1] = (cl_context_properties)runtime_cl->platform_id;
|
||||
#endif
|
||||
|
||||
runtime_cl->context = clCreateContext(context_properties,
|
||||
1,
|
||||
&runtime_cl->device_id,
|
||||
skc_context_callback,
|
||||
NULL,
|
||||
&cl_err);
|
||||
cl_ok(cl_err);
|
||||
|
||||
//
|
||||
// get device name, driver version, and unified memory flag
|
||||
//
|
||||
if (is_verbose)
|
||||
{
|
||||
char device_name[DEVICE_NAME_SIZE_MAX];
|
||||
char driver_version[DRIVER_VERSION_SIZE_MAX];
|
||||
cl_bool device_is_unified;
|
||||
cl_device_svm_capabilities svm_caps;
|
||||
size_t printf_buffer_size;
|
||||
|
||||
cl(GetDeviceInfo(runtime_cl->device_id,
|
||||
CL_DEVICE_NAME,
|
||||
sizeof(device_name),
|
||||
device_name,
|
||||
NULL));
|
||||
|
||||
cl(GetDeviceInfo(runtime_cl->device_id,
|
||||
CL_DRIVER_VERSION,
|
||||
sizeof(driver_version),
|
||||
driver_version,
|
||||
NULL));
|
||||
|
||||
cl(GetDeviceInfo(runtime_cl->device_id,
|
||||
CL_DEVICE_HOST_UNIFIED_MEMORY,
|
||||
sizeof(device_is_unified),
|
||||
&device_is_unified,
|
||||
NULL));
|
||||
|
||||
cl(GetDeviceInfo(runtime_cl->device_id,
|
||||
CL_DEVICE_SVM_CAPABILITIES,
|
||||
sizeof(svm_caps),
|
||||
&svm_caps,
|
||||
0));
|
||||
|
||||
cl(GetDeviceInfo(runtime_cl->device_id,
|
||||
CL_DEVICE_PRINTF_BUFFER_SIZE,
|
||||
sizeof(printf_buffer_size),
|
||||
&printf_buffer_size,
|
||||
NULL));
|
||||
|
||||
fprintf(stderr,
|
||||
"CL_DEVICE_SVM_COARSE_GRAIN_BUFFER %c\n"
|
||||
"CL_DEVICE_SVM_FINE_GRAIN_BUFFER %c\n"
|
||||
"CL_DEVICE_SVM_FINE_GRAIN_SYSTEM %c\n"
|
||||
"CL_DEVICE_SVM_ATOMICS %c\n"
|
||||
"CL_DEVICE_PRINTF_BUFFER_SIZE %zu\n\n",
|
||||
svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-',
|
||||
svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? '*' : '-',
|
||||
svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? '*' : '-',
|
||||
svm_caps & CL_DEVICE_SVM_ATOMICS ? '*' : '-',
|
||||
printf_buffer_size);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl)
|
||||
{
|
||||
// FIXME
|
||||
printf("%s incomplete!\n",__func__);
|
||||
|
||||
return SKC_ERR_SUCCESS;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
cl_command_queue
|
||||
skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type)
|
||||
{
|
||||
cl_command_queue cq;
|
||||
|
||||
if (runtime_cl->version.major < 2)
|
||||
{
|
||||
//
|
||||
// <= OpenCL 1.2
|
||||
//
|
||||
cl_int cl_err;
|
||||
|
||||
cq = clCreateCommandQueue(runtime_cl->context,
|
||||
runtime_cl->device_id,
|
||||
(cl_command_queue_properties)type,
|
||||
&cl_err); cl_ok(cl_err);
|
||||
}
|
||||
else
|
||||
{
|
||||
//
|
||||
// >= OpenCL 2.0
|
||||
//
|
||||
cl_int cl_err;
|
||||
cl_queue_properties const queue_properties[] = {
|
||||
CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
|
||||
};
|
||||
|
||||
cq = clCreateCommandQueueWithProperties(runtime_cl->context,
|
||||
runtime_cl->device_id,
|
||||
queue_properties,
|
||||
&cl_err); cl_ok(cl_err);
|
||||
}
|
||||
|
||||
return cq;
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
@ -1,79 +0,0 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
//
|
||||
// squelch OpenCL 1.2 deprecation warning
|
||||
//
|
||||
|
||||
#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
||||
#endif
|
||||
|
||||
#include <CL/opencl.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "skc.h"
|
||||
|
||||
//
|
||||
// Minimal OpenCL state needed by the runtime to get started
|
||||
//
|
||||
|
||||
struct skc_runtime_cl
|
||||
{
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
cl_context context;
|
||||
|
||||
struct {
|
||||
cl_uint major;
|
||||
cl_uint minor;
|
||||
} version; // sometimes we need to know this at runtime
|
||||
|
||||
cl_uint base_align; // base address alignment for subbuffer origins
|
||||
};
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
typedef enum skc_cq_type_e {
|
||||
SKC_CQ_TYPE_IN_ORDER = 0,
|
||||
SKC_CQ_TYPE_OUT_OF_ORDER = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
|
||||
SKC_CQ_TYPE_IN_ORDER_PROFILING = (SKC_CQ_TYPE_IN_ORDER | CL_QUEUE_PROFILING_ENABLE),
|
||||
SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE),
|
||||
} skc_cq_type_e;
|
||||
|
||||
//
|
||||
// safely creates a generic OpenCL target in very few lines
|
||||
//
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
|
||||
char const * const target_platform_substring,
|
||||
char const * const target_device_substring,
|
||||
cl_context_properties context_properties[]);
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl);
|
||||
|
||||
//
|
||||
// create a command queue with the non-deprecated function
|
||||
//
|
||||
|
||||
cl_command_queue
|
||||
skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type);
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
@ -24,7 +24,6 @@
|
||||
#include "grid.h"
|
||||
#include "common/cl/assert_cl.h"
|
||||
#include "config_cl.h"
|
||||
#include "runtime_cl.h"
|
||||
#include "runtime_cl_12.h"
|
||||
#include "export_cl_12.h"
|
||||
|
||||
@ -32,7 +31,7 @@
|
||||
//
|
||||
//
|
||||
|
||||
static
|
||||
static
|
||||
void
|
||||
skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
|
||||
{
|
||||
@ -42,7 +41,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
|
||||
// create block extent
|
||||
skc_extent_pdrw_alloc(runtime,
|
||||
&runtime->block_pool.blocks,
|
||||
runtime->block_pool.size->pool_size *
|
||||
runtime->block_pool.size->pool_size *
|
||||
runtime->config->block.bytes);
|
||||
|
||||
// allocate block pool ids
|
||||
@ -85,7 +84,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
|
||||
cl(ReleaseKernel(k1));
|
||||
}
|
||||
|
||||
static
|
||||
static
|
||||
void
|
||||
skc_block_pool_dispose(struct skc_runtime * const runtime)
|
||||
{
|
||||
@ -106,7 +105,7 @@ skc_runtime_yield(struct skc_runtime * const runtime)
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
void
|
||||
skc_runtime_wait(struct skc_runtime * const runtime)
|
||||
{
|
||||
skc_scheduler_wait(runtime->scheduler);
|
||||
@ -118,18 +117,26 @@ skc_runtime_wait(struct skc_runtime * const runtime)
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_12_create(struct skc_context * const context,
|
||||
char const * const target_platform_substring,
|
||||
char const * const target_device_substring,
|
||||
cl_context_properties context_properties[])
|
||||
cl_context context_cl,
|
||||
cl_device_id device_id_cl)
|
||||
{
|
||||
// allocate the runtime
|
||||
struct skc_runtime * const runtime = malloc(sizeof(*runtime));
|
||||
|
||||
// acquire OpenCL ids and context for target device
|
||||
skc_err err = skc_runtime_cl_create(&runtime->cl,
|
||||
target_platform_substring,
|
||||
target_device_substring,
|
||||
context_properties);
|
||||
// save off CL objects
|
||||
runtime->cl.context = context_cl;
|
||||
runtime->cl.device_id = device_id_cl;
|
||||
|
||||
// query device alignment
|
||||
cl_uint align_bits;
|
||||
|
||||
cl(GetDeviceInfo(device_id_cl,
|
||||
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
|
||||
sizeof(align_bits),
|
||||
&align_bits,
|
||||
NULL));
|
||||
|
||||
runtime->cl.align_bytes = align_bits / 8;
|
||||
|
||||
// create device
|
||||
skc_device_create(runtime);
|
||||
@ -149,7 +156,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
|
||||
// initialize cq pool
|
||||
skc_cq_pool_create(runtime,
|
||||
&runtime->cq_pool,
|
||||
runtime->config->cq_pool.type,
|
||||
runtime->config->cq_pool.cq_props,
|
||||
runtime->config->cq_pool.size);
|
||||
|
||||
// acquire in-order cq
|
||||
@ -176,7 +183,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
|
||||
|
||||
context->yield = skc_runtime_yield;
|
||||
context->wait = skc_runtime_wait;
|
||||
|
||||
|
||||
context->path_builder = skc_path_builder_cl_12_create;
|
||||
context->path_retain = skc_runtime_path_host_retain;
|
||||
context->path_release = skc_runtime_path_host_release;
|
||||
@ -189,7 +196,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
|
||||
|
||||
context->composition = skc_composition_cl_12_create;
|
||||
context->styling = skc_styling_cl_12_create;
|
||||
|
||||
|
||||
context->surface = skc_surface_cl_12_create;
|
||||
|
||||
// block on pool creation
|
||||
@ -198,7 +205,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
|
||||
// dispose of in-order cq
|
||||
skc_runtime_release_cq_in_order(runtime,cq);
|
||||
|
||||
return err;
|
||||
return SKC_ERR_SUCCESS;
|
||||
};
|
||||
|
||||
//
|
||||
@ -227,7 +234,7 @@ skc_runtime_cl_12_dispose(struct skc_context * const context)
|
||||
skc_block_pool_dispose(context->runtime);
|
||||
|
||||
// skc_handle_pool_dispose(context->runtime);
|
||||
|
||||
|
||||
return SKC_ERR_SUCCESS;
|
||||
}
|
||||
|
||||
@ -253,12 +260,12 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
|
||||
return;
|
||||
|
||||
QueryPerformanceCounter(&EndingTime);
|
||||
|
||||
|
||||
LARGE_INTEGER ElapsedMicroseconds, Frequency;
|
||||
|
||||
ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
|
||||
|
||||
QueryPerformanceFrequency(&Frequency);
|
||||
QueryPerformanceFrequency(&Frequency);
|
||||
|
||||
double const msecs_total = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart;
|
||||
double const msecs_frame = msecs_total / SKC_FRAMES;
|
||||
@ -268,7 +275,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
|
||||
#endif
|
||||
|
||||
struct skc_runtime * const runtime = context->runtime;
|
||||
|
||||
|
||||
// acquire out-of-order cq
|
||||
cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
|
||||
|
||||
@ -311,4 +318,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
|
@ -12,8 +12,8 @@
|
||||
//
|
||||
//
|
||||
|
||||
#include "skc.h"
|
||||
#include "runtime.h"
|
||||
#include "runtime_cl.h"
|
||||
#include "cq_pool_cl.h"
|
||||
#include "handle_pool_cl_12.h"
|
||||
#include "block_pool_cl_12.h"
|
||||
@ -31,7 +31,11 @@ struct skc_runtime
|
||||
//
|
||||
// state visible to device
|
||||
//
|
||||
struct skc_runtime_cl cl;
|
||||
struct {
|
||||
cl_context context;
|
||||
cl_device_id device_id;
|
||||
cl_uint align_bytes;
|
||||
} cl;
|
||||
|
||||
struct {
|
||||
struct skc_allocator_host host;
|
||||
@ -63,9 +67,8 @@ struct skc_runtime
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_12_create(struct skc_context * const context,
|
||||
char const * const target_platform_substring,
|
||||
char const * const target_device_substring,
|
||||
cl_context_properties context_properties[]);
|
||||
cl_context context_cl,
|
||||
cl_device_id device_id_cl);
|
||||
|
||||
skc_err
|
||||
skc_runtime_cl_12_dispose(struct skc_context * const context);
|
||||
|
@ -69,7 +69,7 @@ float const skc_transform_identity[8] =
|
||||
0.0f, 0.0f // w0 w1 1 <-- always 1
|
||||
};
|
||||
|
||||
float const * const skc_transform_identity_ptr = skc_transform_identity;
|
||||
// float const * const skc_transform_identity_ptr = skc_transform_identity;
|
||||
|
||||
//
|
||||
// DEFAULT RASTER CLIP
|
||||
@ -82,7 +82,7 @@ float const skc_raster_clip_default[4] =
|
||||
+FLT_MAX, +FLT_MAX // upper right corner of bounding box
|
||||
};
|
||||
|
||||
float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
|
||||
// float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -9,125 +9,18 @@
|
||||
#ifndef SKC_ONCE_SKC
|
||||
#define SKC_ONCE_SKC
|
||||
|
||||
//
|
||||
// FIXME -- get rid of these here
|
||||
//
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "skc_styling.h" // FIXME -- skc_styling
|
||||
// #include "skc_err.h"
|
||||
|
||||
//
|
||||
// FIXME -- move errors to an skc prefixed include
|
||||
//
|
||||
|
||||
typedef enum skc_err {
|
||||
|
||||
SKC_ERR_SUCCESS = 0,
|
||||
|
||||
SKC_ERR_API_BASE = 10000,
|
||||
|
||||
SKC_ERR_NOT_IMPLEMENTED = SKC_ERR_API_BASE,
|
||||
|
||||
SKC_ERR_POOL_EMPTY,
|
||||
|
||||
SKC_ERR_CONDVAR_WAIT,
|
||||
|
||||
SKC_ERR_LAYER_ID_INVALID,
|
||||
SKC_ERR_LAYER_NOT_EMPTY,
|
||||
|
||||
SKC_ERR_TRANSFORM_WEAKREF_INVALID,
|
||||
SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
|
||||
|
||||
SKC_ERR_COMMAND_NOT_READY,
|
||||
SKC_ERR_COMMAND_NOT_COMPLETED,
|
||||
SKC_ERR_COMMAND_NOT_STARTED,
|
||||
|
||||
SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
|
||||
|
||||
SKC_ERR_COMPOSITION_SEALED,
|
||||
SKC_ERR_STYLING_SEALED,
|
||||
|
||||
SKC_ERR_HANDLE_INVALID,
|
||||
SKC_ERR_HANDLE_OVERFLOW,
|
||||
|
||||
SKC_ERR_COUNT
|
||||
|
||||
} skc_err;
|
||||
|
||||
//
|
||||
// SPINEL TYPES
|
||||
//
|
||||
|
||||
typedef struct skc_context * skc_context_t;
|
||||
typedef struct skc_path_builder * skc_path_builder_t;
|
||||
typedef struct skc_raster_builder * skc_raster_builder_t;
|
||||
|
||||
typedef struct skc_composition * skc_composition_t;
|
||||
typedef struct skc_styling * skc_styling_t;
|
||||
|
||||
typedef struct skc_surface * skc_surface_t;
|
||||
|
||||
#if 0
|
||||
typedef struct skc_interop * skc_interop_t;
|
||||
typedef uint32_t skc_interop_surface_t;
|
||||
#endif
|
||||
|
||||
typedef uint32_t skc_path_t;
|
||||
typedef uint32_t skc_raster_t;
|
||||
|
||||
typedef uint32_t skc_layer_id;
|
||||
typedef uint32_t skc_group_id;
|
||||
|
||||
typedef uint32_t skc_styling_cmd_t;
|
||||
|
||||
typedef uint64_t skc_weakref_t;
|
||||
typedef skc_weakref_t skc_transform_weakref_t;
|
||||
typedef skc_weakref_t skc_raster_clip_weakref_t;
|
||||
|
||||
//
|
||||
// FIXME -- bury all of this
|
||||
//
|
||||
|
||||
#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
|
||||
#define SKC_GROUP_IDS(...) _countof(__VA_ARGS__),__VA_ARGS__
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_PATH_INVALID UINT32_MAX
|
||||
#define SKC_RASTER_INVALID UINT32_MAX
|
||||
#define SKC_WEAKREF_INVALID UINT64_MAX
|
||||
|
||||
//
|
||||
// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
|
||||
//
|
||||
|
||||
extern float const * const skc_transform_identity_ptr; // { 1, 0, 0, 0, 1, 0, 0, 0 }
|
||||
|
||||
//
|
||||
// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
|
||||
//
|
||||
|
||||
extern float const * const skc_raster_clip_default_ptr;
|
||||
#include "skc_err.h"
|
||||
#include "skc_types.h"
|
||||
#include "skc_styling.h"
|
||||
|
||||
//
|
||||
// CONTEXT
|
||||
//
|
||||
|
||||
skc_err
|
||||
skc_context_create(skc_context_t * context,
|
||||
char const * target_platform_substring,
|
||||
char const * target_device_substring,
|
||||
intptr_t context_properties[]);
|
||||
|
||||
skc_err
|
||||
skc_context_retain(skc_context_t context);
|
||||
|
||||
@ -137,31 +30,6 @@ skc_context_release(skc_context_t context);
|
||||
skc_err
|
||||
skc_context_reset(skc_context_t context);
|
||||
|
||||
//
|
||||
// COORDINATED EXTERNAL OPERATIONS
|
||||
//
|
||||
|
||||
/*
|
||||
Examples include:
|
||||
|
||||
- Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
|
||||
- Subpixel antialiasing using neighboring pixel color and coverage data.
|
||||
- Performing a blit from one region to another region on a surface.
|
||||
- Blitting from one surface to another.
|
||||
- Loading and processing from one region and storing to another region.
|
||||
- Rendezvousing with an external pipeline.
|
||||
*/
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
bool
|
||||
skc_context_yield(skc_context_t context);
|
||||
|
||||
void
|
||||
skc_context_wait(skc_context_t context);
|
||||
|
||||
//
|
||||
// PATH BUILDER
|
||||
//
|
||||
@ -485,6 +353,31 @@ skc_surface_render(skc_surface_t surface,
|
||||
void * data,
|
||||
void * fb); // FIXME FIXME
|
||||
|
||||
//
|
||||
// COORDINATED EXTERNAL OPERATIONS
|
||||
//
|
||||
// Examples include:
|
||||
//
|
||||
// - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
|
||||
// - Subpixel antialiasing using neighboring pixel color and coverage data.
|
||||
// - Performing a blit from one region to another region on a surface.
|
||||
// - Blitting from one surface to another.
|
||||
// - Loading and processing from one region and storing to another region.
|
||||
// - Rendezvousing with an external pipeline.
|
||||
//
|
||||
|
||||
// FORTHCOMING...
|
||||
|
||||
//
|
||||
// SCHEDULER
|
||||
//
|
||||
|
||||
bool
|
||||
skc_context_yield(skc_context_t context);
|
||||
|
||||
void
|
||||
skc_context_wait(skc_context_t context);
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
70
src/compute/skc/skc_create_cl.h
Normal file
70
src/compute/skc/skc_create_cl.h
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright 2017 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SKC_ONCE_SKC_CREATE_CL
|
||||
#define SKC_ONCE_SKC_CREATE_CL
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include "OpenCL/opencl.h"
|
||||
#else
|
||||
#include "CL/opencl.h"
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include "skc.h"
|
||||
|
||||
//
|
||||
// CONTEXT CREATION
|
||||
//
|
||||
|
||||
skc_err
|
||||
skc_context_create_cl(skc_context_t * context,
|
||||
cl_context context_cl,
|
||||
cl_device_id device_id_cl);
|
||||
|
||||
//
|
||||
// FIXME -- SPECIALIZE SURFACE RENDER
|
||||
//
|
||||
|
||||
#if 0
|
||||
|
||||
//
|
||||
// SURFACE RENDER
|
||||
//
|
||||
|
||||
typedef void (*skc_surface_render_pfn_notify)(skc_surface_t surface,
|
||||
skc_styling_t styling,
|
||||
skc_composition_t composition,
|
||||
void * data);
|
||||
skc_err
|
||||
skc_surface_render(skc_surface_t surface,
|
||||
uint32_t const clip[4],
|
||||
skc_styling_t styling,
|
||||
skc_composition_t composition,
|
||||
skc_surface_render_pfn_notify notify,
|
||||
void * data,
|
||||
void * fb); // FIXME FIXME
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
58
src/compute/skc/skc_err.h
Normal file
58
src/compute/skc/skc_err.h
Normal file
@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SKC_ONCE_SKC_ERR
|
||||
#define SKC_ONCE_SKC_ERR
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
typedef enum skc_err {
|
||||
|
||||
SKC_ERR_SUCCESS = 0,
|
||||
|
||||
SKC_ERR_API_BASE = 10000,
|
||||
|
||||
SKC_ERR_NOT_IMPLEMENTED = SKC_ERR_API_BASE,
|
||||
|
||||
SKC_ERR_POOL_EMPTY,
|
||||
|
||||
SKC_ERR_CONDVAR_WAIT,
|
||||
|
||||
SKC_ERR_LAYER_ID_INVALID,
|
||||
SKC_ERR_LAYER_NOT_EMPTY,
|
||||
|
||||
SKC_ERR_TRANSFORM_WEAKREF_INVALID,
|
||||
SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
|
||||
|
||||
SKC_ERR_COMMAND_NOT_READY,
|
||||
SKC_ERR_COMMAND_NOT_COMPLETED,
|
||||
SKC_ERR_COMMAND_NOT_STARTED,
|
||||
|
||||
SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
|
||||
|
||||
SKC_ERR_COMPOSITION_SEALED,
|
||||
SKC_ERR_STYLING_SEALED,
|
||||
|
||||
SKC_ERR_HANDLE_INVALID,
|
||||
SKC_ERR_HANDLE_OVERFLOW,
|
||||
|
||||
SKC_ERR_COUNT
|
||||
|
||||
} skc_err;
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
@ -79,6 +79,13 @@ typedef enum skc_styling_gradient_type_e {
|
||||
|
||||
} skc_styling_gradient_type_e;
|
||||
|
||||
//
|
||||
// FIXME -- bury all of this once we stabilize styling
|
||||
//
|
||||
|
||||
#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
|
||||
#define SKC_GROUP_IDS(...) _countof(__VA_ARGS__),__VA_ARGS__
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
73
src/compute/skc/skc_types.h
Normal file
73
src/compute/skc/skc_types.h
Normal file
@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright 2018 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can
|
||||
* be found in the LICENSE file.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SKC_ONCE_SKC_TYPES
|
||||
#define SKC_ONCE_SKC_TYPES
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
typedef struct skc_context * skc_context_t;
|
||||
typedef struct skc_path_builder * skc_path_builder_t;
|
||||
typedef struct skc_raster_builder * skc_raster_builder_t;
|
||||
|
||||
typedef struct skc_composition * skc_composition_t;
|
||||
typedef struct skc_styling * skc_styling_t;
|
||||
|
||||
typedef struct skc_surface * skc_surface_t;
|
||||
|
||||
typedef uint32_t skc_path_t;
|
||||
typedef uint32_t skc_raster_t;
|
||||
|
||||
typedef uint32_t skc_layer_id;
|
||||
typedef uint32_t skc_group_id;
|
||||
|
||||
typedef uint32_t skc_styling_cmd_t;
|
||||
|
||||
typedef uint64_t skc_weakref_t;
|
||||
typedef skc_weakref_t skc_transform_weakref_t;
|
||||
typedef skc_weakref_t skc_raster_clip_weakref_t;
|
||||
|
||||
#if 0
|
||||
typedef struct skc_interop * skc_interop_t;
|
||||
typedef uint32_t skc_interop_surface_t;
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#define SKC_PATH_INVALID UINT32_MAX
|
||||
#define SKC_RASTER_INVALID UINT32_MAX
|
||||
#define SKC_WEAKREF_INVALID UINT64_MAX
|
||||
|
||||
//
|
||||
// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
|
||||
//
|
||||
|
||||
//
|
||||
// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
|
||||
//
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//
|
||||
//
|
Loading…
Reference in New Issue
Block a user