OpenGL interop is simplified when the cl_context is not created by SKC.

Added GEN9 HotSort kernels so the hs_cl_gen9 lib and hs_bench_cl app can be built.

Bug: skia:
Change-Id: I5b21d33499a6ec3524f39a51443981802b722c8b
Reviewed-on: https://skia-review.googlesource.com/136608
Commit-Queue: Allan MacKinnon <allanmac@google.com>
Reviewed-by: Mike Reed <reed@google.com>
Reviewed-by: Mike Klein <mtklein@google.com>
This commit is contained in:
Allan MacKinnon 2018-06-21 09:09:56 -07:00 committed by Skia Commit-Bot
parent 867ce8fc8e
commit c110e7941e
36 changed files with 20693 additions and 10527 deletions

View File

@ -20,7 +20,7 @@
#include "find_cl.h"
#include "assert_cl.h"
#include "macros.h"
#include "../macros.h"
//
// search platforms and devices for a match

View File

@ -38,9 +38,9 @@
//
#if defined(_MSC_VER)
#define ALLOCA(n) _alloca(n)
#define ALLOCA(n) _alloca(n)
#else
#define ALLOCA(n) alloca(n)
#define ALLOCA(n) alloca(n)
#endif
//
//

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,122 @@
//
// Copyright 2016 Google Inc.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
#ifndef HS_CL_ONCE
#define HS_CL_ONCE
#define HS_LANES_PER_WARP_LOG2 3
#define HS_LANES_PER_WARP (1 << HS_LANES_PER_WARP_LOG2)
#define HS_BS_WARPS 16
#define HS_BS_WARPS_LOG2_RU 4
#define HS_BC_WARPS_LOG2_MAX 4
#define HS_FM_BLOCKS_LOG2_MIN 1
#define HS_HM_BLOCKS_LOG2_MIN 1
#define HS_KEYS_PER_LANE 16
#define HS_REG_LAST(c) c##16
#define HS_KEY_WORDS 2
#define HS_KEY_TYPE ulong
#define HS_EMPTY
#define HS_SLAB_ROWS() \
HS_SLAB_ROW( 1, 0 ) \
HS_SLAB_ROW( 2, 1 ) \
HS_SLAB_ROW( 3, 2 ) \
HS_SLAB_ROW( 4, 3 ) \
HS_SLAB_ROW( 5, 4 ) \
HS_SLAB_ROW( 6, 5 ) \
HS_SLAB_ROW( 7, 6 ) \
HS_SLAB_ROW( 8, 7 ) \
HS_SLAB_ROW( 9, 8 ) \
HS_SLAB_ROW( 10, 9 ) \
HS_SLAB_ROW( 11, 10 ) \
HS_SLAB_ROW( 12, 11 ) \
HS_SLAB_ROW( 13, 12 ) \
HS_SLAB_ROW( 14, 13 ) \
HS_SLAB_ROW( 15, 14 ) \
HS_SLAB_ROW( 16, 15 ) \
HS_EMPTY
#define HS_TRANSPOSE_SLAB() \
HS_TRANSPOSE_STAGE( 1 ) \
HS_TRANSPOSE_STAGE( 2 ) \
HS_TRANSPOSE_STAGE( 3 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 2, 1 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 4, 3 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 6, 5 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 8, 7 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 10, 9 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 12, 11 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 14, 13 ) \
HS_TRANSPOSE_BLEND( r, s, 1, 16, 15 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 3, 1 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 4, 2 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 7, 5 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 8, 6 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 11, 9 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 12, 10 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 15, 13 ) \
HS_TRANSPOSE_BLEND( s, t, 2, 16, 14 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 5, 1 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 6, 2 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 7, 3 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 8, 4 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 13, 9 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 14, 10 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 15, 11 ) \
HS_TRANSPOSE_BLEND( t, u, 3, 16, 12 ) \
HS_TRANSPOSE_REMAP( u, 1, 1 ) \
HS_TRANSPOSE_REMAP( u, 2, 3 ) \
HS_TRANSPOSE_REMAP( u, 3, 5 ) \
HS_TRANSPOSE_REMAP( u, 4, 7 ) \
HS_TRANSPOSE_REMAP( u, 5, 9 ) \
HS_TRANSPOSE_REMAP( u, 6, 11 ) \
HS_TRANSPOSE_REMAP( u, 7, 13 ) \
HS_TRANSPOSE_REMAP( u, 8, 15 ) \
HS_TRANSPOSE_REMAP( u, 9, 2 ) \
HS_TRANSPOSE_REMAP( u, 10, 4 ) \
HS_TRANSPOSE_REMAP( u, 11, 6 ) \
HS_TRANSPOSE_REMAP( u, 12, 8 ) \
HS_TRANSPOSE_REMAP( u, 13, 10 ) \
HS_TRANSPOSE_REMAP( u, 14, 12 ) \
HS_TRANSPOSE_REMAP( u, 15, 14 ) \
HS_TRANSPOSE_REMAP( u, 16, 16 ) \
HS_EMPTY
#define HS_FM_BLOCKS_LOG2_1 0
#define HS_FM_BLOCKS_LOG2_2 1
#define HS_FM_BLOCKS_LOG2_3 2
#define HS_FM_BLOCKS_LOG2_4 3
#define HS_FM_BLOCKS_LOG2_5 4
#define HS_FM_BLOCKS_LOG2_6 5
#define HS_HM_BLOCKS_LOG2_5 0
#define HS_FM_BLOCKS_LOG2_7 6
#define HS_HM_BLOCKS_LOG2_6 1
#define HS_FM_BLOCKS_LOG2_8 7
#define HS_HM_BLOCKS_LOG2_7 2
#define HS_FM_BLOCKS_LOG2_9 8
#define HS_HM_BLOCKS_LOG2_8 3
#define HS_FM_BLOCKS_LOG2_10 9
#define HS_HM_BLOCKS_LOG2_9 4
#define HS_FM_BLOCKS_LOG2_11 10
#define HS_HM_BLOCKS_LOG2_10 5
#define HS_FM_BLOCKS_LOG2_12 11
#define HS_HM_BLOCKS_LOG2_11 6
#define HS_FM_BLOCKS_LOG2_13 12
#define HS_HM_BLOCKS_LOG2_12 7
#define HS_FM_BLOCKS_LOG2_14 13
#define HS_HM_BLOCKS_LOG2_13 8
#define HS_FM_BLOCKS_LOG2_15 14
#define HS_HM_BLOCKS_LOG2_14 9
#define HS_FM_BLOCKS_LOG2_16 15
#define HS_HM_BLOCKS_LOG2_15 10
#endif
//
//
//

View File

@ -0,0 +1,199 @@
//
// Copyright 2016 Google Inc.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
#ifndef HS_CL_MACROS_ONCE
#define HS_CL_MACROS_ONCE
//
//
//
#include "hs_cl.h"
//
// Inter-lane compare exchange
//
// default
#define HS_CMP_XCHG_V0(a,b) \
{ \
HS_KEY_TYPE const t = min(a,b); \
b = max(a,b); \
a = t; \
}
// super slow
#define HS_CMP_XCHG_V1(a,b) \
{ \
HS_KEY_TYPE const tmp = a; \
a = (a < b) ? a : b; \
b ^= a ^ tmp; \
}
// best
#define HS_CMP_XCHG_V2(a,b) \
if (a >= b) { \
HS_KEY_TYPE const t = a; \
a = b; \
b = t; \
}
// good
#define HS_CMP_XCHG_V3(a,b) \
{ \
int const ge = a >= b; \
HS_KEY_TYPE const t = a; \
a = ge ? b : a; \
b = ge ? t : b; \
}
//
//
//
#if (HS_KEY_WORDS == 1)
#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V0(a,b)
#elif (HS_KEY_WORDS == 2)
#define HS_CMP_XCHG(a,b) HS_CMP_XCHG_V2(a,b)
#endif
//
// Conditional inter-subgroup flip/half compare exchange
//
#define HS_CMP_FLIP(i,a,b) \
{ \
HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx); \
HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx); \
a = HS_COND_MIN_MAX(t_lt,a,tb); \
b = HS_COND_MIN_MAX(t_lt,b,ta); \
}
#define HS_CMP_HALF(i,a) \
{ \
HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx); \
a = HS_COND_MIN_MAX(t_lt,a,ta); \
}
//
// The device's comparison operator might return what we actually
// want. For example, it appears GEN 'cmp' returns {true:-1,false:0}.
//
#define HS_CMP_IS_ZERO_ONE
#ifdef HS_CMP_IS_ZERO_ONE
// OpenCL requires a {true: +1, false: 0} scalar result
// (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
#define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
#define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a)
#else
// However, OpenCL requires { -1, 0 } for vectors
// (a < b) -> { 0xFFFFFFFF, 0 }
#define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
#define HS_CMP_TO_MASK(a) (a)
#endif
//
// The flip/half comparisons rely on a "conditional min/max":
//
// - if the flag is false, return min(a,b)
// - otherwise, return max(a,b)
//
// What's a little surprising is that sequence (1) is faster than (2)
// for 32-bit keys.
//
// I suspect either a code generation problem or that the sequence
// maps well to the GEN instruction set.
//
// We mostly care about 64-bit keys and unsurprisingly sequence (2) is
// fastest for this wider type.
//
// this is what you would normally use
#define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
// this seems to be faster for 32-bit keys
#define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
//
//
//
#if (HS_KEY_WORDS == 1)
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V1(lt,a,b)
#elif (HS_KEY_WORDS == 2)
#define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
#endif
//
// This snarl of macros is for transposing a "slab" of sorted elements
// into linear order.
//
// This can occur as the last step in hs_sort() or via a custom kernel
// that inspects the slab and then transposes and stores it to memory.
//
// The slab format can be inspected more efficiently than a linear
// arrangement.
//
// The prime example is detecting when adjacent keys (in sort order)
// have differing high order bits ("key changes"). The index of each
// change is recorded to an auxilary array.
//
// A post-processing step like this needs to be able to navigate the
// slab and eventually transpose and store the slab in linear order.
//
#define HS_TRANSPOSE_REG(prefix,row) prefix##row
#define HS_TRANSPOSE_DECL(prefix,row) HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
#define HS_TRANSPOSE_DELTA(level) (HS_LANES_PER_WARP + (1 << (level-1)))
#define HS_TRANSPOSE_IF(level) ((get_sub_group_local_id() >> (level - 1)) & 1)
#define HS_TRANSPOSE_LL(level) HS_TRANSPOSE_IF(level) ? 0 : HS_TRANSPOSE_DELTA(level)
#define HS_TRANSPOSE_UR(level) HS_TRANSPOSE_IF(level) ? HS_TRANSPOSE_DELTA(level) : 0
#define HS_TRANSPOSE_DELTA_LL(level) delta_ll_##level
#define HS_TRANSPOSE_DELTA_UR(level) delta_ur_##level
#define HS_TRANSPOSE_STAGE(level) \
uint const HS_TRANSPOSE_DELTA_LL(level) = HS_TRANSPOSE_LL(level); \
uint const HS_TRANSPOSE_DELTA_UR(level) = HS_TRANSPOSE_UR(level);
#define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \
intel_sub_group_shuffle_down(HS_TRANSPOSE_REG(prefix_prev,row_ll), \
HS_TRANSPOSE_REG(prefix_prev,row_ur), \
HS_TRANSPOSE_DELTA_LL(level)); \
HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \
intel_sub_group_shuffle_up(HS_TRANSPOSE_REG(prefix_prev,row_ll), \
HS_TRANSPOSE_REG(prefix_prev,row_ur), \
HS_TRANSPOSE_DELTA_UR(level)); \
// #define HS_TRANSPOSE_LOAD(row) \
// HS_TRANSPOSE_DECL(0,row) = (vout + gmem_idx)[(row-1) << HS_LANES_PER_WARP_LOG2];
#define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \
(vout + gmem_idx)[(row_to-1) << HS_LANES_PER_WARP_LOG2] = \
HS_TRANSPOSE_REG(prefix,row_from);
//
// undefine these if you want to override
//
#define HS_TRANSPOSE_PREAMBLE()
#define HS_TRANSPOSE_BODY()
//
//
//
#endif
//
//
//

View File

@ -28,23 +28,20 @@
//
skc_err
skc_context_create(skc_context_t * context,
char const * target_platform_substring,
char const * target_device_substring,
intptr_t context_properties[])
skc_context_create_cl(skc_context_t * context,
cl_context context_cl,
cl_device_id device_id_cl)
{
(*context) = malloc(sizeof(**context));
//
// FIXME -- don't directly grab a CL runtime but for now juts create
// the CL_12 runtime here
// FIXME -- we'll clean up context creation by platform later. For
// now, just create a CL_12 context.
//
skc_err err;
err = skc_runtime_cl_12_create(*context,
target_platform_substring,
target_device_substring,
context_properties);
err = skc_runtime_cl_12_create(*context,context_cl,device_id_cl);
return err;
}

View File

@ -21,6 +21,11 @@
#include <stdlib.h>
#include <conio.h>
#include "skc_create_cl.h"
#include "common/cl/find_cl.h"
#include "common/cl/assert_cl.h"
#include "svg/svg_doc.h"
#include "svg2skc/svg2skc.h"
#include "svg2skc/transform_stack.h"
@ -49,7 +54,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context);
//
//
static
static
void
is_render_complete(skc_surface_t surface,
skc_styling_t styling,
@ -67,9 +72,9 @@ int
main(int argc, char** argv)
{
//
//
//
if (argc <= 1)
//
if (argc <= 1)
{
fprintf(stderr,"-- missing filename\n");
return EXIT_FAILURE; // no filename
@ -94,6 +99,18 @@ main(int argc, char** argv)
skc_interop_init(&window);
//
// find platform and device by name
//
cl_platform_id platform_id_cl;
cl_device_id device_id_cl;
cl(FindIdsByName("Intel","Graphics",
&platform_id_cl,
&device_id_cl,
0,NULL,NULL,
true));
//
// get GL and device contexts
//
@ -101,22 +118,31 @@ main(int argc, char** argv)
HDC hDC = wglGetCurrentDC();
//
// create the CL context
//
//
cl_context_properties context_properties[] =
cl_context_properties context_properties_cl[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties)-1,
CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id_cl,
CL_GL_CONTEXT_KHR, (cl_context_properties)hGLRC,
CL_WGL_HDC_KHR, (cl_context_properties)hDC,
0
};
cl_int cl_err;
cl_context context_cl = clCreateContext(context_properties_cl,
1,
&device_id_cl,
NULL,
NULL,
&cl_err); cl_ok(cl_err);
//
// create context
// create SKC context
//
skc_context_t context;
skc_err err = skc_context_create(&context,"Intel","Graphics",context_properties);
skc_err err = skc_context_create_cl(&context,
context_cl,
device_id_cl);
//
// associate
@ -136,14 +162,14 @@ main(int argc, char** argv)
skc_raster_builder_t raster_builder;
err = skc_raster_builder_create(context,&raster_builder);
//
// create a composition
//
skc_composition_t composition;
err = skc_composition_create(context,&composition);
//
// create a styling instance
//
@ -154,7 +180,7 @@ main(int argc, char** argv)
svg_doc_layer_count(svg_doc),
1000,
2 * 1024 * 1024);
//
// create a surface
//
@ -191,7 +217,7 @@ main(int argc, char** argv)
skc_transform_stack_restore(ts,ts_save);
// decode layers -- places rasters
svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);
svg_doc_layers_decode(svg_doc,rasters,composition,styling,true/*is_srgb*/);
// seal the composition
skc_composition_seal(composition);
@ -244,7 +270,7 @@ main(int argc, char** argv)
// unseal the composition
skc_composition_unseal(composition,true);
}
//
// dispose of mundane resources
//

View File

@ -106,7 +106,7 @@ skc_allocator_device_create(struct skc_runtime * const runtime)
&runtime->allocator.device.temp.suballocator,
"DEVICE",
runtime->config->suballocator.device.subbufs,
runtime->cl.base_align,
runtime->cl.align_bytes,
runtime->config->suballocator.device.size);
#ifndef NDEBUG

View File

@ -12,7 +12,6 @@
//
//
#include "runtime_cl.h"
#include "block_pool_cl.h"
//
@ -52,8 +51,8 @@ struct skc_config
union skc_block_pool_size block_pool;
struct {
skc_cq_type_e type;
skc_uint size;
cl_command_queue_properties cq_props;
skc_uint size;
} cq_pool;
struct {

View File

@ -7,17 +7,18 @@
*/
//
//
// squelch OpenCL 1.2 deprecation warning
//
#ifndef NDEBUG
#include <stdio.h>
#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
//
//
//
#include <stdio.h>
#include <string.h>
//
@ -25,6 +26,7 @@
//
#include "runtime_cl_12.h"
#include "common/cl/assert_cl.h"
//
// This implementation is probably excessive.
@ -40,21 +42,77 @@
//
//
void
skc_cq_pool_create(struct skc_runtime * const runtime,
struct skc_cq_pool * const pool,
skc_uint const type,
skc_uint const size)
{
pool->type = type;
pool->size = size + 1; // an empty spot
pool->reads = 0;
pool->writes = size;
pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,pool->size * sizeof(*pool->cq));
static
cl_command_queue
skc_runtime_cl_12_create_cq(struct skc_runtime * const runtime,
struct skc_cq_pool * const pool)
{
cl_command_queue cq;
#if 1
//
// <= OpenCL 1.2
//
cl_int cl_err;
cq = clCreateCommandQueue(runtime->cl.context,
runtime->cl.device_id,
pool->cq_props,
&cl_err); cl_ok(cl_err);
#else
if (runtime_cl->version.major < 2)
{
//
// <= OpenCL 1.2
//
cl_int cl_err;
cq = clCreateCommandQueue(runtime_cl->context,
runtime_cl->device_id,
(cl_command_queue_properties)type,
&cl_err); cl_ok(cl_err);
}
else
{
//
// >= OpenCL 2.0
//
cl_int cl_err;
cl_queue_properties const queue_properties[] = {
CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
};
cq = clCreateCommandQueueWithProperties(runtime_cl->context,
runtime_cl->device_id,
queue_properties,
&cl_err); cl_ok(cl_err);
}
#endif
return cq;
}
//
//
//
void
skc_cq_pool_create(struct skc_runtime * const runtime,
struct skc_cq_pool * const pool,
cl_command_queue_properties const cq_props,
skc_uint const size)
{
pool->size = size + 1; // an empty spot
pool->reads = 0;
pool->writes = size;
pool->cq_props = cq_props;
pool->cq = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,
pool->size * sizeof(*pool->cq));
for (skc_uint ii=0; ii<size; ii++)
pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
for (skc_uint ii=0; ii<size; ii++) {
pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
}
pool->cq[size] = NULL;
}
@ -77,7 +135,7 @@ skc_cq_pool_dispose(struct skc_runtime * const runtime,
//
//
static
static
void
skc_cq_pool_write(struct skc_cq_pool * const pool,
cl_command_queue cq)
@ -109,14 +167,14 @@ skc_cq_pool_expand(struct skc_runtime * const runtime,
pool->writes = expand;
for (skc_uint ii=0; ii<expand; ii++)
pool->cq[ii] = skc_runtime_cl_create_cq(&runtime->cl,pool->type);
pool->cq[ii] = skc_runtime_cl_12_create_cq(runtime,pool);
}
//
//
//
static
static
cl_command_queue
skc_cq_pool_read(struct skc_runtime * const runtime,
struct skc_cq_pool * const pool)
@ -141,7 +199,7 @@ skc_runtime_acquire_cq_in_order(struct skc_runtime * const runtime)
}
void
skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
skc_runtime_release_cq_in_order(struct skc_runtime * const runtime,
cl_command_queue cq)
{
skc_cq_pool_write(&runtime->cq_pool,cq);

View File

@ -20,11 +20,12 @@
struct skc_cq_pool
{
skc_cq_type_e type;
skc_uint size;
skc_uint reads;
skc_uint writes;
cl_command_queue * cq;
cl_command_queue * cq;
cl_command_queue_properties cq_props;
skc_uint size;
skc_uint reads;
skc_uint writes;
};
//l
@ -32,10 +33,10 @@ struct skc_cq_pool
//
void
skc_cq_pool_create(struct skc_runtime * const runtime,
struct skc_cq_pool * const pool,
skc_uint const type,
skc_uint const size);
skc_cq_pool_create(struct skc_runtime * const runtime,
struct skc_cq_pool * const pool,
cl_command_queue_properties const cq_props,
skc_uint const size);
void
skc_cq_pool_dispose(struct skc_runtime * const runtime,

View File

@ -77,6 +77,10 @@ cl_kernel
skc_device_acquire_kernel(struct skc_device * const device,
skc_device_kernel_id const type);
void
skc_device_release_kernel(struct skc_device * const device,
cl_kernel kernel);
//
// grid shape can vary greatly by target platform
//

View File

@ -1,64 +1,64 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "device_cl_12.h"
//
// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
//
__kernel
SKC_BP_INIT_IDS_KERNEL_ATTRIBS
void
skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
{
uint const gid = get_global_id(0);
//
// FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
// accomplish this with fewer threads and using either IPC and/or
// vector stores -- it should be on certain architectures!
//
//
// initialize pool with sequence
//
if (gid < bp_size)
ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
}
//
//
//
__kernel
SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
void
skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
{
// the version test is to squelch a bug with the Intel OpenCL CPU
// compiler declaring it supports the cl_intel_subgroups extension
#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
uint const tid = get_sub_group_local_id();
#else
uint const tid = get_local_id(0);
#endif
//
// launch two threads and store [ 0, bp_size ]
//
bp_atomics[tid] = tid * bp_size;
}
//
//
//
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "kernel_cl_12.h"
//
// BEST TO RUN THESE ON AN OUT-OF-ORDER CQ
//
__kernel
SKC_BP_INIT_IDS_KERNEL_ATTRIBS
void
skc_kernel_block_pool_init_ids(__global uint * const ids, uint const bp_size)
{
uint const gid = get_global_id(0);
//
// FIXME -- TUNE FOR ARCH -- evaluate if it's much faster to
// accomplish this with fewer threads and using either IPC and/or
// vector stores -- it should be on certain architectures!
//
//
// initialize pool with sequence
//
if (gid < bp_size)
ids[gid] = gid * SKC_DEVICE_SUBBLOCKS_PER_BLOCK;
}
//
//
//
__kernel
SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS
void
skc_kernel_block_pool_init_atomics(__global uint * const bp_atomics, uint const bp_size)
{
// the version test is to squelch a bug with the Intel OpenCL CPU
// compiler declaring it supports the cl_intel_subgroups extension
#if defined(cl_intel_subgroups) || defined (cl_khr_subgroups)
uint const tid = get_sub_group_local_id();
#else
uint const tid = get_local_id(0);
#endif
//
// launch two threads and store [ 0, bp_size ]
//
bp_atomics[tid] = tid * bp_size;
}
//
//
//

View File

@ -19,6 +19,7 @@
#include "config_cl.h"
#include "runtime_cl_12.h"
#include "kernel_cl_12.h"
#include "device_cl_12.h"
#include "hs/cl/hs_cl_launcher.h"
@ -124,9 +125,9 @@ struct skc_config const config =
.cq_pool = {
#ifndef NDEBUG
.type = SKC_CQ_TYPE_IN_ORDER_PROFILING,
.cq_props = CL_QUEUE_PROFILING_ENABLE,
#else
.type = 0,
.cq_props = 0,
#endif
.size = 8
},
@ -841,6 +842,14 @@ skc_device_acquire_kernel(struct skc_device * const device,
return kernel;
}
void
skc_device_release_kernel(struct skc_device * const device,
cl_kernel kernel)
{
cl(ReleaseKernel(kernel));
}
//
// INITIALIZE KERNEL ARGS
//

View File

@ -1,309 +1,309 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "block.h"
#include "path.h"
#include "common.h"
#include "atomic_cl.h"
#include "raster_builder_cl_12.h"
#include "device_cl_12.h"
//
//
//
#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
//
//
//
#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
//
//
//
#if ( SKC_FILLS_EXPAND_X == 1 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_FILLS_EXPAND_X == 2 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_FILLS_EXPAND_X == 4 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_FILLS_EXPAND_X == 8 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_FILLS_EXPAND_X == 16)
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_FILLS_EXPAND_X"
#endif
//
// Fill and rasterize cmds only differ in their first word semantics
//
union skc_cmd_expand
{
union skc_cmd_fill fill;
union skc_cmd_rasterize rasterize;
};
//
//
//
union skc_path_elem
{
skc_uint u32;
skc_float f32;
};
//
// COMPILE-TIME AND RUN-TIME MACROS
//
#define SKC_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
#define SKC_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
//
//
void
skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
skc_uint * const out_idx,
union skc_cmd_expand * const cmd,
union skc_path_elem const e,
skc_uint const e_idx)
{
//
// FIXME -- we can append a large number of nodeword indices to a
// local SMEM queue and flush when full. It may or may not be a
// performance win on some architectures.
//
skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
cmd->rasterize.nodeword = e_idx;
if (is_elem) {
cmds_out[*out_idx + offset] = cmd->rasterize;
}
*out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
}
//
//
//
__kernel
SKC_FILLS_EXPAND_KERNEL_ATTRIBS
void
skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
__global skc_uint volatile * const atomics,
__global skc_block_id_t const * const map,
__global union skc_cmd_fill const * const cmds_in,
__global union skc_cmd_rasterize * const cmds_out)
{
//
// Need to harmonize the way we determine a subgroup's id. In this
// kernel it's not as important because no local memory is being
// used. Although the device/mask calc to determine subgroup and
// lanes is still proper, we might want to make it clearer that
// we're working with subgroups by using the subgroup API.
//
// every subgroup/simd that will work on the block loads the same command
//
#if (__OPENCL_VERSION__ < 200)
skc_uint const cmd_stride = get_num_sub_groups();
#else
skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
// load fill command -- we reuse y component
union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
// get the path header block from the map
skc_block_id_t id = map[cmd.fill.path];
#if 0
if (get_sub_group_local_id() == 0)
printf("expand[%u] = %u\n",cmd_idx,id);
#endif
//
// blindly load all of the head elements into registers
//
skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
skc_uint count_nodes, count_prims;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
} \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
}
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// debug of path head
//
#if 0
skc_uint count_blocks;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
}
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
if (get_sub_group_local_id() == 0)
printf("path header = { %5u, %5u, %5u }\n",
count_blocks,count_nodes,count_prims);
#endif
//
// acquire slots in the expanded cmd extent
//
// decrement prim_idx by 1 so we can use inclusive warp scan later
//
skc_uint out_idx = 0;
if (get_sub_group_local_id() == 0) {
out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
(atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
}
out_idx = sub_group_broadcast(out_idx,0);
//
// process ids trailing the path header
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
} \
} \
skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
}
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, process the nodes
//
//
// get id of next node
//
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
//
// the following blocks are nodes
//
while (true)
{
// get index of each element
skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
//
// blindly load all of the node elements into registers
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// append all valid ids
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
// any more nodes?
if (--count_nodes == 0)
return;
//
// get id of next node
//
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
}
}
//
//
//
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "block.h"
#include "path.h"
#include "common.h"
#include "atomic_cl.h"
#include "raster_builder_cl_12.h"
#include "kernel_cl_12.h"
//
//
//
#define SKC_FILLS_EXPAND_SUBGROUP_SIZE_MASK (SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
#define SKC_FILLS_EXPAND_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
#define SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_FILLS_EXPAND_ELEM_WORDS)
#define SKC_FILLS_EXPAND_ELEMS_PER_THREAD (SKC_FILLS_EXPAND_ELEMS_PER_BLOCK / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
//
//
//
#define SKC_FILLS_EXPAND_X (SKC_DEVICE_BLOCK_WORDS / SKC_FILLS_EXPAND_SUBGROUP_SIZE)
//
//
//
#if ( SKC_FILLS_EXPAND_X == 1 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_FILLS_EXPAND_X == 2 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_FILLS_EXPAND_X == 4 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_FILLS_EXPAND_X == 8 )
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_FILLS_EXPAND_X == 16)
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_FILLS_EXPAND_X"
#endif
//
// Fill and rasterize cmds only differ in their first word semantics
//
union skc_cmd_expand
{
union skc_cmd_fill fill;
union skc_cmd_rasterize rasterize;
};
//
//
//
union skc_path_elem
{
skc_uint u32;
skc_float f32;
};
//
// COMPILE-TIME AND RUN-TIME MACROS
//
#define SKC_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X,(I ) * SKC_FILLS_EXPAND_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
#define SKC_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E##I.u32,S - I * SKC_FILLS_EXPAND_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E##I.u32,SKC_FILLS_EXPAND_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
//
//
void
skc_cmds_out_append(__global union skc_cmd_rasterize * const cmds_out,
skc_uint * const out_idx,
union skc_cmd_expand * const cmd,
union skc_path_elem const e,
skc_uint const e_idx)
{
//
// FIXME -- we can append a large number of nodeword indices to a
// local SMEM queue and flush when full. It may or may not be a
// performance win on some architectures.
//
skc_bool const is_elem = SKC_TAGGED_BLOCK_ID_GET_TAG(e.u32) < SKC_BLOCK_ID_TAG_PATH_NEXT;
skc_uint const offset = sub_group_scan_inclusive_add(is_elem ? 1 : 0);
cmd->rasterize.nodeword = e_idx;
if (is_elem) {
cmds_out[*out_idx + offset] = cmd->rasterize;
}
*out_idx += sub_group_broadcast(offset,SKC_FILLS_EXPAND_SUBGROUP_SIZE-1);
}
//
//
//
__kernel
SKC_FILLS_EXPAND_KERNEL_ATTRIBS
void
skc_kernel_fills_expand(__global union skc_path_elem const * const blocks,
__global skc_uint volatile * const atomics,
__global skc_block_id_t const * const map,
__global union skc_cmd_fill const * const cmds_in,
__global union skc_cmd_rasterize * const cmds_out)
{
//
// Need to harmonize the way we determine a subgroup's id. In this
// kernel it's not as important because no local memory is being
// used. Although the device/mask calc to determine subgroup and
// lanes is still proper, we might want to make it clearer that
// we're working with subgroups by using the subgroup API.
//
// every subgroup/simd that will work on the block loads the same command
//
#if (__OPENCL_VERSION__ < 200)
skc_uint const cmd_stride = get_num_sub_groups();
#else
skc_uint const cmd_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint cmd_idx = get_group_id(0) * cmd_stride + get_sub_group_id();
// load fill command -- we reuse y component
union skc_cmd_expand cmd = { .fill = cmds_in[cmd_idx] };
// get the path header block from the map
skc_block_id_t id = map[cmd.fill.path];
#if 0
if (get_sub_group_local_id() == 0)
printf("expand[%u] = %u\n",cmd_idx,id);
#endif
//
// blindly load all of the head elements into registers
//
skc_uint head_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
union skc_path_elem h##I = blocks[head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
skc_uint count_nodes, count_prims;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
count_nodes = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_NODES,I); \
} \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_PRIMS,I)) { \
count_prims = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_PRIMS,I); \
}
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// debug of path head
//
#if 0
skc_uint count_blocks;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
count_blocks = SKC_BROADCAST(h,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
}
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
if (get_sub_group_local_id() == 0)
printf("path header = { %5u, %5u, %5u }\n",
count_blocks,count_nodes,count_prims);
#endif
//
// acquire slots in the expanded cmd extent
//
// decrement prim_idx by 1 so we can use inclusive warp scan later
//
skc_uint out_idx = 0;
if (get_sub_group_local_id() == 0) {
out_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP
(atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_CMDS,count_prims) - 1;
}
out_idx = sub_group_broadcast(out_idx,0);
//
// process ids trailing the path header
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_ELEM_GTE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
if (SKC_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_IDS,I)) { \
if (get_sub_group_local_id() + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE < SKC_PATH_HEAD_OFFSET_IDS) { \
h##I.u32 = SKC_TAGGED_BLOCK_ID_INVALID; \
} \
} \
skc_cmds_out_append(cmds_out,&out_idx,&cmd,h##I, \
head_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE); \
}
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, process the nodes
//
//
// get id of next node
//
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(h,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
//
// the following blocks are nodes
//
while (true)
{
// get index of each element
skc_uint node_idx = id * SKC_FILLS_EXPAND_ELEMS_PER_SUBBLOCK + get_sub_group_local_id();
//
// blindly load all of the node elements into registers
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
union skc_path_elem const n##I = blocks[node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE];
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
//
// append all valid ids
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_cmds_out_append(cmds_out,&out_idx,&cmd,n##I, \
node_idx + I * SKC_FILLS_EXPAND_SUBGROUP_SIZE);
SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND();
// any more nodes?
if (--count_nodes == 0)
return;
//
// get id of next node
//
id = SKC_TAGGED_BLOCK_ID_GET_ID(SKC_BROADCAST_LAST(n,SKC_FILLS_EXPAND_PATH_BLOCK_EXPAND_I_LAST));
}
}
//
//
//

File diff suppressed because it is too large Load Diff

View File

@ -1,390 +1,390 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// FIXME -- a pre-allocation step could load the path header quads and
// total up the number of blocks in the workgroup or subgroup
// minimizing the number of later atomics adds.
//
#include "block.h"
#include "path.h"
#include "common.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "device_cl_12.h"
//
//
//
#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
//
//
//
#if ( SKC_PATHS_RECLAIM_X == 1 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_PATHS_RECLAIM_X == 2 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_PATHS_RECLAIM_X == 4 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_PATHS_RECLAIM_X == 8 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_PATHS_RECLAIM_X == 16)
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_PATHS_RECLAIM_X"
#endif
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
// COMPILE-TIME PREDICATES
//
#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
//
// RUN-TIME PREDICATES
//
#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
(get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
//
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
// COMBOS (NOT NECESSARILY POW2)
//
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
// UINT TYPE INSTEAD OF A ULONG.
//
#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
//
//
//
#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
S = sub_group_scan_exclusive_add(C)
#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
(((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
//
//
//
struct skc_reclaim
{
skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
};
__kernel
SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
void
skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
__global skc_uint * const bp_elems, // block pool blocks
__global skc_uint volatile * const bp_atomics, // read/write atomics
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const map, // path host-to-device map
struct skc_reclaim const reclaim) // array of host path ids
{
#if (__OPENCL_VERSION__ < 200)
skc_uint const reclaim_stride = get_num_sub_groups();
#else
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
#if 0
//
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
//
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
#endif
{
// get host path id
skc_path_h const path = reclaim.aN[reclaim_idx];
// get the path header block from the map
skc_block_id_t id = map[path];
//
// blindly load all of the head elements into registers
//
skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
skc_uint count_blocks, count_nodes;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
} \
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
#if 0
if (get_sub_group_local_id() == 0) {
printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
}
#endif
//
// acquire a span in the block pool ids ring for reclaimed ids
//
// FIXME count_blocks and atomic add can be done in same lane
//
skc_uint bp_ids_base = 0;
if (get_sub_group_local_id() == 0) {
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
#if 0
printf("paths: bp_ids_base = %u\n",bp_ids_base);
#endif
}
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
//
// shift away the tagged block id's tag
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
}
//
// - we'll skip subgroups that are entirely header
//
// - but we need to mark any header elements that partially fill
// a subgroup as invalid tagged block ids
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
} \
} \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
{
//
// count reclaimable blocks in each lane
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = h##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
// printf("P %7u ! %u\n",bp_ids_idx,h##I);
}
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, walk the nodes
//
do {
// id of next block is in last lane
id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
// get index of each element
skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
//
// blindly load all of the node elements into registers
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// shift away the tagged block id's tag
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
}
//
// count reclaimable blocks in each lane
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = n##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
// printf("P %7u ! %u\n",bp_ids_idx,n##I);
// any more nodes?
} while (--count_nodes > 0);
}
}
//
//
//
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// FIXME -- a pre-allocation step could load the path header quads and
// total up the number of blocks in the workgroup or subgroup
// minimizing the number of later atomics adds.
//
#include "block.h"
#include "path.h"
#include "common.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "kernel_cl_12.h"
//
//
//
#define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
#define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
//
//
//
#if ( SKC_PATHS_RECLAIM_X == 1 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_PATHS_RECLAIM_X == 2 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_PATHS_RECLAIM_X == 4 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_PATHS_RECLAIM_X == 8 )
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_PATHS_RECLAIM_X == 16)
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_PATHS_RECLAIM_X"
#endif
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
// COMPILE-TIME PREDICATES
//
#define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
#define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
#define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \
SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
#define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \
SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
//
// RUN-TIME PREDICATES
//
#define SKC_PATHS_RECLAIM_IS_HEADER(I) \
(get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
//
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
// COMBOS (NOT NECESSARILY POW2)
//
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
// UINT TYPE INSTEAD OF A ULONG.
//
#define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
#define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
//
//
//
#define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
#define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
#define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
S = sub_group_scan_exclusive_add(C)
#define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \
(((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
//
//
//
struct skc_reclaim
{
skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
};
__kernel
SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
void
skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
__global skc_uint * const bp_elems, // block pool blocks
__global skc_uint volatile * const bp_atomics, // read/write atomics
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const map, // path host-to-device map
struct skc_reclaim const reclaim) // array of host path ids
{
#if (__OPENCL_VERSION__ < 200)
skc_uint const reclaim_stride = get_num_sub_groups();
#else
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
#if 0
//
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
//
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
#endif
{
// get host path id
skc_path_h const path = reclaim.aN[reclaim_idx];
// get the path header block from the map
skc_block_id_t id = map[path];
//
// blindly load all of the head elements into registers
//
skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
skc_uint count_blocks, count_nodes;
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
} \
if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
#if 0
if (get_sub_group_local_id() == 0) {
printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes);
}
#endif
//
// acquire a span in the block pool ids ring for reclaimed ids
//
// FIXME count_blocks and atomic add can be done in same lane
//
skc_uint bp_ids_base = 0;
if (get_sub_group_local_id() == 0) {
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
#if 0
printf("paths: bp_ids_base = %u\n",bp_ids_base);
#endif
}
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
//
// shift away the tagged block id's tag
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
}
//
// - we'll skip subgroups that are entirely header
//
// - but we need to mark any header elements that partially fill
// a subgroup as invalid tagged block ids
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \
if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \
h##I = SKC_TAGGED_BLOCK_ID_INVALID; \
} \
} \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
{
//
// count reclaimable blocks in each lane
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = h##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
// printf("P %7u ! %u\n",bp_ids_idx,h##I);
}
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, walk the nodes
//
do {
// id of next block is in last lane
id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
// get index of each element
skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
//
// blindly load all of the node elements into registers
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// shift away the tagged block id's tag
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
}
//
// count reclaimable blocks in each lane
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = n##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_PATHS_RECLAIM_BLOCK_EXPAND();
// printf("P %7u ! %u\n",bp_ids_idx,n##I);
// any more nodes?
} while (--count_nodes > 0);
}
}
//
//
//

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,144 +1,144 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "raster.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "raster_builder_cl_12.h"
#include "device_cl_12.h"
//
// There is a fixed-size meta table per raster cohort that we use to
// peform a mostly coalesced sizing and allocation of blocks.
//
// This code is simple and fast.
//
__kernel
SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
void
skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
__global skc_block_id_t const * const bp_ids,
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t * const map,
__global skc_uint * const metas,
__global skc_uint const * const raster_ids, // FIXME -- CONSTANT
skc_uint const count)
{
// access to the meta extent is linear
skc_uint const gid = get_global_id(0);
skc_bool const is_active = gid < count;
//
// init with defaults for all lanes
//
union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } };
skc_uint raster_id = SKC_UINT_MAX;
skc_uint extra_blocks = 0;
if (is_active)
{
// load meta_in
meta.in.u32v4 = vload4(gid,metas);
// load raster_id as early as possible
raster_id = raster_ids[gid];
#if 0
printf("%3u + %5u, %5u, %5u, %5u\n",
gid,
meta.in.blocks,
meta.in.offset,
meta.in.pk,
meta.in.rk);
#endif
// how many blocks will the ttpb blocks consume?
extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) /
SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
// total keys
meta.out.keys += meta.in.pk;
// how many blocks do we need to store the keys in the head and trailing nodes?
skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
(SKC_RASTER_NODE_DWORDS - 1));
// increment blocks
extra_blocks += hn;
// how many nodes trail the head?
meta.out.nodes = hn - 1;
// update blocks
meta.out.blocks += extra_blocks;
#if 0
printf("%3u - %5u, %5u, %5u, %5u\n",
gid,
meta.out.blocks,
meta.out.offset,
meta.out.nodes,
meta.out.keys);
#endif
}
//
// allocate blocks from block pool
//
// first perform a prefix sum on the subgroup to reduce atomic
// operation traffic
//
// note this idiom can be implemented with vectors, subgroups or
// workgroups
//
skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
skc_uint reads = 0;
// last lane performs the block pool allocation with an atomic increment
if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
}
// broadcast block pool base to all lanes
reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
// update base for each lane
reads += prefix - extra_blocks;
//
// store meta header
//
if (is_active)
{
// store headers back to meta extent
vstore4(meta.out.u32v4,gid,metas);
// store reads
metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads;
// get block_id of each raster head
skc_block_id_t const block_id = bp_ids[reads & bp_mask];
// update map
map[raster_id] = block_id;
#if 0
printf("alloc: %u / %u\n",raster_id,block_id);
#endif
}
}
//
//
//
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "raster.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "raster_builder_cl_12.h"
#include "kernel_cl_12.h"
//
// There is a fixed-size meta table per raster cohort that we use to
// peform a mostly coalesced sizing and allocation of blocks.
//
// This code is simple and fast.
//
__kernel
SKC_RASTERS_ALLOC_KERNEL_ATTRIBS
void
skc_kernel_rasters_alloc(__global SKC_ATOMIC_UINT volatile * const bp_atomics,
__global skc_block_id_t const * const bp_ids,
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t * const map,
__global skc_uint * const metas,
__global skc_uint const * const raster_ids, // FIXME -- CONSTANT
skc_uint const count)
{
// access to the meta extent is linear
skc_uint const gid = get_global_id(0);
skc_bool const is_active = gid < count;
//
// init with defaults for all lanes
//
union skc_raster_cohort_meta_inout meta = { .in.u32v4 = { 0, 0, 0, 0 } };
skc_uint raster_id = SKC_UINT_MAX;
skc_uint extra_blocks = 0;
if (is_active)
{
// load meta_in
meta.in.u32v4 = vload4(gid,metas);
// load raster_id as early as possible
raster_id = raster_ids[gid];
#if 0
printf("%3u + %5u, %5u, %5u, %5u\n",
gid,
meta.in.blocks,
meta.in.offset,
meta.in.pk,
meta.in.rk);
#endif
// how many blocks will the ttpb blocks consume?
extra_blocks = ((meta.in.pk * SKC_TILE_RATIO + SKC_DEVICE_SUBBLOCKS_PER_BLOCK - SKC_TILE_RATIO) /
SKC_DEVICE_SUBBLOCKS_PER_BLOCK);
// total keys
meta.out.keys += meta.in.pk;
// how many blocks do we need to store the keys in the head and trailing nodes?
skc_uint const hn = ((SKC_RASTER_HEAD_DWORDS + meta.out.keys + SKC_RASTER_NODE_DWORDS - 2) /
(SKC_RASTER_NODE_DWORDS - 1));
// increment blocks
extra_blocks += hn;
// how many nodes trail the head?
meta.out.nodes = hn - 1;
// update blocks
meta.out.blocks += extra_blocks;
#if 0
printf("%3u - %5u, %5u, %5u, %5u\n",
gid,
meta.out.blocks,
meta.out.offset,
meta.out.nodes,
meta.out.keys);
#endif
}
//
// allocate blocks from block pool
//
// first perform a prefix sum on the subgroup to reduce atomic
// operation traffic
//
// note this idiom can be implemented with vectors, subgroups or
// workgroups
//
skc_uint const prefix = SKC_RASTERS_ALLOC_INCLUSIVE_ADD(extra_blocks);
skc_uint reads = 0;
// last lane performs the block pool allocation with an atomic increment
if (SKC_RASTERS_ALLOC_LOCAL_ID() == SKC_RASTERS_ALLOC_GROUP_SIZE - 1) {
reads = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,prefix); // ring_reads
}
// broadcast block pool base to all lanes
reads = SKC_RASTERS_ALLOC_BROADCAST(reads,SKC_RASTERS_ALLOC_GROUP_SIZE - 1);
// update base for each lane
reads += prefix - extra_blocks;
//
// store meta header
//
if (is_active)
{
// store headers back to meta extent
vstore4(meta.out.u32v4,gid,metas);
// store reads
metas[SKC_RASTER_COHORT_META_OFFSET_READS + gid] = reads;
// get block_id of each raster head
skc_block_id_t const block_id = bp_ids[reads & bp_mask];
// update map
map[raster_id] = block_id;
#if 0
printf("alloc: %u / %u\n",raster_id,block_id);
#endif
}
}
//
//
//

View File

@ -1,442 +1,442 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "block.h"
#include "raster.h"
#include "common.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "device_cl_12.h"
//
//
//
#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
//
//
//
#if ( SKC_RASTERS_RECLAIM_X == 1 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_RASTERS_RECLAIM_X == 2 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_RASTERS_RECLAIM_X == 4 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_RASTERS_RECLAIM_X == 8 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_RASTERS_RECLAIM_X == 16)
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_RASTERS_RECLAIM_X"
#endif
#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
(I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
#endif
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
// COMPILE-TIME PREDICATES
//
#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
//
// RUN-TIME PREDICATES
//
#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
(get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
//
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
// COMBOS (NOT NECESSARILY POW2)
//
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
// UINT TYPE INSTEAD OF A ULONG.
//
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
//
//
//
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
S = sub_group_scan_exclusive_add(C)
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
(((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
//
//
//
struct skc_reclaim
{
skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
};
__kernel
SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
void
skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
__global skc_uint * const bp_elems, // block pool blocks
__global skc_uint volatile * const bp_atomics, // read/write atomics
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const map, // raster host-to-device map
struct skc_reclaim const reclaim) // array of host raster ids
{
#if (__OPENCL_VERSION__ < 200)
skc_uint const reclaim_stride = get_num_sub_groups();
#else
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
#if 0
//
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
//
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
#endif
{
// get host raster id
skc_raster_h const raster = reclaim.aN[reclaim_idx];
// get block id of raster header
skc_block_id_t id = map[raster];
//
// load all of the head block ttxk.lo keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
// load raster header counts -- we only need the blocks and
// nodes words the keys are doublewords.
//
// FIXME -- this can be made portable with compile-time macro expansion
//
skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
#if 0
if (get_sub_group_local_id() == 0) {
printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
}
#endif
//
// acquire a span in the block pool ids ring for reclaimed ids
//
skc_uint bp_ids_base = 0;
if (get_sub_group_local_id() == 0) {
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
}
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
//
// mask off everything but the block id
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
h##I = h##I & SKC_TTXK_LO_MASK_ID; \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
#if 0
printf("rasters next = %u\n",id);
#endif
}
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%08X %u\n",h##I,h##I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
printf("%08X\n",h##I); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
//
// - we'll skip subgroups that are entirely header
//
// - but we need to mark any header elements that partially fill
// a subgroup as subblocks
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
h##I = SKC_UINT_MAX; \
} \
} \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
{
//
// count reclaimable blocks in each lane
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = h##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
}
// printf("R %7u ! %u\n",bp_ids_idx,h##I);
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, walk the nodes
//
do {
// id of next block is in last lane
id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
//
// load all of the node block ttxk.lo keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// mask off everything but the block id
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
n##I = n##I & SKC_TTXK_LO_MASK_ID;
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
#if 0
printf("rasters next = %u\n",id);
#endif
}
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%08X %u\n",n##I,n##I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
//
// count reclaimable blocks in each lane
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = n##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
// printf("R %7u ! %u\n",bp_ids_idx,n##I);
// any more nodes?
} while (--count_nodes > 0);
}
}
//
//
//
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include "tile.h"
#include "block.h"
#include "raster.h"
#include "common.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "kernel_cl_12.h"
//
//
//
#define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_RASTERS_RECLAIM_SUBGROUP_WORDS (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE * SKC_RASTERS_RECLAIM_LOCAL_ELEMS)
#define SKC_RASTERS_RECLAIM_X (SKC_DEVICE_BLOCK_DWORDS / SKC_RASTERS_RECLAIM_SUBGROUP_WORDS)
//
//
//
#if ( SKC_RASTERS_RECLAIM_X == 1 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 0
#elif ( SKC_RASTERS_RECLAIM_X == 2 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 1
#elif ( SKC_RASTERS_RECLAIM_X == 4 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 3
#elif ( SKC_RASTERS_RECLAIM_X == 8 )
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 7
#elif ( SKC_RASTERS_RECLAIM_X == 16)
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16()
#define SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST 15
#else
#error "MISSING SKC_RASTERS_RECLAIM_X"
#endif
#if ( SKC_PREFIX_SUBGROUP_SIZE == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE )
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE > SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_PREFIX_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) ((I / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO) * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_RATIO + \
(I & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (L)
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_SCALE(I) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_RATIO * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#elif ( SKC_PREFIX_SUBGROUP_SIZE < SKC_RASTERS_RECLAIM_SUBGROUP_SIZE ) // same as above when ratio equals 1
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK (SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
#define SKC_RASTERS_RECLAIM_STRIDE_H(L) (((L) & ~SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_RASTERS_RECLAIM_SUBGROUP_RATIO_MASK))
#define SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) (I * 2 * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_STRIDE_V_HI(I) (SKC_RASTERS_RECLAIM_STRIDE_V_LO(I) + SKC_RASTERS_RECLAIM_SUBGROUP_SIZE / SKC_RASTERS_RECLAIM_SUBGROUP_RATIO)
#endif
//
// FIXME -- slate these for replacement
//
#define SKC_BROADCAST(E,S,I) \
sub_group_broadcast(E,S - I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_BROADCAST_LAST_HELPER(E,I) \
sub_group_broadcast(E,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
#define SKC_BROADCAST_LAST(E,I) \
SKC_BROADCAST_LAST_HELPER(E,I)
//
// COMPILE-TIME PREDICATES
//
#define SKC_RASTERS_RECLAIM_ELEM_GTE(X,I) \
SKC_GTE_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(X,I) \
(skc_bool)SKC_GTE_MACRO(X, I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE) && \
(skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)
#define SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I) \
SKC_RASTERS_RECLAIM_ELEM_GTE(SKC_RASTER_HEAD_DWORDS,I)
#define SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I) \
SKC_RASTERS_RECLAIM_ELEM_IN_RANGE(SKC_RASTER_HEAD_DWORDS,I)
//
// RUN-TIME PREDICATES
//
#define SKC_RASTERS_RECLAIM_IS_HEADER(I) \
(get_sub_group_local_id() + I * SKC_RASTERS_RECLAIM_SUBGROUP_SIZE < SKC_RASTER_HEAD_DWORDS)
//
// FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
// POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
// COMBOS (NOT NECESSARILY POW2)
//
// FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
// UINT TYPE INSTEAD OF A ULONG.
//
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE skc_uint
//
//
//
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS)
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \
(((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \
? 0 : (1u << SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I))
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \
S = sub_group_scan_exclusive_add(C)
#define SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(C,I) \
(((C) >> (SKC_RASTERS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_RASTERS_RECLAIM_PACKED_COUNT_MASK)
//
//
//
struct skc_reclaim
{
skc_raster_h aN[SKC_RECLAIM_ARRAY_SIZE];
};
__kernel
SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS
void
skc_kernel_rasters_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring
__global skc_uint * const bp_elems, // block pool blocks
__global skc_uint volatile * const bp_atomics, // read/write atomics
skc_uint const bp_mask, // pow2 modulo mask for block pool ring
__global skc_block_id_t const * const map, // raster host-to-device map
struct skc_reclaim const reclaim) // array of host raster ids
{
#if (__OPENCL_VERSION__ < 200)
skc_uint const reclaim_stride = get_num_sub_groups();
#else
skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
#endif
skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id();
#if 0
//
// NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
// WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
// MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
// RECLAMATION JOB ON THE REST OF THE PIPELINE.
//
for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
#endif
{
// get host raster id
skc_raster_h const raster = reclaim.aN[reclaim_idx];
// get block id of raster header
skc_block_id_t id = map[raster];
//
// load all of the head block ttxk.lo keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint h##I = bp_elems[head_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// pick out count.nodes and count.prims from the header
//
// load raster header counts -- we only need the blocks and
// nodes words the keys are doublewords.
//
// FIXME -- this can be made portable with compile-time macro expansion
//
skc_uint count_blocks = sub_group_broadcast(h0,0); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
skc_uint count_nodes = sub_group_broadcast(h0,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
#if 0
if (get_sub_group_local_id() == 0) {
printf("reclaim rasters: %u / %u / %5u / %5u\n",raster,id,count_blocks,count_nodes);
}
#endif
//
// acquire a span in the block pool ids ring for reclaimed ids
//
skc_uint bp_ids_base = 0;
if (get_sub_group_local_id() == 0) {
bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
}
bp_ids_base = sub_group_broadcast(bp_ids_base,0);
//
// mask off everything but the block id
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
h##I = h##I & SKC_TTXK_LO_MASK_ID; \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(h,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
#if 0
printf("rasters next = %u\n",id);
#endif
}
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%08X %u\n",h##I,h##I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
printf("%08X\n",h##I); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
//
// - we'll skip subgroups that are entirely header
//
// - but we need to mark any header elements that partially fill
// a subgroup as subblocks
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
if (SKC_RASTERS_RECLAIM_PARTIALLY_HEADER(I)) { \
if (SKC_RASTERS_RECLAIM_IS_HEADER(I)) { \
h##I = SKC_UINT_MAX; \
} \
} \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
{
//
// count reclaimable blocks in each lane
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
if (!SKC_RASTERS_RECLAIM_ENTIRELY_HEADER(I)) { \
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = h##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
}
// printf("R %7u ! %u\n",bp_ids_idx,h##I);
//
// we're done if it was just the header
//
if (count_nodes == 0)
return;
//
// otherwise, walk the nodes
//
do {
// id of next block is in last lane
id = sub_group_broadcast(id,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1);
//
// load all of the node block ttxk.lo keys into registers
//
// FIXME -- this pattern lends itself to using the higher
// performance Intel GEN block load instructions
//
skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_RASTERS_RECLAIM_STRIDE_H(get_sub_group_local_id());
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
skc_uint n##I = bp_elems[node_id + SKC_RASTERS_RECLAIM_STRIDE_V_LO(I)];
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// mask off everything but the block id
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
n##I = n##I & SKC_TTXK_LO_MASK_ID;
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// swap current id with next
//
if (get_sub_group_local_id() == SKC_RASTERS_RECLAIM_SUBGROUP_SIZE - 1)
{
skc_block_id_t const next = SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST);
SKC_CONCAT(n,SKC_RASTERS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
id = next;
#if 0
printf("rasters next = %u\n",id);
#endif
}
#if 0
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
printf("%08X %u\n",n##I,n##I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
#endif
//
// count reclaimable blocks in each lane
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) \
packed_count |= SKC_RASTERS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
//
// scan to find index of each block
//
SKC_RASTERS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
SKC_RASTERS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
//
// store blocks back to ring
//
#undef SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,R) { \
skc_uint const index = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
skc_uint const count = SKC_RASTERS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \
if (count > 0) { \
bp_ids[bp_ids_idx] = n##I; \
} \
skc_uint const total = index + count; \
bp_ids_base += sub_group_broadcast(total,SKC_RASTERS_RECLAIM_SUBGROUP_SIZE-1); \
}
SKC_RASTERS_RECLAIM_BLOCK_EXPAND();
// printf("R %7u ! %u\n",bp_ids_idx,n##I);
// any more nodes?
} while (--count_nodes > 0);
}
}
//
//
//

File diff suppressed because it is too large Load Diff

View File

@ -1,130 +1,130 @@
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//
#include "tile.h"
#include "atomic_cl.h"
#include "device_cl_12.h"
//
//
//
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
//
//
//
#define SKC_YX_NEQ(row,prev) \
(((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
//
//
//
__kernel
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
void
skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const indices,
__global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
{
uint const global_id = get_global_id(0);
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
HS_SLAB_ROWS();
//
// LOAD LAST REGISTER FROM COLUMN TO LEFT
//
uint diffs = 0;
uint2 r0 = r1;
if (gmem_base > 0) {
// if this is the first key in any slab but the first then it
// broadcast loads the last key in previous slab
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
} else if (get_sub_group_local_id() == 0) {
// if this is the first lane in the first slab
diffs = 1;
}
// now shuffle in the last key from the column to the left
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
//
// FIND ALL DIFFERENCES IN SLAB
//
uint valid = 0;
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
valid |= ((r##row != SKC_ULONG_MAX) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
diffs |= (SKC_YX_NEQ(row,prev) << prev);
HS_SLAB_ROWS();
//
// SUM UP THE DIFFERENCES
//
uint const valid_diffs = valid & diffs;
uint const count = popcount(valid_diffs);
uint const inclusive = sub_group_scan_inclusive_add(count);
uint const exclusive = inclusive - count;
//
// RESERVE SPACE IN THE INDICES ARRAY
//
uint next = 0;
if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
// distribute base across subgroup
next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
//
// STORE THE INDICES
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (valid_diffs & (1 << prev)) \
indices[next++] = lane_idx + prev;
HS_SLAB_ROWS();
//
// TRANSPOSE THE SLAB AND STORE IT
//
HS_TRANSPOSE_SLAB();
}
//
//
//
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//
#include "tile.h"
#include "atomic_cl.h"
#include "kernel_cl_12.h"
//
//
//
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
//
//
//
#define SKC_YX_NEQ(row,prev) \
(((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0)
//
//
//
__kernel
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
void
skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const indices,
__global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics)
{
uint const global_id = get_global_id(0);
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
HS_SLAB_ROWS();
//
// LOAD LAST REGISTER FROM COLUMN TO LEFT
//
uint diffs = 0;
uint2 r0 = r1;
if (gmem_base > 0) {
// if this is the first key in any slab but the first then it
// broadcast loads the last key in previous slab
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
} else if (get_sub_group_local_id() == 0) {
// if this is the first lane in the first slab
diffs = 1;
}
// now shuffle in the last key from the column to the left
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
//
// FIND ALL DIFFERENCES IN SLAB
//
uint valid = 0;
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
valid |= ((r##row != SKC_ULONG_MAX) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
diffs |= (SKC_YX_NEQ(row,prev) << prev);
HS_SLAB_ROWS();
//
// SUM UP THE DIFFERENCES
//
uint const valid_diffs = valid & diffs;
uint const count = popcount(valid_diffs);
uint const inclusive = sub_group_scan_inclusive_add(count);
uint const exclusive = inclusive - count;
//
// RESERVE SPACE IN THE INDICES ARRAY
//
uint next = 0;
if (get_sub_group_local_id() == HS_LANES_PER_WARP-1)
next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset
// distribute base across subgroup
next = exclusive + sub_group_broadcast(next,HS_LANES_PER_WARP-1);
//
// STORE THE INDICES
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (valid_diffs & (1 << prev)) \
indices[next++] = lane_idx + prev;
HS_SLAB_ROWS();
//
// TRANSPOSE THE SLAB AND STORE IT
//
HS_TRANSPOSE_SLAB();
}
//
//
//

View File

@ -1,394 +1,394 @@
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//
#include "tile.h"
#include "raster_builder_cl_12.h" // need meta_in structure
#include "device_cl_12.h"
//
//
//
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
//
// THE BEST TYPE TO ZERO SMEM
//
#define SKC_ZERO_TYPE ulong
#define SKC_ZERO_WORDS 2
//
// THE ORDER OF COMPONENTS IS:
//
// 0: blocks
// 1: offset
// 2: pk
// 3: rk
//
#if (HS_KEYS_PER_SLAB < 256)
#define SKC_META_TYPE uint
#define SKC_META_WORDS 1
#define SKC_COMPONENT_TYPE uchar
#else
#define SKC_META_TYPE uint2
#define SKC_META_WORDS 2
#define SKC_COMPONENT_TYPE ushort
#endif
//
//
//
#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
#define SKC_COHORT_TYPE uchar
#else
#define SKC_COHORT_TYPE ushort
#endif
//
//
//
#define SKC_COHORT_ID(row) \
as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
//
// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
//
#define SKC_IS_BLOCK(row) \
((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
#define SKC_YX(row,prev) \
(as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
#define SKC_IS_PK(row,prev) \
((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
//
// COHORT SIZE IS ALWAYS A POWER-OF-TWO
// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
//
// COHORT SIZE >= SUBGROUP SIZE
//
#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
#define SKC_META_COMPONENTS 4
#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
//
//
//
__kernel
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
void
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const metas)
{
__local union
{
SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
} shared;
uint const global_id = get_global_id(0);
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
HS_SLAB_ROWS();
//
// LOAD LAST REGISTER FROM COLUMN TO LEFT
//
uint diffs = 0;
uint2 r0 = 0;
if (gmem_base > 0) {
// if this is the first key in any slab but the first then it
// broadcast loads the last key in previous slab
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
} else {
// otherwise broadcast the first key in the first slab
r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
// and mark it as an implicit diff
if (get_sub_group_local_id() == 0)
diffs = 1;
}
// now shuffle in the last key from the column to the left
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
// shift away y/x
SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
//
// EXTRACT ALL COHORT IDS EARLY...
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
HS_SLAB_ROWS();
//
// DEBUG
//
#if 0
if (gmem_base == HS_KEYS_PER_SLAB * 7)
{
if (get_sub_group_local_id() == 0)
printf("\n%llX ",as_ulong(r0));
else
printf("%llX ",as_ulong(r0));
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (get_sub_group_local_id() == 0) \
printf("\n%llX ",r##row); \
else \
printf("%llX ",r##row);
HS_SLAB_ROWS();
}
#endif
//
// CAPTURE ALL CONDITIONS WE CARE ABOUT
//
// Diffs must be captured before cohorts
//
uint valid = 0;
uint blocks = 0;
uint pks = 0;
SKC_COHORT_TYPE c_max = 0;
//
// FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
// EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
//
#if 0
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
diffs |= ((c##row != c##prev) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
blocks |= (SKC_IS_BLOCK(row) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
pks |= SKC_IS_PK(row,prev) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
valid |= ((r##row != SKC_ULONG_MAX) << prev);
HS_SLAB_ROWS();
#else
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (c##row != c##prev) \
diffs |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (SKC_IS_BLOCK(row)) \
blocks |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (SKC_IS_PK(row,prev)) \
pks |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (r##row != SKC_ULONG_MAX) { \
valid |= 1<<prev; \
c_max = max(c_max,c##row); \
}
HS_SLAB_ROWS();
#endif
//
// TRANSPOSE THE SLAB AND STORE IT
//
HS_TRANSPOSE_SLAB();
// the min cohort is the first key in the slab
uint const c_min = sub_group_broadcast(c1,0);
// the max cohort is the max across all lanes
c_max = sub_group_reduce_max(c_max);
#if 0 // REMOVE ME LATER
if (get_sub_group_local_id() == 0)
printf("%3u : ( %3u , %3u )\n",
get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
#endif
//
// ZERO SMEM
//
// zero only the meta info for the cohort ids found in this slab
//
#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
shared.z[zz] = 0;
#else
// ERROR -- it's highly unlikely that the zero type is smaller than
// the meta type
#error("Unsupported right now...")
#endif
//
// ACCUMULATE AND STORE META INFO
//
uint const valid_blocks = valid & blocks;
uint const valid_pks = valid & pks & ~diffs;
SKC_META_TYPE meta = ( 0 );
#define SKC_META_LOCAL_ADD(meta) \
atomic_add(shared.m+HS_REG_LAST(c),meta);
#define SKC_META_LOCAL_STORE(meta,prev) \
shared.m[c##prev] = meta;
// note this is purposefully off by +1
#define SKC_META_RESET(meta,curr) \
meta = ((gmem_off + curr) << 8);
#if 0
// FIXME -- this can be tweaked to shift directly
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
meta += ((((blocks >> prev) & 1) ) | \
(((pks >> prev) & 1) << 16) | \
(((rks >> prev) & 1) << 24));
#else
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
if (blocks & (1<<prev)) \
meta += 1; \
if (pks & (1<<prev)) \
meta += 1<<16; \
if (rks & (1<<prev)) \
meta += 1<<24;
#endif
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (diffs & (1<<prev)) { \
SKC_META_LOCAL_STORE(meta,prev); \
SKC_META_RESET(meta,row); \
} \
SKC_META_ADD(meta,prev, \
valid_blocks, \
valid_pks, \
valid);
HS_SLAB_ROWS();
//
// ATOMICALLY ADD THE CARRIED OUT METAS
//
#if 0 // BUG
if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
SKC_META_LOCAL_ADD(meta);
#else
if (meta != 0)
SKC_META_LOCAL_ADD(meta);
#endif
//
// NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
//
// convert the slab offset to an extent offset
bool const is_offset = (get_sub_group_local_id() & 3) == 1;
uint const adjust = is_offset ? gmem_base - 1 : 0;
//
// only process the meta components found in this slab
//
uint const cc_min = c_min * SKC_META_COMPONENTS;
uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
if ((cc >= cc_min) && (cc <= cc_max))
{
uint const c = shared.c[cc];
if (c != 0)
atomic_add(metas+cc,c+adjust);
}
cc += HS_LANES_PER_WARP;
for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
{
uint const c = shared.c[cc];
if (c != 0)
atomic_add(metas+cc,c+adjust);
}
}
//
//
//
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
// NOTE THAT THE SEGMENT TTRK KERNEL IS ENTIRELY DEPENDENT ON THE
// LAYOUT OF THE TTRK KEY. IF THE TTRK KEY IS ALTERED THEN THIS
// KERNEL WILL NEED TO BE UPDATED
//
#include "tile.h"
#include "raster_builder_cl_12.h" // need meta_in structure
#include "kernel_cl_12.h"
//
//
//
#define HS_KEYS_PER_SLAB (HS_KEYS_PER_LANE * HS_LANES_PER_WARP)
#define HS_LANE_MASK (HS_LANES_PER_WARP - 1)
//
// THE BEST TYPE TO ZERO SMEM
//
#define SKC_ZERO_TYPE ulong
#define SKC_ZERO_WORDS 2
//
// THE ORDER OF COMPONENTS IS:
//
// 0: blocks
// 1: offset
// 2: pk
// 3: rk
//
#if (HS_KEYS_PER_SLAB < 256)
#define SKC_META_TYPE uint
#define SKC_META_WORDS 1
#define SKC_COMPONENT_TYPE uchar
#else
#define SKC_META_TYPE uint2
#define SKC_META_WORDS 2
#define SKC_COMPONENT_TYPE ushort
#endif
//
//
//
#if ( SKC_TTRK_HI_BITS_COHORT <= 8)
#define SKC_COHORT_TYPE uchar
#else
#define SKC_COHORT_TYPE ushort
#endif
//
//
//
#define SKC_COHORT_ID(row) \
as_uint2(r##row).hi >> SKC_TTRK_HI_OFFSET_COHORT
//
// FIXME -- THIS WILL BREAK IF EITHER THE YX BITS OR OFFSET ARE CHANGED
//
#define SKC_IS_BLOCK(row) \
((as_uint2(r##row).lo & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
#define SKC_YX(row,prev) \
(as_uint2(r##row).hi ^ as_uint2(r##prev).hi)
#define SKC_IS_PK(row,prev) \
((uint)(SKC_YX(row,prev) - 1) < SKC_TTRK_HI_MASK_X)
//
// COHORT SIZE IS ALWAYS A POWER-OF-TWO
// SUBGROUP SIZE IS ALWAYS A POWER-OF-TWO
//
// COHORT SIZE >= SUBGROUP SIZE
//
#define SKC_COHORT_SIZE (1<<SKC_TTRK_HI_BITS_COHORT)
#define SKC_ZERO_RATIO (SKC_ZERO_WORDS / SKC_META_WORDS)
#define SKC_META_ZERO_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_ZERO_TYPE))
#define SKC_META_ZERO_REM (SKC_META_ZERO_COUNT & SKC_BITS_TO_MASK(HS_LANES_PER_WARP_LOG2))
#define SKC_META_COMPONENTS 4
#define SKC_META_COMPONENT_COUNT (SKC_COHORT_SIZE * sizeof(SKC_META_TYPE) / sizeof(SKC_COMPONENT_TYPE))
//
//
//
__kernel
__attribute__((intel_reqd_sub_group_size(HS_LANES_PER_WARP)))
void
skc_kernel_segment_ttrk(__global HS_KEY_TYPE * SKC_RESTRICT const vout,
__global uint * SKC_RESTRICT const metas)
{
__local union
{
SKC_META_TYPE volatile m[SKC_COHORT_SIZE];
SKC_ZERO_TYPE z[SKC_META_ZERO_COUNT];
SKC_COMPONENT_TYPE c[SKC_META_COMPONENT_COUNT];
} shared;
uint const global_id = get_global_id(0);
uint const gmem_base = (global_id >> HS_LANES_PER_WARP_LOG2) * HS_KEYS_PER_SLAB;
uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK);
uint const gmem_off = (global_id & HS_LANE_MASK) * HS_KEYS_PER_LANE;
//
// LOAD ALL THE ROWS
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_LANES_PER_WARP];
HS_SLAB_ROWS();
//
// LOAD LAST REGISTER FROM COLUMN TO LEFT
//
uint diffs = 0;
uint2 r0 = 0;
if (gmem_base > 0) {
// if this is the first key in any slab but the first then it
// broadcast loads the last key in previous slab
r0.hi = as_uint2(vout[gmem_base - 1]).hi;
} else {
// otherwise broadcast the first key in the first slab
r0.hi = sub_group_broadcast(as_uint2(r1).hi,0);
// and mark it as an implicit diff
if (get_sub_group_local_id() == 0)
diffs = 1;
}
// now shuffle in the last key from the column to the left
r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1);
// shift away y/x
SKC_COHORT_TYPE const c0 = r0.hi >> SKC_TTRK_HI_OFFSET_COHORT;
//
// EXTRACT ALL COHORT IDS EARLY...
//
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
SKC_COHORT_TYPE c##row = SKC_COHORT_ID(row);
HS_SLAB_ROWS();
//
// DEBUG
//
#if 0
if (gmem_base == HS_KEYS_PER_SLAB * 7)
{
if (get_sub_group_local_id() == 0)
printf("\n%llX ",as_ulong(r0));
else
printf("%llX ",as_ulong(r0));
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (get_sub_group_local_id() == 0) \
printf("\n%llX ",r##row); \
else \
printf("%llX ",r##row);
HS_SLAB_ROWS();
}
#endif
//
// CAPTURE ALL CONDITIONS WE CARE ABOUT
//
// Diffs must be captured before cohorts
//
uint valid = 0;
uint blocks = 0;
uint pks = 0;
SKC_COHORT_TYPE c_max = 0;
//
// FIXME -- IT'S UNCLEAR IF SHIFTING THE CONDITION CODE VS. AN
// EXPLICIT PREDICATE WILL GENERATE THE SAME CODE
//
#if 0
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
diffs |= ((c##row != c##prev) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
blocks |= (SKC_IS_BLOCK(row) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
pks |= SKC_IS_PK(row,prev) << prev);
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
valid |= ((r##row != SKC_ULONG_MAX) << prev);
HS_SLAB_ROWS();
#else
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (c##row != c##prev) \
diffs |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (SKC_IS_BLOCK(row)) \
blocks |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (SKC_IS_PK(row,prev)) \
pks |= 1<<prev;
HS_SLAB_ROWS();
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (r##row != SKC_ULONG_MAX) { \
valid |= 1<<prev; \
c_max = max(c_max,c##row); \
}
HS_SLAB_ROWS();
#endif
//
// TRANSPOSE THE SLAB AND STORE IT
//
HS_TRANSPOSE_SLAB();
// the min cohort is the first key in the slab
uint const c_min = sub_group_broadcast(c1,0);
// the max cohort is the max across all lanes
c_max = sub_group_reduce_max(c_max);
#if 0 // REMOVE ME LATER
if (get_sub_group_local_id() == 0)
printf("%3u : ( %3u , %3u )\n",
get_global_id(0)>>HS_LANES_PER_WARP_LOG2,c_min,c_max);
#endif
//
// ZERO SMEM
//
// zero only the meta info for the cohort ids found in this slab
//
#if (SKC_ZERO_WORDS >= SKC_META_WORDS)
uint zz = ((c_min / SKC_ZERO_RATIO) & ~HS_LANE_MASK) + get_sub_group_local_id();
uint const zz_max = (c_max + SKC_ZERO_RATIO - 1) / SKC_ZERO_RATIO;
for (; zz<=zz_max; zz+=HS_LANES_PER_WARP)
shared.z[zz] = 0;
#else
// ERROR -- it's highly unlikely that the zero type is smaller than
// the meta type
#error("Unsupported right now...")
#endif
//
// ACCUMULATE AND STORE META INFO
//
uint const valid_blocks = valid & blocks;
uint const valid_pks = valid & pks & ~diffs;
SKC_META_TYPE meta = ( 0 );
#define SKC_META_LOCAL_ADD(meta) \
atomic_add(shared.m+HS_REG_LAST(c),meta);
#define SKC_META_LOCAL_STORE(meta,prev) \
shared.m[c##prev] = meta;
// note this is purposefully off by +1
#define SKC_META_RESET(meta,curr) \
meta = ((gmem_off + curr) << 8);
#if 0
// FIXME -- this can be tweaked to shift directly
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
meta += ((((blocks >> prev) & 1) ) | \
(((pks >> prev) & 1) << 16) | \
(((rks >> prev) & 1) << 24));
#else
#define SKC_META_ADD(meta,prev,blocks,pks,rks) \
if (blocks & (1<<prev)) \
meta += 1; \
if (pks & (1<<prev)) \
meta += 1<<16; \
if (rks & (1<<prev)) \
meta += 1<<24;
#endif
#undef HS_SLAB_ROW
#define HS_SLAB_ROW(row,prev) \
if (diffs & (1<<prev)) { \
SKC_META_LOCAL_STORE(meta,prev); \
SKC_META_RESET(meta,row); \
} \
SKC_META_ADD(meta,prev, \
valid_blocks, \
valid_pks, \
valid);
HS_SLAB_ROWS();
//
// ATOMICALLY ADD THE CARRIED OUT METAS
//
#if 0 // BUG
if ((valid & (1<<(HS_KEYS_PER_LANE-1))) && (meta != 0))
SKC_META_LOCAL_ADD(meta);
#else
if (meta != 0)
SKC_META_LOCAL_ADD(meta);
#endif
//
// NOW ATOMICALLY ADD ALL METAS TO THE GLOBAL META TABLE
//
// convert the slab offset to an extent offset
bool const is_offset = (get_sub_group_local_id() & 3) == 1;
uint const adjust = is_offset ? gmem_base - 1 : 0;
//
// only process the meta components found in this slab
//
uint const cc_min = c_min * SKC_META_COMPONENTS;
uint const cc_max = c_max * SKC_META_COMPONENTS + SKC_META_COMPONENTS - 1;
uint cc = (cc_min & ~HS_LANE_MASK) + get_sub_group_local_id();
if ((cc >= cc_min) && (cc <= cc_max))
{
uint const c = shared.c[cc];
if (c != 0)
atomic_add(metas+cc,c+adjust);
}
cc += HS_LANES_PER_WARP;
for (; cc<=cc_max; cc+=HS_LANES_PER_WARP)
{
uint const c = shared.c[cc];
if (c != 0)
atomic_add(metas+cc,c+adjust);
}
}
//
//
//

View File

@ -1,362 +0,0 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
//
//
//
#include "runtime_cl.h"
#include "common/cl/assert_cl.h"
//
//
//
static is_verbose = true;
//
// FIXME -- all variable length device queries need to start querying
// the parameter's return size before getting its value
//
// FIXME -- this is now handled by the common/cl/find.* routine
//
union skc_cl_device_version {
struct {
cl_uchar opencl_space[7]; // "OpenCL_"
cl_uchar major;
cl_uchar dot;
cl_uchar minor;
#if 1 // Intel NEO requires at least 16 bytes
cl_uchar space;
cl_uchar vendor[32];
#endif
};
struct {
cl_uchar aN[];
};
};
typedef cl_bitfield cl_diagnostic_verbose_level_intel;
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
static
void
CL_CALLBACK
skc_context_callback(char const * error, void const * info, size_t size, void * user)
{
if (info != NULL )
{
fprintf(stderr,"%s\n",error);
}
}
//
//
//
skc_err
skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
char const * const target_platform_substring,
char const * const target_device_substring,
cl_context_properties context_properties[])
{
skc_err err = SKC_ERR_SUCCESS;
//
// search available devices for a match
//
#define PLATFORM_IDS_MAX 16
#define DEVICE_IDS_MAX 16
#define PLATFORM_NAME_SIZE_MAX 64
#define DEVICE_NAME_SIZE_MAX 64
#define DRIVER_VERSION_SIZE_MAX 64
cl_int cl_err;
cl_platform_id platform_ids[PLATFORM_IDS_MAX];
cl_device_id device_ids [PLATFORM_IDS_MAX][DEVICE_IDS_MAX];
cl_uint platform_count;
cl_uint device_count[PLATFORM_IDS_MAX];
cl_uint platform_idx = UINT32_MAX, device_idx = UINT32_MAX;
bool match = false; // find _first_ match
//
// get number of platforms
//
cl(GetPlatformIDs(PLATFORM_IDS_MAX,platform_ids,&platform_count));
//
// search platforms
//
for (cl_uint ii=0; ii<platform_count; ii++)
{
char platform_name[PLATFORM_NAME_SIZE_MAX];
cl(GetPlatformInfo(platform_ids[ii],
CL_PLATFORM_NAME,
sizeof(platform_name),
platform_name,
NULL));
if (!match && (strstr(platform_name,target_platform_substring) != NULL))
{
platform_idx = ii;
}
if (is_verbose) {
fprintf(stdout,"%2u: %s\n",ii,platform_name);
}
cl_err = clGetDeviceIDs(platform_ids[ii],
CL_DEVICE_TYPE_ALL,
DEVICE_IDS_MAX,
device_ids[ii],
device_count+ii);
if (cl_err != CL_DEVICE_NOT_FOUND)
cl_ok(cl_err);
for (cl_uint jj=0; jj<device_count[ii]; jj++)
{
char device_name[DEVICE_NAME_SIZE_MAX];
union skc_cl_device_version device_version;
cl_uint device_align_bits;
char driver_version[DRIVER_VERSION_SIZE_MAX];
cl(GetDeviceInfo(device_ids[ii][jj],
CL_DEVICE_NAME,
sizeof(device_name),
device_name,
NULL));
// FIXME -- some of these variable length parameters should
// use the "size the param before reading" idiom
cl(GetDeviceInfo(device_ids[ii][jj],
CL_DEVICE_VERSION,
sizeof(device_version),
device_version.aN,
NULL));
cl(GetDeviceInfo(device_ids[ii][jj],
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
sizeof(device_align_bits),
&device_align_bits,
NULL));
cl_uint const base_align = device_align_bits / 8; // bytes
cl(GetDeviceInfo(device_ids[ii][jj],
CL_DRIVER_VERSION,
sizeof(driver_version),
driver_version,
NULL));
if (!match && (platform_idx == ii) && (strstr(device_name,target_device_substring) != NULL))
{
match = true;
device_idx = jj;
runtime_cl->version.major = device_version.major - 48;
runtime_cl->version.minor = device_version.minor - 48;
runtime_cl->base_align = base_align;
if (is_verbose) {
fprintf(stdout," >>>");
}
}
else if (is_verbose)
{
fprintf(stdout," ");
}
if (is_verbose) {
fprintf(stdout,
" %1u: %s [ %s ] [ %s ] [ %u ]\n",
jj,
device_name,
device_version.aN,
driver_version,
base_align);
}
}
}
if (is_verbose) {
fprintf(stdout,"\n");
}
//
// get target platform and device
//
if (platform_idx >= platform_count)
{
fprintf(stderr,"no match for target platform substring %s\n",target_platform_substring);
exit(EXIT_FAILURE);
}
if (device_idx >= device_count[platform_idx])
{
fprintf(stderr,"no match for target device substring %s\n",target_device_substring);
exit(EXIT_FAILURE);
}
runtime_cl->platform_id = platform_ids[platform_idx];
runtime_cl->device_id = device_ids [platform_idx][device_idx];
//
// create context
//
#if 0
cl_context_properties context_properties[] =
{
CL_CONTEXT_PLATFORM,(cl_context_properties)runtime_cl->platform_id,
0
};
#else
context_properties[1] = (cl_context_properties)runtime_cl->platform_id;
#endif
runtime_cl->context = clCreateContext(context_properties,
1,
&runtime_cl->device_id,
skc_context_callback,
NULL,
&cl_err);
cl_ok(cl_err);
//
// get device name, driver version, and unified memory flag
//
if (is_verbose)
{
char device_name[DEVICE_NAME_SIZE_MAX];
char driver_version[DRIVER_VERSION_SIZE_MAX];
cl_bool device_is_unified;
cl_device_svm_capabilities svm_caps;
size_t printf_buffer_size;
cl(GetDeviceInfo(runtime_cl->device_id,
CL_DEVICE_NAME,
sizeof(device_name),
device_name,
NULL));
cl(GetDeviceInfo(runtime_cl->device_id,
CL_DRIVER_VERSION,
sizeof(driver_version),
driver_version,
NULL));
cl(GetDeviceInfo(runtime_cl->device_id,
CL_DEVICE_HOST_UNIFIED_MEMORY,
sizeof(device_is_unified),
&device_is_unified,
NULL));
cl(GetDeviceInfo(runtime_cl->device_id,
CL_DEVICE_SVM_CAPABILITIES,
sizeof(svm_caps),
&svm_caps,
0));
cl(GetDeviceInfo(runtime_cl->device_id,
CL_DEVICE_PRINTF_BUFFER_SIZE,
sizeof(printf_buffer_size),
&printf_buffer_size,
NULL));
fprintf(stderr,
"CL_DEVICE_SVM_COARSE_GRAIN_BUFFER %c\n"
"CL_DEVICE_SVM_FINE_GRAIN_BUFFER %c\n"
"CL_DEVICE_SVM_FINE_GRAIN_SYSTEM %c\n"
"CL_DEVICE_SVM_ATOMICS %c\n"
"CL_DEVICE_PRINTF_BUFFER_SIZE %zu\n\n",
svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? '*' : '-',
svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? '*' : '-',
svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? '*' : '-',
svm_caps & CL_DEVICE_SVM_ATOMICS ? '*' : '-',
printf_buffer_size);
}
return err;
}
//
//
//
skc_err
skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl)
{
// FIXME
printf("%s incomplete!\n",__func__);
return SKC_ERR_SUCCESS;
}
//
//
//
cl_command_queue
skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type)
{
cl_command_queue cq;
if (runtime_cl->version.major < 2)
{
//
// <= OpenCL 1.2
//
cl_int cl_err;
cq = clCreateCommandQueue(runtime_cl->context,
runtime_cl->device_id,
(cl_command_queue_properties)type,
&cl_err); cl_ok(cl_err);
}
else
{
//
// >= OpenCL 2.0
//
cl_int cl_err;
cl_queue_properties const queue_properties[] = {
CL_QUEUE_PROPERTIES,(cl_queue_properties)type,0
};
cq = clCreateCommandQueueWithProperties(runtime_cl->context,
runtime_cl->device_id,
queue_properties,
&cl_err); cl_ok(cl_err);
}
return cq;
}
//
//
//

View File

@ -1,79 +0,0 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
#pragma once
//
// squelch OpenCL 1.2 deprecation warning
//
#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#include <CL/opencl.h>
//
//
//
#include "skc.h"
//
// Minimal OpenCL state needed by the runtime to get started
//
struct skc_runtime_cl
{
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
struct {
cl_uint major;
cl_uint minor;
} version; // sometimes we need to know this at runtime
cl_uint base_align; // base address alignment for subbuffer origins
};
//
//
//
typedef enum skc_cq_type_e {
SKC_CQ_TYPE_IN_ORDER = 0,
SKC_CQ_TYPE_OUT_OF_ORDER = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
SKC_CQ_TYPE_IN_ORDER_PROFILING = (SKC_CQ_TYPE_IN_ORDER | CL_QUEUE_PROFILING_ENABLE),
SKC_CQ_TYPE_OUT_OF_ORDER_PROFILING = (SKC_CQ_TYPE_OUT_OF_ORDER | CL_QUEUE_PROFILING_ENABLE),
} skc_cq_type_e;
//
// safely creates a generic OpenCL target in very few lines
//
skc_err
skc_runtime_cl_create(struct skc_runtime_cl * const runtime_cl,
char const * const target_platform_substring,
char const * const target_device_substring,
cl_context_properties context_properties[]);
skc_err
skc_runtime_cl_dispose(struct skc_runtime_cl * const runtime_cl);
//
// create a command queue with the non-deprecated function
//
cl_command_queue
skc_runtime_cl_create_cq(struct skc_runtime_cl * const runtime_cl, skc_cq_type_e const type);
//
//
//

View File

@ -24,7 +24,6 @@
#include "grid.h"
#include "common/cl/assert_cl.h"
#include "config_cl.h"
#include "runtime_cl.h"
#include "runtime_cl_12.h"
#include "export_cl_12.h"
@ -32,7 +31,7 @@
//
//
static
static
void
skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
{
@ -42,7 +41,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
// create block extent
skc_extent_pdrw_alloc(runtime,
&runtime->block_pool.blocks,
runtime->block_pool.size->pool_size *
runtime->block_pool.size->pool_size *
runtime->config->block.bytes);
// allocate block pool ids
@ -85,7 +84,7 @@ skc_block_pool_create(struct skc_runtime * const runtime, cl_command_queue cq)
cl(ReleaseKernel(k1));
}
static
static
void
skc_block_pool_dispose(struct skc_runtime * const runtime)
{
@ -106,7 +105,7 @@ skc_runtime_yield(struct skc_runtime * const runtime)
}
static
void
void
skc_runtime_wait(struct skc_runtime * const runtime)
{
skc_scheduler_wait(runtime->scheduler);
@ -118,18 +117,26 @@ skc_runtime_wait(struct skc_runtime * const runtime)
skc_err
skc_runtime_cl_12_create(struct skc_context * const context,
char const * const target_platform_substring,
char const * const target_device_substring,
cl_context_properties context_properties[])
cl_context context_cl,
cl_device_id device_id_cl)
{
// allocate the runtime
struct skc_runtime * const runtime = malloc(sizeof(*runtime));
// acquire OpenCL ids and context for target device
skc_err err = skc_runtime_cl_create(&runtime->cl,
target_platform_substring,
target_device_substring,
context_properties);
// save off CL objects
runtime->cl.context = context_cl;
runtime->cl.device_id = device_id_cl;
// query device alignment
cl_uint align_bits;
cl(GetDeviceInfo(device_id_cl,
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
sizeof(align_bits),
&align_bits,
NULL));
runtime->cl.align_bytes = align_bits / 8;
// create device
skc_device_create(runtime);
@ -149,7 +156,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
// initialize cq pool
skc_cq_pool_create(runtime,
&runtime->cq_pool,
runtime->config->cq_pool.type,
runtime->config->cq_pool.cq_props,
runtime->config->cq_pool.size);
// acquire in-order cq
@ -176,7 +183,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
context->yield = skc_runtime_yield;
context->wait = skc_runtime_wait;
context->path_builder = skc_path_builder_cl_12_create;
context->path_retain = skc_runtime_path_host_retain;
context->path_release = skc_runtime_path_host_release;
@ -189,7 +196,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
context->composition = skc_composition_cl_12_create;
context->styling = skc_styling_cl_12_create;
context->surface = skc_surface_cl_12_create;
// block on pool creation
@ -198,7 +205,7 @@ skc_runtime_cl_12_create(struct skc_context * const context,
// dispose of in-order cq
skc_runtime_release_cq_in_order(runtime,cq);
return err;
return SKC_ERR_SUCCESS;
};
//
@ -227,7 +234,7 @@ skc_runtime_cl_12_dispose(struct skc_context * const context)
skc_block_pool_dispose(context->runtime);
// skc_handle_pool_dispose(context->runtime);
return SKC_ERR_SUCCESS;
}
@ -253,12 +260,12 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
return;
QueryPerformanceCounter(&EndingTime);
LARGE_INTEGER ElapsedMicroseconds, Frequency;
ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
QueryPerformanceFrequency(&Frequency);
QueryPerformanceFrequency(&Frequency);
double const msecs_total = 1000.0 * ElapsedMicroseconds.QuadPart / Frequency.QuadPart;
double const msecs_frame = msecs_total / SKC_FRAMES;
@ -268,7 +275,7 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
#endif
struct skc_runtime * const runtime = context->runtime;
// acquire out-of-order cq
cl_command_queue cq = skc_runtime_acquire_cq_in_order(runtime);
@ -311,4 +318,3 @@ skc_runtime_cl_12_debug(struct skc_context * const context)
//
//
//

View File

@ -12,8 +12,8 @@
//
//
#include "skc.h"
#include "runtime.h"
#include "runtime_cl.h"
#include "cq_pool_cl.h"
#include "handle_pool_cl_12.h"
#include "block_pool_cl_12.h"
@ -31,7 +31,11 @@ struct skc_runtime
//
// state visible to device
//
struct skc_runtime_cl cl;
struct {
cl_context context;
cl_device_id device_id;
cl_uint align_bytes;
} cl;
struct {
struct skc_allocator_host host;
@ -63,9 +67,8 @@ struct skc_runtime
skc_err
skc_runtime_cl_12_create(struct skc_context * const context,
char const * const target_platform_substring,
char const * const target_device_substring,
cl_context_properties context_properties[]);
cl_context context_cl,
cl_device_id device_id_cl);
skc_err
skc_runtime_cl_12_dispose(struct skc_context * const context);

View File

@ -69,7 +69,7 @@ float const skc_transform_identity[8] =
0.0f, 0.0f // w0 w1 1 <-- always 1
};
float const * const skc_transform_identity_ptr = skc_transform_identity;
// float const * const skc_transform_identity_ptr = skc_transform_identity;
//
// DEFAULT RASTER CLIP
@ -82,7 +82,7 @@ float const skc_raster_clip_default[4] =
+FLT_MAX, +FLT_MAX // upper right corner of bounding box
};
float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
// float const * const skc_raster_clip_default_ptr = skc_raster_clip_default;
#endif

View File

@ -9,125 +9,18 @@
#ifndef SKC_ONCE_SKC
#define SKC_ONCE_SKC
//
// FIXME -- get rid of these here
//
#include <stdint.h>
#include <stdbool.h>
//
//
//
#include "skc_styling.h" // FIXME -- skc_styling
// #include "skc_err.h"
//
// FIXME -- move errors to an skc prefixed include
//
typedef enum skc_err {
SKC_ERR_SUCCESS = 0,
SKC_ERR_API_BASE = 10000,
SKC_ERR_NOT_IMPLEMENTED = SKC_ERR_API_BASE,
SKC_ERR_POOL_EMPTY,
SKC_ERR_CONDVAR_WAIT,
SKC_ERR_LAYER_ID_INVALID,
SKC_ERR_LAYER_NOT_EMPTY,
SKC_ERR_TRANSFORM_WEAKREF_INVALID,
SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
SKC_ERR_COMMAND_NOT_READY,
SKC_ERR_COMMAND_NOT_COMPLETED,
SKC_ERR_COMMAND_NOT_STARTED,
SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
SKC_ERR_COMPOSITION_SEALED,
SKC_ERR_STYLING_SEALED,
SKC_ERR_HANDLE_INVALID,
SKC_ERR_HANDLE_OVERFLOW,
SKC_ERR_COUNT
} skc_err;
//
// SPINEL TYPES
//
typedef struct skc_context * skc_context_t;
typedef struct skc_path_builder * skc_path_builder_t;
typedef struct skc_raster_builder * skc_raster_builder_t;
typedef struct skc_composition * skc_composition_t;
typedef struct skc_styling * skc_styling_t;
typedef struct skc_surface * skc_surface_t;
#if 0
typedef struct skc_interop * skc_interop_t;
typedef uint32_t skc_interop_surface_t;
#endif
typedef uint32_t skc_path_t;
typedef uint32_t skc_raster_t;
typedef uint32_t skc_layer_id;
typedef uint32_t skc_group_id;
typedef uint32_t skc_styling_cmd_t;
typedef uint64_t skc_weakref_t;
typedef skc_weakref_t skc_transform_weakref_t;
typedef skc_weakref_t skc_raster_clip_weakref_t;
//
// FIXME -- bury all of this
//
#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
#define SKC_GROUP_IDS(...) _countof(__VA_ARGS__),__VA_ARGS__
//
//
//
#define SKC_PATH_INVALID UINT32_MAX
#define SKC_RASTER_INVALID UINT32_MAX
#define SKC_WEAKREF_INVALID UINT64_MAX
//
// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
//
extern float const * const skc_transform_identity_ptr; // { 1, 0, 0, 0, 1, 0, 0, 0 }
//
// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
//
extern float const * const skc_raster_clip_default_ptr;
#include "skc_err.h"
#include "skc_types.h"
#include "skc_styling.h"
//
// CONTEXT
//
skc_err
skc_context_create(skc_context_t * context,
char const * target_platform_substring,
char const * target_device_substring,
intptr_t context_properties[]);
skc_err
skc_context_retain(skc_context_t context);
@ -137,31 +30,6 @@ skc_context_release(skc_context_t context);
skc_err
skc_context_reset(skc_context_t context);
//
// COORDINATED EXTERNAL OPERATIONS
//
/*
Examples include:
- Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
- Subpixel antialiasing using neighboring pixel color and coverage data.
- Performing a blit from one region to another region on a surface.
- Blitting from one surface to another.
- Loading and processing from one region and storing to another region.
- Rendezvousing with an external pipeline.
*/
//
//
//
bool
skc_context_yield(skc_context_t context);
void
skc_context_wait(skc_context_t context);
//
// PATH BUILDER
//
@ -485,6 +353,31 @@ skc_surface_render(skc_surface_t surface,
void * data,
void * fb); // FIXME FIXME
//
// COORDINATED EXTERNAL OPERATIONS
//
// Examples include:
//
// - Transforming an intermediate layer with a blur, sharpen, rotation or scaling kernel.
// - Subpixel antialiasing using neighboring pixel color and coverage data.
// - Performing a blit from one region to another region on a surface.
// - Blitting from one surface to another.
// - Loading and processing from one region and storing to another region.
// - Rendezvousing with an external pipeline.
//
// FORTHCOMING...
//
// SCHEDULER
//
bool
skc_context_yield(skc_context_t context);
void
skc_context_wait(skc_context_t context);
//
//
//

View File

@ -0,0 +1,70 @@
/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
#ifndef SKC_ONCE_SKC_CREATE_CL
#define SKC_ONCE_SKC_CREATE_CL
//
//
//
#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include "CL/opencl.h"
#endif
//
//
//
#include "skc.h"
//
// CONTEXT CREATION
//
skc_err
skc_context_create_cl(skc_context_t * context,
cl_context context_cl,
cl_device_id device_id_cl);
//
// FIXME -- SPECIALIZE SURFACE RENDER
//
#if 0
//
// SURFACE RENDER
//
typedef void (*skc_surface_render_pfn_notify)(skc_surface_t surface,
skc_styling_t styling,
skc_composition_t composition,
void * data);
skc_err
skc_surface_render(skc_surface_t surface,
uint32_t const clip[4],
skc_styling_t styling,
skc_composition_t composition,
skc_surface_render_pfn_notify notify,
void * data,
void * fb); // FIXME FIXME
#endif
//
//
//
#endif
//
//
//

58
src/compute/skc/skc_err.h Normal file
View File

@ -0,0 +1,58 @@
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
#ifndef SKC_ONCE_SKC_ERR
#define SKC_ONCE_SKC_ERR
//
//
//
typedef enum skc_err {
SKC_ERR_SUCCESS = 0,
SKC_ERR_API_BASE = 10000,
SKC_ERR_NOT_IMPLEMENTED = SKC_ERR_API_BASE,
SKC_ERR_POOL_EMPTY,
SKC_ERR_CONDVAR_WAIT,
SKC_ERR_LAYER_ID_INVALID,
SKC_ERR_LAYER_NOT_EMPTY,
SKC_ERR_TRANSFORM_WEAKREF_INVALID,
SKC_ERR_STROKE_STYLE_WEAKREF_INVALID,
SKC_ERR_COMMAND_NOT_READY,
SKC_ERR_COMMAND_NOT_COMPLETED,
SKC_ERR_COMMAND_NOT_STARTED,
SKC_ERR_COMMAND_NOT_READY_OR_COMPLETED,
SKC_ERR_COMPOSITION_SEALED,
SKC_ERR_STYLING_SEALED,
SKC_ERR_HANDLE_INVALID,
SKC_ERR_HANDLE_OVERFLOW,
SKC_ERR_COUNT
} skc_err;
//
//
//
#endif
//
//
//

View File

@ -79,6 +79,13 @@ typedef enum skc_styling_gradient_type_e {
} skc_styling_gradient_type_e;
//
// FIXME -- bury all of this once we stabilize styling
//
#define SKC_STYLING_CMDS(...) _countof(__VA_ARGS__),__VA_ARGS__
#define SKC_GROUP_IDS(...) _countof(__VA_ARGS__),__VA_ARGS__
//
//
//

View File

@ -0,0 +1,73 @@
/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
#ifndef SKC_ONCE_SKC_TYPES
#define SKC_ONCE_SKC_TYPES
//
//
//
#include <stdint.h>
#include <stdbool.h>
//
//
//
typedef struct skc_context * skc_context_t;
typedef struct skc_path_builder * skc_path_builder_t;
typedef struct skc_raster_builder * skc_raster_builder_t;
typedef struct skc_composition * skc_composition_t;
typedef struct skc_styling * skc_styling_t;
typedef struct skc_surface * skc_surface_t;
typedef uint32_t skc_path_t;
typedef uint32_t skc_raster_t;
typedef uint32_t skc_layer_id;
typedef uint32_t skc_group_id;
typedef uint32_t skc_styling_cmd_t;
typedef uint64_t skc_weakref_t;
typedef skc_weakref_t skc_transform_weakref_t;
typedef skc_weakref_t skc_raster_clip_weakref_t;
#if 0
typedef struct skc_interop * skc_interop_t;
typedef uint32_t skc_interop_surface_t;
#endif
//
//
//
#define SKC_PATH_INVALID UINT32_MAX
#define SKC_RASTER_INVALID UINT32_MAX
#define SKC_WEAKREF_INVALID UINT64_MAX
//
// TRANSFORM LAYOUT: { sx shx tx shy sy ty w0 w1 }
//
//
// RASTER CLIP LAYOUT: { x0, y0, x1, y1 }
//
//
//
//
#endif
//
//
//