2016-08-22 18:26:36 +00:00
//--------------------------------------------------------------------------------------
// File: BC7Encode.hlsl
//
// The Compute Shader for BC7 Encoder
//
// Copyright (c) Microsoft Corporation. All rights reserved.
2018-02-24 06:24:46 +00:00
// Licensed under the MIT License.
2016-08-22 18:26:36 +00:00
//--------------------------------------------------------------------------------------
2018-07-05 22:52:46 +00:00
#define REF_DEVICE
2016-08-22 18:26:36 +00:00
#define CHAR_LENGTH 8
#define NCHANNELS 4
#define BC7_UNORM 98
#define MAX_UINT 0xFFFFFFFF
#define MIN_UINT 0
static const uint candidateSectionBit [ 64 ] = //Associated to partition 0-63
{
0xCCCC , 0x8888 , 0xEEEE , 0xECC8 ,
0xC880 , 0xFEEC , 0xFEC8 , 0xEC80 ,
0xC800 , 0xFFEC , 0xFE80 , 0xE800 ,
0xFFE8 , 0xFF00 , 0xFFF0 , 0xF000 ,
0xF710 , 0x008E , 0x7100 , 0x08CE ,
0x008C , 0x7310 , 0x3100 , 0x8CCE ,
0x088C , 0x3110 , 0x6666 , 0x366C ,
0x17E8 , 0x0FF0 , 0x718E , 0x399C ,
0xaaaa , 0xf0f0 , 0x5a5a , 0x33cc ,
0x3c3c , 0x55aa , 0x9696 , 0xa55a ,
0x73ce , 0x13c8 , 0x324c , 0x3bdc ,
0x6996 , 0xc33c , 0x9966 , 0x660 ,
0x272 , 0x4e4 , 0x4e40 , 0x2720 ,
0xc936 , 0x936c , 0x39c6 , 0x639c ,
0x9336 , 0x9cc6 , 0x817e , 0xe718 ,
0xccf0 , 0xfcc , 0x7744 , 0xee22 ,
} ;
static const uint candidateSectionBit2 [ 64 ] = //Associated to partition 64-127
{
0xaa685050 , 0x6a5a5040 , 0x5a5a4200 , 0x5450a0a8 ,
0xa5a50000 , 0xa0a05050 , 0x5555a0a0 , 0x5a5a5050 ,
0xaa550000 , 0xaa555500 , 0xaaaa5500 , 0x90909090 ,
0x94949494 , 0xa4a4a4a4 , 0xa9a59450 , 0x2a0a4250 ,
0xa5945040 , 0x0a425054 , 0xa5a5a500 , 0x55a0a0a0 ,
0xa8a85454 , 0x6a6a4040 , 0xa4a45000 , 0x1a1a0500 ,
0x0050a4a4 , 0xaaa59090 , 0x14696914 , 0x69691400 ,
0xa08585a0 , 0xaa821414 , 0x50a4a450 , 0x6a5a0200 ,
0xa9a58000 , 0x5090a0a8 , 0xa8a09050 , 0x24242424 ,
0x00aa5500 , 0x24924924 , 0x24499224 , 0x50a50a50 ,
0x500aa550 , 0xaaaa4444 , 0x66660000 , 0xa5a0a5a0 ,
0x50a050a0 , 0x69286928 , 0x44aaaa44 , 0x66666600 ,
0xaa444444 , 0x54a854a8 , 0x95809580 , 0x96969600 ,
0xa85454a8 , 0x80959580 , 0xaa141414 , 0x96960000 ,
0xaaaa1414 , 0xa05050a0 , 0xa0a5a5a0 , 0x96000000 ,
0x40804080 , 0xa9a8a9a8 , 0xaaaaaa44 , 0x2a4a5254 ,
} ;
static const uint2 candidateFixUpIndex1D [ 128 ] =
{
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 2 , 0 } , { 8 , 0 } , { 2 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 8 , 0 } , { 15 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 8 , 0 } , { 8 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 6 , 0 } , { 8 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 2 , 0 } , { 15 , 0 } , { 15 , 0 } , { 6 , 0 } ,
{ 6 , 0 } , { 2 , 0 } , { 6 , 0 } , { 8 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 2 , 0 } , { 2 , 0 } , { 15 , 0 } ,
//candidateFixUpIndex1D[i][1], i < 64 should not be used
{ 3 , 15 } , { 3 , 8 } , { 15 , 8 } , { 15 , 3 } ,
{ 8 , 15 } , { 3 , 15 } , { 15 , 3 } , { 15 , 8 } ,
{ 8 , 15 } , { 8 , 15 } , { 6 , 15 } , { 6 , 15 } ,
{ 6 , 15 } , { 5 , 15 } , { 3 , 15 } , { 3 , 8 } ,
{ 3 , 15 } , { 3 , 8 } , { 8 , 15 } , { 15 , 3 } ,
{ 3 , 15 } , { 3 , 8 } , { 6 , 15 } , { 10 , 8 } ,
{ 5 , 3 } , { 8 , 15 } , { 8 , 6 } , { 6 , 10 } ,
{ 8 , 15 } , { 5 , 15 } , { 15 , 10 } , { 15 , 8 } ,
{ 8 , 15 } , { 15 , 3 } , { 3 , 15 } , { 5 , 10 } ,
{ 6 , 10 } , { 10 , 8 } , { 8 , 9 } , { 15 , 10 } ,
{ 15 , 6 } , { 3 , 15 } , { 15 , 8 } , { 5 , 15 } ,
{ 15 , 3 } , { 15 , 6 } , { 15 , 6 } , { 15 , 8 } , //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
{ 3 , 15 } , { 15 , 3 } , { 5 , 15 } , { 5 , 15 } ,
{ 5 , 15 } , { 8 , 15 } , { 5 , 15 } , { 10 , 15 } ,
{ 5 , 15 } , { 10 , 15 } , { 8 , 15 } , { 13 , 15 } ,
{ 15 , 3 } , { 12 , 15 } , { 3 , 15 } , { 3 , 8 } ,
} ;
static const uint2 candidateFixUpIndex1DOrdered [ 128 ] = //Same with candidateFixUpIndex1D but order the result when i >= 64
{
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 2 , 0 } , { 8 , 0 } , { 2 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 8 , 0 } , { 15 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 8 , 0 } , { 8 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 6 , 0 } , { 8 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 2 , 0 } , { 8 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 2 , 0 } , { 15 , 0 } , { 15 , 0 } , { 6 , 0 } ,
{ 6 , 0 } , { 2 , 0 } , { 6 , 0 } , { 8 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 2 , 0 } , { 2 , 0 } ,
{ 15 , 0 } , { 15 , 0 } , { 15 , 0 } , { 15 , 0 } ,
{ 15 , 0 } , { 2 , 0 } , { 2 , 0 } , { 15 , 0 } ,
//candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
{ 3 , 15 } , { 3 , 8 } , { 8 , 15 } , { 3 , 15 } ,
{ 8 , 15 } , { 3 , 15 } , { 3 , 15 } , { 8 , 15 } ,
{ 8 , 15 } , { 8 , 15 } , { 6 , 15 } , { 6 , 15 } ,
{ 6 , 15 } , { 5 , 15 } , { 3 , 15 } , { 3 , 8 } ,
{ 3 , 15 } , { 3 , 8 } , { 8 , 15 } , { 3 , 15 } ,
{ 3 , 15 } , { 3 , 8 } , { 6 , 15 } , { 8 , 10 } ,
{ 3 , 5 } , { 8 , 15 } , { 6 , 8 } , { 6 , 10 } ,
{ 8 , 15 } , { 5 , 15 } , { 10 , 15 } , { 8 , 15 } ,
{ 8 , 15 } , { 3 , 15 } , { 3 , 15 } , { 5 , 10 } ,
{ 6 , 10 } , { 8 , 10 } , { 8 , 9 } , { 10 , 15 } ,
{ 6 , 15 } , { 3 , 15 } , { 8 , 15 } , { 5 , 15 } ,
{ 3 , 15 } , { 6 , 15 } , { 6 , 15 } , { 8 , 15 } , //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
{ 3 , 15 } , { 3 , 15 } , { 5 , 15 } , { 5 , 15 } ,
{ 5 , 15 } , { 8 , 15 } , { 5 , 15 } , { 10 , 15 } ,
{ 5 , 15 } , { 10 , 15 } , { 8 , 15 } , { 13 , 15 } ,
{ 3 , 15 } , { 12 , 15 } , { 3 , 15 } , { 3 , 8 } ,
} ;
//static const uint4x4 candidateRotation[4] =
//{
// {1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1},
// {0,0,0,1},{0,1,0,0},{0,0,1,0},{1,0,0,0},
// {1,0,0,0},{0,0,0,1},{0,0,1,0},{0,1,0,0},
// {1,0,0,0},{0,1,0,0},{0,0,0,1},{0,0,1,0}
//};
//static const uint2 candidateIndexPrec[8] = {{3,0},{3,0},{2,0},{2,0},
// {2,3}, //color index and alpha index can exchange
// {2,2},{4,4},{2,2}};
static const uint aWeight [ 3 ] [ 16 ] = { { 0 , 4 , 9 , 13 , 17 , 21 , 26 , 30 , 34 , 38 , 43 , 47 , 51 , 55 , 60 , 64 } ,
{ 0 , 9 , 18 , 27 , 37 , 46 , 55 , 64 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 0 , 21 , 43 , 64 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } } ;
//4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
static const uint aStep [ 3 ] [ 64 ] = { { 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 ,
2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ,
4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 ,
6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 ,
7 , 8 , 8 , 8 , 8 , 9 , 9 , 9 ,
9 , 10 , 10 , 10 , 10 , 10 , 11 , 11 ,
11 , 11 , 12 , 12 , 12 , 12 , 13 , 13 ,
13 , 13 , 14 , 14 , 14 , 14 , 15 , 15 } ,
//3 bit index: 0, 9, 18, 27, 37, 46, 55, 64
{ 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 ,
1 , 1 , 1 , 1 , 1 , 1 , 2 , 2 ,
2 , 2 , 2 , 2 , 2 , 2 , 2 , 3 ,
3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 ,
3 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 ,
5 , 5 , 5 , 6 , 6 , 6 , 6 , 6 ,
6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 } ,
//2 bit index: 0, 21, 43, 64
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 ,
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
1 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
2 , 2 , 2 , 2 , 2 , 2 , 3 , 3 ,
3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 } } ;
cbuffer cbCS : register ( b0 )
{
uint g_tex_width ;
uint g_num_block_x ;
uint g_format ;
uint g_mode_id ;
uint g_start_block_id ;
uint g_num_total_blocks ;
float g_alpha_weight ;
} ;
//Forward declaration
uint2x4 compress_endpoints0 ( inout uint2x4 endPoint , uint2 P ) ; //Mode = 0
uint2x4 compress_endpoints1 ( inout uint2x4 endPoint , uint2 P ) ; //Mode = 1
uint2x4 compress_endpoints2 ( inout uint2x4 endPoint ) ; //Mode = 2
uint2x4 compress_endpoints3 ( inout uint2x4 endPoint , uint2 P ) ; //Mode = 3
uint2x4 compress_endpoints7 ( inout uint2x4 endPoint , uint2 P ) ; //Mode = 7
uint2x4 compress_endpoints6 ( inout uint2x4 endPoint , uint2 P ) ; //Mode = 6
uint2x4 compress_endpoints4 ( inout uint2x4 endPoint ) ; //Mode = 4
uint2x4 compress_endpoints5 ( inout uint2x4 endPoint ) ; //Mode = 5
void block_package0 ( out uint4 block , uint partition , uint threadBase ) ; //Mode0
void block_package1 ( out uint4 block , uint partition , uint threadBase ) ; //Mode1
void block_package2 ( out uint4 block , uint partition , uint threadBase ) ; //Mode2
void block_package3 ( out uint4 block , uint partition , uint threadBase ) ; //Mode3
void block_package4 ( out uint4 block , uint rotation , uint index_selector , uint threadBase ) ; //Mode4
void block_package5 ( out uint4 block , uint rotation , uint threadBase ) ; //Mode5
void block_package6 ( out uint4 block , uint threadBase ) ; //Mode6
void block_package7 ( out uint4 block , uint partition , uint threadBase ) ; //Mode7
void swap ( inout uint4 lhs , inout uint4 rhs )
{
uint4 tmp = lhs ;
lhs = rhs ;
rhs = tmp ;
}
void swap ( inout uint3 lhs , inout uint3 rhs )
{
uint3 tmp = lhs ;
lhs = rhs ;
rhs = tmp ;
}
void swap ( inout uint lhs , inout uint rhs )
{
uint tmp = lhs ;
lhs = rhs ;
rhs = tmp ;
}
uint ComputeError ( in uint4 a , in uint4 b )
{
return dot ( a . rgb , b . rgb ) + g_alpha_weight * a . a * b . a ;
}
void Ensure_A_Is_Larger ( inout uint4 a , inout uint4 b )
{
if ( a . x < b . x )
swap ( a . x , b . x ) ;
if ( a . y < b . y )
swap ( a . y , b . y ) ;
if ( a . z < b . z )
swap ( a . z , b . z ) ;
if ( a . w < b . w )
swap ( a . w , b . w ) ;
}
Texture2D g_Input : register ( t0 ) ;
StructuredBuffer < uint4 > g_InBuff : register ( t1 ) ;
RWStructuredBuffer < uint4 > g_OutBuff : register ( u0 ) ;
#define THREAD_GROUP_SIZE 64
#define BLOCK_SIZE_Y 4
#define BLOCK_SIZE_X 4
#define BLOCK_SIZE (BLOCK_SIZE_Y * BLOCK_SIZE_X)
struct BufferShared
{
uint4 pixel ;
uint error ;
uint mode ;
uint partition ;
uint index_selector ;
uint rotation ;
uint4 endPoint_low ;
uint4 endPoint_high ;
uint4 endPoint_low_quantized ;
uint4 endPoint_high_quantized ;
} ;
groupshared BufferShared shared_temp [ THREAD_GROUP_SIZE ] ;
[ numthreads ( THREAD_GROUP_SIZE , 1 , 1 ) ]
void TryMode456CS ( uint GI : SV_GroupIndex , uint3 groupID : SV_GroupID ) // mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0
{
// we process 4 BC blocks per thread group
const uint MAX_USED_THREAD = 16 ; // pixels in a BC (block compressed) block
uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD ; // the number of BC blocks a thread group processes = 64 / 16 = 4
uint blockInGroup = GI / MAX_USED_THREAD ; // what BC block this thread is on within this thread group
uint blockID = g_start_block_id + groupID . x * BLOCK_IN_GROUP + blockInGroup ; // what global BC block this thread is on
uint threadBase = blockInGroup * MAX_USED_THREAD ; // the first id of the pixel in this BC block in this thread group
uint threadInBlock = GI - threadBase ; // id of the pixel in this BC block
#ifndef REF_DEVICE
if ( blockID >= g_num_total_blocks )
{
return ;
}
#endif
uint block_y = blockID / g_num_block_x ;
uint block_x = blockID - block_y * g_num_block_x ;
uint base_x = block_x * BLOCK_SIZE_X ;
uint base_y = block_y * BLOCK_SIZE_Y ;
if ( threadInBlock < 16 )
{
shared_temp [ GI ] . pixel = clamp ( uint4 ( g_Input . Load ( uint3 ( base_x + threadInBlock % 4 , base_y + threadInBlock / 4 , 0 ) ) * 255 ) , 0 , 255 ) ;
shared_temp [ GI ] . endPoint_low = shared_temp [ GI ] . pixel ;
shared_temp [ GI ] . endPoint_high = shared_temp [ GI ] . pixel ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 8 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 8 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 8 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 4 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 4 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 4 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 2 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 2 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 2 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 1 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 1 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 1 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
uint2x4 endPoint ;
endPoint [ 0 ] = shared_temp [ threadBase ] . endPoint_low ;
endPoint [ 1 ] = shared_temp [ threadBase ] . endPoint_high ;
uint error = 0xFFFFFFFF ;
uint mode = 0 ;
uint index_selector = 0 ;
uint rotation = 0 ;
uint2 indexPrec ;
if ( threadInBlock < 8 ) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit
{
if ( 0 == ( threadInBlock & 1 ) ) // thread 0, 2, 4, 6
{
//2 represents 2bit index precision; 1 represents 3bit index precision
index_selector = 0 ;
indexPrec = uint2 ( 2 , 1 ) ;
}
else // thread 1, 3, 5, 7
{
//2 represents 2bit index precision; 1 represents 3bit index precision
index_selector = 1 ;
indexPrec = uint2 ( 1 , 2 ) ;
}
}
else
{
//2 represents 2bit index precision
indexPrec = uint2 ( 2 , 2 ) ;
}
uint4 pixel_r ;
uint color_index ;
uint alpha_index ;
int4 span ;
int2 span_norm_sqr ;
int2 dotProduct ;
if ( threadInBlock < 12 ) // Try mode 4 5 in threads 0..11
{
// mode 4 5 have component rotation
if ( ( threadInBlock < 2 ) || ( 8 == threadInBlock ) ) // rotation = 0 in thread 0, 1
{
rotation = 0 ;
}
else if ( ( threadInBlock < 4 ) || ( 9 == threadInBlock ) ) // rotation = 1 in thread 2, 3
{
endPoint [ 0 ] . ra = endPoint [ 0 ] . ar ;
endPoint [ 1 ] . ra = endPoint [ 1 ] . ar ;
rotation = 1 ;
}
else if ( ( threadInBlock < 6 ) || ( 10 == threadInBlock ) ) // rotation = 2 in thread 4, 5
{
endPoint [ 0 ] . ga = endPoint [ 0 ] . ag ;
endPoint [ 1 ] . ga = endPoint [ 1 ] . ag ;
rotation = 2 ;
}
else if ( ( threadInBlock < 8 ) || ( 11 == threadInBlock ) ) // rotation = 3 in thread 6, 7
{
endPoint [ 0 ] . ba = endPoint [ 0 ] . ab ;
endPoint [ 1 ] . ba = endPoint [ 1 ] . ab ;
rotation = 3 ;
}
if ( threadInBlock < 8 ) // try mode 4 in threads 0..7
{
// mode 4 thread distribution
// Thread 0 1 2 3 4 5 6 7
// Rotation 0 0 1 1 2 2 3 3
// Index selector 0 1 0 1 0 1 0 1
mode = 4 ;
compress_endpoints4 ( endPoint ) ;
}
else // try mode 5 in threads 8..11
{
// mode 5 thread distribution
// Thread 8 9 10 11
// Rotation 0 1 2 3
mode = 5 ;
compress_endpoints5 ( endPoint ) ;
}
uint4 pixel = shared_temp [ threadBase + 0 ] . pixel ;
if ( 1 == rotation )
{
pixel . ra = pixel . ar ;
}
else if ( 2 == rotation )
{
pixel . ga = pixel . ag ;
}
else if ( 3 == rotation )
{
pixel . ba = pixel . ab ;
}
span = endPoint [ 1 ] - endPoint [ 0 ] ;
span_norm_sqr = uint2 ( dot ( span . rgb , span . rgb ) , span . a * span . a ) ;
// in mode 4 5 6, end point 0 must be closer to pixel 0 than end point 1, because of the fix-up index is always index 0
// TODO: this shouldn't be necessary here in error calculation
/ *
dotProduct = int2 ( dot ( span . rgb , pixel . rgb - endPoint [ 0 ] . rgb ) , span . a * ( pixel . a - endPoint [ 0 ] . a ) ) ;
if ( span_norm_sqr . x > 0 && dotProduct . x > 0 && uint ( dotProduct . x * 63.49999 ) > uint ( 32 * span_norm_sqr . x ) )
{
span . rgb = - span . rgb ;
swap ( endPoint [ 0 ] . rgb , endPoint [ 1 ] . rgb ) ;
}
if ( span_norm_sqr . y > 0 && dotProduct . y > 0 && uint ( dotProduct . y * 63.49999 ) > uint ( 32 * span_norm_sqr . y ) )
{
span . a = - span . a ;
swap ( endPoint [ 0 ] . a , endPoint [ 1 ] . a ) ;
}
* /
// should be the same as above
dotProduct = int2 ( dot ( pixel . rgb - endPoint [ 0 ] . rgb , pixel . rgb - endPoint [ 0 ] . rgb ) , dot ( pixel . rgb - endPoint [ 1 ] . rgb , pixel . rgb - endPoint [ 1 ] . rgb ) ) ;
if ( dotProduct . x > dotProduct . y )
{
span . rgb = - span . rgb ;
swap ( endPoint [ 0 ] . rgb , endPoint [ 1 ] . rgb ) ;
}
dotProduct = int2 ( dot ( pixel . a - endPoint [ 0 ] . a , pixel . a - endPoint [ 0 ] . a ) , dot ( pixel . a - endPoint [ 1 ] . a , pixel . a - endPoint [ 1 ] . a ) ) ;
if ( dotProduct . x > dotProduct . y )
{
span . a = - span . a ;
swap ( endPoint [ 0 ] . a , endPoint [ 1 ] . a ) ;
}
error = 0 ;
for ( uint i = 0 ; i < 16 ; i + + )
{
pixel = shared_temp [ threadBase + i ] . pixel ;
if ( 1 == rotation )
{
pixel . ra = pixel . ar ;
}
else if ( 2 == rotation )
{
pixel . ga = pixel . ag ;
}
else if ( 3 == rotation )
{
pixel . ba = pixel . ab ;
}
dotProduct . x = dot ( span . rgb , pixel . rgb - endPoint [ 0 ] . rgb ) ;
color_index = ( span_norm_sqr . x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct . x <= 0 /*pixel == endPoint[0]*/ ) ? 0
: ( ( dotProduct . x < span_norm_sqr . x ) ? aStep [ indexPrec . x ] [ uint ( dotProduct . x * 63.49999 / span_norm_sqr . x ) ] : aStep [ indexPrec . x ] [ 63 ] ) ;
dotProduct . y = dot ( span . a , pixel . a - endPoint [ 0 ] . a ) ;
alpha_index = ( span_norm_sqr . y <= 0 || dotProduct . y <= 0 ) ? 0
: ( ( dotProduct . y < span_norm_sqr . y ) ? aStep [ indexPrec . y ] [ uint ( dotProduct . y * 63.49999 / span_norm_sqr . y ) ] : aStep [ indexPrec . y ] [ 63 ] ) ;
// the same color_index and alpha_index should be used for reconstruction, so this should be left commented out
/ * if ( index_selector )
{
swap ( color_index , alpha_index ) ;
} * /
pixel_r . rgb = ( ( 64 - aWeight [ indexPrec . x ] [ color_index ] ) * endPoint [ 0 ] . rgb +
aWeight [ indexPrec . x ] [ color_index ] * endPoint [ 1 ] . rgb +
32 ) >> 6 ;
pixel_r . a = ( ( 64 - aWeight [ indexPrec . y ] [ alpha_index ] ) * endPoint [ 0 ] . a +
aWeight [ indexPrec . y ] [ alpha_index ] * endPoint [ 1 ] . a +
32 ) >> 6 ;
Ensure_A_Is_Larger ( pixel_r , pixel ) ;
pixel_r - = pixel ;
if ( 1 == rotation )
{
pixel_r . ra = pixel_r . ar ;
}
else if ( 2 == rotation )
{
pixel_r . ga = pixel_r . ag ;
}
else if ( 3 == rotation )
{
pixel_r . ba = pixel_r . ab ;
}
error + = ComputeError ( pixel_r , pixel_r ) ;
}
}
else if ( threadInBlock < 16 ) // Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit
{
uint p = threadInBlock - 12 ;
compress_endpoints6 ( endPoint , uint2 ( p >> 0 , p >> 1 ) & 1 ) ;
uint4 pixel = shared_temp [ threadBase + 0 ] . pixel ;
span = endPoint [ 1 ] - endPoint [ 0 ] ;
span_norm_sqr = dot ( span , span ) ;
dotProduct = dot ( span , pixel - endPoint [ 0 ] ) ;
if ( span_norm_sqr . x > 0 && dotProduct . x >= 0 && uint ( dotProduct . x * 63.49999 ) > uint ( 32 * span_norm_sqr . x ) )
{
span = - span ;
swap ( endPoint [ 0 ] , endPoint [ 1 ] ) ;
}
error = 0 ;
for ( uint i = 0 ; i < 16 ; i + + )
{
pixel = shared_temp [ threadBase + i ] . pixel ;
dotProduct . x = dot ( span , pixel - endPoint [ 0 ] ) ;
color_index = ( span_norm_sqr . x <= 0 || dotProduct . x <= 0 ) ? 0
: ( ( dotProduct . x < span_norm_sqr . x ) ? aStep [ 0 ] [ uint ( dotProduct . x * 63.49999 / span_norm_sqr . x ) ] : aStep [ 0 ] [ 63 ] ) ;
pixel_r = ( ( 64 - aWeight [ 0 ] [ color_index ] ) * endPoint [ 0 ]
+ aWeight [ 0 ] [ color_index ] * endPoint [ 1 ] + 32 ) >> 6 ;
Ensure_A_Is_Larger ( pixel_r , pixel ) ;
pixel_r - = pixel ;
error + = ComputeError ( pixel_r , pixel_r ) ;
}
mode = 6 ;
rotation = p ; // Borrow rotation for p
}
shared_temp [ GI ] . error = error ;
shared_temp [ GI ] . mode = mode ;
shared_temp [ GI ] . index_selector = index_selector ;
shared_temp [ GI ] . rotation = rotation ;
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 8 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 8 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 8 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 8 ] . mode ;
shared_temp [ GI ] . index_selector = shared_temp [ GI + 8 ] . index_selector ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 8 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 4 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 4 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 4 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 4 ] . mode ;
shared_temp [ GI ] . index_selector = shared_temp [ GI + 4 ] . index_selector ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 4 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 2 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 2 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 2 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 2 ] . mode ;
shared_temp [ GI ] . index_selector = shared_temp [ GI + 2 ] . index_selector ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 2 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 1 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 1 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 1 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 1 ] . mode ;
shared_temp [ GI ] . index_selector = shared_temp [ GI + 1 ] . index_selector ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 1 ] . rotation ;
}
g_OutBuff [ blockID ] = uint4 ( shared_temp [ GI ] . error , ( shared_temp [ GI ] . index_selector << 31 ) | shared_temp [ GI ] . mode ,
0 , shared_temp [ GI ] . rotation ) ; // rotation is indeed rotation for mode 4 5. for mode 6, rotation is p bit
}
}
[ numthreads ( THREAD_GROUP_SIZE , 1 , 1 ) ]
void TryMode137CS ( uint GI : SV_GroupIndex , uint3 groupID : SV_GroupID ) // mode 1 3 7 all have 2 subsets per block
{
const uint MAX_USED_THREAD = 64 ;
uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD ;
uint blockInGroup = GI / MAX_USED_THREAD ;
uint blockID = g_start_block_id + groupID . x * BLOCK_IN_GROUP + blockInGroup ;
uint threadBase = blockInGroup * MAX_USED_THREAD ;
uint threadInBlock = GI - threadBase ;
uint block_y = blockID / g_num_block_x ;
uint block_x = blockID - block_y * g_num_block_x ;
uint base_x = block_x * BLOCK_SIZE_X ;
uint base_y = block_y * BLOCK_SIZE_Y ;
if ( threadInBlock < 16 )
{
shared_temp [ GI ] . pixel = clamp ( uint4 ( g_Input . Load ( uint3 ( base_x + threadInBlock % 4 , base_y + threadInBlock / 4 , 0 ) ) * 255 ) , 0 , 255 ) ;
}
GroupMemoryBarrierWithGroupSync ( ) ;
shared_temp [ GI ] . error = 0xFFFFFFFF ;
uint4 pixel_r ;
uint2x4 endPoint [ 2 ] ; // endPoint[0..1 for subset id][0..1 for low and high in the subset]
uint2x4 endPointBackup [ 2 ] ;
uint color_index ;
if ( threadInBlock < 64 )
{
uint partition = threadInBlock ;
endPoint [ 0 ] [ 0 ] = MAX_UINT ;
endPoint [ 0 ] [ 1 ] = MIN_UINT ;
endPoint [ 1 ] [ 0 ] = MAX_UINT ;
endPoint [ 1 ] [ 1 ] = MIN_UINT ;
uint bits = candidateSectionBit [ partition ] ;
for ( uint i = 0 ; i < 16 ; i + + )
{
uint4 pixel = shared_temp [ threadBase + i ] . pixel ;
if ( ( ( bits >> i ) & 0x01 ) == 1 )
{
endPoint [ 1 ] [ 0 ] = min ( endPoint [ 1 ] [ 0 ] , pixel ) ;
endPoint [ 1 ] [ 1 ] = max ( endPoint [ 1 ] [ 1 ] , pixel ) ;
}
else
{
endPoint [ 0 ] [ 0 ] = min ( endPoint [ 0 ] [ 0 ] , pixel ) ;
endPoint [ 0 ] [ 1 ] = max ( endPoint [ 0 ] [ 1 ] , pixel ) ;
}
}
endPointBackup [ 0 ] = endPoint [ 0 ] ;
endPointBackup [ 1 ] = endPoint [ 1 ] ;
uint max_p ;
if ( 1 == g_mode_id )
{
// in mode 1, there is only one p bit per subset
2018-07-04 01:32:23 +00:00
max_p = 2 ;
2016-08-22 18:26:36 +00:00
}
else
{
// in mode 3 7, there are two p bits per subset, one for each end point
2018-07-04 01:32:23 +00:00
max_p = 4 ;
2016-08-22 18:26:36 +00:00
}
2018-07-04 01:32:23 +00:00
uint final_p [ 2 ] = { 0 , 0 } ;
uint error [ 2 ] = { MAX_UINT , MAX_UINT } ;
2016-08-22 18:26:36 +00:00
for ( uint p = 0 ; p < max_p ; p + + )
{
endPoint [ 0 ] = endPointBackup [ 0 ] ;
endPoint [ 1 ] = endPointBackup [ 1 ] ;
for ( i = 0 ; i < 2 ; i + + ) // loop through 2 subsets
{
if ( g_mode_id == 1 )
{
2018-07-04 01:32:23 +00:00
compress_endpoints1 ( endPoint [ i ] , p ) ;
2016-08-22 18:26:36 +00:00
}
else if ( g_mode_id == 3 )
{
2018-07-04 01:32:23 +00:00
compress_endpoints3 ( endPoint [ i ] , uint2 ( p , p >> 1 ) & 1 ) ;
2016-08-22 18:26:36 +00:00
}
else if ( g_mode_id == 7 )
{
2018-07-04 01:32:23 +00:00
compress_endpoints7 ( endPoint [ i ] , uint2 ( p , p >> 1 ) & 1 ) ;
2016-08-22 18:26:36 +00:00
}
}
int4 span [ 2 ] ;
span [ 0 ] = endPoint [ 0 ] [ 1 ] - endPoint [ 0 ] [ 0 ] ;
span [ 1 ] = endPoint [ 1 ] [ 1 ] - endPoint [ 1 ] [ 0 ] ;
if ( g_mode_id != 7 )
{
span [ 0 ] . w = span [ 1 ] . w = 0 ;
}
int span_norm_sqr [ 2 ] ;
span_norm_sqr [ 0 ] = dot ( span [ 0 ] , span [ 0 ] ) ;
span_norm_sqr [ 1 ] = dot ( span [ 1 ] , span [ 1 ] ) ;
// TODO: again, this shouldn't be necessary here in error calculation
int dotProduct = dot ( span [ 0 ] , shared_temp [ threadBase + 0 ] . pixel - endPoint [ 0 ] [ 0 ] ) ;
if ( span_norm_sqr [ 0 ] > 0 && dotProduct > 0 && uint ( dotProduct * 63.49999 ) > uint ( 32 * span_norm_sqr [ 0 ] ) )
{
span [ 0 ] = - span [ 0 ] ;
swap ( endPoint [ 0 ] [ 0 ] , endPoint [ 0 ] [ 1 ] ) ;
}
dotProduct = dot ( span [ 1 ] , shared_temp [ threadBase + candidateFixUpIndex1D [ partition ] . x ] . pixel - endPoint [ 1 ] [ 0 ] ) ;
if ( span_norm_sqr [ 1 ] > 0 && dotProduct > 0 && uint ( dotProduct * 63.49999 ) > uint ( 32 * span_norm_sqr [ 1 ] ) )
{
span [ 1 ] = - span [ 1 ] ;
swap ( endPoint [ 1 ] [ 0 ] , endPoint [ 1 ] [ 1 ] ) ;
}
uint step_selector ;
if ( g_mode_id != 1 )
{
step_selector = 2 ; // mode 3 7 have 2 bit index
}
else
{
step_selector = 1 ; // mode 1 has 3 bit index
}
2018-07-04 01:32:23 +00:00
uint p_error [ 2 ] = { 0 , 0 } ;
2016-08-22 18:26:36 +00:00
for ( i = 0 ; i < 16 ; i + + )
{
2018-07-04 01:32:23 +00:00
uint subset_index = ( bits >> i ) & 0x01 ;
if ( subset_index == 1 )
2016-08-22 18:26:36 +00:00
{
dotProduct = dot ( span [ 1 ] , shared_temp [ threadBase + i ] . pixel - endPoint [ 1 ] [ 0 ] ) ;
color_index = ( span_norm_sqr [ 1 ] <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr [ 1 ] ) ? aStep [ step_selector ] [ uint ( dotProduct * 63.49999 / span_norm_sqr [ 1 ] ) ] : aStep [ step_selector ] [ 63 ] ) ;
}
else
{
dotProduct = dot ( span [ 0 ] , shared_temp [ threadBase + i ] . pixel - endPoint [ 0 ] [ 0 ] ) ;
color_index = ( span_norm_sqr [ 0 ] <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr [ 0 ] ) ? aStep [ step_selector ] [ uint ( dotProduct * 63.49999 / span_norm_sqr [ 0 ] ) ] : aStep [ step_selector ] [ 63 ] ) ;
}
pixel_r = ( ( 64 - aWeight [ step_selector ] [ color_index ] ) * endPoint [ subset_index ] [ 0 ]
+ aWeight [ step_selector ] [ color_index ] * endPoint [ subset_index ] [ 1 ] + 32 ) >> 6 ;
if ( g_mode_id != 7 )
{
pixel_r . a = 255 ;
}
uint4 pixel = shared_temp [ threadBase + i ] . pixel ;
Ensure_A_Is_Larger ( pixel_r , pixel ) ;
pixel_r - = pixel ;
2018-07-04 01:32:23 +00:00
uint pixel_error = ComputeError ( pixel_r , pixel_r ) ;
if ( subset_index == 1 )
p_error [ 1 ] + = pixel_error ;
else
p_error [ 0 ] + = pixel_error ;
2016-08-22 18:26:36 +00:00
}
2018-07-04 01:32:23 +00:00
for ( i = 0 ; i < 2 ; i + + )
2016-08-22 18:26:36 +00:00
{
2018-07-04 01:32:23 +00:00
if ( p_error [ i ] < error [ i ] )
{
error [ i ] = p_error [ i ] ;
final_p [ i ] = p ;
}
2016-08-22 18:26:36 +00:00
}
}
2018-07-04 01:32:23 +00:00
shared_temp [ GI ] . error = error [ 0 ] + error [ 1 ] ;
2016-08-22 18:26:36 +00:00
shared_temp [ GI ] . mode = g_mode_id ;
shared_temp [ GI ] . partition = partition ;
2018-07-04 01:32:23 +00:00
// mode 1 3 7 don't have rotation, we use rotation for p bits
if ( g_mode_id == 1 )
shared_temp [ GI ] . rotation = ( final_p [ 1 ] << 1 ) | final_p [ 0 ] ;
else
shared_temp [ GI ] . rotation = ( final_p [ 1 ] << 2 ) | final_p [ 0 ] ;
2016-08-22 18:26:36 +00:00
}
GroupMemoryBarrierWithGroupSync ( ) ;
if ( threadInBlock < 32 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 32 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 32 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 32 ] . mode ;
shared_temp [ GI ] . partition = shared_temp [ GI + 32 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 32 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 16 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 16 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 16 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 16 ] . mode ;
shared_temp [ GI ] . partition = shared_temp [ GI + 16 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 16 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 8 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 8 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 8 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 8 ] . mode ;
shared_temp [ GI ] . partition = shared_temp [ GI + 8 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 8 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 4 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 4 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 4 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 4 ] . mode ;
shared_temp [ GI ] . partition = shared_temp [ GI + 4 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 4 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 2 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 2 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 2 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 2 ] . mode ;
shared_temp [ GI ] . partition = shared_temp [ GI + 2 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 2 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 1 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 1 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 1 ] . error ;
shared_temp [ GI ] . mode = shared_temp [ GI + 1 ] . mode ;
shared_temp [ GI ] . partition = shared_temp [ GI + 1 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 1 ] . rotation ;
}
if ( g_InBuff [ blockID ] . x > shared_temp [ GI ] . error )
{
g_OutBuff [ blockID ] = uint4 ( shared_temp [ GI ] . error , shared_temp [ GI ] . mode , shared_temp [ GI ] . partition , shared_temp [ GI ] . rotation ) ; // mode 1 3 7 don't have rotation, we use rotation for p bits
}
else
{
g_OutBuff [ blockID ] = g_InBuff [ blockID ] ;
}
}
}
[ numthreads ( THREAD_GROUP_SIZE , 1 , 1 ) ]
void TryMode02CS ( uint GI : SV_GroupIndex , uint3 groupID : SV_GroupID ) // mode 0 2 have 3 subsets per block
{
const uint MAX_USED_THREAD = 64 ;
uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD ;
uint blockInGroup = GI / MAX_USED_THREAD ;
uint blockID = g_start_block_id + groupID . x * BLOCK_IN_GROUP + blockInGroup ;
uint threadBase = blockInGroup * MAX_USED_THREAD ;
uint threadInBlock = GI - threadBase ;
uint block_y = blockID / g_num_block_x ;
uint block_x = blockID - block_y * g_num_block_x ;
uint base_x = block_x * BLOCK_SIZE_X ;
uint base_y = block_y * BLOCK_SIZE_Y ;
if ( threadInBlock < 16 )
{
shared_temp [ GI ] . pixel = clamp ( uint4 ( g_Input . Load ( uint3 ( base_x + threadInBlock % 4 , base_y + threadInBlock / 4 , 0 ) ) * 255 ) , 0 , 255 ) ;
}
GroupMemoryBarrierWithGroupSync ( ) ;
shared_temp [ GI ] . error = 0xFFFFFFFF ;
uint num_partitions ;
if ( 0 == g_mode_id )
{
num_partitions = 16 ;
}
else
{
num_partitions = 64 ;
}
uint4 pixel_r ;
uint2x4 endPoint [ 3 ] ; // endPoint[0..1 for subset id][0..1 for low and high in the subset]
uint2x4 endPointBackup [ 3 ] ;
uint color_index [ 16 ] ;
if ( threadInBlock < num_partitions )
{
uint partition = threadInBlock + 64 ;
endPoint [ 0 ] [ 0 ] = MAX_UINT ;
endPoint [ 0 ] [ 1 ] = MIN_UINT ;
endPoint [ 1 ] [ 0 ] = MAX_UINT ;
endPoint [ 1 ] [ 1 ] = MIN_UINT ;
endPoint [ 2 ] [ 0 ] = MAX_UINT ;
endPoint [ 2 ] [ 1 ] = MIN_UINT ;
uint bits2 = candidateSectionBit2 [ partition - 64 ] ;
for ( uint i = 0 ; i < 16 ; i + + )
{
uint4 pixel = shared_temp [ threadBase + i ] . pixel ;
uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03 ;
if ( subset_index == 2 )
{
endPoint [ 2 ] [ 0 ] = min ( endPoint [ 2 ] [ 0 ] , pixel ) ;
endPoint [ 2 ] [ 1 ] = max ( endPoint [ 2 ] [ 1 ] , pixel ) ;
}
else if ( subset_index == 1 )
{
endPoint [ 1 ] [ 0 ] = min ( endPoint [ 1 ] [ 0 ] , pixel ) ;
endPoint [ 1 ] [ 1 ] = max ( endPoint [ 1 ] [ 1 ] , pixel ) ;
}
else
{
endPoint [ 0 ] [ 0 ] = min ( endPoint [ 0 ] [ 0 ] , pixel ) ;
endPoint [ 0 ] [ 1 ] = max ( endPoint [ 0 ] [ 1 ] , pixel ) ;
}
}
endPointBackup [ 0 ] = endPoint [ 0 ] ;
endPointBackup [ 1 ] = endPoint [ 1 ] ;
endPointBackup [ 2 ] = endPoint [ 2 ] ;
uint max_p ;
if ( 0 == g_mode_id )
{
2018-07-04 01:32:23 +00:00
max_p = 4 ;
2016-08-22 18:26:36 +00:00
}
else
{
max_p = 1 ;
}
2018-07-04 01:32:23 +00:00
uint final_p [ 3 ] = { 0 , 0 , 0 } ;
uint error [ 3 ] = { MAX_UINT , MAX_UINT , MAX_UINT } ;
2016-08-22 18:26:36 +00:00
for ( uint p = 0 ; p < max_p ; p + + )
{
endPoint [ 0 ] = endPointBackup [ 0 ] ;
endPoint [ 1 ] = endPointBackup [ 1 ] ;
endPoint [ 2 ] = endPointBackup [ 2 ] ;
for ( i = 0 ; i < 3 ; i + + )
{
if ( 0 == g_mode_id )
{
2018-07-04 01:32:23 +00:00
compress_endpoints0 ( endPoint [ i ] , uint2 ( p , p >> 1 ) & 1 ) ;
2016-08-22 18:26:36 +00:00
}
else
{
compress_endpoints2 ( endPoint [ i ] ) ;
}
}
uint step_selector = 1 + ( 2 == g_mode_id ) ;
int4 span [ 3 ] ;
span [ 0 ] = endPoint [ 0 ] [ 1 ] - endPoint [ 0 ] [ 0 ] ;
span [ 1 ] = endPoint [ 1 ] [ 1 ] - endPoint [ 1 ] [ 0 ] ;
span [ 2 ] = endPoint [ 2 ] [ 1 ] - endPoint [ 2 ] [ 0 ] ;
span [ 0 ] . w = span [ 1 ] . w = span [ 2 ] . w = 0 ;
int span_norm_sqr [ 3 ] ;
span_norm_sqr [ 0 ] = dot ( span [ 0 ] , span [ 0 ] ) ;
span_norm_sqr [ 1 ] = dot ( span [ 1 ] , span [ 1 ] ) ;
span_norm_sqr [ 2 ] = dot ( span [ 2 ] , span [ 2 ] ) ;
// TODO: again, this shouldn't be necessary here in error calculation
uint ci [ 3 ] = { 0 , candidateFixUpIndex1D [ partition ] . x , candidateFixUpIndex1D [ partition ] . y } ;
for ( i = 0 ; i < 3 ; i + + )
{
int dotProduct = dot ( span [ i ] , shared_temp [ threadBase + ci [ i ] ] . pixel - endPoint [ i ] [ 0 ] ) ;
if ( span_norm_sqr [ i ] > 0 && dotProduct > 0 && uint ( dotProduct * 63.49999 ) > uint ( 32 * span_norm_sqr [ i ] ) )
{
span [ i ] = - span [ i ] ;
swap ( endPoint [ i ] [ 0 ] , endPoint [ i ] [ 1 ] ) ;
}
}
2018-07-04 01:32:23 +00:00
uint p_error [ 3 ] = { 0 , 0 , 0 } ;
2016-08-22 18:26:36 +00:00
for ( i = 0 ; i < 16 ; i + + )
{
uint subset_index = ( bits2 >> ( i * 2 ) ) & 0x03 ;
if ( subset_index == 2 )
{
int dotProduct = dot ( span [ 2 ] , shared_temp [ threadBase + i ] . pixel - endPoint [ 2 ] [ 0 ] ) ;
color_index [ i ] = ( span_norm_sqr [ 2 ] <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr [ 2 ] ) ? aStep [ step_selector ] [ uint ( dotProduct * 63.49999 / span_norm_sqr [ 2 ] ) ] : aStep [ step_selector ] [ 63 ] ) ;
}
else if ( subset_index == 1 )
{
int dotProduct = dot ( span [ 1 ] , shared_temp [ threadBase + i ] . pixel - endPoint [ 1 ] [ 0 ] ) ;
color_index [ i ] = ( span_norm_sqr [ 1 ] <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr [ 1 ] ) ? aStep [ step_selector ] [ uint ( dotProduct * 63.49999 / span_norm_sqr [ 1 ] ) ] : aStep [ step_selector ] [ 63 ] ) ;
}
else
{
int dotProduct = dot ( span [ 0 ] , shared_temp [ threadBase + i ] . pixel - endPoint [ 0 ] [ 0 ] ) ;
color_index [ i ] = ( span_norm_sqr [ 0 ] <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr [ 0 ] ) ? aStep [ step_selector ] [ uint ( dotProduct * 63.49999 / span_norm_sqr [ 0 ] ) ] : aStep [ step_selector ] [ 63 ] ) ;
}
pixel_r = ( ( 64 - aWeight [ step_selector ] [ color_index [ i ] ] ) * endPoint [ subset_index ] [ 0 ]
+ aWeight [ step_selector ] [ color_index [ i ] ] * endPoint [ subset_index ] [ 1 ] + 32 ) >> 6 ;
pixel_r . a = 255 ;
uint4 pixel = shared_temp [ threadBase + i ] . pixel ;
Ensure_A_Is_Larger ( pixel_r , pixel ) ;
pixel_r - = pixel ;
2018-07-04 01:32:23 +00:00
uint pixel_error = ComputeError ( pixel_r , pixel_r ) ;
if ( subset_index == 2 )
p_error [ 2 ] + = pixel_error ;
else if ( subset_index == 1 )
p_error [ 1 ] + = pixel_error ;
else
p_error [ 0 ] + = pixel_error ;
2016-08-22 18:26:36 +00:00
}
2018-07-04 01:32:23 +00:00
for ( i = 0 ; i < 3 ; i + + )
2016-08-22 18:26:36 +00:00
{
2018-07-04 01:32:23 +00:00
if ( p_error [ i ] < error [ i ] )
{
error [ i ] = p_error [ i ] ;
final_p [ i ] = p ; // Borrow rotation for p
}
2016-08-22 18:26:36 +00:00
}
}
2018-07-04 01:32:23 +00:00
shared_temp [ GI ] . error = error [ 0 ] + error [ 1 ] + error [ 2 ] ;
2016-08-22 18:26:36 +00:00
shared_temp [ GI ] . partition = partition ;
2018-07-04 01:32:23 +00:00
shared_temp [ GI ] . rotation = ( final_p [ 2 ] << 4 ) | ( final_p [ 1 ] << 2 ) | final_p [ 0 ] ;
2016-08-22 18:26:36 +00:00
}
GroupMemoryBarrierWithGroupSync ( ) ;
if ( threadInBlock < 32 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 32 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 32 ] . error ;
shared_temp [ GI ] . partition = shared_temp [ GI + 32 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 32 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 16 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 16 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 16 ] . error ;
shared_temp [ GI ] . partition = shared_temp [ GI + 16 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 16 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 8 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 8 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 8 ] . error ;
shared_temp [ GI ] . partition = shared_temp [ GI + 8 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 8 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 4 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 4 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 4 ] . error ;
shared_temp [ GI ] . partition = shared_temp [ GI + 4 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 4 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 2 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 2 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 2 ] . error ;
shared_temp [ GI ] . partition = shared_temp [ GI + 2 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 2 ] . rotation ;
}
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 1 )
{
if ( shared_temp [ GI ] . error > shared_temp [ GI + 1 ] . error )
{
shared_temp [ GI ] . error = shared_temp [ GI + 1 ] . error ;
shared_temp [ GI ] . partition = shared_temp [ GI + 1 ] . partition ;
shared_temp [ GI ] . rotation = shared_temp [ GI + 1 ] . rotation ;
}
if ( g_InBuff [ blockID ] . x > shared_temp [ GI ] . error )
{
g_OutBuff [ blockID ] = uint4 ( shared_temp [ GI ] . error , g_mode_id , shared_temp [ GI ] . partition , shared_temp [ GI ] . rotation ) ; // rotation is actually p bit for mode 0. for mode 2, rotation is always 0
}
else
{
g_OutBuff [ blockID ] = g_InBuff [ blockID ] ;
}
}
}
[ numthreads ( THREAD_GROUP_SIZE , 1 , 1 ) ]
void EncodeBlockCS ( uint GI : SV_GroupIndex , uint3 groupID : SV_GroupID )
{
const uint MAX_USED_THREAD = 16 ;
uint BLOCK_IN_GROUP = THREAD_GROUP_SIZE / MAX_USED_THREAD ;
uint blockInGroup = GI / MAX_USED_THREAD ;
uint blockID = g_start_block_id + groupID . x * BLOCK_IN_GROUP + blockInGroup ;
uint threadBase = blockInGroup * MAX_USED_THREAD ;
uint threadInBlock = GI - threadBase ;
#ifndef REF_DEVICE
if ( blockID >= g_num_total_blocks )
{
return ;
}
#endif
uint block_y = blockID / g_num_block_x ;
uint block_x = blockID - block_y * g_num_block_x ;
uint base_x = block_x * BLOCK_SIZE_X ;
uint base_y = block_y * BLOCK_SIZE_Y ;
uint mode = g_InBuff [ blockID ] . y & 0x7FFFFFFF ;
uint partition = g_InBuff [ blockID ] . z ;
uint index_selector = ( g_InBuff [ blockID ] . y >> 31 ) & 1 ;
uint rotation = g_InBuff [ blockID ] . w ;
if ( threadInBlock < 16 )
{
uint4 pixel = clamp ( uint4 ( g_Input . Load ( uint3 ( base_x + threadInBlock % 4 , base_y + threadInBlock / 4 , 0 ) ) * 255 ) , 0 , 255 ) ;
if ( ( 4 == mode ) || ( 5 == mode ) )
{
if ( 1 == rotation )
{
pixel . ra = pixel . ar ;
}
else if ( 2 == rotation )
{
pixel . ga = pixel . ag ;
}
else if ( 3 == rotation )
{
pixel . ba = pixel . ab ;
}
}
shared_temp [ GI ] . pixel = pixel ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
uint bits = candidateSectionBit [ partition ] ;
uint bits2 = candidateSectionBit2 [ partition - 64 ] ;
uint2x4 ep ;
2017-11-18 01:09:40 +00:00
ep [ 0 ] = MAX_UINT ;
ep [ 1 ] = MIN_UINT ;
2016-08-22 18:26:36 +00:00
uint2x4 ep_quantized ;
[ unroll ]
for ( int ii = 2 ; ii >= 0 ; - - ii )
{
if ( threadInBlock < 16 )
{
uint2x4 ep ;
ep [ 0 ] = MAX_UINT ;
ep [ 1 ] = MIN_UINT ;
uint4 pixel = shared_temp [ GI ] . pixel ;
uint subset_index = ( bits >> threadInBlock ) & 0x01 ;
uint subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03 ;
if ( 0 == ii )
{
if ( ( 0 == mode ) || ( 2 == mode ) )
{
if ( 0 == subset_index2 )
{
ep [ 0 ] = ep [ 1 ] = pixel ;
}
}
else if ( ( 1 == mode ) || ( 3 == mode ) || ( 7 == mode ) )
{
if ( 0 == subset_index )
{
ep [ 0 ] = ep [ 1 ] = pixel ;
}
}
else if ( ( 4 == mode ) || ( 5 == mode ) || ( 6 == mode ) )
{
ep [ 0 ] = ep [ 1 ] = pixel ;
}
}
else if ( 1 == ii )
{
if ( ( 0 == mode ) || ( 2 == mode ) )
{
if ( 1 == subset_index2 )
{
ep [ 0 ] = ep [ 1 ] = pixel ;
}
}
else if ( ( 1 == mode ) || ( 3 == mode ) || ( 7 == mode ) )
{
if ( 1 == subset_index )
{
ep [ 0 ] = ep [ 1 ] = pixel ;
}
}
}
else
{
if ( ( 0 == mode ) || ( 2 == mode ) )
{
if ( 2 == subset_index2 )
{
ep [ 0 ] = ep [ 1 ] = pixel ;
}
}
}
shared_temp [ GI ] . endPoint_low = ep [ 0 ] ;
shared_temp [ GI ] . endPoint_high = ep [ 1 ] ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 8 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 8 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 8 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 4 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 4 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 4 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 2 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 2 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 2 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 1 )
{
shared_temp [ GI ] . endPoint_low = min ( shared_temp [ GI ] . endPoint_low , shared_temp [ GI + 1 ] . endPoint_low ) ;
shared_temp [ GI ] . endPoint_high = max ( shared_temp [ GI ] . endPoint_high , shared_temp [ GI + 1 ] . endPoint_high ) ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( ii == ( int ) threadInBlock )
{
ep [ 0 ] = shared_temp [ threadBase ] . endPoint_low ;
ep [ 1 ] = shared_temp [ threadBase ] . endPoint_high ;
}
}
if ( threadInBlock < 3 )
{
uint2 P ;
if ( 1 == mode )
{
P = ( rotation >> threadInBlock ) & 1 ;
}
else
{
P = uint2 ( rotation >> ( threadInBlock * 2 + 0 ) , rotation >> ( threadInBlock * 2 + 1 ) ) & 1 ;
}
if ( 0 == mode )
{
ep_quantized = compress_endpoints0 ( ep , P ) ;
}
else if ( 1 == mode )
{
ep_quantized = compress_endpoints1 ( ep , P ) ;
}
else if ( 2 == mode )
{
ep_quantized = compress_endpoints2 ( ep ) ;
}
else if ( 3 == mode )
{
ep_quantized = compress_endpoints3 ( ep , P ) ;
}
else if ( 4 == mode )
{
ep_quantized = compress_endpoints4 ( ep ) ;
}
else if ( 5 == mode )
{
ep_quantized = compress_endpoints5 ( ep ) ;
}
else if ( 6 == mode )
{
ep_quantized = compress_endpoints6 ( ep , P ) ;
}
else //if (7 == mode)
{
ep_quantized = compress_endpoints7 ( ep , P ) ;
}
int4 span = ep [ 1 ] - ep [ 0 ] ;
if ( mode < 4 )
{
span . w = 0 ;
}
if ( ( 4 == mode ) || ( 5 == mode ) )
{
if ( 0 == threadInBlock )
{
int2 span_norm_sqr = uint2 ( dot ( span . rgb , span . rgb ) , span . a * span . a ) ;
int2 dotProduct = int2 ( dot ( span . rgb , shared_temp [ threadBase + 0 ] . pixel . rgb - ep [ 0 ] . rgb ) , span . a * ( shared_temp [ threadBase + 0 ] . pixel . a - ep [ 0 ] . a ) ) ;
if ( span_norm_sqr . x > 0 && dotProduct . x > 0 && uint ( dotProduct . x * 63.49999 ) > uint ( 32 * span_norm_sqr . x ) )
{
swap ( ep [ 0 ] . rgb , ep [ 1 ] . rgb ) ;
swap ( ep_quantized [ 0 ] . rgb , ep_quantized [ 1 ] . rgb ) ;
}
if ( span_norm_sqr . y > 0 && dotProduct . y > 0 && uint ( dotProduct . y * 63.49999 ) > uint ( 32 * span_norm_sqr . y ) )
{
swap ( ep [ 0 ] . a , ep [ 1 ] . a ) ;
swap ( ep_quantized [ 0 ] . a , ep_quantized [ 1 ] . a ) ;
}
}
}
else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
{
int p ;
if ( 0 == threadInBlock )
{
p = 0 ;
}
else if ( 1 == threadInBlock )
{
p = candidateFixUpIndex1D [ partition ] . x ;
}
else //if (2 == threadInBlock)
{
p = candidateFixUpIndex1D [ partition ] . y ;
}
int span_norm_sqr = dot ( span , span ) ;
int dotProduct = dot ( span , shared_temp [ threadBase + p ] . pixel - ep [ 0 ] ) ;
if ( span_norm_sqr > 0 && dotProduct > 0 && uint ( dotProduct * 63.49999 ) > uint ( 32 * span_norm_sqr ) )
{
swap ( ep [ 0 ] , ep [ 1 ] ) ;
swap ( ep_quantized [ 0 ] , ep_quantized [ 1 ] ) ;
}
}
shared_temp [ GI ] . endPoint_low = ep [ 0 ] ;
shared_temp [ GI ] . endPoint_high = ep [ 1 ] ;
shared_temp [ GI ] . endPoint_low_quantized = ep_quantized [ 0 ] ;
shared_temp [ GI ] . endPoint_high_quantized = ep_quantized [ 1 ] ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( threadInBlock < 16 )
{
uint color_index = 0 ;
uint alpha_index = 0 ;
uint2x4 ep ;
uint2 indexPrec ;
if ( ( 0 == mode ) || ( 1 == mode ) )
{
indexPrec = 1 ;
}
else if ( 6 == mode )
{
indexPrec = 0 ;
}
else if ( 4 == mode )
{
if ( 0 == index_selector )
{
indexPrec = uint2 ( 2 , 1 ) ;
}
else
{
indexPrec = uint2 ( 1 , 2 ) ;
}
}
else
{
indexPrec = 2 ;
}
int subset_index ;
if ( ( 0 == mode ) || ( 2 == mode ) )
{
subset_index = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03 ;
}
else if ( ( 1 == mode ) || ( 3 == mode ) || ( 7 == mode ) )
{
subset_index = ( bits >> threadInBlock ) & 0x01 ;
}
else
{
subset_index = 0 ;
}
ep [ 0 ] = shared_temp [ threadBase + subset_index ] . endPoint_low ;
ep [ 1 ] = shared_temp [ threadBase + subset_index ] . endPoint_high ;
int4 span = ep [ 1 ] - ep [ 0 ] ;
if ( mode < 4 )
{
span . w = 0 ;
}
if ( ( 4 == mode ) || ( 5 == mode ) )
{
int2 span_norm_sqr ;
span_norm_sqr . x = dot ( span . rgb , span . rgb ) ;
span_norm_sqr . y = span . a * span . a ;
int dotProduct = dot ( span . rgb , shared_temp [ threadBase + threadInBlock ] . pixel . rgb - ep [ 0 ] . rgb ) ;
color_index = ( span_norm_sqr . x <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr . x ) ? aStep [ indexPrec . x ] [ uint ( dotProduct * 63.49999 / span_norm_sqr . x ) ] : aStep [ indexPrec . x ] [ 63 ] ) ;
dotProduct = dot ( span . a , shared_temp [ threadBase + threadInBlock ] . pixel . a - ep [ 0 ] . a ) ;
alpha_index = ( span_norm_sqr . y <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr . y ) ? aStep [ indexPrec . y ] [ uint ( dotProduct * 63.49999 / span_norm_sqr . y ) ] : aStep [ indexPrec . y ] [ 63 ] ) ;
if ( index_selector )
{
swap ( color_index , alpha_index ) ;
}
}
else
{
int span_norm_sqr = dot ( span , span ) ;
int dotProduct = dot ( span , shared_temp [ threadBase + threadInBlock ] . pixel - ep [ 0 ] ) ;
color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
: ( ( dotProduct < span_norm_sqr ) ? aStep [ indexPrec . x ] [ uint ( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep [ indexPrec . x ] [ 63 ] ) ;
}
shared_temp [ GI ] . error = color_index ;
shared_temp [ GI ] . mode = alpha_index ;
}
#ifdef REF_DEVICE
GroupMemoryBarrierWithGroupSync ( ) ;
#endif
if ( 0 == threadInBlock )
{
uint4 block ;
if ( 0 == mode )
{
block_package0 ( block , partition , threadBase ) ;
}
else if ( 1 == mode )
{
block_package1 ( block , partition , threadBase ) ;
}
else if ( 2 == mode )
{
block_package2 ( block , partition , threadBase ) ;
}
else if ( 3 == mode )
{
block_package3 ( block , partition , threadBase ) ;
}
else if ( 4 == mode )
{
block_package4 ( block , rotation , index_selector , threadBase ) ;
}
else if ( 5 == mode )
{
block_package5 ( block , rotation , threadBase ) ;
}
else if ( 6 == mode )
{
block_package6 ( block , threadBase ) ;
}
else //if (7 == mode)
{
block_package7 ( block , partition , threadBase ) ;
}
g_OutBuff [ blockID ] = block ;
}
}
//uint4 truncate_and_round( uint4 color, uint bits)
//{
// uint precisionMask = ((1 << bits) - 1) << (8 - bits);
// uint precisionHalf = (1 << (7-bits));
//
// uint4 truncated = color & precisionMask;
// uint4 rounded = min(255, color + precisionHalf) & precisionMask;
//
// uint4 truncated_bak = truncated = truncated | (truncated >> bits);
// uint4 rounded_bak = rounded = rounded | (rounded >> bits);
//
// uint4 color_bak = color;
//
// Ensure_A_Is_Larger( rounded, color );
// Ensure_A_Is_Larger( truncated, color_bak );
//
// if (dot(rounded - color, rounded - color) <
// dot(truncated - color_bak, truncated - color_bak))
// {
// return rounded_bak;
// }
// else
// {
// return truncated_bak;
// }
//}
uint4 quantize ( uint4 color , uint uPrec )
{
2018-07-04 01:20:41 +00:00
return ( ( ( color << 8 ) + color ) * ( ( 1 << uPrec ) - 1 ) + 32768 ) >> 16 ;
2016-08-22 18:26:36 +00:00
}
uint4 unquantize ( uint4 color , uint uPrec )
{
color = color << ( 8 - uPrec ) ;
return color | ( color >> uPrec ) ;
}
uint2x4 compress_endpoints0 ( inout uint2x4 endPoint , uint2 P )
{
uint2x4 quantized ;
2017-11-18 01:09:40 +00:00
[ unroll ] for ( uint j = 0 ; j < 2 ; j + + )
2016-08-22 18:26:36 +00:00
{
quantized [ j ] . rgb = quantize ( endPoint [ j ] . rgbb , 5 ) . rgb & 0xFFFFFFFE ;
quantized [ j ] . rgb | = P [ j ] ;
quantized [ j ] . a = 0xFF ;
endPoint [ j ] . rgb = unquantize ( quantized [ j ] . rgbb , 5 ) . rgb ;
endPoint [ j ] . a = 0xFF ;
quantized [ j ] << = 3 ;
}
return quantized ;
}
uint2x4 compress_endpoints1 ( inout uint2x4 endPoint , uint2 P )
{
uint2x4 quantized ;
2017-11-18 01:09:40 +00:00
[ unroll ] for ( uint j = 0 ; j < 2 ; j + + )
2016-08-22 18:26:36 +00:00
{
quantized [ j ] . rgb = quantize ( endPoint [ j ] . rgbb , 7 ) . rgb & 0xFFFFFFFE ;
quantized [ j ] . rgb | = P [ j ] ;
quantized [ j ] . a = 0xFF ;
endPoint [ j ] . rgb = unquantize ( quantized [ j ] . rgbb , 7 ) . rgb ;
endPoint [ j ] . a = 0xFF ;
quantized [ j ] << = 1 ;
}
return quantized ;
}
uint2x4 compress_endpoints2 ( inout uint2x4 endPoint )
{
uint2x4 quantized ;
2017-11-18 01:09:40 +00:00
[ unroll ] for ( uint j = 0 ; j < 2 ; j + + )
2016-08-22 18:26:36 +00:00
{
quantized [ j ] . rgb = quantize ( endPoint [ j ] . rgbb , 5 ) . rgb ;
quantized [ j ] . a = 0xFF ;
endPoint [ j ] . rgb = unquantize ( quantized [ j ] . rgbb , 5 ) . rgb ;
endPoint [ j ] . a = 0xFF ;
quantized [ j ] << = 3 ;
}
return quantized ;
}
uint2x4 compress_endpoints3 ( inout uint2x4 endPoint , uint2 P )
{
uint2x4 quantized ;
for ( uint j = 0 ; j < 2 ; j + + )
{
quantized [ j ] . rgb = endPoint [ j ] . rgb & 0xFFFFFFFE ;
quantized [ j ] . rgb | = P [ j ] ;
quantized [ j ] . a = 0xFF ;
endPoint [ j ] . rgb = quantized [ j ] . rgb ;
endPoint [ j ] . a = 0xFF ;
}
return quantized ;
}
uint2x4 compress_endpoints4 ( inout uint2x4 endPoint )
{
uint2x4 quantized ;
2017-11-18 01:09:40 +00:00
[ unroll ] for ( uint j = 0 ; j < 2 ; j + + )
2016-08-22 18:26:36 +00:00
{
quantized [ j ] . rgb = quantize ( endPoint [ j ] . rgbb , 5 ) . rgb ;
quantized [ j ] . a = quantize ( endPoint [ j ] . a , 6 ) . r ;
endPoint [ j ] . rgb = unquantize ( quantized [ j ] . rgbb , 5 ) . rgb ;
endPoint [ j ] . a = unquantize ( quantized [ j ] . a , 6 ) . r ;
quantized [ j ] . rgb << = 3 ;
quantized [ j ] . a << = 2 ;
}
return quantized ;
}
uint2x4 compress_endpoints5 ( inout uint2x4 endPoint )
{
uint2x4 quantized ;
2017-11-18 01:09:40 +00:00
[ unroll ] for ( uint j = 0 ; j < 2 ; j + + )
2016-08-22 18:26:36 +00:00
{
quantized [ j ] . rgb = quantize ( endPoint [ j ] . rgbb , 7 ) . rgb ;
quantized [ j ] . a = endPoint [ j ] . a ;
endPoint [ j ] . rgb = unquantize ( quantized [ j ] . rgbb , 7 ) . rgb ;
// endPoint[j].a Alpha is full precision
quantized [ j ] . rgb << = 1 ;
}
return quantized ;
}
uint2x4 compress_endpoints6 ( inout uint2x4 endPoint , uint2 P )
{
uint2x4 quantized ;
for ( uint j = 0 ; j < 2 ; j + + )
{
quantized [ j ] = endPoint [ j ] & 0xFFFFFFFE ;
quantized [ j ] | = P [ j ] ;
endPoint [ j ] = quantized [ j ] ;
}
return quantized ;
}
uint2x4 compress_endpoints7 ( inout uint2x4 endPoint , uint2 P )
{
uint2x4 quantized ;
2017-11-18 01:09:40 +00:00
[ unroll ] for ( uint j = 0 ; j < 2 ; j + + )
2016-08-22 18:26:36 +00:00
{
quantized [ j ] = quantize ( endPoint [ j ] , 6 ) & 0xFFFFFFFE ;
quantized [ j ] | = P [ j ] ;
endPoint [ j ] = unquantize ( quantized [ j ] , 6 ) ;
}
return quantized << 2 ;
}
#define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized
#define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized
#define get_color_index(index) shared_temp[threadBase + index].error
#define get_alpha_index(index) shared_temp[threadBase + index].mode
void block_package0 ( out uint4 block , uint partition , uint threadBase )
{
block . x = 0x01 | ( ( partition - 64 ) << 1 )
| ( ( get_end_point_l ( 0 ) . r & 0xF0 ) << 1 ) | ( ( get_end_point_h ( 0 ) . r & 0xF0 ) << 5 )
| ( ( get_end_point_l ( 1 ) . r & 0xF0 ) << 9 ) | ( ( get_end_point_h ( 1 ) . r & 0xF0 ) << 13 )
| ( ( get_end_point_l ( 2 ) . r & 0xF0 ) << 17 ) | ( ( get_end_point_h ( 2 ) . r & 0xF0 ) << 21 )
| ( ( get_end_point_l ( 0 ) . g & 0xF0 ) << 25 ) ;
block . y = ( ( get_end_point_l ( 0 ) . g & 0xF0 ) >> 7 ) | ( ( get_end_point_h ( 0 ) . g & 0xF0 ) >> 3 )
| ( ( get_end_point_l ( 1 ) . g & 0xF0 ) << 1 ) | ( ( get_end_point_h ( 1 ) . g & 0xF0 ) << 5 )
| ( ( get_end_point_l ( 2 ) . g & 0xF0 ) << 9 ) | ( ( get_end_point_h ( 2 ) . g & 0xF0 ) << 13 )
| ( ( get_end_point_l ( 0 ) . b & 0xF0 ) << 17 ) | ( ( get_end_point_h ( 0 ) . b & 0xF0 ) << 21 )
| ( ( get_end_point_l ( 1 ) . b & 0xF0 ) << 25 ) ;
block . z = ( ( get_end_point_l ( 1 ) . b & 0xF0 ) >> 7 ) | ( ( get_end_point_h ( 1 ) . b & 0xF0 ) >> 3 )
| ( ( get_end_point_l ( 2 ) . b & 0xF0 ) << 1 ) | ( ( get_end_point_h ( 2 ) . b & 0xF0 ) << 5 )
| ( ( get_end_point_l ( 0 ) . r & 0x08 ) << 10 ) | ( ( get_end_point_h ( 0 ) . r & 0x08 ) << 11 )
| ( ( get_end_point_l ( 1 ) . r & 0x08 ) << 12 ) | ( ( get_end_point_h ( 1 ) . r & 0x08 ) << 13 )
| ( ( get_end_point_l ( 2 ) . r & 0x08 ) << 14 ) | ( ( get_end_point_h ( 2 ) . r & 0x08 ) << 15 )
| ( get_color_index ( 0 ) << 19 ) ;
block . w = 0 ;
uint i = 1 ;
for ( ; i <= min ( candidateFixUpIndex1DOrdered [ partition ] [ 0 ] , 4 ) ; i + + )
{
block . z | = get_color_index ( i ) << ( i * 3 + 18 ) ;
}
if ( candidateFixUpIndex1DOrdered [ partition ] [ 0 ] < 4 ) //i = 4
{
block . z | = get_color_index ( 4 ) << 29 ;
i + = 1 ;
}
else //i = 5
{
block . w | = ( get_color_index ( 4 ) & 0x04 ) >> 2 ;
for ( ; i <= candidateFixUpIndex1DOrdered [ partition ] [ 0 ] ; i + + )
block . w | = get_color_index ( i ) << ( i * 3 - 14 ) ;
}
for ( ; i <= candidateFixUpIndex1DOrdered [ partition ] [ 1 ] ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 3 - 15 ) ;
}
for ( ; i < 16 ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 3 - 16 ) ;
}
}
void block_package1 ( out uint4 block , uint partition , uint threadBase )
{
block . x = 0x02 | ( partition << 2 )
| ( ( get_end_point_l ( 0 ) . r & 0xFC ) << 6 ) | ( ( get_end_point_h ( 0 ) . r & 0xFC ) << 12 )
| ( ( get_end_point_l ( 1 ) . r & 0xFC ) << 18 ) | ( ( get_end_point_h ( 1 ) . r & 0xFC ) << 24 ) ;
block . y = ( ( get_end_point_l ( 0 ) . g & 0xFC ) >> 2 ) | ( ( get_end_point_h ( 0 ) . g & 0xFC ) << 4 )
| ( ( get_end_point_l ( 1 ) . g & 0xFC ) << 10 ) | ( ( get_end_point_h ( 1 ) . g & 0xFC ) << 16 )
| ( ( get_end_point_l ( 0 ) . b & 0xFC ) << 22 ) | ( ( get_end_point_h ( 0 ) . b & 0xFC ) << 28 ) ;
block . z = ( ( get_end_point_h ( 0 ) . b & 0xFC ) >> 4 ) | ( ( get_end_point_l ( 1 ) . b & 0xFC ) << 2 )
| ( ( get_end_point_h ( 1 ) . b & 0xFC ) << 8 )
| ( ( get_end_point_l ( 0 ) . r & 0x02 ) << 15 ) | ( ( get_end_point_l ( 1 ) . r & 0x02 ) << 16 )
| ( get_color_index ( 0 ) << 18 ) ;
if ( candidateFixUpIndex1DOrdered [ partition ] [ 0 ] == 15 )
{
block . w = ( get_color_index ( 15 ) << 30 ) | ( get_color_index ( 14 ) << 27 ) | ( get_color_index ( 13 ) << 24 ) | ( get_color_index ( 12 ) << 21 ) | ( get_color_index ( 11 ) << 18 ) | ( get_color_index ( 10 ) << 15 )
| ( get_color_index ( 9 ) << 12 ) | ( get_color_index ( 8 ) << 9 ) | ( get_color_index ( 7 ) << 6 ) | ( get_color_index ( 6 ) << 3 ) | get_color_index ( 5 ) ;
block . z | = ( get_color_index ( 4 ) << 29 ) | ( get_color_index ( 3 ) << 26 ) | ( get_color_index ( 2 ) << 23 ) | ( get_color_index ( 1 ) << 20 ) | ( get_color_index ( 0 ) << 18 ) ;
}
else if ( candidateFixUpIndex1DOrdered [ partition ] [ 0 ] == 2 )
{
block . w = ( get_color_index ( 15 ) << 29 ) | ( get_color_index ( 14 ) << 26 ) | ( get_color_index ( 13 ) << 23 ) | ( get_color_index ( 12 ) << 20 ) | ( get_color_index ( 11 ) << 17 ) | ( get_color_index ( 10 ) << 14 )
| ( get_color_index ( 9 ) << 11 ) | ( get_color_index ( 8 ) << 8 ) | ( get_color_index ( 7 ) << 5 ) | ( get_color_index ( 6 ) << 2 ) | ( get_color_index ( 5 ) >> 1 ) ;
block . z | = ( get_color_index ( 5 ) << 31 ) | ( get_color_index ( 4 ) << 28 ) | ( get_color_index ( 3 ) << 25 ) | ( get_color_index ( 2 ) << 23 ) | ( get_color_index ( 1 ) << 20 ) | ( get_color_index ( 0 ) << 18 ) ;
}
else if ( candidateFixUpIndex1DOrdered [ partition ] [ 0 ] == 8 )
{
block . w = ( get_color_index ( 15 ) << 29 ) | ( get_color_index ( 14 ) << 26 ) | ( get_color_index ( 13 ) << 23 ) | ( get_color_index ( 12 ) << 20 ) | ( get_color_index ( 11 ) << 17 ) | ( get_color_index ( 10 ) << 14 )
| ( get_color_index ( 9 ) << 11 ) | ( get_color_index ( 8 ) << 9 ) | ( get_color_index ( 7 ) << 6 ) | ( get_color_index ( 6 ) << 3 ) | get_color_index ( 5 ) ;
block . z | = ( get_color_index ( 4 ) << 29 ) | ( get_color_index ( 3 ) << 26 ) | ( get_color_index ( 2 ) << 23 ) | ( get_color_index ( 1 ) << 20 ) | ( get_color_index ( 0 ) << 18 ) ;
}
else //candidateFixUpIndex1DOrdered[partition] == 6
{
block . w = ( get_color_index ( 15 ) << 29 ) | ( get_color_index ( 14 ) << 26 ) | ( get_color_index ( 13 ) << 23 ) | ( get_color_index ( 12 ) << 20 ) | ( get_color_index ( 11 ) << 17 ) | ( get_color_index ( 10 ) << 14 )
2018-01-17 01:18:59 +00:00
| ( get_color_index ( 9 ) << 11 ) | ( get_color_index ( 8 ) << 8 ) | ( get_color_index ( 7 ) << 5 ) | ( get_color_index ( 6 ) << 3 ) | get_color_index ( 5 ) ;
2016-08-22 18:26:36 +00:00
block . z | = ( get_color_index ( 4 ) << 29 ) | ( get_color_index ( 3 ) << 26 ) | ( get_color_index ( 2 ) << 23 ) | ( get_color_index ( 1 ) << 20 ) | ( get_color_index ( 0 ) << 18 ) ;
}
}
void block_package2 ( out uint4 block , uint partition , uint threadBase )
{
block . x = 0x04 | ( ( partition - 64 ) << 3 )
| ( ( get_end_point_l ( 0 ) . r & 0xF8 ) << 6 ) | ( ( get_end_point_h ( 0 ) . r & 0xF8 ) << 11 )
| ( ( get_end_point_l ( 1 ) . r & 0xF8 ) << 16 ) | ( ( get_end_point_h ( 1 ) . r & 0xF8 ) << 21 )
| ( ( get_end_point_l ( 2 ) . r & 0xF8 ) << 26 ) ;
block . y = ( ( get_end_point_l ( 2 ) . r & 0xF8 ) >> 6 ) | ( ( get_end_point_h ( 2 ) . r & 0xF8 ) >> 1 )
| ( ( get_end_point_l ( 0 ) . g & 0xF8 ) << 4 ) | ( ( get_end_point_h ( 0 ) . g & 0xF8 ) << 9 )
| ( ( get_end_point_l ( 1 ) . g & 0xF8 ) << 14 ) | ( ( get_end_point_h ( 1 ) . g & 0xF8 ) << 19 )
| ( ( get_end_point_l ( 2 ) . g & 0xF8 ) << 24 ) ;
block . z = ( ( get_end_point_h ( 2 ) . g & 0xF8 ) >> 3 ) | ( ( get_end_point_l ( 0 ) . b & 0xF8 ) << 2 )
| ( ( get_end_point_h ( 0 ) . b & 0xF8 ) << 7 ) | ( ( get_end_point_l ( 1 ) . b & 0xF8 ) << 12 )
| ( ( get_end_point_h ( 1 ) . b & 0xF8 ) << 17 ) | ( ( get_end_point_l ( 2 ) . b & 0xF8 ) << 22 )
| ( ( get_end_point_h ( 2 ) . b & 0xF8 ) << 27 ) ;
block . w = ( ( get_end_point_h ( 2 ) . b & 0xF8 ) >> 5 )
| ( get_color_index ( 0 ) << 3 ) ;
uint i = 1 ;
for ( ; i <= candidateFixUpIndex1DOrdered [ partition ] [ 0 ] ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 + 2 ) ;
}
for ( ; i <= candidateFixUpIndex1DOrdered [ partition ] [ 1 ] ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 + 1 ) ;
}
for ( ; i < 16 ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 ) ;
}
}
void block_package3 ( out uint4 block , uint partition , uint threadBase )
{
block . x = 0x08 | ( partition << 4 )
| ( ( get_end_point_l ( 0 ) . r & 0xFE ) << 9 ) | ( ( get_end_point_h ( 0 ) . r & 0xFE ) << 16 )
| ( ( get_end_point_l ( 1 ) . r & 0xFE ) << 23 ) | ( ( get_end_point_h ( 1 ) . r & 0xFE ) << 30 ) ;
block . y = ( ( get_end_point_h ( 1 ) . r & 0xFE ) >> 2 ) | ( ( get_end_point_l ( 0 ) . g & 0xFE ) << 5 )
| ( ( get_end_point_h ( 0 ) . g & 0xFE ) << 12 ) | ( ( get_end_point_l ( 1 ) . g & 0xFE ) << 19 )
| ( ( get_end_point_h ( 1 ) . g & 0xFE ) << 26 ) ;
block . z = ( ( get_end_point_h ( 1 ) . g & 0xFE ) >> 6 ) | ( ( get_end_point_l ( 0 ) . b & 0xFE ) << 1 )
| ( ( get_end_point_h ( 0 ) . b & 0xFE ) << 8 ) | ( ( get_end_point_l ( 1 ) . b & 0xFE ) << 15 )
| ( ( get_end_point_h ( 1 ) . b & 0xFE ) << 22 )
| ( ( get_end_point_l ( 0 ) . r & 0x01 ) << 30 ) | ( ( get_end_point_h ( 0 ) . r & 0x01 ) << 31 ) ;
block . w = ( ( get_end_point_l ( 1 ) . r & 0x01 ) << 0 ) | ( ( get_end_point_h ( 1 ) . r & 0x01 ) << 1 )
| ( get_color_index ( 0 ) << 2 ) ;
uint i = 1 ;
for ( ; i <= candidateFixUpIndex1DOrdered [ partition ] [ 0 ] ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 + 1 ) ;
}
for ( ; i < 16 ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 ) ;
}
}
void block_package4 ( out uint4 block , uint rotation , uint index_selector , uint threadBase )
{
block . x = 0x10 | ( ( rotation & 3 ) << 5 ) | ( ( index_selector & 1 ) << 7 )
| ( ( get_end_point_l ( 0 ) . r & 0xF8 ) << 5 ) | ( ( get_end_point_h ( 0 ) . r & 0xF8 ) << 10 )
| ( ( get_end_point_l ( 0 ) . g & 0xF8 ) << 15 ) | ( ( get_end_point_h ( 0 ) . g & 0xF8 ) << 20 )
| ( ( get_end_point_l ( 0 ) . b & 0xF8 ) << 25 ) ;
block . y = ( ( get_end_point_l ( 0 ) . b & 0xF8 ) >> 7 ) | ( ( get_end_point_h ( 0 ) . b & 0xF8 ) >> 2 )
| ( ( get_end_point_l ( 0 ) . a & 0xFC ) << 4 ) | ( ( get_end_point_h ( 0 ) . a & 0xFC ) << 10 )
| ( ( get_color_index ( 0 ) & 1 ) << 18 ) | ( get_color_index ( 1 ) << 19 ) | ( get_color_index ( 2 ) << 21 ) | ( get_color_index ( 3 ) << 23 )
| ( get_color_index ( 4 ) << 25 ) | ( get_color_index ( 5 ) << 27 ) | ( get_color_index ( 6 ) << 29 ) | ( get_color_index ( 7 ) << 31 ) ;
block . z = ( get_color_index ( 7 ) >> 1 ) | ( get_color_index ( 8 ) << 1 ) | ( get_color_index ( 9 ) << 3 ) | ( get_color_index ( 10 ) << 5 )
| ( get_color_index ( 11 ) << 7 ) | ( get_color_index ( 12 ) << 9 ) | ( get_color_index ( 13 ) << 11 ) | ( get_color_index ( 14 ) << 13 )
| ( get_color_index ( 15 ) << 15 ) | ( ( get_alpha_index ( 0 ) & 3 ) << 17 ) | ( get_alpha_index ( 1 ) << 19 ) | ( get_alpha_index ( 2 ) << 22 )
| ( get_alpha_index ( 3 ) << 25 ) | ( get_alpha_index ( 4 ) << 28 ) | ( get_alpha_index ( 5 ) << 31 ) ;
block . w = ( get_alpha_index ( 5 ) >> 1 ) | ( get_alpha_index ( 6 ) << 2 ) | ( get_alpha_index ( 7 ) << 5 ) | ( get_alpha_index ( 8 ) << 8 )
| ( get_alpha_index ( 9 ) << 11 ) | ( get_alpha_index ( 10 ) << 14 ) | ( get_alpha_index ( 11 ) << 17 ) | ( get_alpha_index ( 12 ) << 20 )
| ( get_alpha_index ( 13 ) << 23 ) | ( get_alpha_index ( 14 ) << 26 ) | ( get_alpha_index ( 15 ) << 29 ) ;
}
void block_package5 ( out uint4 block , uint rotation , uint threadBase )
{
block . x = 0x20 | ( rotation << 6 )
| ( ( get_end_point_l ( 0 ) . r & 0xFE ) << 7 ) | ( ( get_end_point_h ( 0 ) . r & 0xFE ) << 14 )
| ( ( get_end_point_l ( 0 ) . g & 0xFE ) << 21 ) | ( ( get_end_point_h ( 0 ) . g & 0xFE ) << 28 ) ;
block . y = ( ( get_end_point_h ( 0 ) . g & 0xFE ) >> 4 ) | ( ( get_end_point_l ( 0 ) . b & 0xFE ) << 3 )
| ( ( get_end_point_h ( 0 ) . b & 0xFE ) << 10 ) | ( get_end_point_l ( 0 ) . a << 18 ) | ( get_end_point_h ( 0 ) . a << 26 ) ;
block . z = ( get_end_point_h ( 0 ) . a >> 6 )
| ( get_color_index ( 0 ) << 2 ) | ( get_color_index ( 1 ) << 3 ) | ( get_color_index ( 2 ) << 5 ) | ( get_color_index ( 3 ) << 7 )
| ( get_color_index ( 4 ) << 9 ) | ( get_color_index ( 5 ) << 11 ) | ( get_color_index ( 6 ) << 13 ) | ( get_color_index ( 7 ) << 15 )
| ( get_color_index ( 8 ) << 17 ) | ( get_color_index ( 9 ) << 19 ) | ( get_color_index ( 10 ) << 21 ) | ( get_color_index ( 11 ) << 23 )
| ( get_color_index ( 12 ) << 25 ) | ( get_color_index ( 13 ) << 27 ) | ( get_color_index ( 14 ) << 29 ) | ( get_color_index ( 15 ) << 31 ) ;
block . w = ( get_color_index ( 15 ) >> 1 ) | ( get_alpha_index ( 0 ) << 1 ) | ( get_alpha_index ( 1 ) << 2 ) | ( get_alpha_index ( 2 ) << 4 )
| ( get_alpha_index ( 3 ) << 6 ) | ( get_alpha_index ( 4 ) << 8 ) | ( get_alpha_index ( 5 ) << 10 ) | ( get_alpha_index ( 6 ) << 12 )
| ( get_alpha_index ( 7 ) << 14 ) | ( get_alpha_index ( 8 ) << 16 ) | ( get_alpha_index ( 9 ) << 18 ) | ( get_alpha_index ( 10 ) << 20 )
| ( get_alpha_index ( 11 ) << 22 ) | ( get_alpha_index ( 12 ) << 24 ) | ( get_alpha_index ( 13 ) << 26 ) | ( get_alpha_index ( 14 ) << 28 )
| ( get_alpha_index ( 15 ) << 30 ) ;
}
void block_package6 ( out uint4 block , uint threadBase )
{
block . x = 0x40
| ( ( get_end_point_l ( 0 ) . r & 0xFE ) << 6 ) | ( ( get_end_point_h ( 0 ) . r & 0xFE ) << 13 )
| ( ( get_end_point_l ( 0 ) . g & 0xFE ) << 20 ) | ( ( get_end_point_h ( 0 ) . g & 0xFE ) << 27 ) ;
block . y = ( ( get_end_point_h ( 0 ) . g & 0xFE ) >> 5 ) | ( ( get_end_point_l ( 0 ) . b & 0xFE ) << 2 )
| ( ( get_end_point_h ( 0 ) . b & 0xFE ) << 9 ) | ( ( get_end_point_l ( 0 ) . a & 0xFE ) << 16 )
| ( ( get_end_point_h ( 0 ) . a & 0xFE ) << 23 )
| ( get_end_point_l ( 0 ) . r & 0x01 ) << 31 ;
block . z = ( get_end_point_h ( 0 ) . r & 0x01 )
| ( get_color_index ( 0 ) << 1 ) | ( get_color_index ( 1 ) << 4 ) | ( get_color_index ( 2 ) << 8 ) | ( get_color_index ( 3 ) << 12 )
| ( get_color_index ( 4 ) << 16 ) | ( get_color_index ( 5 ) << 20 ) | ( get_color_index ( 6 ) << 24 ) | ( get_color_index ( 7 ) << 28 ) ;
block . w = ( get_color_index ( 8 ) << 0 ) | ( get_color_index ( 9 ) << 4 ) | ( get_color_index ( 10 ) << 8 ) | ( get_color_index ( 11 ) << 12 )
| ( get_color_index ( 12 ) << 16 ) | ( get_color_index ( 13 ) << 20 ) | ( get_color_index ( 14 ) << 24 ) | ( get_color_index ( 15 ) << 28 ) ;
}
void block_package7 ( out uint4 block , uint partition , uint threadBase )
{
block . x = 0x80 | ( partition << 8 )
| ( ( get_end_point_l ( 0 ) . r & 0xF8 ) << 11 ) | ( ( get_end_point_h ( 0 ) . r & 0xF8 ) << 16 )
| ( ( get_end_point_l ( 1 ) . r & 0xF8 ) << 21 ) | ( ( get_end_point_h ( 1 ) . r & 0xF8 ) << 26 ) ;
block . y = ( ( get_end_point_h ( 1 ) . r & 0xF8 ) >> 6 ) | ( ( get_end_point_l ( 0 ) . g & 0xF8 ) >> 1 )
| ( ( get_end_point_h ( 0 ) . g & 0xF8 ) << 4 ) | ( ( get_end_point_l ( 1 ) . g & 0xF8 ) << 9 )
| ( ( get_end_point_h ( 1 ) . g & 0xF8 ) << 14 ) | ( ( get_end_point_l ( 0 ) . b & 0xF8 ) << 19 )
| ( ( get_end_point_h ( 0 ) . b & 0xF8 ) << 24 ) ;
block . z = ( ( get_end_point_l ( 1 ) . b & 0xF8 ) >> 3 ) | ( ( get_end_point_h ( 1 ) . b & 0xF8 ) << 2 )
| ( ( get_end_point_l ( 0 ) . a & 0xF8 ) << 7 ) | ( ( get_end_point_h ( 0 ) . a & 0xF8 ) << 12 )
| ( ( get_end_point_l ( 1 ) . a & 0xF8 ) << 17 ) | ( ( get_end_point_h ( 1 ) . a & 0xF8 ) << 22 )
| ( ( get_end_point_l ( 0 ) . r & 0x04 ) << 28 ) | ( ( get_end_point_h ( 0 ) . r & 0x04 ) << 29 ) ;
block . w = ( ( get_end_point_l ( 1 ) . r & 0x04 ) >> 2 ) | ( ( get_end_point_h ( 1 ) . r & 0x04 ) >> 1 )
| ( get_color_index ( 0 ) << 2 ) ;
uint i = 1 ;
for ( ; i <= candidateFixUpIndex1DOrdered [ partition ] [ 0 ] ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 + 1 ) ;
}
for ( ; i < 16 ; i + + )
{
block . w | = get_color_index ( i ) << ( i * 2 ) ;
}
2013-08-05 17:58:54 +00:00
}