diff --git a/DirectXTex/DirectXTex.h b/DirectXTex/DirectXTex.h index 9d8c349..8e0169d 100644 --- a/DirectXTex/DirectXTex.h +++ b/DirectXTex/DirectXTex.h @@ -40,6 +40,7 @@ #define _Out_writes_bytes_to_opt_(a,b) #define _Inout_updates_bytes_(exp) #define _Inout_updates_all_(exp) +#define _Inout_updates_all_opt_(exp) #define _Outptr_ #define _When_(a,b) #endif diff --git a/DirectXTex/DirectXTexConvert.cpp b/DirectXTex/DirectXTexConvert.cpp index 008d6d7..ff71dd6 100644 --- a/DirectXTex/DirectXTexConvert.cpp +++ b/DirectXTex/DirectXTexConvert.cpp @@ -2176,7 +2176,7 @@ static const ConvertData g_ConvertTable[] = { { DXGI_FORMAT_B5G5R5A1_UNORM, 5, CONVF_UNORM | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, { DXGI_FORMAT_B8G8R8A8_UNORM, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, { DXGI_FORMAT_B8G8R8X8_UNORM, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B }, - { DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, 10, CONVF_UNORM | CONVF_X2 | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, + { DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM, 10, CONVF_UNORM | CONVF_XR | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, { DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, { DXGI_FORMAT_B8G8R8X8_UNORM_SRGB, 8, CONVF_UNORM | CONVF_BGR | CONVF_R | CONVF_G | CONVF_B }, { DXGI_FORMAT_BC6H_UF16, 16, CONVF_FLOAT | CONVF_BC | CONVF_R | CONVF_G | CONVF_B | CONVF_A }, @@ -2522,6 +2522,590 @@ void _ConvertScanline( XMVECTOR* pBuffer, size_t count, DXGI_FORMAT outFormat, D } +//------------------------------------------------------------------------------------- +// Dithering +//------------------------------------------------------------------------------------- + +// 4X4X4 ordered dithering matrix +static const float g_Dither[] = +{ + // (z & 3) + ( (y & 3) * 8) + (x & 3) + 0.468750f, -0.031250f, 0.343750f, -0.156250f, 0.468750f, -0.031250f, 0.343750f, -0.156250f, + -0.281250f, 0.218750f, -0.406250f, 0.093750f, -0.281250f, 0.218750f, -0.406250f, 0.093750f, + 0.281250f, -0.218750f, 0.406250f, -0.093750f, 0.281250f, -0.218750f, 0.406250f, -0.093750f, + -0.468750f, 0.031250f, -0.343750f, 0.156250f, -0.468750f, 0.031250f, -0.343750f, 0.156250f, +}; + +static const XMVECTORF32 g_Scale16pc = { 65535.f, 65535.f, 65535.f, 65535.f }; +static const XMVECTORF32 g_Scale15pc = { 32767.f, 32767.f, 32767.f, 32767.f }; +static const XMVECTORF32 g_Scale10pc = { 1023.f, 1023.f, 1023.f, 3.f }; +static const XMVECTORF32 g_Scale8pc = { 255.f, 255.f, 255.f, 255.f }; +static const XMVECTORF32 g_Scale7pc = { 127.f, 127.f, 127.f, 127.f }; +static const XMVECTORF32 g_Scale565pc = { 31.f, 63.f, 31.f, 1.f }; +static const XMVECTORF32 g_Scale5551pc = { 31.f, 31.f, 31.f, 1.f }; +static const XMVECTORF32 g_Scale4pc = { 15.f, 15.f, 15.f, 15.f }; + +static const XMVECTORF32 g_ErrorWeight3 = { 3.f/16.f, 3.f/16.f, 3.f/16.f, 3.f/16.f }; +static const XMVECTORF32 g_ErrorWeight5 = { 5.f/16.f, 5.f/16.f, 5.f/16.f, 5.f/16.f }; +static const XMVECTORF32 g_ErrorWeight1 = { 1.f/16.f, 1.f/16.f, 1.f/16.f, 1.f/16.f }; +static const XMVECTORF32 g_ErrorWeight7 = { 7.f/16.f, 7.f/16.f, 7.f/16.f, 7.f/16.f }; + +#define STORE_SCANLINE( type, scalev, clampzero, norm, itype, mask, row, bgr ) \ + if ( size >= sizeof(type) ) \ + { \ + type * __restrict dest = reinterpret_cast(pDestination); \ + for( size_t i = 0; i < count; ++i ) \ + { \ + ptrdiff_t index = static_cast( ( row & 1 ) ? ( count - i - 1 ) : i ); \ + ptrdiff_t delta = ( row & 1 ) ? -2 : 0; \ + \ + XMVECTOR v = sPtr[ index ]; \ + if ( bgr ) { v = XMVectorSwizzle<2, 1, 0, 3>( v ); } \ + if ( norm && clampzero ) v = XMVectorSaturate( v ) ; \ + else if ( clampzero ) v = XMVectorClamp( v, g_XMZero, scalev ); \ + else if ( norm ) v = XMVectorClamp( v, g_XMNegativeOne, g_XMOne ); \ + else v = XMVectorClamp( v, -scalev + g_XMOne, scalev ); \ + v = XMVectorAdd( v, vError ); \ + if ( norm ) v = XMVectorMultiply( v, scalev ); \ + \ + XMVECTOR target; \ + if ( pDiffusionErrors ) \ + { \ + target = XMVectorRound( v ); \ + vError = XMVectorSubtract( v, target ); \ + if (norm) vError = XMVectorDivide( vError, scalev ); \ + \ + /* Distribute error to next scanline and next pixel */ \ + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); \ + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); \ + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); \ + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); \ + } \ + else \ + { \ + /* Applied ordered dither */ \ + target = XMVectorAdd( v, ordered[ index & 3 ] ); \ + target = XMVectorRound( v ); \ + } \ + \ + target = XMVectorMin( scalev, target ); \ + target = XMVectorMax( (clampzero) ? g_XMZero : ( -scalev + g_XMOne ), target ); \ + \ + XMFLOAT4A tmp; \ + XMStoreFloat4A( &tmp, target ); \ + \ + auto dPtr = &dest[ index ]; \ + dPtr->x = static_cast( tmp.x ) & mask; \ + dPtr->y = static_cast( tmp.y ) & mask; \ + dPtr->z = static_cast( tmp.z ) & mask; \ + dPtr->w = static_cast( tmp.w ) & mask; \ + } \ + return true; \ + } \ + return false; + +#define STORE_SCANLINE2( type, scalev, clampzero, norm, itype, mask, row ) \ + if ( size >= sizeof(type) ) \ + { \ + type * __restrict dest = reinterpret_cast(pDestination); \ + for( size_t i = 0; i < count; ++i ) \ + { \ + ptrdiff_t index = static_cast( ( row & 1 ) ? ( count - i - 1 ) : i ); \ + ptrdiff_t delta = ( row & 1 ) ? -2 : 0; \ + \ + XMVECTOR v = sPtr[ index ]; \ + if ( norm && clampzero ) v = XMVectorSaturate( v ) ; \ + else if ( clampzero ) v = XMVectorClamp( v, g_XMZero, scalev ); \ + else if ( norm ) v = XMVectorClamp( v, g_XMNegativeOne, g_XMOne ); \ + else v = XMVectorClamp( v, -scalev + g_XMOne, scalev ); \ + v = XMVectorAdd( v, vError ); \ + if ( norm ) v = XMVectorMultiply( v, scalev ); \ + \ + XMVECTOR target; \ + if ( pDiffusionErrors ) \ + { \ + target = XMVectorRound( v ); \ + vError = XMVectorSubtract( v, target ); \ + if (norm) vError = XMVectorDivide( vError, scalev ); \ + \ + /* Distribute error to next scanline and next pixel */ \ + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); \ + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); \ + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); \ + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); \ + } \ + else \ + { \ + /* Applied ordered dither */ \ + target = XMVectorAdd( v, ordered[ index & 3 ] ); \ + target = XMVectorRound( v ); \ + } \ + \ + target = XMVectorMin( scalev, target ); \ + target = XMVectorMax( (clampzero) ? g_XMZero : ( -scalev + g_XMOne ), target ); \ + \ + XMFLOAT4A tmp; \ + XMStoreFloat4A( &tmp, target ); \ + \ + auto dPtr = &dest[ index ]; \ + dPtr->x = static_cast( tmp.x ) & mask; \ + dPtr->y = static_cast( tmp.y ) & mask; \ + } \ + return true; \ + } \ + return false; + +#define STORE_SCANLINE1( type, scalev, clampzero, norm, mask, row, selectw ) \ + if ( size >= sizeof(type) ) \ + { \ + type * __restrict dest = reinterpret_cast(pDestination); \ + for( size_t i = 0; i < count; ++i ) \ + { \ + ptrdiff_t index = static_cast( ( row & 1 ) ? ( count - i - 1 ) : i ); \ + ptrdiff_t delta = ( row & 1 ) ? -2 : 0; \ + \ + XMVECTOR v = sPtr[ index ]; \ + if ( norm && clampzero ) v = XMVectorSaturate( v ) ; \ + else if ( clampzero ) v = XMVectorClamp( v, g_XMZero, scalev ); \ + else if ( norm ) v = XMVectorClamp( v, g_XMNegativeOne, g_XMOne ); \ + else v = XMVectorClamp( v, -scalev + g_XMOne, scalev ); \ + v = XMVectorAdd( v, vError ); \ + if ( norm ) v = XMVectorMultiply( v, scalev ); \ + \ + XMVECTOR target; \ + if ( pDiffusionErrors ) \ + { \ + target = XMVectorRound( v ); \ + vError = XMVectorSubtract( v, target ); \ + if (norm) vError = XMVectorDivide( vError, scalev ); \ + \ + /* Distribute error to next scanline and next pixel */ \ + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); \ + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); \ + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); \ + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); \ + } \ + else \ + { \ + /* Applied ordered dither */ \ + target = XMVectorAdd( v, ordered[ index & 3 ] ); \ + target = XMVectorRound( v ); \ + } \ + \ + target = XMVectorMin( scalev, target ); \ + target = XMVectorMax( (clampzero) ? g_XMZero : ( -scalev + g_XMOne ), target ); \ + \ + dest[ index ] = static_cast( (selectw) ? XMVectorGetW( target ) : XMVectorGetX( target ) ) & mask; \ + } \ + return true; \ + } \ + return false; + +#pragma warning(push) +#pragma warning( disable : 4127 ) + +_Use_decl_annotations_ +bool _StoreScanlineDither( LPVOID pDestination, size_t size, DXGI_FORMAT format, + XMVECTOR* pSource, size_t count, float threshold, size_t y, size_t z, XMVECTOR* pDiffusionErrors ) +{ + assert( pDestination && size > 0 ); + assert( pSource && count > 0 && (((uintptr_t)pSource & 0xF) == 0) ); + assert( IsValid(format) && !IsVideo(format) && !IsTypeless(format) && !IsCompressed(format) ); + + XMVECTOR ordered[4]; + if ( pDiffusionErrors ) + { + // If pDiffusionErrors != 0, then this function performs error diffusion dithering (aka Floyd-Steinberg dithering) + + // To avoid the need for another temporary scanline buffer, we allow this function to overwrite the source buffer in-place + // Given the intended usage in the conversion routines, this is not a problem. + + XMVECTOR* ptr = pSource; + const XMVECTOR* err = pDiffusionErrors + 1; + for( size_t i=0; i < count; ++i ) + { + // Add contribution from previous scanline + XMVECTOR v = XMVectorAdd( *ptr, *err++ ); + *ptr++ = v; + } + + // Reset errors for next scanline + memset( pDiffusionErrors, 0, sizeof(XMVECTOR)*(count+2) ); + } + else + { + // If pDiffusionErrors == 0, then this function performs ordered dithering + + XMVECTOR dither = XMLoadFloat4( reinterpret_cast( g_Dither + (z & 3) + ( (y & 3) * 8 ) ) ); + + ordered[0] = XMVectorSplatX( dither ); + ordered[1] = XMVectorSplatY( dither ); + ordered[2] = XMVectorSplatZ( dither ); + ordered[3] = XMVectorSplatW( dither ); + } + + const XMVECTOR* __restrict sPtr = pSource; + if ( !sPtr ) + return false; + + XMVECTOR vError = XMVectorZero(); + + switch( format ) + { + case DXGI_FORMAT_R16G16B16A16_UNORM: + STORE_SCANLINE( XMUSHORTN4, g_Scale16pc, true, true, uint16_t, 0xFFFF, y, false ) + + case DXGI_FORMAT_R16G16B16A16_UINT: + STORE_SCANLINE( XMUSHORT4, g_Scale16pc, true, false, uint16_t, 0xFFFF, y, false ) + + case DXGI_FORMAT_R16G16B16A16_SNORM: + STORE_SCANLINE( XMSHORTN4, g_Scale15pc, false, true, int16_t, 0xFFFF, y, false ) + + case DXGI_FORMAT_R16G16B16A16_SINT: + STORE_SCANLINE( XMSHORT4, g_Scale15pc, false, false, int16_t, 0xFFFF, y, false ) + + case DXGI_FORMAT_R10G10B10A2_UNORM: + STORE_SCANLINE( XMUDECN4, g_Scale10pc, true, true, uint16_t, 0x3FF, y, false ) + + case DXGI_FORMAT_R10G10B10A2_UINT: + STORE_SCANLINE( XMUDEC4, g_Scale10pc, true, false, uint16_t, 0x3FF, y, false ) + + case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM: + if ( size >= sizeof(XMUDEC4) ) + { + static const XMVECTORF32 Scale = { 510.0f, 510.0f, 510.0f, 3.0f }; + static const XMVECTORF32 Bias = { 384.0f, 384.0f, 384.0f, 0.0f }; + static const XMVECTORF32 MinXR = { -0.7529f, -0.7529f, -0.7529f, 0.f }; + static const XMVECTORF32 MaxXR = { 1.2529f, 1.2529f, 1.2529f, 1.0f }; + + XMUDEC4 * __restrict dest = reinterpret_cast(pDestination); + for( size_t i = 0; i < count; ++i ) + { + ptrdiff_t index = static_cast( ( y & 1 ) ? ( count - i - 1 ) : i ); + ptrdiff_t delta = ( y & 1 ) ? -2 : 0; + + XMVECTOR v = XMVectorClamp( sPtr[ index ], MinXR, MaxXR ); + v = XMVectorMultiplyAdd( v, Scale, vError ); + + XMVECTOR target; + if ( pDiffusionErrors ) + { + target = XMVectorRound( v ); + vError = XMVectorSubtract( v, target ); + vError = XMVectorDivide( vError, Scale ); + + // Distribute error to next scanline and next pixel + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); + } + else + { + // Applied ordered dither + target = XMVectorAdd( v, ordered[ index & 3 ] ); + target = XMVectorRound( v ); + } + + target = XMVectorAdd( target, Bias ); + target = XMVectorClamp( target, g_XMZero, g_Scale10pc ); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, target ); + + auto dPtr = &dest[ index ]; + dPtr->x = static_cast( tmp.x ) & 0x3FF; + dPtr->y = static_cast( tmp.y ) & 0x3FF; + dPtr->z = static_cast( tmp.z ) & 0x3FF; + dPtr->w = static_cast( tmp.w ); + } + return true; + } + return false; + + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + STORE_SCANLINE( XMUBYTEN4, g_Scale8pc, true, true, uint8_t, 0xFF, y, false ) + + case DXGI_FORMAT_R8G8B8A8_UINT: + STORE_SCANLINE( XMUBYTE4, g_Scale8pc, true, false, uint8_t, 0xFF, y, false ) + + case DXGI_FORMAT_R8G8B8A8_SNORM: + STORE_SCANLINE( XMBYTEN4, g_Scale7pc, false, true, int8_t, 0xFF, y, false ) + + case DXGI_FORMAT_R8G8B8A8_SINT: + STORE_SCANLINE( XMBYTE4, g_Scale7pc, false, false, int8_t, 0xFF, y, false ) + + case DXGI_FORMAT_R16G16_UNORM: + STORE_SCANLINE2( XMUSHORTN2, g_Scale16pc, true, true, uint16_t, 0xFFFF, y ) + + case DXGI_FORMAT_R16G16_UINT: + STORE_SCANLINE2( XMUSHORT2, g_Scale16pc, true, false, uint16_t, 0xFFFF, y ) + + case DXGI_FORMAT_R16G16_SNORM: + STORE_SCANLINE2( XMSHORTN2, g_Scale15pc, false, true, int16_t, 0xFFFF, y ) + + case DXGI_FORMAT_R16G16_SINT: + STORE_SCANLINE2( XMSHORT2, g_Scale15pc, false, false, int16_t, 0xFFFF, y ) + + case DXGI_FORMAT_D24_UNORM_S8_UINT: + if ( size >= sizeof(uint32_t) ) + { + static const XMVECTORF32 Clamp = { 1.f, 255.f, 0.f, 0.f }; + static const XMVECTORF32 Scale = { 16777215.f, 1.f, 0.f, 0.f }; + static const XMVECTORF32 Scale2 = { 16777215.f, 255.f, 0.f, 0.f }; + + uint32_t * __restrict dest = reinterpret_cast(pDestination); + for( size_t i = 0; i < count; ++i ) + { + ptrdiff_t index = static_cast( ( y & 1 ) ? ( count - i - 1 ) : i ); + ptrdiff_t delta = ( y & 1 ) ? -2 : 0; + + XMVECTOR v = XMVectorClamp( sPtr[ index ], g_XMZero, Clamp ); + v = XMVectorAdd( v, vError ); + v = XMVectorMultiply( v, Scale ); + + XMVECTOR target; + if ( pDiffusionErrors ) + { + target = XMVectorRound( v ); + vError = XMVectorSubtract( v, target ); + vError = XMVectorDivide( vError, Scale ); + + // Distribute error to next scanline and next pixel + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); + } + else + { + // Applied ordered dither + target = XMVectorAdd( v, ordered[ index & 3 ] ); + target = XMVectorRound( v ); + } + + target = XMVectorClamp( target, g_XMZero, Scale2 ); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, target ); + + auto dPtr = &dest[ index ]; + *dPtr = (static_cast( tmp.x ) & 0xFFFFFF) + | ((static_cast( tmp.y ) & 0xFF) << 24); + } + return true; + } + return false; + + case DXGI_FORMAT_R8G8_UNORM: + STORE_SCANLINE2( XMUBYTEN2, g_Scale8pc, true, true, uint8_t, 0xFF, y ) + + case DXGI_FORMAT_R8G8_UINT: + STORE_SCANLINE2( XMUBYTE2, g_Scale8pc, true, false, uint8_t, 0xFF, y ) + + case DXGI_FORMAT_R8G8_SNORM: + STORE_SCANLINE2( XMBYTEN2, g_Scale7pc, false, true, int8_t, 0xFF, y ) + + case DXGI_FORMAT_R8G8_SINT: + STORE_SCANLINE2( XMBYTE2, g_Scale7pc, false, false, int8_t, 0xFF, y ) + + case DXGI_FORMAT_D16_UNORM: + case DXGI_FORMAT_R16_UNORM: + STORE_SCANLINE1( uint16_t, g_Scale16pc, true, true, 0xFFFF, y, false ) + + case DXGI_FORMAT_R16_UINT: + STORE_SCANLINE1( uint16_t, g_Scale16pc, true, false, 0xFFFF, y, false ) + + case DXGI_FORMAT_R16_SNORM: + STORE_SCANLINE1( int16_t, g_Scale15pc, false, true, 0xFFFF, y, false ) + + case DXGI_FORMAT_R16_SINT: + STORE_SCANLINE1( int16_t, g_Scale15pc, false, false, 0xFFFF, y, false ) + + case DXGI_FORMAT_R8_UNORM: + STORE_SCANLINE1( uint8_t, g_Scale8pc, true, true, 0xFF, y, false ) + + case DXGI_FORMAT_R8_UINT: + STORE_SCANLINE1( uint8_t, g_Scale8pc, true, false, 0xFF, y, false ) + + case DXGI_FORMAT_R8_SNORM: + STORE_SCANLINE1( int8_t, g_Scale7pc, false, true, 0xFF, y, false ) + + case DXGI_FORMAT_R8_SINT: + STORE_SCANLINE1( int8_t, g_Scale7pc, false, false, 0xFF, y, false ) + + case DXGI_FORMAT_A8_UNORM: + STORE_SCANLINE1( uint8_t, g_Scale8pc, true, true, 0xFF, y, true ) + + case DXGI_FORMAT_B5G6R5_UNORM: + if ( size >= sizeof(XMU565) ) + { + XMU565 * __restrict dest = reinterpret_cast(pDestination); + for( size_t i = 0; i < count; ++i ) + { + ptrdiff_t index = static_cast( ( y & 1 ) ? ( count - i - 1 ) : i ); + ptrdiff_t delta = ( y & 1 ) ? -2 : 0; + + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( sPtr[ index ] ); + v = XMVectorSaturate( v ); + v = XMVectorAdd( v, vError ); + v = XMVectorMultiply( v, g_Scale565pc ); + + XMVECTOR target; + if ( pDiffusionErrors ) + { + target = XMVectorRound( v ); + vError = XMVectorSubtract( v, target ); + vError = XMVectorDivide( vError, g_Scale565pc ); + + // Distribute error to next scanline and next pixel + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); + } + else + { + // Applied ordered dither + target = XMVectorAdd( v, ordered[ index & 3 ] ); + target = XMVectorRound( v ); + } + + target = XMVectorClamp( target, g_XMZero, g_Scale565pc ); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, target ); + + auto dPtr = &dest[ index ]; + dPtr->x = static_cast( tmp.x ) & 0x1F; + dPtr->y = static_cast( tmp.y ) & 0x3F; + dPtr->z = static_cast( tmp.z ) & 0x1F; + } + return true; + } + return false; + + case DXGI_FORMAT_B5G5R5A1_UNORM: + if ( size >= sizeof(XMU555) ) + { + XMU555 * __restrict dest = reinterpret_cast(pDestination); + for( size_t i = 0; i < count; ++i ) + { + ptrdiff_t index = static_cast( ( y & 1 ) ? ( count - i - 1 ) : i ); + ptrdiff_t delta = ( y & 1 ) ? -2 : 0; + + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( sPtr[ index ] ); + v = XMVectorSaturate( v ); + v = XMVectorAdd( v, vError ); + v = XMVectorMultiply( v, g_Scale5551pc ); + + XMVECTOR target; + if ( pDiffusionErrors ) + { + target = XMVectorRound( v ); + vError = XMVectorSubtract( v, target ); + vError = XMVectorDivide( vError, g_Scale5551pc ); + + // Distribute error to next scanline and next pixel + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); + } + else + { + // Applied ordered dither + target = XMVectorAdd( v, ordered[ index & 3 ] ); + target = XMVectorRound( v ); + } + + target = XMVectorClamp( target, g_XMZero, g_Scale5551pc ); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, target ); + + auto dPtr = &dest[ index ]; + dPtr->x = static_cast( tmp.x ) & 0x1F; + dPtr->y = static_cast( tmp.y ) & 0x1F; + dPtr->z = static_cast( tmp.z ) & 0x1F; + dPtr->w = ( XMVectorGetW( target ) > threshold ) ? 1 : 0; + } + return true; + } + return false; + + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + STORE_SCANLINE( XMUBYTEN4, g_Scale8pc, true, true, uint8_t, 0xFF, y, true ) + + case DXGI_FORMAT_B8G8R8X8_UNORM: + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + if ( size >= sizeof(XMUBYTEN4) ) + { + XMUBYTEN4 * __restrict dest = reinterpret_cast(pDestination); + for( size_t i = 0; i < count; ++i ) + { + ptrdiff_t index = static_cast( ( y & 1 ) ? ( count - i - 1 ) : i ); + ptrdiff_t delta = ( y & 1 ) ? -2 : 0; + + XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>( sPtr[ index ] ); + v = XMVectorSaturate( v ); + v = XMVectorAdd( v, vError ); + v = XMVectorMultiply( v, g_Scale8pc ); + + XMVECTOR target; + if ( pDiffusionErrors ) + { + target = XMVectorRound( v ); + vError = XMVectorSubtract( v, target ); + vError = XMVectorDivide( vError, g_Scale8pc ); + + // Distribute error to next scanline and next pixel + pDiffusionErrors[ index-delta ] += XMVectorMultiply( g_ErrorWeight3, vError ); + pDiffusionErrors[ index+1 ] += XMVectorMultiply( g_ErrorWeight5, vError ); + pDiffusionErrors[ index+2+delta ] += XMVectorMultiply( g_ErrorWeight1, vError ); + vError = XMVectorMultiply( vError, g_ErrorWeight7 ); + } + else + { + // Applied ordered dither + target = XMVectorAdd( v, ordered[ index & 3 ] ); + target = XMVectorRound( v ); + } + + target = XMVectorClamp( target, g_XMZero, g_Scale8pc ); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, target ); + + auto dPtr = &dest[ index ]; + dPtr->x = static_cast( tmp.x ) & 0xFF; + dPtr->y = static_cast( tmp.y ) & 0xFF; + dPtr->z = static_cast( tmp.z ) & 0xFF; + dPtr->w = 0; + } + return true; + } + return false; + +#ifdef DXGI_1_2_FORMATS + case DXGI_FORMAT_B4G4R4A4_UNORM: + STORE_SCANLINE( XMUNIBBLE4, g_Scale4pc, true, true, uint8_t, 0xF, y, true ) +#endif + + default: + return _StoreScanline( pDestination, size, format, pSource, count, threshold ); + } +} + +#pragma warning(pop) + +#undef STORE_SCANLINE +#undef STORE_SCANLINE2 +#undef STORE_SCANLINE1 + + //------------------------------------------------------------------------------------- // Selection logic for using WIC vs. our own routines //------------------------------------------------------------------------------------- @@ -2674,32 +3258,82 @@ static HRESULT _ConvertUsingWIC( _In_ const Image& srcImage, _In_ const WICPixel //------------------------------------------------------------------------------------- // Convert the source image (not using WIC) //------------------------------------------------------------------------------------- -static HRESULT _Convert( _In_ const Image& srcImage, _In_ DWORD filter, _In_ const Image& destImage, _In_ float threshold ) +static HRESULT _Convert( _In_ const Image& srcImage, _In_ DWORD filter, _In_ const Image& destImage, _In_ float threshold, _In_ size_t z ) { assert( srcImage.width == destImage.width ); assert( srcImage.height == destImage.height ); - ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*srcImage.width), 16 ) ) ); - if ( !scanline ) - return E_OUTOFMEMORY; - const uint8_t *pSrc = srcImage.pixels; uint8_t *pDest = destImage.pixels; if ( !pSrc || !pDest ) return E_POINTER; - for( size_t h = 0; h < srcImage.height; ++h ) + size_t width = srcImage.width; + + if ( filter & TEX_FILTER_DITHER_DIFFUSION ) { - if ( !_LoadScanline( scanline.get(), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) ) - return E_FAIL; + // Error diffusion dithering (aka Floyd-Steinberg dithering) + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*(width*2 + 2)), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; - _ConvertScanline( scanline.get(), srcImage.width, destImage.format, srcImage.format, filter ); + XMVECTOR* pDiffusionErrors = scanline.get() + width; + memset( pDiffusionErrors, 0, sizeof(XMVECTOR)*(width+2) ); - if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, scanline.get(), srcImage.width, threshold ) ) - return E_FAIL; + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_LoadScanline( scanline.get(), width, pSrc, srcImage.rowPitch, srcImage.format ) ) + return E_FAIL; - pSrc += srcImage.rowPitch; - pDest += destImage.rowPitch; + _ConvertScanline( scanline.get(), width, destImage.format, srcImage.format, filter ); + + if ( !_StoreScanlineDither( pDest, destImage.rowPitch, destImage.format, scanline.get(), width, threshold, h, z, pDiffusionErrors ) ) + return E_FAIL; + + pSrc += srcImage.rowPitch; + pDest += destImage.rowPitch; + } + } + else + { + ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast( _aligned_malloc( (sizeof(XMVECTOR)*width), 16 ) ) ); + if ( !scanline ) + return E_OUTOFMEMORY; + + if ( filter & TEX_FILTER_DITHER ) + { + // Ordered dithering + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_LoadScanline( scanline.get(), width, pSrc, srcImage.rowPitch, srcImage.format ) ) + return E_FAIL; + + _ConvertScanline( scanline.get(), width, destImage.format, srcImage.format, filter ); + + if ( !_StoreScanlineDither( pDest, destImage.rowPitch, destImage.format, scanline.get(), width, threshold, h, z, nullptr ) ) + return E_FAIL; + + pSrc += srcImage.rowPitch; + pDest += destImage.rowPitch; + } + } + else + { + // No dithering + for( size_t h = 0; h < srcImage.height; ++h ) + { + if ( !_LoadScanline( scanline.get(), width, pSrc, srcImage.rowPitch, srcImage.format ) ) + return E_FAIL; + + _ConvertScanline( scanline.get(), width, destImage.format, srcImage.format, filter ); + + if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, scanline.get(), width, threshold ) ) + return E_FAIL; + + pSrc += srcImage.rowPitch; + pDest += destImage.rowPitch; + } + } } return S_OK; @@ -2750,7 +3384,7 @@ HRESULT Convert( const Image& srcImage, DXGI_FORMAT format, DWORD filter, float } else { - hr = _Convert( srcImage, filter, *rimage, threshold ); + hr = _Convert( srcImage, filter, *rimage, threshold, 0 ); } if ( FAILED(hr) ) @@ -2805,43 +3439,106 @@ HRESULT Convert( const Image* srcImages, size_t nimages, const TexMetadata& meta WICPixelFormatGUID pfGUID, targetGUID; bool usewic = _UseWICConversion( filter, metadata.format, format, pfGUID, targetGUID ); - for( size_t index=0; index < nimages; ++index ) + switch (metadata.dimension) { - const Image& src = srcImages[ index ]; - if ( src.format != metadata.format ) + case TEX_DIMENSION_TEXTURE1D: + case TEX_DIMENSION_TEXTURE2D: + for( size_t index=0; index < nimages; ++index ) { - result.Release(); - return E_FAIL; - } + const Image& src = srcImages[ index ]; + if ( src.format != metadata.format ) + { + result.Release(); + return E_FAIL; + } #ifdef _AMD64_ - if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) ) - return E_FAIL; + if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) ) + return E_FAIL; #endif - const Image& dst = dest[ index ]; - assert( dst.format == format ); + const Image& dst = dest[ index ]; + assert( dst.format == format ); - if ( src.width != dst.width || src.height != dst.height ) - { - result.Release(); - return E_FAIL; - } + if ( src.width != dst.width || src.height != dst.height ) + { + result.Release(); + return E_FAIL; + } - if ( usewic ) - { - hr = _ConvertUsingWIC( src, pfGUID, targetGUID, filter, threshold, dst ); - } - else - { - hr = _Convert( src, filter, dst, threshold ); - } + if ( usewic ) + { + hr = _ConvertUsingWIC( src, pfGUID, targetGUID, filter, threshold, dst ); + } + else + { + hr = _Convert( src, filter, dst, threshold, 0 ); + } - if ( FAILED(hr) ) - { - result.Release(); - return hr; + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } } + break; + + case TEX_DIMENSION_TEXTURE3D: + { + size_t index = 0; + size_t d = metadata.depth; + for( size_t level = 0; level < metadata.mipLevels; ++level ) + { + for( size_t slice = 0; slice < d; ++slice, ++index ) + { + if ( index >= nimages ) + return E_FAIL; + + const Image& src = srcImages[ index ]; + if ( src.format != metadata.format ) + { + result.Release(); + return E_FAIL; + } + +#ifdef _AMD64_ + if ( (src.width > 0xFFFFFFFF) || (src.height > 0xFFFFFFFF) ) + return E_FAIL; +#endif + + const Image& dst = dest[ index ]; + assert( dst.format == format ); + + if ( src.width != dst.width || src.height != dst.height ) + { + result.Release(); + return E_FAIL; + } + + if ( usewic ) + { + hr = _ConvertUsingWIC( src, pfGUID, targetGUID, filter, threshold, dst ); + } + else + { + hr = _Convert( src, filter, dst, threshold, slice ); + } + + if ( FAILED(hr) ) + { + result.Release(); + return hr; + } + } + + if ( d > 1 ) + d >>= 1; + } + } + break; + + default: + return E_FAIL; } return S_OK; diff --git a/DirectXTex/DirectXTexP.h b/DirectXTex/DirectXTexP.h index a17db22..d855075 100644 --- a/DirectXTex/DirectXTexP.h +++ b/DirectXTex/DirectXTexP.h @@ -150,7 +150,7 @@ namespace DirectX CONVF_STENCIL = 0x40, CONVF_SHAREDEXP = 0x80, CONVF_BGR = 0x100, - CONVF_X2 = 0x200, + CONVF_XR = 0x200, CONVF_PACKED = 0x400, CONVF_BC = 0x800, CONVF_R = 0x10000, @@ -197,6 +197,11 @@ namespace DirectX bool _StoreScanlineLinear( LPVOID pDestination, _In_ size_t size, _In_ DXGI_FORMAT format, _Inout_updates_all_(count) XMVECTOR* pSource, _In_ size_t count, _In_ DWORD flags ); + _Success_(return != false) + bool _StoreScanlineDither( LPVOID pDestination, _In_ size_t size, _In_ DXGI_FORMAT format, + _Inout_updates_all_(count) XMVECTOR* pSource, _In_ size_t count, _In_ float threshold, size_t y, size_t z, + _Inout_updates_all_opt_(count+2) XMVECTOR* pDiffusionErrors ); + HRESULT _ConvertToR32G32B32A32( _In_ const Image& srcImage, _Inout_ ScratchImage& image ); HRESULT _ConvertFromR32G32B32A32( _In_ const Image& srcImage, _In_ const Image& destImage ); diff --git a/Texconv/texconv.cpp b/Texconv/texconv.cpp index 270bef3..c7a391c 100644 --- a/Texconv/texconv.cpp +++ b/Texconv/texconv.cpp @@ -172,15 +172,22 @@ SValue g_pFormats[] = SValue g_pFilters[] = { - { L"POINT", TEX_FILTER_POINT }, - { L"LINEAR", TEX_FILTER_LINEAR }, - { L"CUBIC", TEX_FILTER_CUBIC }, - { L"FANT", TEX_FILTER_FANT }, - { L"POINT_DITHER", TEX_FILTER_POINT | TEX_FILTER_DITHER_DIFFUSION }, - { L"LINEAR_DITHER", TEX_FILTER_LINEAR | TEX_FILTER_DITHER_DIFFUSION }, - { L"CUBIC_DITHER", TEX_FILTER_CUBIC | TEX_FILTER_DITHER_DIFFUSION }, - { L"FANT_DITHER", TEX_FILTER_FANT | TEX_FILTER_DITHER_DIFFUSION }, - { nullptr, TEX_FILTER_DEFAULT } + { L"POINT", TEX_FILTER_POINT }, + { L"LINEAR", TEX_FILTER_LINEAR }, + { L"CUBIC", TEX_FILTER_CUBIC }, + { L"FANT", TEX_FILTER_FANT }, + { L"BOX", TEX_FILTER_BOX }, + { L"POINT_DITHER", TEX_FILTER_POINT | TEX_FILTER_DITHER }, + { L"LINEAR_DITHER", TEX_FILTER_LINEAR | TEX_FILTER_DITHER }, + { L"CUBIC_DITHER", TEX_FILTER_CUBIC | TEX_FILTER_DITHER }, + { L"FANT_DITHER", TEX_FILTER_FANT | TEX_FILTER_DITHER }, + { L"BOX_DITHER", TEX_FILTER_BOX | TEX_FILTER_DITHER }, + { L"POINT_DITHER_DIFFUSION", TEX_FILTER_POINT | TEX_FILTER_DITHER_DIFFUSION }, + { L"LINEAR_DITHER_DIFFUSION", TEX_FILTER_LINEAR | TEX_FILTER_DITHER_DIFFUSION }, + { L"CUBIC_DITHER_DIFFUSION", TEX_FILTER_CUBIC | TEX_FILTER_DITHER_DIFFUSION }, + { L"FANT_DITHER_DIFFUSION", TEX_FILTER_FANT | TEX_FILTER_DITHER_DIFFUSION }, + { L"BOX_DITHER_DIFFUSION", TEX_FILTER_BOX | TEX_FILTER_DITHER_DIFFUSION }, + { nullptr, TEX_FILTER_DEFAULT } }; #define CODEC_DDS 0xFFFF0001