Merge branch 'convauto_getencoding' of https://github.com/pavel-t/wxWidgets
Add wxConvAuto::GetEncoding(), other improvements and bug fixes. See https://github.com/wxWidgets/wxWidgets/pull/2072
This commit is contained in:
commit
e21f8ba1a3
@ -78,6 +78,8 @@ public:
|
||||
|
||||
virtual size_t GetMBNulLen() const wxOVERRIDE { return m_conv->GetMBNulLen(); }
|
||||
|
||||
virtual bool IsUTF8() const wxOVERRIDE { return m_conv && m_conv->IsUTF8(); }
|
||||
|
||||
virtual wxMBConv *Clone() const wxOVERRIDE { return new wxConvAuto(*this); }
|
||||
|
||||
// return the BOM type of this buffer
|
||||
@ -91,6 +93,14 @@ public:
|
||||
return m_bomType;
|
||||
}
|
||||
|
||||
wxFontEncoding GetEncoding() const;
|
||||
|
||||
// Return true if the fall-back encoding is used
|
||||
bool IsUsingFallbackEncoding() const
|
||||
{
|
||||
return m_ownsConv && m_bomType == wxBOM_None;
|
||||
}
|
||||
|
||||
private:
|
||||
// common part of all ctors
|
||||
void Init()
|
||||
|
16
include/wx/private/unicode.h
Normal file
16
include/wx/private/unicode.h
Normal file
@ -0,0 +1,16 @@
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Name: wx/private/unicode.h
|
||||
// Purpose: Unicode private declsrations
|
||||
// Author: Pavel Tyunin
|
||||
// Created: 2020-10-06
|
||||
// Copyright: (c) 2020 Pavel Tyunin
|
||||
// Licence: wxWindows licence
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef _WX_PRIVATE_UNICODEH__
|
||||
#define _WX_PRIVATE_UNICODEH__
|
||||
|
||||
// this table gives the length of the UTF-8 encoding from its first character:
|
||||
extern const unsigned char tableUtf8Lengths[256];
|
||||
|
||||
#endif // _WX_PRIVATE_UNICODEH__
|
@ -94,15 +94,15 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
|
||||
return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
|
||||
}
|
||||
|
||||
// table of offsets to skip forward when iterating over UTF-8 sequence
|
||||
static const unsigned char ms_utf8IterTable[256];
|
||||
// returns offset to skip forward when iterating over UTF-8 sequence
|
||||
static unsigned char GetUTF8IterOffset(unsigned char c);
|
||||
|
||||
|
||||
template<typename Iterator>
|
||||
static void IncIter(Iterator& i)
|
||||
{
|
||||
wxASSERT( IsValidUtf8LeadByte(*i) );
|
||||
i += ms_utf8IterTable[(unsigned char)*i];
|
||||
i += GetUTF8IterOffset(*i);
|
||||
}
|
||||
|
||||
template<typename Iterator>
|
||||
@ -178,7 +178,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
|
||||
static size_t GetUtf8CharLength(char c)
|
||||
{
|
||||
wxASSERT( IsValidUtf8LeadByte(c) );
|
||||
return ms_utf8IterTable[(unsigned char)c];
|
||||
return GetUTF8IterOffset(c);
|
||||
}
|
||||
|
||||
// decodes single UTF-8 character from UTF-8 string
|
||||
|
@ -146,6 +146,22 @@ public:
|
||||
*/
|
||||
wxBOM GetBOM() const;
|
||||
|
||||
/**
|
||||
Return the detected encoding
|
||||
|
||||
Returns @c wxFONTENCODING_MAX if called before the first use.
|
||||
|
||||
@since 3.1.5
|
||||
*/
|
||||
wxBOM GetEncoding() const;
|
||||
|
||||
/**
|
||||
Check if the fall-back encoding is used.
|
||||
|
||||
@since 3.1.5
|
||||
*/
|
||||
bool IsUsingFallbackEncoding() const;
|
||||
|
||||
/**
|
||||
Return a pointer to the characters that makes up this BOM.
|
||||
|
||||
|
@ -23,6 +23,7 @@
|
||||
#endif
|
||||
|
||||
#include "wx/convauto.h"
|
||||
#include "wx/private/unicode.h"
|
||||
|
||||
// we use latin1 by default as it seems the least bad choice: the files we need
|
||||
// to detect input of don't always come from the user system (they are often
|
||||
@ -266,6 +267,23 @@ bool wxConvAuto::InitFromInput(const char *src, size_t len)
|
||||
return true;
|
||||
}
|
||||
|
||||
// checks if the input can be the beginning of a valid UTF-8 sequence
|
||||
static bool wxCanBeUTF8SequencePrefix(const char *src, size_t len)
|
||||
{
|
||||
size_t i = 0;
|
||||
unsigned char l = tableUtf8Lengths[(unsigned char)src[i]];
|
||||
if ( !l )
|
||||
return false; // invalid leading byte
|
||||
while ( --l )
|
||||
{
|
||||
if ( ++i == len )
|
||||
return true; // truncated sequence
|
||||
if ( (src[i] & 0xC0) != 0x80 )
|
||||
return false; // invalid continuation byte
|
||||
}
|
||||
return false; // complete sequence
|
||||
}
|
||||
|
||||
size_t
|
||||
wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
const char *src, size_t srcLen) const
|
||||
@ -307,25 +325,28 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen,
|
||||
|
||||
// try to convert using the auto-detected encoding
|
||||
size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||
if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None )
|
||||
if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None && !m_ownsConv )
|
||||
{
|
||||
// we may need more bytes before we can decode the input, don't switch
|
||||
// to the fall-back conversion in this case as it would prevent us from
|
||||
// decoding UTF-8 input when fed it byte by byte, as done by
|
||||
// wxTextInputStream, for example
|
||||
if ( srcLen < m_conv->GetMaxCharLen() )
|
||||
// up to 2 extra bytes are needed for inputs that start with null bytes
|
||||
// that look like the start of UTF-32BE BOM, but can be in UTF-8 too
|
||||
size_t nNull = 0;
|
||||
if ( srcLen != wxNO_LEN && srcLen >= 2 && !src[0] )
|
||||
nNull = ( src[1]? 1 : 2 );
|
||||
if ( srcLen < nNull + m_conv->GetMaxCharLen() &&
|
||||
wxCanBeUTF8SequencePrefix(src + nNull, srcLen - nNull) )
|
||||
return wxCONV_FAILED;
|
||||
|
||||
// if the conversion failed but we didn't really detect anything and
|
||||
// simply tried UTF-8 by default, retry it using the fall-back
|
||||
if ( m_encDefault == wxFONTENCODING_DEFAULT )
|
||||
self->m_encDefault = GetFallbackEncoding();
|
||||
if ( m_encDefault != wxFONTENCODING_MAX )
|
||||
{
|
||||
if ( m_ownsConv )
|
||||
delete m_conv;
|
||||
|
||||
self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT
|
||||
? GetFallbackEncoding()
|
||||
: m_encDefault);
|
||||
self->m_conv = new wxCSConv(m_encDefault);
|
||||
self->m_ownsConv = true;
|
||||
|
||||
rc = m_conv->ToWChar(dst, dstLen, src, srcLen);
|
||||
@ -351,3 +372,32 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen,
|
||||
|
||||
return m_conv->FromWChar(dst, dstLen, src, srcLen);
|
||||
}
|
||||
|
||||
wxFontEncoding wxConvAuto::GetEncoding() const
|
||||
{
|
||||
switch ( m_bomType )
|
||||
{
|
||||
case wxBOM_UTF32BE:
|
||||
return wxFONTENCODING_UTF32BE;
|
||||
case wxBOM_UTF32LE:
|
||||
return wxFONTENCODING_UTF32LE;
|
||||
case wxBOM_UTF16BE:
|
||||
return wxFONTENCODING_UTF16BE;
|
||||
case wxBOM_UTF16LE:
|
||||
return wxFONTENCODING_UTF16LE;
|
||||
case wxBOM_UTF8:
|
||||
return wxFONTENCODING_UTF8;
|
||||
|
||||
case wxBOM_Unknown:
|
||||
case wxBOM_None:
|
||||
if ( !m_conv )
|
||||
return wxFONTENCODING_MAX;
|
||||
else if ( !m_ownsConv )
|
||||
return wxFONTENCODING_UTF8;
|
||||
else
|
||||
return m_encDefault;
|
||||
}
|
||||
|
||||
wxFAIL_MSG( "unknown BOM type" );
|
||||
return wxFONTENCODING_MAX;
|
||||
}
|
||||
|
@ -46,6 +46,7 @@
|
||||
|
||||
#include "wx/encconv.h"
|
||||
#include "wx/fontmap.h"
|
||||
#include "wx/private/unicode.h"
|
||||
|
||||
#ifdef __DARWIN__
|
||||
#include "wx/osx/core/private/strconv_cf.h"
|
||||
@ -921,7 +922,7 @@ const wxUint32 wxUnicodePUA = 0x100000;
|
||||
const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
|
||||
|
||||
// this table gives the length of the UTF-8 encoding from its first character:
|
||||
const unsigned char tableUtf8Lengths[256] = {
|
||||
extern const unsigned char tableUtf8Lengths[256] = {
|
||||
// single-byte sequences (ASCII):
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include "wx/stringops.h"
|
||||
#endif
|
||||
|
||||
#include "wx/private/unicode.h"
|
||||
|
||||
// ===========================================================================
|
||||
// implementation
|
||||
// ===========================================================================
|
||||
@ -97,40 +99,13 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar&
|
||||
// UTF-8 sequences lengths
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = {
|
||||
// single-byte sequences (ASCII):
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
|
||||
|
||||
// these are invalid, we use step 1 to skip
|
||||
// over them (should never happen):
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF
|
||||
1, 1, // C0,C1
|
||||
|
||||
// two-byte sequences:
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
|
||||
|
||||
// three-byte sequences:
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
|
||||
|
||||
// four-byte sequences:
|
||||
4, 4, 4, 4, 4, // F0..F4
|
||||
|
||||
// these are invalid again (5- or 6-byte
|
||||
// sequences and sequences for code points
|
||||
// above U+10FFFF, as restricted by RFC 3629):
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF
|
||||
};
|
||||
unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c)
|
||||
{
|
||||
unsigned char l = tableUtf8Lengths[c];
|
||||
if ( !l ) //skip over invalid characters
|
||||
l = 1;
|
||||
return l;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// UTF-8 operations
|
||||
@ -166,7 +141,7 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len)
|
||||
{
|
||||
// if the string is not NULL-terminated, verify we have enough
|
||||
// bytes in it left for current character's encoding:
|
||||
if ( c + ms_utf8IterTable[*c] > end )
|
||||
if ( c + GetUTF8IterOffset(*c) > end )
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -364,7 +339,7 @@ wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch)
|
||||
{
|
||||
Utf8CharBuffer once(EncodeChar(ch));
|
||||
// the IncIter() table can be used to determine the length of ch's encoding:
|
||||
size_t len = ms_utf8IterTable[(unsigned char)once.data[0]];
|
||||
size_t len = GetUTF8IterOffset(once.data[0]);
|
||||
|
||||
wxCharBuffer buf(n * len);
|
||||
char *ptr = buf.data();
|
||||
|
@ -97,10 +97,11 @@ wxChar wxTextInputStream::GetChar()
|
||||
m_validEnd = 0;
|
||||
}
|
||||
|
||||
// We may need to decode up to 4 characters if we have input starting with
|
||||
// 3 BOM-like bytes, but not actually containing a BOM, as decoding it will
|
||||
// only succeed when 4 bytes are read -- and will yield 4 wide characters.
|
||||
wxChar wbuf[4];
|
||||
// We may need to decode up to 6 characters if we have input starting with
|
||||
// 2 null bytes (like in UTF-32BE BOM), and then 3 bytes that look like
|
||||
// the start of UTF-8 sequence, as decoding it will only succeed when
|
||||
// 6 bytes are read -- and will yield 6 wide characters.
|
||||
wxChar wbuf[6];
|
||||
for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++)
|
||||
{
|
||||
if ( inlen >= m_validEnd )
|
||||
@ -134,12 +135,13 @@ wxChar wxTextInputStream::GetChar()
|
||||
// one extra byte, the only explanation is that we were using a
|
||||
// wxConvAuto conversion recognizing the initial BOM and that
|
||||
// it couldn't detect the presence or absence of BOM so far,
|
||||
// but now finally has enough data to see that there is none.
|
||||
// As we must have fallen back to Latin-1 in this case, return
|
||||
// just the first byte and keep the other ones for the next
|
||||
// time.
|
||||
m_validBegin = 1;
|
||||
return wbuf[0];
|
||||
// but now finally has enough data to see that there is none, or
|
||||
// it was trying to decode the data as UTF-8 sequence, but now
|
||||
// recognized that it's not valid UTF-8 and switched to fallback.
|
||||
// We don't know how long is the first character or if it's decoded
|
||||
// as 1 or 2 wchar_t characters, so we need to start with 1 byte again.
|
||||
inlen = -1;
|
||||
break;
|
||||
|
||||
#if SIZEOF_WCHAR_T == 2
|
||||
case 2:
|
||||
|
@ -15,6 +15,7 @@
|
||||
#endif
|
||||
|
||||
#include "wx/ustring.h"
|
||||
#include "wx/private/unicode.h"
|
||||
|
||||
#ifndef WX_PRECOMP
|
||||
#include "wx/crt.h"
|
||||
@ -67,41 +68,6 @@ wxUString &wxUString::assignFromAscii( const char *str, size_type n )
|
||||
// UTF-8
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// this table gives the length of the UTF-8 encoding from its first character:
|
||||
const unsigned char tableUtf8Lengths[256] = {
|
||||
// single-byte sequences (ASCII):
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
|
||||
|
||||
// these are invalid:
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
|
||||
0, 0, // C0,C1
|
||||
|
||||
// two-byte sequences:
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
|
||||
|
||||
// three-byte sequences:
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
|
||||
|
||||
// four-byte sequences:
|
||||
4, 4, 4, 4, 4, // F0..F4
|
||||
|
||||
// these are invalid again (5- or 6-byte
|
||||
// sequences and sequences for code points
|
||||
// above U+10FFFF, as restricted by RFC 3629):
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
|
||||
};
|
||||
|
||||
wxUString &wxUString::assignFromUTF8( const char *str )
|
||||
{
|
||||
if (!str)
|
||||
|
@ -34,7 +34,9 @@ public:
|
||||
|
||||
private:
|
||||
CPPUNIT_TEST_SUITE( ConvAutoTestCase );
|
||||
CPPUNIT_TEST( Init );
|
||||
CPPUNIT_TEST( Empty );
|
||||
CPPUNIT_TEST( Encode );
|
||||
CPPUNIT_TEST( Short );
|
||||
CPPUNIT_TEST( None );
|
||||
CPPUNIT_TEST( UTF32LE );
|
||||
@ -42,22 +44,53 @@ private:
|
||||
CPPUNIT_TEST( UTF16LE );
|
||||
CPPUNIT_TEST( UTF16BE );
|
||||
CPPUNIT_TEST( UTF8 );
|
||||
CPPUNIT_TEST( UTF8NoBom );
|
||||
CPPUNIT_TEST( Fallback );
|
||||
CPPUNIT_TEST( FallbackMultibyte );
|
||||
CPPUNIT_TEST( FallbackShort );
|
||||
CPPUNIT_TEST( StreamUTF8NoBOM );
|
||||
CPPUNIT_TEST( StreamUTF8 );
|
||||
CPPUNIT_TEST( StreamUTF16LE );
|
||||
CPPUNIT_TEST( StreamUTF16BE );
|
||||
CPPUNIT_TEST( StreamUTF32LE );
|
||||
CPPUNIT_TEST( StreamUTF32BE );
|
||||
CPPUNIT_TEST( StreamFallback );
|
||||
CPPUNIT_TEST( StreamFallbackMultibyte );
|
||||
CPPUNIT_TEST_SUITE_END();
|
||||
|
||||
// expected converter state, UTF-8 without BOM by default
|
||||
struct ConvState
|
||||
{
|
||||
ConvState( wxBOM bom = wxBOM_None,
|
||||
wxFontEncoding enc = wxFONTENCODING_UTF8,
|
||||
bool fallback = false )
|
||||
: m_bom(bom), m_enc(enc), m_fallback(fallback) {}
|
||||
|
||||
void Check(const wxConvAuto& conv) const
|
||||
{
|
||||
CPPUNIT_ASSERT( conv.GetBOM() == m_bom );
|
||||
CPPUNIT_ASSERT( conv.GetEncoding() == m_enc );
|
||||
CPPUNIT_ASSERT( conv.IsUsingFallbackEncoding() == m_fallback );
|
||||
CPPUNIT_ASSERT( conv.IsUTF8() == (m_enc == wxFONTENCODING_UTF8) );
|
||||
}
|
||||
|
||||
wxBOM m_bom;
|
||||
wxFontEncoding m_enc;
|
||||
bool m_fallback;
|
||||
};
|
||||
|
||||
// real test function: check that converting the src multibyte string to
|
||||
// wide char using wxConvAuto yields wch as the first result
|
||||
//
|
||||
// the length of the string may need to be passed explicitly if it has
|
||||
// embedded NULs, otherwise it's not necessary
|
||||
void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN);
|
||||
void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN,
|
||||
ConvState st = ConvState(),
|
||||
wxFontEncoding fe = wxFONTENCODING_DEFAULT);
|
||||
|
||||
void Init();
|
||||
void Empty();
|
||||
void Encode();
|
||||
void Short();
|
||||
void None();
|
||||
void UTF32LE();
|
||||
@ -65,12 +98,17 @@ private:
|
||||
void UTF16LE();
|
||||
void UTF16BE();
|
||||
void UTF8();
|
||||
void UTF8NoBom();
|
||||
void Fallback();
|
||||
void FallbackMultibyte();
|
||||
void FallbackShort();
|
||||
|
||||
// test whether two lines of text are converted properly from a stream
|
||||
void TestTextStream(const char *src,
|
||||
size_t srclength,
|
||||
const wxString& line1,
|
||||
const wxString& line2);
|
||||
const wxString& line2,
|
||||
wxFontEncoding fe = wxFONTENCODING_DEFAULT);
|
||||
|
||||
void StreamUTF8NoBOM();
|
||||
void StreamUTF8();
|
||||
@ -78,6 +116,8 @@ private:
|
||||
void StreamUTF16BE();
|
||||
void StreamUTF32LE();
|
||||
void StreamUTF32BE();
|
||||
void StreamFallback();
|
||||
void StreamFallbackMultibyte();
|
||||
};
|
||||
|
||||
// register in the unnamed registry so that these tests are run by default
|
||||
@ -90,16 +130,36 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(ConvAutoTestCase, "ConvAutoTestCase");
|
||||
// tests
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len)
|
||||
void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len,
|
||||
ConvState st, wxFontEncoding fe)
|
||||
{
|
||||
wxWCharBuffer wbuf = wxConvAuto().cMB2WC(src, len, NULL);
|
||||
wxConvAuto conv(fe);
|
||||
wxWCharBuffer wbuf = conv.cMB2WC(src, len, NULL);
|
||||
CPPUNIT_ASSERT( wbuf );
|
||||
CPPUNIT_ASSERT_EQUAL( wch, *wbuf );
|
||||
st.Check(conv);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::Init()
|
||||
{
|
||||
ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(wxConvAuto());
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::Empty()
|
||||
{
|
||||
CPPUNIT_ASSERT( !wxConvAuto().cMB2WC("") );
|
||||
wxConvAuto conv;
|
||||
CPPUNIT_ASSERT( !conv.cMB2WC("") );
|
||||
ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(conv);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::Encode()
|
||||
{
|
||||
wxConvAuto conv;
|
||||
wxString str = wxString::FromUTF8("\xd0\x9f\xe3\x81\x82");
|
||||
wxCharBuffer buf = conv.cWC2MB(str.wc_str());
|
||||
CPPUNIT_ASSERT( buf );
|
||||
CPPUNIT_ASSERT_EQUAL( str, wxString::FromUTF8(buf) );
|
||||
ConvState(wxBOM_Unknown, wxFONTENCODING_UTF8).Check(conv);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::Short()
|
||||
@ -114,38 +174,71 @@ void ConvAutoTestCase::None()
|
||||
|
||||
void ConvAutoTestCase::UTF32LE()
|
||||
{
|
||||
TestFirstChar("\xff\xfe\0\0A\0\0\0", wxT('A'), 8);
|
||||
TestFirstChar("\xff\xfe\0\0A\0\0\0", wxT('A'), 8, ConvState(wxBOM_UTF32LE, wxFONTENCODING_UTF32LE));
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::UTF32BE()
|
||||
{
|
||||
TestFirstChar("\0\0\xfe\xff\0\0\0B", wxT('B'), 8);
|
||||
TestFirstChar("\0\0\xfe\xff\0\0\0B", wxT('B'), 8, ConvState(wxBOM_UTF32BE, wxFONTENCODING_UTF32BE));
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::UTF16LE()
|
||||
{
|
||||
TestFirstChar("\xff\xfeZ\0", wxT('Z'), 4);
|
||||
TestFirstChar("\xff\xfeZ\0", wxT('Z'), 4, ConvState(wxBOM_UTF16LE, wxFONTENCODING_UTF16LE));
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::UTF16BE()
|
||||
{
|
||||
TestFirstChar("\xfe\xff\0Y", wxT('Y'), 4);
|
||||
TestFirstChar("\xfe\xff\0Y", wxT('Y'), 4, ConvState(wxBOM_UTF16BE, wxFONTENCODING_UTF16BE));
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::UTF8()
|
||||
{
|
||||
#ifdef wxHAVE_U_ESCAPE
|
||||
TestFirstChar("\xef\xbb\xbf\xd0\x9f", L'\u041f');
|
||||
TestFirstChar("\xef\xbb\xbf\xd0\x9f", L'\u041f', wxNO_LEN, ConvState(wxBOM_UTF8, wxFONTENCODING_UTF8));
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::UTF8NoBom()
|
||||
{
|
||||
#ifdef wxHAVE_U_ESCAPE
|
||||
TestFirstChar("\xd0\x9f\xe3\x81\x82", L'\u041f', wxNO_LEN, ConvState(wxBOM_None, wxFONTENCODING_UTF8));
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::Fallback()
|
||||
{
|
||||
#ifdef wxHAVE_U_ESCAPE
|
||||
TestFirstChar("\xbf", L'\u041f', wxNO_LEN,
|
||||
ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true),
|
||||
wxFONTENCODING_ISO8859_5);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::FallbackMultibyte()
|
||||
{
|
||||
#ifdef wxHAVE_U_ESCAPE
|
||||
TestFirstChar("\x84\x50", L'\u041f', wxNO_LEN,
|
||||
ConvState(wxBOM_None, wxFONTENCODING_CP932, true),
|
||||
wxFONTENCODING_CP932);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::FallbackShort()
|
||||
{
|
||||
TestFirstChar("\x61\xc4", 'a', 2,
|
||||
ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true),
|
||||
wxFONTENCODING_ISO8859_5);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::TestTextStream(const char *src,
|
||||
size_t srclength,
|
||||
const wxString& line1,
|
||||
const wxString& line2)
|
||||
const wxString& line2,
|
||||
wxFontEncoding fe)
|
||||
{
|
||||
wxMemoryInputStream instream(src, srclength);
|
||||
wxTextInputStream text(instream);
|
||||
wxTextInputStream text(instream, wxT(" \t"), wxConvAuto(fe));
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() );
|
||||
CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() );
|
||||
@ -166,16 +259,8 @@ const wxString line2 = wxString::FromUTF8("\xce\xb2");
|
||||
|
||||
void ConvAutoTestCase::StreamUTF8NoBOM()
|
||||
{
|
||||
// currently this test doesn't work because without the BOM wxConvAuto
|
||||
// decides that the string is in Latin-1 after finding the first (but not
|
||||
// the two subsequent ones which are part of the same UTF-8 sequence!)
|
||||
// 8-bit character
|
||||
//
|
||||
// FIXME: we need to fix this at wxTextInputStream level, see #11570
|
||||
#if 0
|
||||
TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2",
|
||||
7, line1, line2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamUTF8()
|
||||
@ -210,4 +295,17 @@ void ConvAutoTestCase::StreamUTF32BE()
|
||||
20, line1, line2);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamFallback()
|
||||
{
|
||||
TestTextStream("\x61\xbf\x0A\xe0",
|
||||
4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"),
|
||||
wxFONTENCODING_ISO8859_5);
|
||||
}
|
||||
|
||||
void ConvAutoTestCase::StreamFallbackMultibyte()
|
||||
{
|
||||
TestTextStream("\x61\x82\xa0\x0A\x83\xc0",
|
||||
6, line1, line2, wxFONTENCODING_CP932);
|
||||
}
|
||||
|
||||
#endif // wxUSE_UNICODE
|
||||
|
@ -324,6 +324,46 @@ TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]")
|
||||
REQUIRE( tis.GetChar() == 0x00 );
|
||||
CHECK( tis.GetInputStream().Eof() );
|
||||
}
|
||||
|
||||
// Two null bytes that look like the start of UTF-32BE BOM,
|
||||
// followed by 4 byte UTF-8 sequence.
|
||||
// Needs wxConvAuto to not switch to fallback on <6 bytes.
|
||||
SECTION("UTF8-with-nulls")
|
||||
{
|
||||
const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0x98 };
|
||||
wxMemoryInputStream mis(buf, sizeof(buf));
|
||||
wxTextInputStream tis(mis);
|
||||
|
||||
wxCharTypeBuffer<wxChar> e = wxString::FromUTF8((char*)buf, sizeof(buf))
|
||||
.tchar_str<wxChar>();
|
||||
for ( size_t i = 0; i < e.length(); ++i )
|
||||
{
|
||||
INFO("i = " << i);
|
||||
REQUIRE( tis.GetChar() == e[i] );
|
||||
}
|
||||
REQUIRE( tis.GetChar() == 0x00 );
|
||||
CHECK( tis.GetInputStream().Eof() );
|
||||
}
|
||||
|
||||
// Two null bytes that look like the start of UTF-32BE BOM,
|
||||
// then 3 bytes that look like the start of UTF-8 sequence.
|
||||
// Needs 6 character output buffer in GetChar().
|
||||
SECTION("almost-UTF8-with-nulls")
|
||||
{
|
||||
const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0xe0 };
|
||||
wxMemoryInputStream mis(buf, sizeof(buf));
|
||||
wxTextInputStream tis(mis);
|
||||
|
||||
wxCharTypeBuffer<wxChar> e = wxString((char*)buf, wxCSConv(wxFONTENCODING_ISO8859_1),
|
||||
sizeof(buf)).tchar_str<wxChar>();
|
||||
for ( size_t i = 0; i < e.length(); ++i )
|
||||
{
|
||||
INFO("i = " << i);
|
||||
REQUIRE( tis.GetChar() == e[i] );
|
||||
}
|
||||
REQUIRE( tis.GetChar() == 0x00 );
|
||||
CHECK( tis.GetInputStream().Eof() );
|
||||
}
|
||||
}
|
||||
|
||||
#endif // wxUSE_UNICODE
|
||||
|
Loading…
Reference in New Issue
Block a user