From 2b8fd103b709f8026cc95e01b47f7f21513e3c42 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 13 Feb 2019 15:00:23 +0200 Subject: [PATCH 01/18] Add wxConvAuto::IsFallbackEncoding() --- include/wx/convauto.h | 6 ++++++ interface/wx/convauto.h | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/include/wx/convauto.h b/include/wx/convauto.h index 3da6c6adc4..caab99973f 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -91,6 +91,12 @@ public: return m_bomType; } + // Return true if the fall-back encoding is used + bool IsFallbackEncoding() const + { + return m_ownsConv && m_bomType == wxBOM_None; + } + private: // common part of all ctors void Init() diff --git a/interface/wx/convauto.h b/interface/wx/convauto.h index 7ddfb26927..90e769c32a 100644 --- a/interface/wx/convauto.h +++ b/interface/wx/convauto.h @@ -146,6 +146,13 @@ public: */ wxBOM GetBOM() const; + /** + Check if the fall-back encoding is used. + + @since 3.1.5 + */ + bool IsFallbackEncoding() const; + /** Return a pointer to the characters that makes up this BOM. From 307a97dadfe7a796aa6025ce2da2f6f8196db133 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Sun, 20 Sep 2020 21:40:19 +0300 Subject: [PATCH 02/18] Add wxConvAuto::IsUTF8() --- include/wx/convauto.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/wx/convauto.h b/include/wx/convauto.h index caab99973f..23b21dc41c 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -78,6 +78,8 @@ public: virtual size_t GetMBNulLen() const wxOVERRIDE { return m_conv->GetMBNulLen(); } + virtual bool IsUTF8() const wxOVERRIDE { return m_conv && m_conv->IsUTF8(); } + virtual wxMBConv *Clone() const wxOVERRIDE { return new wxConvAuto(*this); } // return the BOM type of this buffer From 28823424e9c3d0a56aff76057c00b023ea04500a Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Tue, 22 Sep 2020 17:42:07 +0300 Subject: [PATCH 03/18] Add wxConvAuto::GetEncoding() --- include/wx/convauto.h | 2 ++ interface/wx/convauto.h | 9 +++++++++ src/common/convauto.cpp | 31 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/include/wx/convauto.h b/include/wx/convauto.h index 23b21dc41c..57212ab1ad 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -93,6 +93,8 @@ public: return m_bomType; } + wxFontEncoding GetEncoding() const; + // Return true if the fall-back encoding is used bool IsFallbackEncoding() const { diff --git a/interface/wx/convauto.h b/interface/wx/convauto.h index 90e769c32a..788d6ec2b9 100644 --- a/interface/wx/convauto.h +++ b/interface/wx/convauto.h @@ -146,6 +146,15 @@ public: */ wxBOM GetBOM() const; + /** + Return the detected encoding + + Returns @c wxFONTENCODING_MAX if called before the first use. + + @since 3.1.5 + */ + wxBOM GetEncoding() const; + /** Check if the fall-back encoding is used. diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 952b4455f5..dca91eb59b 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -351,3 +351,34 @@ wxConvAuto::FromWChar(char *dst, size_t dstLen, return m_conv->FromWChar(dst, dstLen, src, srcLen); } + +wxFontEncoding wxConvAuto::GetEncoding() const +{ + switch ( m_bomType ) + { + case wxBOM_UTF32BE: + return wxFONTENCODING_UTF32BE; + case wxBOM_UTF32LE: + return wxFONTENCODING_UTF32LE; + case wxBOM_UTF16BE: + return wxFONTENCODING_UTF16BE; + case wxBOM_UTF16LE: + return wxFONTENCODING_UTF16LE; + case wxBOM_UTF8: + return wxFONTENCODING_UTF8; + + case wxBOM_Unknown: + case wxBOM_None: + if ( !m_conv ) + return wxFONTENCODING_MAX; + else if ( !m_ownsConv ) + return wxFONTENCODING_UTF8; + else if ( m_encDefault != wxFONTENCODING_DEFAULT ) + return m_encDefault; + else + return GetFallbackEncoding(); + } + + wxFAIL_MSG( "unknown BOM type" ); + return wxFONTENCODING_MAX; +} From 3676635231ec3dde41c86049e43e1f18a8a725b3 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 23 Sep 2020 14:43:01 +0300 Subject: [PATCH 04/18] Check wxConvAuto state in tests --- tests/mbconv/convautotest.cpp | 43 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 6a5d5791d1..cde714b109 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -50,12 +50,33 @@ private: CPPUNIT_TEST( StreamUTF32BE ); CPPUNIT_TEST_SUITE_END(); + // expected converter state, UTF-8 without BOM by default + struct ConvState + { + ConvState( wxBOM bom = wxBOM_None, + wxFontEncoding enc = wxFONTENCODING_UTF8, + bool fallback = false ) + : m_bom(bom), m_enc(enc), m_fallback(fallback) {} + + void Check(const wxConvAuto& conv) const + { + CPPUNIT_ASSERT( conv.GetBOM() == m_bom ); + CPPUNIT_ASSERT( conv.GetEncoding() == m_enc ); + CPPUNIT_ASSERT( conv.IsFallbackEncoding() == m_fallback ); + CPPUNIT_ASSERT( conv.IsUTF8() == (m_enc == wxFONTENCODING_UTF8) ); + } + + wxBOM m_bom; + wxFontEncoding m_enc; + bool m_fallback; + }; + // real test function: check that converting the src multibyte string to // wide char using wxConvAuto yields wch as the first result // // the length of the string may need to be passed explicitly if it has // embedded NULs, otherwise it's not necessary - void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN); + void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN, ConvState st = ConvState()); void Empty(); void Short(); @@ -90,16 +111,20 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(ConvAutoTestCase, "ConvAutoTestCase"); // tests // ---------------------------------------------------------------------------- -void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len) +void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len, ConvState st) { - wxWCharBuffer wbuf = wxConvAuto().cMB2WC(src, len, NULL); + wxConvAuto conv; + wxWCharBuffer wbuf = conv.cMB2WC(src, len, NULL); CPPUNIT_ASSERT( wbuf ); CPPUNIT_ASSERT_EQUAL( wch, *wbuf ); + st.Check(conv); } void ConvAutoTestCase::Empty() { - CPPUNIT_ASSERT( !wxConvAuto().cMB2WC("") ); + wxConvAuto conv; + CPPUNIT_ASSERT( !conv.cMB2WC("") ); + ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(conv); } void ConvAutoTestCase::Short() @@ -114,28 +139,28 @@ void ConvAutoTestCase::None() void ConvAutoTestCase::UTF32LE() { - TestFirstChar("\xff\xfe\0\0A\0\0\0", wxT('A'), 8); + TestFirstChar("\xff\xfe\0\0A\0\0\0", wxT('A'), 8, ConvState(wxBOM_UTF32LE, wxFONTENCODING_UTF32LE)); } void ConvAutoTestCase::UTF32BE() { - TestFirstChar("\0\0\xfe\xff\0\0\0B", wxT('B'), 8); + TestFirstChar("\0\0\xfe\xff\0\0\0B", wxT('B'), 8, ConvState(wxBOM_UTF32BE, wxFONTENCODING_UTF32BE)); } void ConvAutoTestCase::UTF16LE() { - TestFirstChar("\xff\xfeZ\0", wxT('Z'), 4); + TestFirstChar("\xff\xfeZ\0", wxT('Z'), 4, ConvState(wxBOM_UTF16LE, wxFONTENCODING_UTF16LE)); } void ConvAutoTestCase::UTF16BE() { - TestFirstChar("\xfe\xff\0Y", wxT('Y'), 4); + TestFirstChar("\xfe\xff\0Y", wxT('Y'), 4, ConvState(wxBOM_UTF16BE, wxFONTENCODING_UTF16BE)); } void ConvAutoTestCase::UTF8() { #ifdef wxHAVE_U_ESCAPE - TestFirstChar("\xef\xbb\xbf\xd0\x9f", L'\u041f'); + TestFirstChar("\xef\xbb\xbf\xd0\x9f", L'\u041f', wxNO_LEN, ConvState(wxBOM_UTF8, wxFONTENCODING_UTF8)); #endif } From 857950c62665948033c873d4e59731f07cc99370 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Mon, 28 Sep 2020 21:02:46 +0300 Subject: [PATCH 05/18] Add more wxConvAuto test cases --- tests/mbconv/convautotest.cpp | 93 +++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 14 deletions(-) diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index cde714b109..91b940a35b 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -34,7 +34,9 @@ public: private: CPPUNIT_TEST_SUITE( ConvAutoTestCase ); + CPPUNIT_TEST( Init ); CPPUNIT_TEST( Empty ); + CPPUNIT_TEST( Encode ); CPPUNIT_TEST( Short ); CPPUNIT_TEST( None ); CPPUNIT_TEST( UTF32LE ); @@ -42,12 +44,17 @@ private: CPPUNIT_TEST( UTF16LE ); CPPUNIT_TEST( UTF16BE ); CPPUNIT_TEST( UTF8 ); + CPPUNIT_TEST( UTF8NoBom ); + CPPUNIT_TEST( Fallback ); + CPPUNIT_TEST( FallbackMultibyte ); CPPUNIT_TEST( StreamUTF8NoBOM ); CPPUNIT_TEST( StreamUTF8 ); CPPUNIT_TEST( StreamUTF16LE ); CPPUNIT_TEST( StreamUTF16BE ); CPPUNIT_TEST( StreamUTF32LE ); CPPUNIT_TEST( StreamUTF32BE ); + CPPUNIT_TEST( StreamFallback ); + CPPUNIT_TEST( StreamFallbackMultibyte ); CPPUNIT_TEST_SUITE_END(); // expected converter state, UTF-8 without BOM by default @@ -76,9 +83,13 @@ private: // // the length of the string may need to be passed explicitly if it has // embedded NULs, otherwise it's not necessary - void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN, ConvState st = ConvState()); + void TestFirstChar(const char *src, wchar_t wch, size_t len = wxNO_LEN, + ConvState st = ConvState(), + wxFontEncoding fe = wxFONTENCODING_DEFAULT); + void Init(); void Empty(); + void Encode(); void Short(); void None(); void UTF32LE(); @@ -86,12 +97,16 @@ private: void UTF16LE(); void UTF16BE(); void UTF8(); + void UTF8NoBom(); + void Fallback(); + void FallbackMultibyte(); // test whether two lines of text are converted properly from a stream void TestTextStream(const char *src, size_t srclength, const wxString& line1, - const wxString& line2); + const wxString& line2, + wxFontEncoding fe = wxFONTENCODING_DEFAULT); void StreamUTF8NoBOM(); void StreamUTF8(); @@ -99,6 +114,8 @@ private: void StreamUTF16BE(); void StreamUTF32LE(); void StreamUTF32BE(); + void StreamFallback(); + void StreamFallbackMultibyte(); }; // register in the unnamed registry so that these tests are run by default @@ -111,15 +128,21 @@ CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(ConvAutoTestCase, "ConvAutoTestCase"); // tests // ---------------------------------------------------------------------------- -void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len, ConvState st) +void ConvAutoTestCase::TestFirstChar(const char *src, wchar_t wch, size_t len, + ConvState st, wxFontEncoding fe) { - wxConvAuto conv; + wxConvAuto conv(fe); wxWCharBuffer wbuf = conv.cMB2WC(src, len, NULL); CPPUNIT_ASSERT( wbuf ); CPPUNIT_ASSERT_EQUAL( wch, *wbuf ); st.Check(conv); } +void ConvAutoTestCase::Init() +{ + ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(wxConvAuto()); +} + void ConvAutoTestCase::Empty() { wxConvAuto conv; @@ -127,6 +150,16 @@ void ConvAutoTestCase::Empty() ConvState(wxBOM_Unknown, wxFONTENCODING_MAX).Check(conv); } +void ConvAutoTestCase::Encode() +{ + wxConvAuto conv; + wxString str = wxString::FromUTF8("\xd0\x9f\xe3\x81\x82"); + wxCharBuffer buf = conv.cWC2MB(str.wc_str()); + CPPUNIT_ASSERT( buf ); + CPPUNIT_ASSERT_EQUAL( str, wxString::FromUTF8(buf) ); + ConvState(wxBOM_Unknown, wxFONTENCODING_UTF8).Check(conv); +} + void ConvAutoTestCase::Short() { TestFirstChar("1", wxT('1')); @@ -164,13 +197,39 @@ void ConvAutoTestCase::UTF8() #endif } +void ConvAutoTestCase::UTF8NoBom() +{ +#ifdef wxHAVE_U_ESCAPE + TestFirstChar("\xd0\x9f\xe3\x81\x82", L'\u041f', wxNO_LEN, ConvState(wxBOM_None, wxFONTENCODING_UTF8)); +#endif +} + +void ConvAutoTestCase::Fallback() +{ +#ifdef wxHAVE_U_ESCAPE + TestFirstChar("\xbf", L'\u041f', wxNO_LEN, + ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true), + wxFONTENCODING_ISO8859_5); +#endif +} + +void ConvAutoTestCase::FallbackMultibyte() +{ +#ifdef wxHAVE_U_ESCAPE + TestFirstChar("\x84\x50", L'\u041f', wxNO_LEN, + ConvState(wxBOM_None, wxFONTENCODING_CP932, true), + wxFONTENCODING_CP932); +#endif +} + void ConvAutoTestCase::TestTextStream(const char *src, size_t srclength, const wxString& line1, - const wxString& line2) + const wxString& line2, + wxFontEncoding fe) { wxMemoryInputStream instream(src, srclength); - wxTextInputStream text(instream); + wxTextInputStream text(instream, wxT(" \t"), wxConvAuto(fe)); CPPUNIT_ASSERT_EQUAL( line1, text.ReadLine() ); CPPUNIT_ASSERT_EQUAL( line2, text.ReadLine() ); @@ -191,16 +250,8 @@ const wxString line2 = wxString::FromUTF8("\xce\xb2"); void ConvAutoTestCase::StreamUTF8NoBOM() { - // currently this test doesn't work because without the BOM wxConvAuto - // decides that the string is in Latin-1 after finding the first (but not - // the two subsequent ones which are part of the same UTF-8 sequence!) - // 8-bit character - // - // FIXME: we need to fix this at wxTextInputStream level, see #11570 -#if 0 TestTextStream("\x61\xE3\x81\x82\x0A\xCE\xB2", 7, line1, line2); -#endif } void ConvAutoTestCase::StreamUTF8() @@ -235,4 +286,18 @@ void ConvAutoTestCase::StreamUTF32BE() 20, line1, line2); } +void ConvAutoTestCase::StreamFallback() +{ + // this only works if there are at least 3 bytes after the first non-ASCII character + TestTextStream("\x61\xbf\x0A\xe0\x7a", + 5, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80z"), + wxFONTENCODING_ISO8859_5); +} + +void ConvAutoTestCase::StreamFallbackMultibyte() +{ + TestTextStream("\x61\x82\xa0\x0A\x83\xc0", + 6, line1, line2, wxFONTENCODING_CP932); +} + #endif // wxUSE_UNICODE From 1e435d2347cf76337386da9fad43406e49f2ea2f Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Mon, 28 Sep 2020 21:58:52 +0300 Subject: [PATCH 06/18] Fix wxTextInputStream incorrectly decoding multibyte fallback encodings --- src/common/txtstrm.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index c38f7c29ab..fc5e352b58 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -134,12 +134,13 @@ wxChar wxTextInputStream::GetChar() // one extra byte, the only explanation is that we were using a // wxConvAuto conversion recognizing the initial BOM and that // it couldn't detect the presence or absence of BOM so far, - // but now finally has enough data to see that there is none. - // As we must have fallen back to Latin-1 in this case, return - // just the first byte and keep the other ones for the next - // time. - m_validBegin = 1; - return wbuf[0]; + // but now finally has enough data to see that there is none, or + // it was trying to decode the data as UTF-8 sequence, but now + // recognized that it's not valid UTF-8 and switched to fallback. + // We don't know how long is the first character or if it's decoded + // as 1 or 2 wchar_t characters, so we need to start with 1 byte again. + inlen = -1; + break; #if SIZEOF_WCHAR_T == 2 case 2: From bc838b4773ab3a93be5ff869a9357b72b9cf875a Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Mon, 28 Sep 2020 22:11:17 +0300 Subject: [PATCH 07/18] Do not delete and create the fallback conversion again when it fails --- src/common/convauto.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index dca91eb59b..50c5a956c7 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -307,7 +307,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // try to convert using the auto-detected encoding size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); - if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) + if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None && !m_ownsConv ) { // we may need more bytes before we can decode the input, don't switch // to the fall-back conversion in this case as it would prevent us from @@ -320,9 +320,6 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // simply tried UTF-8 by default, retry it using the fall-back if ( m_encDefault != wxFONTENCODING_MAX ) { - if ( m_ownsConv ) - delete m_conv; - self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT ? GetFallbackEncoding() : m_encDefault); From b3eff48e28f4d1f7a7ea56f10522473cb150aff9 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Tue, 29 Sep 2020 15:35:53 +0300 Subject: [PATCH 08/18] Switch to fallback earlier if the input is not valid UTF-8 prefix --- include/wx/strconv.h | 2 ++ src/common/convauto.cpp | 2 +- src/common/strconv.cpp | 20 ++++++++++++++++++++ tests/mbconv/convautotest.cpp | 5 ++--- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index c1b070d36a..21c5f136b1 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -387,6 +387,8 @@ private: int m_options; }; +bool wxIsUTF8Prefix(const char *src, size_t len); + // ---------------------------------------------------------------------------- // wxMBConvUTF16Base: for both LE and BE variants // ---------------------------------------------------------------------------- diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 50c5a956c7..7b92d396f6 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example - if ( srcLen < m_conv->GetMaxCharLen() ) + if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index ba25dae157..04f6e451ec 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -1446,6 +1446,26 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, return len; } +// checks if the input can be the beginning of a valid UTF-8 string +bool wxIsUTF8Prefix(const char *src, size_t len) +{ + unsigned char l; + for ( size_t i = 0; i < len; ++i ) + { + l = tableUtf8Lengths[(unsigned char)src[i]]; + if ( !l ) + return false; // invalid leading byte + while ( --l ) + { + if ( ++i == len ) + return true; // truncated sequence + if ( (src[i] & 0xC0) != 0x80 ) + return false; // invalid continuation byte + } + } + return true; +} + // ============================================================================ // UTF-16 // ============================================================================ diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 91b940a35b..789e7582aa 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -288,9 +288,8 @@ void ConvAutoTestCase::StreamUTF32BE() void ConvAutoTestCase::StreamFallback() { - // this only works if there are at least 3 bytes after the first non-ASCII character - TestTextStream("\x61\xbf\x0A\xe0\x7a", - 5, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80z"), + TestTextStream("\x61\xbf\x0A\xe0", + 4, wxString::FromUTF8("a\xd0\x9f"), wxString::FromUTF8("\xd1\x80"), wxFONTENCODING_ISO8859_5); } From 45adce85618f05cdffc08ef2df76547701cc17b2 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Sat, 3 Oct 2020 18:21:18 +0300 Subject: [PATCH 09/18] Fix wxTextInputStream for some inputs starting with nulls --- src/common/convauto.cpp | 2 +- src/common/txtstrm.cpp | 9 +++---- tests/streams/textstreamtest.cpp | 40 ++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 7b92d396f6..d5d6079b32 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -313,7 +313,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example - if ( srcLen < m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) + if ( srcLen < 2 + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/src/common/txtstrm.cpp b/src/common/txtstrm.cpp index fc5e352b58..1332bb3cf0 100644 --- a/src/common/txtstrm.cpp +++ b/src/common/txtstrm.cpp @@ -97,10 +97,11 @@ wxChar wxTextInputStream::GetChar() m_validEnd = 0; } - // We may need to decode up to 4 characters if we have input starting with - // 3 BOM-like bytes, but not actually containing a BOM, as decoding it will - // only succeed when 4 bytes are read -- and will yield 4 wide characters. - wxChar wbuf[4]; + // We may need to decode up to 6 characters if we have input starting with + // 2 null bytes (like in UTF-32BE BOM), and then 3 bytes that look like + // the start of UTF-8 sequence, as decoding it will only succeed when + // 6 bytes are read -- and will yield 6 wide characters. + wxChar wbuf[6]; for(size_t inlen = 0; inlen < sizeof(m_lastBytes); inlen++) { if ( inlen >= m_validEnd ) diff --git a/tests/streams/textstreamtest.cpp b/tests/streams/textstreamtest.cpp index edb6eaa8a2..c6497b23f2 100644 --- a/tests/streams/textstreamtest.cpp +++ b/tests/streams/textstreamtest.cpp @@ -324,6 +324,46 @@ TEST_CASE("wxTextInputStream::GetChar", "[text][input][stream][char]") REQUIRE( tis.GetChar() == 0x00 ); CHECK( tis.GetInputStream().Eof() ); } + + // Two null bytes that look like the start of UTF-32BE BOM, + // followed by 4 byte UTF-8 sequence. + // Needs wxConvAuto to not switch to fallback on <6 bytes. + SECTION("UTF8-with-nulls") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0x98 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + wxCharTypeBuffer e = wxString::FromUTF8((char*)buf, sizeof(buf)) + .tchar_str(); + for ( size_t i = 0; i < e.length(); ++i ) + { + INFO("i = " << i); + REQUIRE( tis.GetChar() == e[i] ); + } + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } + + // Two null bytes that look like the start of UTF-32BE BOM, + // then 3 bytes that look like the start of UTF-8 sequence. + // Needs 6 character output buffer in GetChar(). + SECTION("almost-UTF8-with-nulls") + { + const wxUint8 buf[] = { 0x00, 0x00, 0xf0, 0x90, 0x8c, 0xe0 }; + wxMemoryInputStream mis(buf, sizeof(buf)); + wxTextInputStream tis(mis); + + wxCharTypeBuffer e = wxString((char*)buf, wxCSConv(wxFONTENCODING_ISO8859_1), + sizeof(buf)).tchar_str(); + for ( size_t i = 0; i < e.length(); ++i ) + { + INFO("i = " << i); + REQUIRE( tis.GetChar() == e[i] ); + } + REQUIRE( tis.GetChar() == 0x00 ); + CHECK( tis.GetInputStream().Eof() ); + } } #endif // wxUSE_UNICODE From 4832565e1034b7b747d0252289698e2d47e73ce6 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Mon, 5 Oct 2020 15:21:35 +0300 Subject: [PATCH 10/18] Rename IsFallbackEncoding() to IsUsingFallbackEncoding() --- include/wx/convauto.h | 2 +- interface/wx/convauto.h | 2 +- tests/mbconv/convautotest.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/wx/convauto.h b/include/wx/convauto.h index 57212ab1ad..d7ed45592f 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -96,7 +96,7 @@ public: wxFontEncoding GetEncoding() const; // Return true if the fall-back encoding is used - bool IsFallbackEncoding() const + bool IsUsingFallbackEncoding() const { return m_ownsConv && m_bomType == wxBOM_None; } diff --git a/interface/wx/convauto.h b/interface/wx/convauto.h index 788d6ec2b9..324b5b24e7 100644 --- a/interface/wx/convauto.h +++ b/interface/wx/convauto.h @@ -160,7 +160,7 @@ public: @since 3.1.5 */ - bool IsFallbackEncoding() const; + bool IsUsingFallbackEncoding() const; /** Return a pointer to the characters that makes up this BOM. diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 789e7582aa..6eaa24b155 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -69,7 +69,7 @@ private: { CPPUNIT_ASSERT( conv.GetBOM() == m_bom ); CPPUNIT_ASSERT( conv.GetEncoding() == m_enc ); - CPPUNIT_ASSERT( conv.IsFallbackEncoding() == m_fallback ); + CPPUNIT_ASSERT( conv.IsUsingFallbackEncoding() == m_fallback ); CPPUNIT_ASSERT( conv.IsUTF8() == (m_enc == wxFONTENCODING_UTF8) ); } From 307a0916fd7581870d40b920d776dec7ff60e8e7 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Mon, 5 Oct 2020 15:29:11 +0300 Subject: [PATCH 11/18] Add comment about 2 extra bytes in wxConvAuto::ToWChar --- src/common/convauto.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index d5d6079b32..f63d015c83 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -313,6 +313,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example + // 2 extra bytes are needed for inputs that start with 1 or 2 null bytes + // that look like the start of UTF-32BE BOM, but can be in UTF-8 too if ( srcLen < 2 + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) return wxCONV_FAILED; From d2e7b5bdd1610b97732e7bcca4f3b4f2db2bd145 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Mon, 5 Oct 2020 18:10:27 +0300 Subject: [PATCH 12/18] Fix GetEncoding() after global fallback encoding is changed Save global fallback encoding when switching to it, so GetEncoding() can still return the correct value if global fallback encoding is changed later. Also do not switch to wxFONTENCODING_MAX when it is set as global fallback encoding. --- src/common/convauto.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index f63d015c83..840841f5b6 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -320,11 +320,11 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // if the conversion failed but we didn't really detect anything and // simply tried UTF-8 by default, retry it using the fall-back + if ( m_encDefault == wxFONTENCODING_DEFAULT ) + self->m_encDefault = GetFallbackEncoding(); if ( m_encDefault != wxFONTENCODING_MAX ) { - self->m_conv = new wxCSConv(m_encDefault == wxFONTENCODING_DEFAULT - ? GetFallbackEncoding() - : m_encDefault); + self->m_conv = new wxCSConv(m_encDefault); self->m_ownsConv = true; rc = m_conv->ToWChar(dst, dstLen, src, srcLen); @@ -372,10 +372,8 @@ wxFontEncoding wxConvAuto::GetEncoding() const return wxFONTENCODING_MAX; else if ( !m_ownsConv ) return wxFONTENCODING_UTF8; - else if ( m_encDefault != wxFONTENCODING_DEFAULT ) - return m_encDefault; else - return GetFallbackEncoding(); + return m_encDefault; } wxFAIL_MSG( "unknown BOM type" ); From 13700025be5cf2c8d9a7e5b99e2cbf934234f6a2 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Tue, 6 Oct 2020 22:06:51 +0300 Subject: [PATCH 13/18] Add wx/private/unicode.h --- include/wx/private/unicode.h | 16 ++++++++++++++++ src/common/strconv.cpp | 3 ++- src/common/ustring.cpp | 36 +----------------------------------- 3 files changed, 19 insertions(+), 36 deletions(-) create mode 100644 include/wx/private/unicode.h diff --git a/include/wx/private/unicode.h b/include/wx/private/unicode.h new file mode 100644 index 0000000000..6c81c23504 --- /dev/null +++ b/include/wx/private/unicode.h @@ -0,0 +1,16 @@ +///////////////////////////////////////////////////////////////////////////// +// Name: wx/private/unicode.h +// Purpose: Unicode private declsrations +// Author: Pavel Tyunin +// Created: 2020-10-06 +// Copyright: (c) 2020 Pavel Tyunin +// Licence: wxWindows licence +///////////////////////////////////////////////////////////////////////////// + +#ifndef _WX_PRIVATE_UNICODEH__ +#define _WX_PRIVATE_UNICODEH__ + +// this table gives the length of the UTF-8 encoding from its first character: +extern const unsigned char tableUtf8Lengths[256]; + +#endif // _WX_PRIVATE_UNICODEH__ diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index 04f6e451ec..d09b5ff2a4 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -46,6 +46,7 @@ #include "wx/encconv.h" #include "wx/fontmap.h" +#include "wx/private/unicode.h" #ifdef __DARWIN__ #include "wx/osx/core/private/strconv_cf.h" @@ -921,7 +922,7 @@ const wxUint32 wxUnicodePUA = 0x100000; const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; // this table gives the length of the UTF-8 encoding from its first character: -const unsigned char tableUtf8Lengths[256] = { +extern const unsigned char tableUtf8Lengths[256] = { // single-byte sequences (ASCII): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F diff --git a/src/common/ustring.cpp b/src/common/ustring.cpp index 6e1768064b..531ee41b9c 100644 --- a/src/common/ustring.cpp +++ b/src/common/ustring.cpp @@ -15,6 +15,7 @@ #endif #include "wx/ustring.h" +#include "wx/private/unicode.h" #ifndef WX_PRECOMP #include "wx/crt.h" @@ -67,41 +68,6 @@ wxUString &wxUString::assignFromAscii( const char *str, size_type n ) // UTF-8 // ---------------------------------------------------------------------------- -// this table gives the length of the UTF-8 encoding from its first character: -const unsigned char tableUtf8Lengths[256] = { - // single-byte sequences (ASCII): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F - - // these are invalid: - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF - 0, 0, // C0,C1 - - // two-byte sequences: - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF - - // three-byte sequences: - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF - - // four-byte sequences: - 4, 4, 4, 4, 4, // F0..F4 - - // these are invalid again (5- or 6-byte - // sequences and sequences for code points - // above U+10FFFF, as restricted by RFC 3629): - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF -}; - wxUString &wxUString::assignFromUTF8( const char *str ) { if (!str) From 240fcee90ead3ab8207cc54f7e7c10303c6e1018 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Tue, 6 Oct 2020 22:34:52 +0300 Subject: [PATCH 14/18] Move wxIsUTF8Prefix() to convauto.cpp --- include/wx/strconv.h | 2 -- src/common/convauto.cpp | 21 +++++++++++++++++++++ src/common/strconv.cpp | 20 -------------------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/include/wx/strconv.h b/include/wx/strconv.h index 21c5f136b1..c1b070d36a 100644 --- a/include/wx/strconv.h +++ b/include/wx/strconv.h @@ -387,8 +387,6 @@ private: int m_options; }; -bool wxIsUTF8Prefix(const char *src, size_t len); - // ---------------------------------------------------------------------------- // wxMBConvUTF16Base: for both LE and BE variants // ---------------------------------------------------------------------------- diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 840841f5b6..9f3be27802 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -23,6 +23,7 @@ #endif #include "wx/convauto.h" +#include "wx/private/unicode.h" // we use latin1 by default as it seems the least bad choice: the files we need // to detect input of don't always come from the user system (they are often @@ -266,6 +267,26 @@ bool wxConvAuto::InitFromInput(const char *src, size_t len) return true; } +// checks if the input can be the beginning of a valid UTF-8 string +static bool wxIsUTF8Prefix(const char *src, size_t len) +{ + unsigned char l; + for ( size_t i = 0; i < len; ++i ) + { + l = tableUtf8Lengths[(unsigned char)src[i]]; + if ( !l ) + return false; // invalid leading byte + while ( --l ) + { + if ( ++i == len ) + return true; // truncated sequence + if ( (src[i] & 0xC0) != 0x80 ) + return false; // invalid continuation byte + } + } + return true; +} + size_t wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen) const diff --git a/src/common/strconv.cpp b/src/common/strconv.cpp index d09b5ff2a4..23c2b0a545 100644 --- a/src/common/strconv.cpp +++ b/src/common/strconv.cpp @@ -1447,26 +1447,6 @@ size_t wxMBConvUTF8::FromWChar(char *buf, size_t n, return len; } -// checks if the input can be the beginning of a valid UTF-8 string -bool wxIsUTF8Prefix(const char *src, size_t len) -{ - unsigned char l; - for ( size_t i = 0; i < len; ++i ) - { - l = tableUtf8Lengths[(unsigned char)src[i]]; - if ( !l ) - return false; // invalid leading byte - while ( --l ) - { - if ( ++i == len ) - return true; // truncated sequence - if ( (src[i] & 0xC0) != 0x80 ) - return false; // invalid continuation byte - } - } - return true; -} - // ============================================================================ // UTF-16 // ============================================================================ From b536457e0787db7fae241b9e1525910307a5cdad Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 7 Oct 2020 15:41:15 +0300 Subject: [PATCH 15/18] Use tableUtf8Lengths[] in sringops.cpp too --- include/wx/stringops.h | 8 +++---- src/common/stringops.cpp | 47 ++++++++++------------------------------ 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/include/wx/stringops.h b/include/wx/stringops.h index 150554f341..dd46e6616c 100644 --- a/include/wx/stringops.h +++ b/include/wx/stringops.h @@ -94,15 +94,15 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4); } - // table of offsets to skip forward when iterating over UTF-8 sequence - static const unsigned char ms_utf8IterTable[256]; + // returns offset to skip forward when iterating over UTF-8 sequence + static unsigned char GetUTF8IterOffset(unsigned char c); template static void IncIter(Iterator& i) { wxASSERT( IsValidUtf8LeadByte(*i) ); - i += ms_utf8IterTable[(unsigned char)*i]; + i += GetUTF8IterOffset(*i); } template @@ -178,7 +178,7 @@ struct WXDLLIMPEXP_BASE wxStringOperationsUtf8 static size_t GetUtf8CharLength(char c) { wxASSERT( IsValidUtf8LeadByte(c) ); - return ms_utf8IterTable[(unsigned char)c]; + return GetUTF8IterOffset(c); } // decodes single UTF-8 character from UTF-8 string diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index 85629406a3..7cedab7cc0 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -23,6 +23,8 @@ #include "wx/stringops.h" #endif +#include "wx/private/unicode.h" + // =========================================================================== // implementation // =========================================================================== @@ -97,40 +99,13 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& // UTF-8 sequences lengths // --------------------------------------------------------------------------- -const unsigned char wxStringOperationsUtf8::ms_utf8IterTable[256] = { - // single-byte sequences (ASCII): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F - - // these are invalid, we use step 1 to skip - // over them (should never happen): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80..8F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90..9F - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0..AF - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0..BF - 1, 1, // C0,C1 - - // two-byte sequences: - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF - - // three-byte sequences: - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF - - // four-byte sequences: - 4, 4, 4, 4, 4, // F0..F4 - - // these are invalid again (5- or 6-byte - // sequences and sequences for code points - // above U+10FFFF, as restricted by RFC 3629): - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F5..FF -}; +static unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c) +{ + unsigned char l = tableUtf8Lengths[c]; + if ( !l ) //skip over invalid characters + l = 1; + return l; +} // --------------------------------------------------------------------------- // UTF-8 operations @@ -166,7 +141,7 @@ bool wxStringOperationsUtf8::IsValidUtf8String(const char *str, size_t len) { // if the string is not NULL-terminated, verify we have enough // bytes in it left for current character's encoding: - if ( c + ms_utf8IterTable[*c] > end ) + if ( c + GetUTF8IterOffset(*c) > end ) return false; } @@ -364,7 +339,7 @@ wxCharBuffer wxStringOperationsUtf8::EncodeNChars(size_t n, const wxUniChar& ch) { Utf8CharBuffer once(EncodeChar(ch)); // the IncIter() table can be used to determine the length of ch's encoding: - size_t len = ms_utf8IterTable[(unsigned char)once.data[0]]; + size_t len = GetUTF8IterOffset(once.data[0]); wxCharBuffer buf(n * len); char *ptr = buf.data(); From 1cbcf24832c978ea72f31542090d90456f0afefb Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 7 Oct 2020 17:02:06 +0300 Subject: [PATCH 16/18] Make leading nulls a special case to avoid breaking decoding some short strings in fallback encoding --- src/common/convauto.cpp | 7 +++++-- tests/mbconv/convautotest.cpp | 9 +++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 9f3be27802..8778295207 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -334,9 +334,12 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // to the fall-back conversion in this case as it would prevent us from // decoding UTF-8 input when fed it byte by byte, as done by // wxTextInputStream, for example - // 2 extra bytes are needed for inputs that start with 1 or 2 null bytes + // up to 2 extra bytes are needed for inputs that start with null bytes // that look like the start of UTF-32BE BOM, but can be in UTF-8 too - if ( srcLen < 2 + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) + size_t nNull = 0; + if ( srcLen != wxNO_LEN && srcLen >= 2 && !src[0] ) + nNull = ( src[1]? 1 : 2 ); + if ( srcLen < nNull + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 6eaa24b155..27839f93a5 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -47,6 +47,7 @@ private: CPPUNIT_TEST( UTF8NoBom ); CPPUNIT_TEST( Fallback ); CPPUNIT_TEST( FallbackMultibyte ); + CPPUNIT_TEST( FallbackShort ); CPPUNIT_TEST( StreamUTF8NoBOM ); CPPUNIT_TEST( StreamUTF8 ); CPPUNIT_TEST( StreamUTF16LE ); @@ -100,6 +101,7 @@ private: void UTF8NoBom(); void Fallback(); void FallbackMultibyte(); + void FallbackShort(); // test whether two lines of text are converted properly from a stream void TestTextStream(const char *src, @@ -222,6 +224,13 @@ void ConvAutoTestCase::FallbackMultibyte() #endif } +void ConvAutoTestCase::FallbackShort() +{ + TestFirstChar("\x61\x61\x61\xc4", 'a', 4, + ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true), + wxFONTENCODING_ISO8859_5); +} + void ConvAutoTestCase::TestTextStream(const char *src, size_t srclength, const wxString& line1, From c9dd9e96a1bc834fad23435242bc83b793d987ed Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 7 Oct 2020 17:31:42 +0300 Subject: [PATCH 17/18] Allow decoding even shorter strings in fallback encoding Complete UTF-8 characters (except leading nulls) never appear in failed decoding attempts when the input is fed byte by byte. --- src/common/convauto.cpp | 30 ++++++++++++++---------------- tests/mbconv/convautotest.cpp | 2 +- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 8778295207..708096de5d 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -267,24 +267,21 @@ bool wxConvAuto::InitFromInput(const char *src, size_t len) return true; } -// checks if the input can be the beginning of a valid UTF-8 string -static bool wxIsUTF8Prefix(const char *src, size_t len) +// checks if the input can be the beginning of a valid UTF-8 sequence +static bool wxCanBeUTF8SequencePrefix(const char *src, size_t len) { - unsigned char l; - for ( size_t i = 0; i < len; ++i ) + size_t i = 0; + unsigned char l = tableUtf8Lengths[(unsigned char)src[i]]; + if ( !l ) + return false; // invalid leading byte + while ( --l ) { - l = tableUtf8Lengths[(unsigned char)src[i]]; - if ( !l ) - return false; // invalid leading byte - while ( --l ) - { - if ( ++i == len ) - return true; // truncated sequence - if ( (src[i] & 0xC0) != 0x80 ) - return false; // invalid continuation byte - } + if ( ++i == len ) + return true; // truncated sequence + if ( (src[i] & 0xC0) != 0x80 ) + return false; // invalid continuation byte } - return true; + return false; // complete sequence } size_t @@ -339,7 +336,8 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, size_t nNull = 0; if ( srcLen != wxNO_LEN && srcLen >= 2 && !src[0] ) nNull = ( src[1]? 1 : 2 ); - if ( srcLen < nNull + m_conv->GetMaxCharLen() && wxIsUTF8Prefix(src, srcLen) ) + if ( srcLen < nNull + m_conv->GetMaxCharLen() && + wxCanBeUTF8SequencePrefix(src + nNull, srcLen - nNull) ) return wxCONV_FAILED; // if the conversion failed but we didn't really detect anything and diff --git a/tests/mbconv/convautotest.cpp b/tests/mbconv/convautotest.cpp index 27839f93a5..12e19c21ed 100644 --- a/tests/mbconv/convautotest.cpp +++ b/tests/mbconv/convautotest.cpp @@ -226,7 +226,7 @@ void ConvAutoTestCase::FallbackMultibyte() void ConvAutoTestCase::FallbackShort() { - TestFirstChar("\x61\x61\x61\xc4", 'a', 4, + TestFirstChar("\x61\xc4", 'a', 2, ConvState(wxBOM_None, wxFONTENCODING_ISO8859_5, true), wxFONTENCODING_ISO8859_5); } From 0ab3b4eac71d442cf56b14cdfd8bc1da9bc69bd7 Mon Sep 17 00:00:00 2001 From: Pavel Tyunin Date: Wed, 7 Oct 2020 17:53:43 +0300 Subject: [PATCH 18/18] Fix UTF-8 build --- src/common/stringops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/stringops.cpp b/src/common/stringops.cpp index 7cedab7cc0..84017ae523 100644 --- a/src/common/stringops.cpp +++ b/src/common/stringops.cpp @@ -99,7 +99,7 @@ wxWxCharBuffer wxStringOperationsWchar::EncodeNChars(size_t n, const wxUniChar& // UTF-8 sequences lengths // --------------------------------------------------------------------------- -static unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c) +unsigned char wxStringOperationsUtf8::GetUTF8IterOffset(unsigned char c) { unsigned char l = tableUtf8Lengths[c]; if ( !l ) //skip over invalid characters