From 038809c2f657f03d8688d7a75ca416878dfa0de7 Mon Sep 17 00:00:00 2001 From: Vadim Zeitlin Date: Thu, 27 Oct 2011 22:48:54 +0000 Subject: [PATCH] Make BOM-detection code in wxConvAuto public. Export GetBOM() and DetectBOM() functions. Also rename BOMType enum elements to use "wx" prefix now that they're public. Closes #13599. git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@69571 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775 --- include/wx/convauto.h | 43 +++++++++++-------- interface/wx/convauto.h | 94 ++++++++++++++++++++++++++++++++++++++++- src/common/convauto.cpp | 66 ++++++++++++++--------------- 3 files changed, 151 insertions(+), 52 deletions(-) diff --git a/include/wx/convauto.h b/include/wx/convauto.h index bc514cea3b..4c18dba601 100644 --- a/include/wx/convauto.h +++ b/include/wx/convauto.h @@ -18,6 +18,18 @@ // wxConvAuto: uses BOM to automatically detect input encoding // ---------------------------------------------------------------------------- +// All currently recognized BOM values. +enum wxBOM +{ + wxBOM_Unknown = -1, + wxBOM_None, + wxBOM_UTF32BE, + wxBOM_UTF32LE, + wxBOM_UTF16BE, + wxBOM_UTF16LE, + wxBOM_UTF8 +}; + class WXDLLIMPEXP_BASE wxConvAuto : public wxMBConv { public: @@ -69,29 +81,24 @@ public: virtual wxMBConv *Clone() const { return new wxConvAuto(*this); } -private: - // all currently recognized BOM values - enum BOMType - { - BOM_Unknown = -1, - BOM_None, - BOM_UTF32BE, - BOM_UTF32LE, - BOM_UTF16BE, - BOM_UTF16LE, - BOM_UTF8 - }; - // return the BOM type of this buffer - static BOMType DetectBOM(const char *src, size_t srcLen); + static wxBOM DetectBOM(const char *src, size_t srcLen); + wxBOM GetBOM() const + { + return m_bomType; + } + +private: // common part of all ctors void Init() { - // no need to initialize m_bomType and m_consumedBOM here, this will be - // done when m_conv is created + // We don't initialize m_encDefault here as different ctors do it + // differently. m_conv = NULL; + m_bomType = wxBOM_Unknown; m_ownsConv = false; + m_consumedBOM = false; } // initialize m_conv with the UTF-8 conversion @@ -102,7 +109,7 @@ private: } // create the correct conversion object for the given BOM type - void InitFromBOM(BOMType bomType); + void InitFromBOM(wxBOM bomType); // create the correct conversion object for the BOM present in the // beginning of the buffer @@ -128,7 +135,7 @@ private: wxFontEncoding m_encDefault; // our BOM type - BOMType m_bomType; + wxBOM m_bomType; // true if we allocated m_conv ourselves, false if we just use an existing // global conversion diff --git a/interface/wx/convauto.h b/interface/wx/convauto.h index fc8f1987a2..715d06c279 100644 --- a/interface/wx/convauto.h +++ b/interface/wx/convauto.h @@ -6,6 +6,74 @@ // Licence: wxWindows licence ///////////////////////////////////////////////////////////////////////////// +/** + Constants representing various BOM types. + + BOM is an abbreviation for "Byte Order Mark", a special Unicode character + which may be inserted into the beginning of a text stream to indicate its + encoding. + + @since 2.9.3 + */ +enum wxBOM +{ + /** + Unknown BOM. + + This is returned if BOM presence couldn't be determined and normally + happens because not enough bytes of input have been analysed. + */ + wxBOM_Unknown = -1, + + /** + No BOM. + + The stream doesn't contain BOM character at all. + */ + wxBOM_None, + + /** + UTF-32 big endian BOM. + + The stream is encoded in big endian variant of UTF-32. + */ + wxBOM_UTF32BE, + + /** + UTF-32 little endian BOM. + + The stream is encoded in little endian variant of UTF-32. + */ + wxBOM_UTF32LE, + + /** + UTF-16 big endian BOM. + + The stream is encoded in big endian variant of UTF-16. + */ + wxBOM_UTF16BE, + + /** + UTF-16 little endian BOM. + + The stream is encoded in little endian variant of UTF-16. + */ + wxBOM_UTF16LE, + + /** + UTF-8 BOM. + + The stream is encoded in UTF-8. + + Notice that contrary to a popular belief, it's perfectly possible and, + n fact, common under Microsoft Windows systems, to have a BOM in an + UTF-8 stream: while it's not used to indicate the endianness of UTF-8 + stream (as it's byte-oriented), the BOM can still be useful just as an + unambiguous indicator of UTF-8 being used. + */ + wxBOM_UTF8 +}; + /** @class wxConvAuto @@ -66,6 +134,19 @@ public: */ wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT); + + /** + Return the detected BOM type. + + The BOM type is detected after sufficiently many initial bytes have + passed through this conversion object so it will always return + wxBOM_Unknown immediately after the object creation but may return a + different value later. + + @since 2.9.3 + */ + wxBOM GetBOM() const; + /** Disable the use of the fall back encoding: if the input doesn't have a BOM and is not valid UTF-8, the conversion will fail. @@ -92,5 +173,16 @@ public: @c wxFONTENCODING_DEFAULT can't be used here. */ static void SetFallbackEncoding(wxFontEncoding enc); -}; + /** + Return the BOM type of this buffer. + + This is a helper function which is normally only used internally by + wxConvAuto but provided for convenience of the code that wants to + detect the encoding of a stream by checking it for BOM presence on its + own. + + @since 2.9.3 + */ + static wxBOM DetectBOM(const char *src, size_t srcLen); +}; diff --git a/src/common/convauto.cpp b/src/common/convauto.cpp index 8620d4e02e..7480754bb6 100644 --- a/src/common/convauto.cpp +++ b/src/common/convauto.cpp @@ -45,7 +45,7 @@ void wxConvAuto::SetFallbackEncoding(wxFontEncoding enc) } /* static */ -wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) +wxBOM wxConvAuto::DetectBOM(const char *src, size_t srcLen) { // examine the buffer for BOM presence // @@ -65,14 +65,14 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) switch ( srcLen ) { case 0: - return BOM_Unknown; + return wxBOM_Unknown; case 1: if ( src[0] == '\x00' || src[0] == '\xFF' || src[0] == '\xFE' || src[0] == '\xEF') { // this could be a BOM but we don't know yet - return BOM_Unknown; + return wxBOM_Unknown; } break; @@ -81,22 +81,22 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) if ( src[0] == '\xEF' && src[1] == '\xBB' ) { if ( srcLen == 3 ) - return src[2] == '\xBF' ? BOM_UTF8 : BOM_None; + return src[2] == '\xBF' ? wxBOM_UTF8 : wxBOM_None; - return BOM_Unknown; + return wxBOM_Unknown; } if ( src[0] == '\xFE' && src[1] == '\xFF' ) - return BOM_UTF16BE; + return wxBOM_UTF16BE; if ( src[0] == '\xFF' && src[1] == '\xFE' ) { // if the next byte is 0, it could be an UTF-32LE BOM but if it // isn't we can be sure it's UTF-16LE if ( srcLen == 3 && src[2] != '\x00' ) - return BOM_UTF16LE; + return wxBOM_UTF16LE; - return BOM_Unknown; + return wxBOM_Unknown; } if ( src[0] == '\x00' && src[1] == '\x00' ) @@ -104,9 +104,9 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) // this could only be UTF-32BE, check that the data we have so // far allows for it if ( srcLen == 3 && src[2] != '\xFE' ) - return BOM_None; + return wxBOM_None; - return BOM_Unknown; + return wxBOM_Unknown; } break; @@ -114,61 +114,61 @@ wxConvAuto::BOMType wxConvAuto::DetectBOM(const char *src, size_t srcLen) // we have at least 4 characters so we may finally decide whether // we have a BOM or not if ( src[0] == '\xEF' && src[1] == '\xBB' && src[2] == '\xBF' ) - return BOM_UTF8; + return wxBOM_UTF8; if ( src[0] == '\x00' && src[1] == '\x00' && src[2] == '\xFE' && src[3] == '\xFF' ) - return BOM_UTF32BE; + return wxBOM_UTF32BE; if ( src[0] == '\xFF' && src[1] == '\xFE' && src[2] == '\x00' && src[3] == '\x00' ) - return BOM_UTF32LE; + return wxBOM_UTF32LE; if ( src[0] == '\xFE' && src[1] == '\xFF' ) - return BOM_UTF16BE; + return wxBOM_UTF16BE; if ( src[0] == '\xFF' && src[1] == '\xFE' ) - return BOM_UTF16LE; + return wxBOM_UTF16LE; } - return BOM_None; + return wxBOM_None; } -void wxConvAuto::InitFromBOM(BOMType bomType) +void wxConvAuto::InitFromBOM(wxBOM bomType) { m_consumedBOM = false; switch ( bomType ) { - case BOM_Unknown: + case wxBOM_Unknown: wxFAIL_MSG( "shouldn't be called for this BOM type" ); break; - case BOM_None: + case wxBOM_None: // use the default break; - case BOM_UTF32BE: + case wxBOM_UTF32BE: m_conv = new wxMBConvUTF32BE; m_ownsConv = true; break; - case BOM_UTF32LE: + case wxBOM_UTF32LE: m_conv = new wxMBConvUTF32LE; m_ownsConv = true; break; - case BOM_UTF16BE: + case wxBOM_UTF16BE: m_conv = new wxMBConvUTF16BE; m_ownsConv = true; break; - case BOM_UTF16LE: + case wxBOM_UTF16LE: m_conv = new wxMBConvUTF16LE; m_ownsConv = true; break; - case BOM_UTF8: + case wxBOM_UTF8: InitWithUTF8(); break; @@ -191,25 +191,25 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const int ofs; switch ( m_bomType ) { - case BOM_Unknown: + case wxBOM_Unknown: wxFAIL_MSG( "shouldn't be called for this BOM type" ); return; - case BOM_None: + case wxBOM_None: ofs = 0; break; - case BOM_UTF32BE: - case BOM_UTF32LE: + case wxBOM_UTF32BE: + case wxBOM_UTF32LE: ofs = 4; break; - case BOM_UTF16BE: - case BOM_UTF16LE: + case wxBOM_UTF16BE: + case wxBOM_UTF16LE: ofs = 2; break; - case BOM_UTF8: + case wxBOM_UTF8: ofs = 3; break; @@ -226,7 +226,7 @@ void wxConvAuto::SkipBOM(const char **src, size_t *len) const bool wxConvAuto::InitFromInput(const char *src, size_t len) { m_bomType = DetectBOM(src, len == wxNO_LEN ? strlen(src) : len); - if ( m_bomType == BOM_Unknown ) + if ( m_bomType == wxBOM_Unknown ) return false; InitFromBOM(m_bomType); @@ -275,7 +275,7 @@ wxConvAuto::ToWChar(wchar_t *dst, size_t dstLen, // try to convert using the auto-detected encoding size_t rc = m_conv->ToWChar(dst, dstLen, src, srcLen); - if ( rc == wxCONV_FAILED && m_bomType == BOM_None ) + if ( rc == wxCONV_FAILED && m_bomType == wxBOM_None ) { // if the conversion failed but we didn't really detect anything and // simply tried UTF-8 by default, retry it using the fall-back