An attempt at make the wxCSConv class useful. Uses iconv under Unix,

Internet codepages under Windows, wxEncodingConverter if all else fails.
Not really complete, not really optimized, nor really tested,
but I'll let you check whether it could be useful at least.


git-svn-id: https://svn.wxwidgets.org/svn/wx/wxWidgets/trunk@7771 c3d73ce0-8a6f-49c7-b76d-6d57e0e08775
This commit is contained in:
Ove Kaaven 2000-07-19 08:25:15 +00:00
parent 66b3ec7f9a
commit 1cd5241886

View File

@ -28,6 +28,7 @@
#pragma hdrstop
#endif
#include <errno.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
@ -36,9 +37,36 @@
#include <clib.h>
#endif
#ifdef HAVE_ICONV_H
#include <iconv.h>
#endif
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif
#include "wx/debug.h"
#include "wx/strconv.h"
#ifdef WORDS_BIGENDIAN
#define BSWAP_UCS4(str, len)
#define BSWAP_UCS2(str, len)
#else
#define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
#define BSWAP_UCS2(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
#define WC_NEED_BSWAP
#endif
#define BSWAP_UTF32(str, len) BSWAP_UCS4(str, len)
#define BSWAP_UTF16(str, len) BSWAP_UCS2(str, len)
#if SIZEOF_WCHAR_T == 4
#define WC_NAME "UCS4"
#define WC_BSWAP BSWAP_UCS4
#elif SIZEOF_WCHAR_T == 2
#define WC_NAME "UTF16"
#define WC_BSWAP BSWAP_UTF16
#define WC_UTF16
#endif
// ----------------------------------------------------------------------------
// globals
// ----------------------------------------------------------------------------
@ -51,6 +79,38 @@ WXDLLEXPORT_DATA(wxMBConv *) wxConvCurrent = &wxConvLibc;
#if wxUSE_WCHAR_T
static size_t encode_utf16(wxUint32 input,wxUint16*output)
{
if (input<=0xffff) {
if (output) *output++ = input;
return 1;
} else
if (input>=0x110000) {
return (size_t)-1;
} else {
if (output) {
*output++ = (input >> 10)+0xd7c0;
*output++ = (input&0x3ff)+0xdc00;
}
return 2;
}
}
static size_t decode_utf16(wxUint16*input,wxUint32&output)
{
if ((*input<0xd800) || (*input>0xdfff)) {
output = *input;
return 1;
} else
if ((input[1]<0xdc00) || (input[1]>=0xdfff)) {
output = *input;
return (size_t)-1;
} else {
output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
return 2;
}
}
// ----------------------------------------------------------------------------
// wxMBConv
// ----------------------------------------------------------------------------
@ -187,7 +247,7 @@ size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf),
WXDLLEXPORT_DATA(wxMBConvUTF8) wxConvUTF8;
static unsigned long utf8_max[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
static wxUint32 utf8_max[]={0x7f,0x7ff,0xffff,0x1fffff,0x3ffffff,0x7fffffff,0xffffffff};
size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
@ -208,7 +268,7 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
return (size_t)-1;
} else {
unsigned ocnt=cnt-1;
unsigned long res=cc&(0x3f>>cnt);
wxUint32 res=cc&(0x3f>>cnt);
while (cnt--) {
cc = *psz++;
if ((cc&0xC0)!=0x80) {
@ -221,8 +281,16 @@ size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
// illegal UTF-8 encoding
return (size_t)-1;
}
#ifdef WC_UTF16
size_t pa = encode_utf16(res, buf);
if (pa == (size_t)-1)
return (size_t)-1;
if (buf) buf+=pa;
len+=pa;
#else
if (buf) *buf++=res;
len++;
#endif
}
}
}
@ -235,7 +303,13 @@ size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
size_t len = 0;
while (*psz && ((!buf) || (len<n))) {
unsigned long cc=(*psz++)&0x7fffffff;
wxUint32 cc;
#ifdef WC_UTF16
size_t pa = decode_utf16(psz,cc);
psz += (pa == (size_t)-1) ? 1 : pa;
#else
cc=(*psz++)&0x7fffffff;
#endif
unsigned cnt;
for (cnt=0; cc>utf8_max[cnt]; cnt++);
if (!cnt) {
@ -264,27 +338,217 @@ WXDLLEXPORT_DATA(wxCSConv) wxConvLocal((const wxChar *)NULL);
#include "wx/encconv.h"
#include "wx/fontmap.h"
// TODO: add some tables here
// - perhaps common encodings to common codepages (for Win32)
// - perhaps common encodings to objects ("UTF8" -> wxConvUTF8)
// - move wxEncodingConverter meat in here
#ifdef __WIN32__
#include "wx/msw/registry.h"
// this should work if M$ Internet Exploiter is installed
static long CharsetToCodepage(const wxChar *name)
{
if (!name) return GetACP();
long CP=-1;
wxString cn(name);
do {
wxString path(wxT("MIME\\Database\\Charset\\"));
path += cn;
wxRegKey key(wxRegKey::HKCR,path);
/* two cases: either there's an AliasForCharset string,
* or there are Codepage and InternetEncoding dwords.
* The InternetEncoding gives us the actual encoding,
* the Codepage just says which Windows character set to
* use when displaying the data.
*/
if (key.QueryValue(wxT("InternetEncoding"),&CP)) break;
// no encoding, see if it's an alias
if (!key.QueryValue(wxT("AliasForCharset"),cn)) break;
} while (1);
return CP;
}
#endif
class wxCharacterSet
{
public:
const wxChar*cname;
wxCharacterSet(const wxChar*name) : cname(name) {}
virtual ~wxCharacterSet() {}
virtual size_t MB2WC(wchar_t*buf, const char*psz, size_t n) { return (size_t)-1; }
virtual size_t WC2MB(char*buf, const wchar_t*psz, size_t n) { return (size_t)-1; }
virtual bool usable() { return FALSE; }
};
class ID_CharSet : public wxCharacterSet
{
public:
wxMBConv*work;
ID_CharSet(const wxChar*name,wxMBConv*cnv) : wxCharacterSet(name), work(cnv) {}
size_t MB2WC(wchar_t*buf, const char*psz, size_t n)
{ return work ? work->MB2WC(buf,psz,n) : (size_t)-1; }
size_t WC2MB(char*buf, const wchar_t*psz, size_t n)
{ return work ? work->WC2MB(buf,psz,n) : (size_t)-1; }
bool usable() { return work!=NULL; }
};
#ifdef HAVE_ICONV_H
class IC_CharSet : public wxCharacterSet
{
public:
iconv_t m2w, w2m;
IC_CharSet(const wxChar*name) : wxCharacterSet(name), m2w((iconv_t)-1), w2m((iconv_t)-1) {}
~IC_CharSet() {
if (m2w!=(iconv_t)-1) iconv_close(m2w);
if (w2m!=(iconv_t)-1) iconv_close(w2m);
}
void LoadM2W() { if (m2w==(iconv_t)-1) m2w=iconv_open(WC_NAME,wxConvLibc.cWX2MB(cname)); }
void LoadW2M() { if (w2m==(iconv_t)-1) w2m=iconv_open(wxConvLibc.cWX2MB(cname),WC_NAME); }
size_t MB2WC(wchar_t*buf, const char*psz, size_t n) {
LoadM2W();
size_t inbuf = strlen(psz);
size_t outbuf = n*SIZEOF_WCHAR_T;
size_t res, cres;
fprintf(stderr,"IC Convert to WC using %s\n",(const char*)wxConvLibc.cWX2MB(cname));
if (buf) {
// have destination buffer, convert there
cres = iconv(m2w,&psz,&inbuf,(char**)&buf,&outbuf);
res = n-(outbuf/SIZEOF_WCHAR_T);
// convert to native endianness
WC_BSWAP(buf, res)
} else {
// no destination buffer... convert using temp buffer
// to calculate destination buffer requirement
wchar_t tbuf[8];
res = 0;
do {
buf = tbuf; outbuf = 8*SIZEOF_WCHAR_T;
cres = iconv(m2w,&psz,&inbuf,(char**)&buf,&outbuf);
res += 8-(outbuf/SIZEOF_WCHAR_T);
} while ((cres==(size_t)-1) && (errno==E2BIG));
}
if (cres==(size_t)-1) return (size_t)-1;
return res;
}
size_t WC2MB(char*buf, const wchar_t*psz, size_t n) {
LoadW2M();
#if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
size_t inbuf = std::wcslen(psz);
#else
size_t inbuf = ::wcslen(psz);
#endif
size_t outbuf = n;
size_t res, cres;
fprintf(stderr,"IC Convert from WC using %s\n",(const char*)wxConvLibc.cWX2MB(cname));
#ifdef WC_NEED_BSWAP
// need to copy to temp buffer to switch endianness
// this absolutely doesn't rock!
// (no, doing WC_BSWAP twice on the original buffer won't help, as it
// could be in read-only memory, or be accessed in some other thread)
wchar_t*tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
WC_BSWAP(tmpbuf, inbuf)
psz=tmpbuf;
#endif
if (buf) {
// have destination buffer, convert there
cres = iconv(w2m,(const char**)&psz,&inbuf,&buf,&outbuf);
res = n-outbuf;
} else {
// no destination buffer... convert using temp buffer
// to calculate destination buffer requirement
char tbuf[16];
res = 0;
do {
buf = tbuf; outbuf = 16;
cres = iconv(w2m,(const char**)&psz,&inbuf,&buf,&outbuf);
res += 16 - outbuf;
} while ((cres==(size_t)-1) && (errno==E2BIG));
}
#ifdef WC_NEED_BSWAP
free(tmpbuf);
#endif
if (cres==(size_t)-1) return (size_t)-1;
return res;
}
bool usable() { return TRUE; }
};
#endif
#ifdef __WIN32__
class CP_CharSet : public wxCharacterSet
{
public:
long CodePage;
CP_CharSet(const wxChar*name) : wxCharacterSet(name), CodePage(CharsetToCodepage(name)) {}
size_t MB2WC(wchar_t*buf, const char*psz, size_t n) {
size_t len = MultiByteToWideChar(CodePage,0,psz,-1,buf,buf?n:0);
return len?len:(size_t)-1;
}
size_t WC2MB(char*buf, const wchar_t*psz, size_t n) {
size_t len = WideCharToMultiByte(CodePage,0,psz,-1,buf,buf?n:0,NULL,NULL);
return len?len:(size_t)-1;
}
bool usable() { return CodePage!=-1; }
};
#endif
class EC_CharSet : public wxCharacterSet
{
public:
// temporarily just use wxEncodingConverter stuff,
// so that it works while a better implementation is built
wxFontEncoding enc;
wxEncodingConverter m2w, w2m;
wxCharacterSet(wxFontEncoding e) : enc(e)
EC_CharSet(const wxChar*name) : wxCharacterSet(name), enc(wxFONTENCODING_SYSTEM)
{
if (name) enc = wxTheFontMapper->CharsetToEncoding(name, FALSE);
m2w.Init(enc, wxFONTENCODING_UNICODE);
w2m.Init(wxFONTENCODING_UNICODE, enc);
}
size_t MB2WC(wchar_t*buf, const char*psz, size_t n) {
size_t inbuf = strlen(psz);
fprintf(stderr,"EC Convert to WC using %d\n",enc);
if (buf) m2w.Convert(psz,buf);
return inbuf;
}
size_t WC2MB(char*buf, const wchar_t*psz, size_t n) {
#if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
size_t inbuf = std::wcslen(psz);
#else
size_t inbuf = ::wcslen(psz);
#endif
fprintf(stderr,"EC Convert from WC using %d\n",enc);
if (buf) w2m.Convert(psz,buf);
return inbuf;
}
bool usable() { return (enc!=wxFONTENCODING_SYSTEM) && (enc!=wxFONTENCODING_DEFAULT); }
};
static wxCharacterSet *wxGetCharacterSet(const wxChar *name)
{
wxFontEncoding enc = name ? wxTheFontMapper->CharsetToEncoding(name, FALSE)
: wxFONTENCODING_SYSTEM;
wxCharacterSet *cset = (enc != wxFONTENCODING_SYSTEM) ? new wxCharacterSet(enc)
: (wxCharacterSet *)NULL;
return cset;
wxCharacterSet *cset = NULL;
if (name) {
if (!wxStricmp(name, wxT("UTF8")) || !wxStricmp(name, wxT("UTF-8"))) {
cset = new ID_CharSet(name, &wxConvUTF8);
} else {
#ifdef HAVE_ICONV_H
cset = new IC_CharSet(name); // may not take NULL
#endif
}
}
if (cset && cset->usable()) return cset;
if (cset) delete cset;
#ifdef __WIN32__
cset = new CP_CharSet(name); // may take NULL
if (cset->usable()) return cset;
#endif
if (cset) delete cset;
cset = new EC_CharSet(name);
if (cset->usable()) return cset;
delete cset;
return NULL;
}
wxCSConv::wxCSConv(const wxChar *charset)
@ -315,10 +579,22 @@ void wxCSConv::LoadNow()
if (m_deferred) {
if (!m_name) {
#ifdef __UNIX__
wxChar *lang = wxGetenv(wxT("LC_ALL"));
if (!lang) lang = wxGetenv(wxT("LANG"));
wxChar *dot = lang ? wxStrchr(lang, wxT('.')) : (wxChar *)NULL;
if (dot) SetName(dot+1);
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
// GNU libc provides current character set this way
char*alang = nl_langinfo(CODESET);
if (alang) SetName(wxConvLibc.cMB2WX(alang));
else
#endif
// if we can't get at the character set directly,
// try to see if it's in the environment variables
// (in most cases this won't work, but I was out of ideas)
{
wxChar *lang = wxGetenv(wxT("LC_ALL"));
if (!lang) lang = wxGetenv(wxT("LC_CTYPE"));
if (!lang) lang = wxGetenv(wxT("LANG"));
wxChar *dot = lang ? wxStrchr(lang, wxT('.')) : (wxChar *)NULL;
if (dot) SetName(dot+1);
}
#endif
}
m_cset = wxGetCharacterSet(m_name);
@ -329,39 +605,72 @@ void wxCSConv::LoadNow()
size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
{
((wxCSConv *)this)->LoadNow(); // discard constness
if (m_cset)
return m_cset->MB2WC(buf, psz, n);
// latin-1 (direct)
size_t len=strlen(psz);
if (buf) {
if (m_cset) {
m_cset->m2w.Convert(psz, buf);
} else {
// latin-1 (direct)
for (size_t c=0; c<n; c++)
buf[c] = (unsigned char)(psz[c]);
}
return n;
for (size_t c=0; c<=len; c++)
buf[c] = (unsigned char)(psz[c]);
}
return strlen(psz);
return len;
}
size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
{
((wxCSConv *)this)->LoadNow(); // discard constness
if (buf) {
if (m_cset) {
m_cset->w2m.Convert(psz, buf);
} else {
// latin-1 (direct)
for (size_t c=0; c<n; c++)
buf[c] = (psz[c]>0xff) ? '?' : psz[c];
}
return n;
}
if (m_cset)
return m_cset->WC2MB(buf, psz, n);
// latin-1 (direct)
#if defined(__BORLANDC__) && (__BORLANDC__ > 0x530)
return std::wcslen(psz);
size_t len=std::wcslen(psz);
#else
return ::wcslen(psz);
size_t len=::wcslen(psz);
#endif
if (buf) {
for (size_t c=0; c<=len; c++)
buf[c] = (psz[c]>0xff) ? '?' : psz[c];
}
return len;
}
#ifdef HAVE_ICONV_H
class IC_CharSetConverter
{
public:
iconv_t cnv;
IC_CharSetConverter(IC_CharSet*from,IC_CharSet*to) {
cnv=iconv_open(wxConvLibc.cWX2MB(to->cname),wxConvLibc.cWX2MB(from->cname));
}
~IC_CharSetConverter() {
if (cnv!=(iconv_t)-1) iconv_close(cnv);
}
size_t Convert(char*buf, const char*psz, size_t n) {
size_t inbuf = strlen(psz);
size_t outbuf = n;
size_t res = iconv(cnv,&psz,&inbuf,&buf,&outbuf);
if (res==(size_t)-1) return (size_t)-1;
return n-outbuf;
}
};
#endif
class EC_CharSetConverter
{
public:
wxEncodingConverter cnv;
EC_CharSetConverter(EC_CharSet*from,EC_CharSet*to) {
cnv.Init(from->enc,to->enc);
}
size_t Convert(char*buf, const char*psz, size_t n) {
size_t inbuf = strlen(psz);
if (buf) cnv.Convert(psz,buf);
return inbuf;
}
};
#else // !wxUSE_WCHAR_T
// ----------------------------------------------------------------------------