fd8737d3f0
* On output: detect them and transcode them properly into UTF-8. * On input: create input records for both halfs of the surrogate pair. I tested by copying-and-pasting U+20000 (D840 DC00) into a mintty bash shell. It works, but the Windows console thinks the character occupies four cells when it really occupies two, so the cursor position is wrong. When I press backspace, it doesn't delete the whole character -- instead, it replaces it with a '?', because it becomes an invalid surrogate pair. Still, the behavior seems like an improvement.
53 lines
1.5 KiB
C
Executable File
53 lines
1.5 KiB
C
Executable File
#ifndef UNICODE_ENCODING_H
|
|
#define UNICODE_ENCODING_H
|
|
|
|
// Encode the Unicode codepoint with UTF-8. The buffer must be at least 4
|
|
// bytes in size.
|
|
static inline int encodeUtf8(char *out, unsigned int code) {
|
|
if (code < 0x80) {
|
|
out[0] = code;
|
|
return 1;
|
|
} else if (code < 0x800) {
|
|
out[0] = ((code >> 6) & 0x1F) | 0xC0;
|
|
out[1] = ((code >> 0) & 0x3F) | 0x80;
|
|
return 2;
|
|
} else if (code < 0x10000) {
|
|
out[0] = ((code >> 12) & 0x0F) | 0xE0;
|
|
out[1] = ((code >> 6) & 0x3F) | 0x80;
|
|
out[2] = ((code >> 0) & 0x3F) | 0x80;
|
|
return 3;
|
|
} else if (code < 0x110000) {
|
|
out[0] = ((code >> 18) & 0x07) | 0xF0;
|
|
out[1] = ((code >> 12) & 0x3F) | 0x80;
|
|
out[2] = ((code >> 6) & 0x3F) | 0x80;
|
|
out[3] = ((code >> 0) & 0x3F) | 0x80;
|
|
return 4;
|
|
} else {
|
|
// Encoding error
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// Encode the Unicode codepoint with UTF-8. The buffer must be at least 2
|
|
// elements in size.
|
|
static inline int encodeUtf16(wchar_t *out, unsigned int code) {
|
|
if (code < 0x10000) {
|
|
out[0] = code;
|
|
return 1;
|
|
} else if (code < 0x110000) {
|
|
code -= 0x10000;
|
|
out[0] = 0xD800 | (code >> 10);
|
|
out[1] = 0xDC00 | (code & 0x3FF);
|
|
return 2;
|
|
} else {
|
|
// Encoding error
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
static inline unsigned int decodeSurrogatePair(wchar_t ch1, wchar_t ch2) {
|
|
return ((ch1 - 0xD800) << 10) + (ch2 - 0xDC00) + 0x10000;
|
|
}
|
|
|
|
#endif // UNICODE_ENCODING_H
|