mirror of
https://sourceware.org/git/glibc.git
synced 2025-01-13 12:40:08 +00:00
8bcca1db3d
This change provides implementations for the mbrtoc8 and c8rtomb functions adopted for C++20 via WG21 P0482R6 and for C2X via WG14 N2653. It also provides the char8_t typedef from WG14 N2653. The mbrtoc8 and c8rtomb functions are declared in uchar.h in C2X mode or when the _GNU_SOURCE macro or C++20 __cpp_char8_t feature test macro is defined. The char8_t typedef is declared in uchar.h in C2X mode or when the _GNU_SOURCE macro is defined and the C++20 __cpp_char8_t feature test macro is not defined (if __cpp_char8_t is defined, then char8_t is a builtin type). Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
127 lines
3.5 KiB
C
127 lines
3.5 KiB
C
/* Multibyte to UTF-8 conversion.
|
|
Copyright (C) 2022 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <assert.h>
|
|
#include <dlfcn.h>
|
|
#include <errno.h>
|
|
#include <gconv.h>
|
|
#include <uchar.h>
|
|
#include <wcsmbsload.h>
|
|
|
|
#include <sysdep.h>
|
|
|
|
#ifndef EILSEQ
|
|
# define EILSEQ EINVAL
|
|
#endif
|
|
|
|
|
|
/* This is the private state used if PS is NULL. */
|
|
static mbstate_t state;
|
|
|
|
size_t
|
|
mbrtoc8 (char8_t *pc8, const char *s, size_t n, mbstate_t *ps)
|
|
{
|
|
/* This implementation depends on the converter invoked by mbrtowc not
|
|
needing to retain state in either the top most bit of ps->__count or
|
|
in ps->__value between invocations. This implementation uses the
|
|
top most bit of ps->__count to indicate that trailing code units are
|
|
yet to be written and uses ps->__value to store those code units. */
|
|
|
|
if (ps == NULL)
|
|
ps = &state;
|
|
|
|
/* If state indicates that trailing code units are yet to be written, write
|
|
those first regardless of whether 's' is a null pointer. */
|
|
if (ps->__count & 0x80000000)
|
|
{
|
|
/* ps->__value.__wchb[3] stores the index of the next code unit to
|
|
write. Code units are stored in reverse order. */
|
|
size_t i = ps->__value.__wchb[3];
|
|
if (pc8 != NULL)
|
|
{
|
|
*pc8 = ps->__value.__wchb[i];
|
|
}
|
|
if (i == 0)
|
|
{
|
|
ps->__count &= 0x7fffffff;
|
|
ps->__value.__wch = 0;
|
|
}
|
|
else
|
|
--ps->__value.__wchb[3];
|
|
return -3;
|
|
}
|
|
|
|
if (s == NULL)
|
|
{
|
|
/* if 's' is a null pointer, behave as if a null pointer was passed for
|
|
'pc8', an empty string was passed for 's', and 1 passed for 'n'. */
|
|
pc8 = NULL;
|
|
s = "";
|
|
n = 1;
|
|
}
|
|
|
|
wchar_t wc;
|
|
size_t result;
|
|
|
|
result = mbrtowc (&wc, s, n, ps);
|
|
if (result <= n)
|
|
{
|
|
if (wc <= 0x7F)
|
|
{
|
|
if (pc8 != NULL)
|
|
*pc8 = wc;
|
|
}
|
|
else if (wc <= 0x7FF)
|
|
{
|
|
if (pc8 != NULL)
|
|
*pc8 = 0xC0 + ((wc >> 6) & 0x1F);
|
|
ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
|
|
ps->__value.__wchb[3] = 0;
|
|
ps->__count |= 0x80000000;
|
|
}
|
|
else if (wc <= 0xFFFF)
|
|
{
|
|
if (pc8 != NULL)
|
|
*pc8 = 0xE0 + ((wc >> 12) & 0x0F);
|
|
ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F);
|
|
ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
|
|
ps->__value.__wchb[3] = 1;
|
|
ps->__count |= 0x80000000;
|
|
}
|
|
else if (wc <= 0x10FFFF)
|
|
{
|
|
if (pc8 != NULL)
|
|
*pc8 = 0xF0 + ((wc >> 18) & 0x07);
|
|
ps->__value.__wchb[2] = 0x80 + ((wc >> 12) & 0x3F);
|
|
ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F);
|
|
ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
|
|
ps->__value.__wchb[3] = 2;
|
|
ps->__count |= 0x80000000;
|
|
}
|
|
}
|
|
if (result == 0 && wc != 0)
|
|
{
|
|
/* mbrtowc() never returns -3. When a MB sequence converts to multiple
|
|
WCs, no input is consumed when writing the subsequent WCs resulting
|
|
in a result of 0 even if a null character wasn't written. */
|
|
result = -3;
|
|
}
|
|
|
|
return result;
|
|
}
|