glibc/wcsmbs/c8rtomb.c

133 lines
3.5 KiB
C

/* UTF-8 to multibyte conversion.
Copyright (C) 2022-2023 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <errno.h>
#include <uchar.h>
#include <wchar.h>
/* This is the private state used if PS is NULL. */
static mbstate_t state;
size_t
c8rtomb (char *s, char8_t c8, mbstate_t *ps)
{
/* This implementation depends on the converter invoked by wcrtomb not
needing to retain state in either the top most bit of ps->__count or
in ps->__value between invocations. This implementation uses the
top most bit of ps->__count to indicate that trailing code units are
expected and uses ps->__value to store previously seen code units. */
wchar_t wc;
if (ps == NULL)
ps = &state;
if (s == NULL)
{
/* if 's' is a null pointer, behave as if u8'\0' was passed as 'c8'. If
this occurs for an incomplete code unit sequence, then an error will
be reported below. */
c8 = u8""[0];
}
if (! (ps->__count & 0x80000000))
{
/* Initial state. */
if ((c8 >= 0x80 && c8 <= 0xC1) || c8 >= 0xF5)
{
/* An invalid lead code unit. */
__set_errno (EILSEQ);
return -1;
}
if (c8 >= 0xC2)
{
/* A valid lead code unit. */
ps->__count |= 0x80000000;
ps->__value.__wchb[0] = c8;
ps->__value.__wchb[3] = 1;
return 0;
}
/* A single byte (ASCII) code unit. */
wc = c8;
}
else
{
char8_t cu1 = ps->__value.__wchb[0];
if (ps->__value.__wchb[3] == 1)
{
/* A single lead code unit was previously seen. */
if ((c8 < 0x80 || c8 > 0xBF)
|| (cu1 == 0xE0 && c8 < 0xA0)
|| (cu1 == 0xED && c8 > 0x9F)
|| (cu1 == 0xF0 && c8 < 0x90)
|| (cu1 == 0xF4 && c8 > 0x8F))
{
/* An invalid second code unit. */
__set_errno (EILSEQ);
return -1;
}
if (cu1 >= 0xE0)
{
/* A three or four code unit sequence. */
ps->__value.__wchb[1] = c8;
++ps->__value.__wchb[3];
return 0;
}
wc = ((cu1 & 0x1F) << 6)
+ (c8 & 0x3F);
}
else
{
char8_t cu2 = ps->__value.__wchb[1];
/* A three or four byte code unit sequence. */
if (c8 < 0x80 || c8 > 0xBF)
{
/* An invalid third or fourth code unit. */
__set_errno (EILSEQ);
return -1;
}
if (ps->__value.__wchb[3] == 2 && cu1 >= 0xF0)
{
/* A four code unit sequence. */
ps->__value.__wchb[2] = c8;
++ps->__value.__wchb[3];
return 0;
}
if (cu1 < 0xF0)
{
wc = ((cu1 & 0x0F) << 12)
+ ((cu2 & 0x3F) << 6)
+ (c8 & 0x3F);
}
else
{
char8_t cu3 = ps->__value.__wchb[2];
wc = ((cu1 & 0x07) << 18)
+ ((cu2 & 0x3F) << 12)
+ ((cu3 & 0x3F) << 6)
+ (c8 & 0x3F);
}
}
ps->__count &= 0x7fffffff;
ps->__value.__wch = 0;
}
return wcrtomb (s, wc, ps);
}