/* ****************************************************************************** * * Copyright (C) 1998-2001, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * * File ustring.h * * Modification History: * * Date Name Description * 12/07/98 bertrand Creation. ****************************************************************************** */ #include "unicode/utypes.h" #include "unicode/ustring.h" #include "unicode/putil.h" #include "unicode/ucnv.h" #include "cstring.h" #include "cwchar.h" #include "cmemory.h" #include "umutex.h" #include "ustr_imp.h" #include "ucln_cmn.h" /* forward declaractions of definitions for the shared default converter */ static UConverter *gDefaultConverter = NULL; /* ANSI string.h - style functions ------------------------------------------ */ #define MAX_STRLEN 0x0FFFFFFF /* ---- String searching functions ---- */ U_CAPI UChar* U_EXPORT2 u_strchr(const UChar *s, UChar c) { while (*s && *s != c) { ++s; } if (*s == c) return (UChar *)s; return NULL; } /* A Boyer-Moore algorithm would be better, but that would require a hashtable because UChar is so big. This algorithm doesn't use a lot of extra memory. */ U_CAPI UChar * U_EXPORT2 u_strstr(const UChar *s, const UChar *substring) { UChar *strItr, *subItr; if (*substring == 0) { return (UChar *)s; } do { strItr = (UChar *)s; subItr = (UChar *)substring; /* Only one string iterator needs checking for null terminator */ while ((*strItr != 0) && (*strItr == *subItr)) { strItr++; subItr++; } if (*subItr == 0) { /* Was the end of the substring reached? */ return (UChar *)s; } s++; } while (*strItr != 0); /* Was the end of the string reached? */ return NULL; /* No match */ } U_CAPI UChar * U_EXPORT2 u_strchr32(const UChar *s, UChar32 c) { if(c < 0xd800) { /* non-surrogate BMP code point */ return u_strchr(s, (UChar)c); } else if(c <= 0xdfff) { /* surrogate code point */ UChar *t; for(;;) { t = u_strchr(s, (UChar)c); if(t == NULL) { return NULL; } if( UTF_IS_SURROGATE_FIRST(*t) ? UTF_IS_TRAIL(*(t+1)) : (s 0) { UChar *anchor = dst; /* save a pointer to start of dst */ while(*dst != 0) { /* To end of first string */ ++dst; } while((*dst = *src) != 0) { /* copy string 2 over */ ++dst; if(--n == 0) { *dst = 0; break; } ++src; } return anchor; } else { return dst; } } /* ----- Text property functions --- */ U_CAPI int32_t U_EXPORT2 u_strcmp(const UChar *s1, const UChar *s2) { UChar c1, c2; for(;;) { c1=*s1++; c2=*s2++; if (c1 != c2 || c1 == 0) { break; } } return (int32_t)c1 - (int32_t)c2; } /* rotate surrogates to the top to get code point order; assume c>=0xd800 */ #define UTF16FIXUP(c) { \ if ((c) >= 0xe000) { \ (c) -= 0x800; \ } else { \ (c) += 0x2000; \ } \ } /* String compare in code point order - u_strcmp() compares in code unit order. */ U_CAPI int32_t U_EXPORT2 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { UChar c1, c2; /* compare identical prefixes - they do not need to be fixed up */ for(;;) { c1=*s1++; c2=*s2++; if (c1 != c2) { break; } if (c1 == 0) { return 0; } } /* if both values are in or above the surrogate range, Fix them up. */ if (c1 >= 0xD800 && c2 >= 0xD800) { UTF16FIXUP(c1); UTF16FIXUP(c2); } /* now c1 and c2 are in UTF-32-compatible order */ return (int32_t)c1-(int32_t)c2; } U_CAPI int32_t U_EXPORT2 u_strncmp(const UChar *s1, const UChar *s2, int32_t n) { if(n > 0) { int32_t rc; for(;;) { rc = (int32_t)*s1 - (int32_t)*s2; if(rc != 0 || *s1 == 0 || --n == 0) { return rc; } ++s1; ++s2; } } else { return 0; } } U_CAPI int32_t U_EXPORT2 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { UChar c1, c2; if(n<=0) { return 0; } /* compare identical prefixes - they do not need to be fixed up */ for(;;) { c1=*s1; c2=*s2; if(c1==c2) { if(c1==0 || --n==0) { return 0; } ++s1; ++s2; } else { break; } } /* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */ if (c1 >= 0xD800 && c2 >= 0xD800) { UTF16FIXUP(c1); UTF16FIXUP(c2); } /* now c1 and c2 are in UTF-32-compatible order */ return (int32_t)c1-(int32_t)c2; } U_CAPI UChar* U_EXPORT2 u_strcpy(UChar *dst, const UChar *src) { UChar *anchor = dst; /* save a pointer to start of dst */ while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ } return anchor; } U_CAPI UChar* U_EXPORT2 u_strncpy(UChar *dst, const UChar *src, int32_t n) { UChar *anchor = dst; /* save a pointer to start of dst */ /* copy string 2 over */ while(n > 0 && (*(dst++) = *(src++)) != 0) { --n; } return anchor; } U_CAPI int32_t U_EXPORT2 u_strlen(const UChar *s) { #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR return uprv_wcslen(s); #else const UChar *t = s; while(*t != 0) { ++t; } return t - s; #endif } U_CAPI int32_t U_EXPORT2 u_countChar32(const UChar *s, int32_t length) { int32_t count; if(s==NULL || length<-1) { return 0; } count=0; if(length>=0) { while(length>0) { ++count; if(UTF_IS_LEAD(*s) && length>=2 && UTF_IS_TRAIL(*(s+1))) { s+=2; length-=2; } else { ++s; --length; } } } else /* length==-1 */ { UChar c; for(;;) { if((c=*s++)==0) { break; } ++count; /* * sufficient to look ahead one because of UTF-16; * safe to look ahead one because at worst that would be the terminating NUL */ if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) { ++s; } } } return count; } U_CAPI UChar * U_EXPORT2 u_memcpy(UChar *dest, const UChar *src, int32_t count) { return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); } U_CAPI UChar * U_EXPORT2 u_memmove(UChar *dest, const UChar *src, int32_t count) { return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); } U_CAPI UChar * U_EXPORT2 u_memset(UChar *dest, UChar c, int32_t count) { if(count > 0) { UChar *ptr = dest; UChar *limit = dest + count; while (ptr < limit) { *(ptr++) = c; } } return dest; } U_CAPI int32_t U_EXPORT2 u_memcmp(UChar *buf1, UChar *buf2, int32_t count) { if(count > 0) { UChar *limit = buf1 + count; int32_t result; while (buf1 < limit) { result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; if (result != 0) { return result; } buf1++; buf2++; } } return 0; } U_CAPI int32_t U_EXPORT2 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { const UChar *limit; UChar c1, c2; if(count<=0) { return 0; } limit=s1+count; /* compare identical prefixes - they do not need to be fixed up */ for(;;) { c1=*s1; c2=*s2; if(c1!=c2) { break; } ++s1; ++s2; if(s1==limit) { return 0; } } /* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */ if (c1 >= 0xD800 && c2 >= 0xD800) { UTF16FIXUP(c1); UTF16FIXUP(c2); } /* now c1 and c2 are in UTF-32-compatible order */ return (int32_t)c1-(int32_t)c2; } U_CAPI UChar * U_EXPORT2 u_memchr(UChar *src, UChar ch, int32_t count) { if(count > 0) { UChar *ptr = src; UChar *limit = src + count; do { if (*ptr == ch) { return ptr; } } while (++ptr < limit); } return NULL; } U_CAPI UChar * U_EXPORT2 u_memchr32(UChar *src, UChar32 ch, int32_t count) { if(count<=0 || (uint32_t)ch>0x10ffff) { return NULL; /* no string, or illegal arguments */ } if(ch<=0xffff) { return u_memchr(src, (UChar)ch, count); /* BMP, single UChar */ } else if(count<2) { return NULL; /* too short for a surrogate pair */ } else { const UChar *limit=src+count-1; /* -1 so that we do not need a separate check for the trail unit */ UChar lead=UTF16_LEAD(ch), trail=UTF16_TRAIL(ch); do { if(*src==lead && *(src+1)==trail) { return src; } } while(++src= 0x0030 && c <= 0x0037) { return (int8_t)(c - 0x0030); } return -1; } /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ static int8_t _digit16(UChar c) { if (c >= 0x0030 && c <= 0x0039) { return (int8_t)(c - 0x0030); } if (c >= 0x0041 && c <= 0x0046) { return (int8_t)(c - (0x0041 - 10)); } if (c >= 0x0061 && c <= 0x0066) { return (int8_t)(c - (0x0061 - 10)); } return -1; } /* Parse a single escape sequence. Although this method deals in * UChars, it does not use C++ or UnicodeString. This allows it to * be used from C contexts. */ U_CAPI UChar32 U_EXPORT2 u_unescapeAt(UNESCAPE_CHAR_AT charAt, int32_t *offset, int32_t length, void *context) { int32_t start = *offset; UChar c; UChar32 result = 0; int8_t n = 0; int8_t minDig = 0; int8_t maxDig = 0; int8_t bitsPerDigit = 4; int8_t dig; int32_t i; /* Check that offset is in range */ if (*offset < 0 || *offset >= length) { goto err; } /* Fetch first UChar after '\\' */ c = charAt((*offset)++, context); /* Convert hexadecimal and octal escapes */ switch (c) { case 0x0075 /*'u'*/: minDig = maxDig = 4; break; case 0x0055 /*'U'*/: minDig = maxDig = 8; break; case 0x0078 /*'x'*/: minDig = 1; maxDig = 2; break; default: dig = _digit8(c); if (dig >= 0) { minDig = 1; maxDig = 3; n = 1; /* Already have first octal digit */ bitsPerDigit = 3; result = dig; } break; } if (minDig != 0) { while (*offset < length && n < maxDig) { c = charAt(*offset, context); dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; ++(*offset); ++n; } if (n < minDig) { goto err; } return result; } /* Convert C-style escapes in table */ for (i=0; i destCapacity) { srcLen = destCapacity; } u_charsToUChars(src, dest, srcLen); } /* Do an invariant conversion of char* -> UChar*, with escape parsing */ U_CAPI int32_t U_EXPORT2 u_unescape(const char *src, UChar *dest, int32_t destCapacity) { const char *segment = src; int32_t i = 0; char c; while ((c=*src) != 0) { /* '\\' intentionally written as compiler-specific * character constant to correspond to compiler-specific * char* constants. */ if (c == '\\') { int32_t lenParsed = 0; UChar32 c32; if (src != segment) { if (dest != NULL) { _appendUChars(dest + i, destCapacity - i, segment, src - segment); } i += src - segment; } ++src; /* advance past '\\' */ c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*)src); if (lenParsed == 0) { goto err; } src += lenParsed; /* advance past escape seq. */ if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) { UTF_APPEND_CHAR_UNSAFE(dest, i, c32); } else { i += UTF_CHAR_LENGTH(c32); } segment = src; } else { ++src; } } if (src != segment) { if (dest != NULL) { _appendUChars(dest + i, destCapacity - i, segment, src - segment); } i += src - segment; } if (dest != NULL && i < destCapacity) { dest[i] = 0; } return i + 1; /* add 1 for zero term */ err: if (dest != NULL && destCapacity > 0) { *dest = 0; } return 0; } /* C UGrowBuffer implementation --------------------------------------------- */ U_CAPI UBool /* U_CALLCONV U_EXPORT2 */ u_growBufferFromStatic(void *context, UChar **pBuffer, int32_t *pCapacity, int32_t reqCapacity, int32_t length) { UChar *newBuffer=(UChar *)uprv_malloc(reqCapacity*U_SIZEOF_UCHAR); if(newBuffer!=NULL) { if(length>0) { uprv_memcpy(newBuffer, *pBuffer, length*U_SIZEOF_UCHAR); } *pCapacity=reqCapacity; } else { *pCapacity=0; } /* release the old pBuffer if it was not statically allocated */ if(*pBuffer!=(UChar *)context) { uprv_free(*pBuffer); } *pBuffer=newBuffer; return (UBool)(newBuffer!=NULL); } /* NUL-termination of strings ----------------------------------------------- */ /** * NUL-terminate a string no matter what its type. * Set warning and error codes accordingly. */ #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ /* not a public function, so no complete argument checking */ \ \ if(length<0) { \ /* assume that the caller handles this */ \ } else if(lengthdestCapacity */ { \ /* even the string itself did not fit - set an error code */ \ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ } \ } U_CAPI int32_t U_EXPORT2 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); return length; } U_CAPI int32_t U_EXPORT2 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); return length; } U_CAPI int32_t U_EXPORT2 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); return length; } U_CAPI int32_t U_EXPORT2 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); return length; }