2001-05-10 16:54:09 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* Copyright (C) 1998-2001, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* File ucbuf.c
|
|
|
|
*
|
|
|
|
* Modification History:
|
|
|
|
*
|
|
|
|
* Date Name Description
|
|
|
|
* 05/10/01 Ram Creation.
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/ucnv.h"
|
2001-11-01 19:43:21 +00:00
|
|
|
#include "unicode/ucnv_err.h"
|
2001-05-10 16:54:09 +00:00
|
|
|
#include "filestrm.h"
|
2002-10-10 01:04:15 +00:00
|
|
|
#include "cstring.h"
|
2001-05-10 16:54:09 +00:00
|
|
|
#include "cmemory.h"
|
2002-10-10 01:04:15 +00:00
|
|
|
#include "ustrfmt.h"
|
2001-05-10 16:54:09 +00:00
|
|
|
#include "unicode/ustring.h"
|
2002-11-13 03:22:10 +00:00
|
|
|
#include "unicode/uchar.h"
|
2001-05-10 16:54:09 +00:00
|
|
|
#include "ucbuf.h"
|
2001-11-01 19:43:21 +00:00
|
|
|
#include <stdio.h>
|
2001-05-16 01:09:06 +00:00
|
|
|
|
2001-05-22 18:00:55 +00:00
|
|
|
#define MAX_IN_BUF 1000
|
|
|
|
#define MAX_U_BUF 1500
|
2001-11-01 19:43:21 +00:00
|
|
|
#define CONTEXT_LEN 15
|
2001-05-10 16:54:09 +00:00
|
|
|
|
2001-11-03 02:54:08 +00:00
|
|
|
struct UCHARBUF {
|
|
|
|
UChar* buffer;
|
|
|
|
UChar* currentPos;
|
|
|
|
UChar* bufLimit;
|
2002-10-10 01:04:15 +00:00
|
|
|
int32_t bufCapacity;
|
2001-11-03 02:54:08 +00:00
|
|
|
int32_t remaining;
|
2002-10-10 01:04:15 +00:00
|
|
|
int32_t signatureLength;
|
2001-11-03 02:54:08 +00:00
|
|
|
FileStream* in;
|
|
|
|
UConverter* conv;
|
|
|
|
UBool showWarning; /* makes this API not produce any errors */
|
2002-10-10 01:04:15 +00:00
|
|
|
UBool isBuffered;
|
2001-11-03 02:54:08 +00:00
|
|
|
};
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
U_CAPI UBool U_EXPORT2
|
|
|
|
ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){
|
2002-11-08 01:28:14 +00:00
|
|
|
char start[8];
|
|
|
|
int32_t numRead;
|
|
|
|
|
|
|
|
UChar target[1]={ 0 };
|
|
|
|
UChar* pTarget;
|
2002-11-12 01:50:37 +00:00
|
|
|
const char* pStart;
|
2001-09-28 00:29:40 +00:00
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
/* read a few bytes */
|
|
|
|
numRead=T_FileStream_read(in, start, sizeof(start));
|
2001-09-28 00:29:40 +00:00
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
*cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error);
|
|
|
|
if(*cp==NULL){
|
|
|
|
/* unread the bytes already read */
|
|
|
|
while(numRead>0) {
|
|
|
|
T_FileStream_ungetc(start[--numRead], in);
|
|
|
|
}
|
2002-09-06 22:15:37 +00:00
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
*conv =NULL;
|
2001-09-28 00:29:40 +00:00
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
/* open the converter for the detected Unicode charset */
|
|
|
|
*conv = ucnv_open(*cp,error);
|
|
|
|
|
|
|
|
/* convert and ignore initial U+FEFF, and the buffer overflow */
|
|
|
|
pTarget = target;
|
|
|
|
pStart = start;
|
|
|
|
ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, NULL, FALSE, error);
|
|
|
|
*signatureLength = pStart - start;
|
|
|
|
if(*error==U_BUFFER_OVERFLOW_ERROR) {
|
|
|
|
*error=U_ZERO_ERROR;
|
2001-09-28 00:29:40 +00:00
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
/* verify that we successfully read exactly U+FEFF */
|
|
|
|
if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) {
|
|
|
|
*error=U_INTERNAL_PROGRAM_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unread the bytes beyond what was consumed for U+FEFF */
|
|
|
|
while(numRead>*signatureLength) {
|
|
|
|
T_FileStream_ungetc(start[--numRead], in);
|
2002-02-28 17:23:53 +00:00
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
return TRUE;
|
2001-05-26 01:16:37 +00:00
|
|
|
}
|
2002-11-13 03:22:10 +00:00
|
|
|
static UBool ucbuf_isCPKnown(const char* cp){
|
|
|
|
if(ucnv_compareNames("UTF-8",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-16BE",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-16LE",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-16",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-32",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-32BE",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-32LE",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-32BE",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("SCSU",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("BOCU",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
if(ucnv_compareNames("UTF-7",cp)){
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
2001-05-26 01:16:37 +00:00
|
|
|
|
2002-11-08 01:28:14 +00:00
|
|
|
U_CAPI FileStream * U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength,UErrorCode* error){
|
|
|
|
FileStream* in=NULL;
|
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
2002-11-08 01:28:14 +00:00
|
|
|
return NULL;
|
2002-10-10 01:04:15 +00:00
|
|
|
}
|
|
|
|
if(conv==NULL || cp==NULL || fileName==NULL){
|
|
|
|
*error = U_ILLEGAL_ARGUMENT_ERROR;
|
2002-11-08 01:28:14 +00:00
|
|
|
return NULL;
|
2002-10-10 01:04:15 +00:00
|
|
|
}
|
|
|
|
/* open the file */
|
|
|
|
in= T_FileStream_open(fileName,"rb");
|
|
|
|
|
|
|
|
if(in == NULL){
|
|
|
|
*error=U_FILE_ACCESS_ERROR;
|
2002-11-08 01:28:14 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(ucbuf_autodetect_fs(in,cp,conv,signatureLength,error)) {
|
|
|
|
return in;
|
|
|
|
} else {
|
|
|
|
ucnv_close(*conv);
|
|
|
|
*conv=NULL;
|
|
|
|
T_FileStream_close(in);
|
|
|
|
return NULL;
|
2002-10-10 01:04:15 +00:00
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* fill the uchar buffer */
|
2002-02-28 01:42:40 +00:00
|
|
|
static UCHARBUF*
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){
|
2001-05-10 21:43:01 +00:00
|
|
|
UChar* pTarget=NULL;
|
|
|
|
UChar* target=NULL;
|
|
|
|
const char* source=NULL;
|
2002-10-10 01:04:15 +00:00
|
|
|
char carr[MAX_IN_BUF] = {'\0'};
|
|
|
|
char* cbuf = carr;
|
|
|
|
int32_t inputRead=0;
|
|
|
|
int32_t outputWritten=0;
|
2002-05-23 22:10:23 +00:00
|
|
|
int32_t offset=0;
|
2001-11-01 19:43:21 +00:00
|
|
|
const char* sourceLimit =NULL;
|
2002-10-10 01:04:15 +00:00
|
|
|
int32_t cbufSize=0;
|
2001-05-22 18:00:55 +00:00
|
|
|
pTarget = buf->buffer;
|
|
|
|
/* check if we arrived here without exhausting the buffer*/
|
|
|
|
if(buf->currentPos<buf->bufLimit){
|
2002-05-23 22:10:23 +00:00
|
|
|
offset = (int32_t)(buf->bufLimit-buf->currentPos);
|
2001-05-22 18:00:55 +00:00
|
|
|
memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar));
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
2001-05-22 18:00:55 +00:00
|
|
|
|
|
|
|
#if DEBUG
|
|
|
|
memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset));
|
|
|
|
#endif
|
2002-10-10 01:04:15 +00:00
|
|
|
if(buf->isBuffered){
|
|
|
|
cbufSize = MAX_IN_BUF;
|
|
|
|
/* read the file */
|
|
|
|
inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset);
|
|
|
|
buf->remaining-=inputRead;
|
|
|
|
|
|
|
|
}else{
|
|
|
|
cbufSize = T_FileStream_size(buf->in);
|
|
|
|
cbuf = (char*)uprv_malloc(cbufSize);
|
|
|
|
inputRead= T_FileStream_read(buf->in,cbuf,cbufSize);
|
|
|
|
buf->remaining-=inputRead;
|
|
|
|
}
|
2001-05-10 21:43:01 +00:00
|
|
|
|
2002-02-28 01:42:40 +00:00
|
|
|
/* just to be sure...*/
|
2002-10-10 01:04:15 +00:00
|
|
|
if ( 0 == inputRead )
|
2002-02-28 01:42:40 +00:00
|
|
|
buf->remaining = 0;
|
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
target=pTarget;
|
|
|
|
/* convert the bytes */
|
|
|
|
if(buf->conv){
|
2001-11-01 19:43:21 +00:00
|
|
|
/* set the callback to stop */
|
|
|
|
UConverterToUCallback toUOldAction ;
|
|
|
|
void* toUOldContext;
|
2002-02-28 01:42:40 +00:00
|
|
|
void* toUNewContext=NULL;
|
2001-11-01 19:43:21 +00:00
|
|
|
ucnv_setToUCallBack(buf->conv,
|
|
|
|
UCNV_TO_U_CALLBACK_STOP,
|
|
|
|
toUNewContext,
|
|
|
|
&toUOldAction,
|
|
|
|
(const void**)&toUOldContext,
|
2002-10-10 01:04:15 +00:00
|
|
|
error);
|
2001-05-10 21:43:01 +00:00
|
|
|
/* since state is saved in the converter we add offset to source*/
|
|
|
|
target = pTarget+offset;
|
|
|
|
source = cbuf;
|
2002-10-10 01:04:15 +00:00
|
|
|
sourceLimit = source + inputRead;
|
|
|
|
ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset),
|
|
|
|
&source,sourceLimit,NULL,
|
|
|
|
(UBool)(buf->remaining==0),error);
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
if(U_FAILURE(*error)){
|
2001-11-01 19:43:21 +00:00
|
|
|
char context[CONTEXT_LEN];
|
|
|
|
char preContext[CONTEXT_LEN];
|
|
|
|
char postContext[CONTEXT_LEN];
|
|
|
|
int8_t len = CONTEXT_LEN;
|
|
|
|
int32_t start=0;
|
|
|
|
int32_t stop =0;
|
|
|
|
int32_t pos =0;
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-11-03 02:54:08 +00:00
|
|
|
if( buf->showWarning==TRUE){
|
2002-02-28 01:42:40 +00:00
|
|
|
fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while"
|
2001-11-03 02:54:08 +00:00
|
|
|
" converting input stream to target encoding: %s\n",
|
2002-10-10 01:04:15 +00:00
|
|
|
u_errorName(*error));
|
2001-11-03 02:54:08 +00:00
|
|
|
}
|
2001-11-01 19:43:21 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
*error = U_ZERO_ERROR;
|
2001-11-01 19:43:21 +00:00
|
|
|
|
|
|
|
/* now get the context chars */
|
2002-10-10 01:04:15 +00:00
|
|
|
ucnv_getInvalidChars(buf->conv,context,&len,error);
|
2001-11-01 19:43:21 +00:00
|
|
|
context[len]= 0 ; /* null terminate the buffer */
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2002-05-23 22:10:23 +00:00
|
|
|
pos = (int32_t)(source - cbuf - len);
|
2001-11-01 19:43:21 +00:00
|
|
|
|
|
|
|
/* for pre-context */
|
|
|
|
start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1));
|
|
|
|
stop = pos-len;
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-11-01 19:43:21 +00:00
|
|
|
memcpy(preContext,cbuf+start,stop-start);
|
|
|
|
/* null terminate the buffer */
|
|
|
|
preContext[stop-start] = 0;
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-11-01 19:43:21 +00:00
|
|
|
/* for post-context */
|
|
|
|
start = pos+len;
|
2002-05-23 22:10:23 +00:00
|
|
|
stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf));
|
2001-11-01 19:43:21 +00:00
|
|
|
|
|
|
|
memcpy(postContext,source,stop-start);
|
|
|
|
/* null terminate the buffer */
|
|
|
|
postContext[stop-start] = 0;
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-11-03 02:54:08 +00:00
|
|
|
if(buf->showWarning ==TRUE){
|
|
|
|
/* print out the context */
|
|
|
|
fprintf(stderr,"\tPre-context: %s\n",preContext);
|
|
|
|
fprintf(stderr,"\tContext: %s\n",context);
|
|
|
|
fprintf(stderr,"\tPost-context: %s\n", postContext);
|
|
|
|
}
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-11-01 22:55:16 +00:00
|
|
|
/* reset the converter */
|
|
|
|
ucnv_reset(buf->conv);
|
|
|
|
|
2002-02-28 01:42:40 +00:00
|
|
|
/* set the call back to substitute
|
2001-11-01 19:43:21 +00:00
|
|
|
* and restart conversion
|
|
|
|
*/
|
|
|
|
ucnv_setToUCallBack(buf->conv,
|
|
|
|
UCNV_TO_U_CALLBACK_SUBSTITUTE,
|
|
|
|
toUNewContext,
|
|
|
|
&toUOldAction,
|
|
|
|
(const void**)&toUOldContext,
|
2002-10-10 01:04:15 +00:00
|
|
|
error);
|
2001-11-01 19:43:21 +00:00
|
|
|
|
|
|
|
/* reset source and target start positions */
|
|
|
|
target = pTarget+offset;
|
|
|
|
source = cbuf;
|
2001-11-01 22:55:16 +00:00
|
|
|
|
2001-11-01 19:43:21 +00:00
|
|
|
/* re convert */
|
2002-10-10 01:04:15 +00:00
|
|
|
ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset),
|
2001-11-01 19:43:21 +00:00
|
|
|
&source,sourceLimit,NULL,
|
2002-10-10 01:04:15 +00:00
|
|
|
(UBool)(buf->remaining==0),error);
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-11-01 19:43:21 +00:00
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
outputWritten = (int32_t)(target - pTarget);
|
2001-11-01 19:43:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
#if DEBUG
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
target = pTarget;
|
|
|
|
for(i=0;i<numRead;i++){
|
|
|
|
/* printf("%c", (char)(*target++));*/
|
|
|
|
}
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
2001-11-01 19:43:21 +00:00
|
|
|
#endif
|
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
}else{
|
2002-10-10 01:04:15 +00:00
|
|
|
u_charsToUChars(cbuf,target+offset,inputRead);
|
|
|
|
outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset);
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
|
|
|
buf->currentPos = pTarget;
|
2002-10-10 01:04:15 +00:00
|
|
|
buf->bufLimit=pTarget+outputWritten;
|
|
|
|
if(cbuf!=carr){
|
|
|
|
uprv_free(cbuf);
|
|
|
|
}
|
2001-05-10 21:43:01 +00:00
|
|
|
return buf;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* get a UChar from the stream*/
|
2002-10-10 01:04:15 +00:00
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_getc(UCHARBUF* buf,UErrorCode* error){
|
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
|
|
|
return FALSE;
|
|
|
|
}
|
2001-05-22 17:36:43 +00:00
|
|
|
if(buf->currentPos>=buf->bufLimit){
|
2001-05-10 21:43:01 +00:00
|
|
|
if(buf->remaining==0){
|
|
|
|
return U_EOF;
|
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
buf=ucbuf_fillucbuf(buf,error);
|
|
|
|
if(U_FAILURE(*error)){
|
2001-05-10 21:43:01 +00:00
|
|
|
return U_EOF;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-05-22 17:36:43 +00:00
|
|
|
return *(buf->currentPos++);
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/* get a UChar32 from the stream*/
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){
|
|
|
|
int32_t retVal =U_EOF;
|
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if(buf->currentPos+1>=buf->bufLimit){
|
|
|
|
if(buf->remaining==0){
|
|
|
|
return U_EOF;
|
|
|
|
}
|
|
|
|
buf=ucbuf_fillucbuf(buf,error);
|
|
|
|
if(U_FAILURE(*error)){
|
|
|
|
return U_EOF;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(UTF_IS_LEAD(*(buf->currentPos))){
|
|
|
|
retVal=UTF16_GET_PAIR_VALUE(*(buf->currentPos++),*(buf->currentPos++));
|
|
|
|
}else{
|
|
|
|
retVal = *(buf->currentPos++);
|
|
|
|
}
|
|
|
|
return retVal;
|
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
|
|
|
|
/* u_unescapeAt() callback to return a UChar*/
|
2002-07-23 23:01:08 +00:00
|
|
|
static UChar U_CALLCONV
|
2001-05-10 21:43:01 +00:00
|
|
|
_charAt(int32_t offset, void *context) {
|
2001-05-10 16:54:09 +00:00
|
|
|
return ((UCHARBUF*) context)->currentPos[offset];
|
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* getc and escape it */
|
2002-10-10 01:04:15 +00:00
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) {
|
2001-05-10 16:54:09 +00:00
|
|
|
int32_t length;
|
|
|
|
int32_t offset;
|
2001-08-28 01:25:35 +00:00
|
|
|
UChar32 c32,c1,c2;
|
2002-10-10 01:04:15 +00:00
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
|
|
|
return FALSE;
|
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
/* Fill the buffer if it is empty */
|
2001-08-28 01:25:35 +00:00
|
|
|
if (buf->currentPos >=buf->bufLimit-2) {
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_fillucbuf(buf,error);
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Get the next character in the buffer */
|
|
|
|
if (buf->currentPos < buf->bufLimit) {
|
2001-05-26 01:16:37 +00:00
|
|
|
c1 = *(buf->currentPos)++;
|
2001-05-10 16:54:09 +00:00
|
|
|
} else {
|
2001-05-26 01:16:37 +00:00
|
|
|
c1 = U_EOF;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-09-20 02:35:51 +00:00
|
|
|
c2 = *(buf->currentPos);
|
2001-05-10 16:54:09 +00:00
|
|
|
|
|
|
|
/* If it isn't a backslash, return it */
|
2001-08-28 01:25:35 +00:00
|
|
|
if (c1 != 0x005C) {
|
2001-05-26 01:16:37 +00:00
|
|
|
return c1;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-05-10 16:54:09 +00:00
|
|
|
/* Determine the amount of data in the buffer */
|
2002-05-23 22:10:23 +00:00
|
|
|
length = (int32_t)(buf->bufLimit - buf->currentPos);
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
/* The longest escape sequence is \Uhhhhhhhh; make sure
|
2001-05-10 16:54:09 +00:00
|
|
|
we have at least that many characters */
|
|
|
|
if (length < 10) {
|
|
|
|
|
|
|
|
/* fill the buffer */
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_fillucbuf(buf,error);
|
2002-05-23 22:10:23 +00:00
|
|
|
length = (int32_t)(buf->bufLimit - buf->buffer);
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2002-02-28 01:42:40 +00:00
|
|
|
|
2001-05-10 16:54:09 +00:00
|
|
|
/* Process the escape */
|
|
|
|
offset = 0;
|
|
|
|
c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf);
|
|
|
|
|
2001-09-20 02:35:51 +00:00
|
|
|
/* check if u_unescapeAt unescaped and converted
|
|
|
|
* to c32 or not
|
|
|
|
*/
|
2002-09-06 22:13:15 +00:00
|
|
|
if(c32==0xFFFFFFFF){
|
2002-10-10 01:04:15 +00:00
|
|
|
*error= U_ILLEGAL_ESCAPE_SEQUENCE;
|
2002-09-06 22:13:15 +00:00
|
|
|
return c1;
|
|
|
|
}else if(c32!=c2){
|
2001-09-20 02:35:51 +00:00
|
|
|
/* Update the current buffer position */
|
|
|
|
buf->currentPos += offset;
|
|
|
|
}else{
|
2002-02-28 01:42:40 +00:00
|
|
|
/* unescaping failed so we just return
|
2001-09-20 02:35:51 +00:00
|
|
|
* c1 and not consume the buffer
|
|
|
|
* this is useful for rules with escapes
|
|
|
|
* in resouce bundles
|
|
|
|
* eg: \' \\ \"
|
|
|
|
*/
|
|
|
|
return c1;
|
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
|
|
|
|
return c32;
|
|
|
|
}
|
|
|
|
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI UCHARBUF* U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_open(const char* fileName,const char** cp,UBool showWarning, UBool buffered, UErrorCode* error){
|
2001-05-10 21:43:01 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
FileStream* in = NULL;
|
|
|
|
int32_t fileSize=0;
|
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
2001-05-10 21:43:01 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
if(cp==NULL || fileName==NULL){
|
|
|
|
*error = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (!uprv_strcmp(fileName, "-")) {
|
|
|
|
in = T_FileStream_stdin();
|
|
|
|
}else{
|
|
|
|
in = T_FileStream_open(fileName, "rb");
|
|
|
|
}
|
|
|
|
|
|
|
|
if(in!=NULL){
|
|
|
|
UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF));
|
|
|
|
fileSize = T_FileStream_size(in);
|
|
|
|
if(buf){
|
|
|
|
buf->in=in;
|
|
|
|
buf->conv=NULL;
|
|
|
|
buf->showWarning = showWarning;
|
|
|
|
buf->isBuffered = buffered;
|
2002-11-13 03:22:10 +00:00
|
|
|
buf->signatureLength=0;
|
|
|
|
if(*cp==NULL || **cp=='\0' || ucbuf_isCPKnown(*cp)/* to discard BOMs */){
|
2002-10-10 01:04:15 +00:00
|
|
|
/* don't have code page name... try to autodetect */
|
|
|
|
ucbuf_autodetect_fs(in,cp,&buf->conv,&buf->signatureLength,error);
|
|
|
|
}
|
2002-11-08 01:28:14 +00:00
|
|
|
if(U_SUCCESS(*error) && buf->conv==NULL) {
|
|
|
|
buf->conv=ucnv_open(*cp,error);
|
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
if(U_FAILURE(*error)){
|
2002-11-08 01:28:14 +00:00
|
|
|
ucnv_close(buf->conv);
|
2002-10-10 01:04:15 +00:00
|
|
|
uprv_free(buf);
|
|
|
|
return NULL;
|
2001-11-01 19:43:21 +00:00
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
|
|
|
|
if((buf->conv==NULL) && (buf->showWarning==TRUE)){
|
|
|
|
fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n");
|
|
|
|
}
|
|
|
|
buf->remaining=fileSize-buf->signatureLength;
|
|
|
|
if(buf->isBuffered){
|
|
|
|
buf->buffer=(UChar*) uprv_malloc(U_SIZEOF_UCHAR* MAX_U_BUF);
|
|
|
|
buf->bufCapacity=MAX_U_BUF;
|
|
|
|
}else{
|
|
|
|
buf->buffer=(UChar*) uprv_malloc(U_SIZEOF_UCHAR * (buf->remaining+buf->signatureLength));
|
|
|
|
buf->bufCapacity=buf->remaining+buf->signatureLength;
|
|
|
|
}
|
|
|
|
if (buf->buffer == NULL) {
|
|
|
|
*error = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
buf->currentPos=buf->buffer;
|
|
|
|
buf->bufLimit=buf->buffer;
|
|
|
|
if(U_FAILURE(*error)){
|
2002-10-11 23:12:31 +00:00
|
|
|
fprintf(stderr, "Could not open codepage [%s]: %s\n", *cp, u_errorName(*error));
|
2002-10-10 01:04:15 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
buf=ucbuf_fillucbuf(buf,error);
|
|
|
|
return buf;
|
|
|
|
}else{
|
|
|
|
*error = U_MEMORY_ALLOCATION_ERROR;
|
2001-05-10 21:43:01 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
*error =U_FILE_ACCESS_ERROR;
|
|
|
|
return NULL;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
|
|
|
|
|
2002-02-28 01:42:40 +00:00
|
|
|
/* TODO: this method will fail if at the
|
2001-05-22 15:54:03 +00:00
|
|
|
* begining of buffer and the uchar to unget
|
|
|
|
* is from the previous buffer. Need to implement
|
|
|
|
* system to take care of that situation.
|
2002-02-28 01:42:40 +00:00
|
|
|
*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_ungetc(int32_t c,UCHARBUF* buf){
|
2001-05-22 17:36:43 +00:00
|
|
|
/* decrement currentPos pointer
|
|
|
|
* if not at the begining of buffer
|
|
|
|
*/
|
2002-10-10 01:04:15 +00:00
|
|
|
UChar escaped[8] ={'\0'};
|
|
|
|
int32_t len =0;
|
|
|
|
if(c > 0xFFFF){
|
|
|
|
len = uprv_itou(escaped,c,16,8);
|
|
|
|
}else{
|
|
|
|
len=uprv_itou(escaped,c,16,4);
|
|
|
|
}
|
2001-05-10 21:43:01 +00:00
|
|
|
if(buf->currentPos!=buf->buffer){
|
2002-10-10 01:04:15 +00:00
|
|
|
if(*(buf->currentPos-1)==c){
|
|
|
|
buf->currentPos--;
|
|
|
|
}else if(u_strncmp(buf->currentPos-len,escaped,len) == 0){
|
|
|
|
while(--len>0){
|
|
|
|
buf->currentPos--;
|
|
|
|
}
|
|
|
|
}
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
2001-05-16 16:34:10 +00:00
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* frees the resources of UChar* buffer */
|
2002-02-28 01:42:40 +00:00
|
|
|
static void
|
2001-05-16 16:34:10 +00:00
|
|
|
ucbuf_closebuf(UCHARBUF* buf){
|
|
|
|
uprv_free(buf->buffer);
|
2001-05-21 19:38:13 +00:00
|
|
|
buf->buffer = NULL;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* close the buf and release resources*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_close(UCHARBUF* buf){
|
2002-10-10 01:04:15 +00:00
|
|
|
if(buf!=NULL){
|
|
|
|
if(buf->conv){
|
|
|
|
ucnv_close(buf->conv);
|
|
|
|
}
|
|
|
|
buf->in=NULL;
|
|
|
|
buf->currentPos=NULL;
|
|
|
|
buf->bufLimit=NULL;
|
|
|
|
T_FileStream_close(buf->in);
|
|
|
|
ucbuf_closebuf(buf);
|
|
|
|
uprv_free(buf);
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2001-05-16 16:34:10 +00:00
|
|
|
|
|
|
|
/* rewind the buf and file stream */
|
|
|
|
U_CAPI void U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){
|
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
|
|
|
return;
|
|
|
|
}
|
2001-05-22 17:36:43 +00:00
|
|
|
if(buf){
|
|
|
|
buf->currentPos=buf->buffer;
|
|
|
|
buf->bufLimit=buf->buffer;
|
|
|
|
T_FileStream_rewind(buf->in);
|
2002-11-08 01:28:14 +00:00
|
|
|
buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength;
|
|
|
|
|
|
|
|
ucnv_resetToUnicode(buf->conv);
|
|
|
|
if(buf->signatureLength>0) {
|
|
|
|
UChar target[1]={ 0 };
|
|
|
|
UChar* pTarget;
|
|
|
|
char start[8];
|
2002-11-12 01:50:37 +00:00
|
|
|
const char* pStart;
|
2002-11-08 01:28:14 +00:00
|
|
|
int32_t numRead;
|
|
|
|
|
|
|
|
/* read the signature bytes */
|
|
|
|
numRead=T_FileStream_read(buf->in, start, buf->signatureLength);
|
|
|
|
|
|
|
|
/* convert and ignore initial U+FEFF, and the buffer overflow */
|
|
|
|
pTarget = target;
|
|
|
|
pStart = start;
|
|
|
|
ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, NULL, FALSE, error);
|
|
|
|
if(*error==U_BUFFER_OVERFLOW_ERROR) {
|
|
|
|
*error=U_ZERO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* verify that we successfully read exactly U+FEFF */
|
|
|
|
if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) {
|
|
|
|
*error=U_INTERNAL_PROGRAM_ERROR;
|
|
|
|
}
|
|
|
|
}
|
2001-05-22 17:36:43 +00:00
|
|
|
}
|
2001-05-16 16:34:10 +00:00
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
|
|
|
|
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_size(UCHARBUF* buf){
|
|
|
|
if(buf){
|
|
|
|
if(buf->isBuffered){
|
|
|
|
return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv);
|
|
|
|
}else{
|
|
|
|
return buf->bufLimit-buf->buffer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_CAPI const UChar* U_EXPORT2
|
|
|
|
ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* error){
|
|
|
|
if(error==NULL || U_FAILURE(*error)){
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if(buf==NULL || len==NULL){
|
|
|
|
*error = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
*len = buf->bufLimit-buf->buffer;
|
|
|
|
return buf->buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
U_CAPI const char* U_EXPORT2
|
|
|
|
ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status){
|
2002-11-12 01:50:37 +00:00
|
|
|
int32_t requiredLen = 0;
|
|
|
|
int32_t dirlen = 0;
|
|
|
|
int32_t filelen = 0;
|
|
|
|
if(status==NULL || U_FAILURE(*status)){
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(inputDir == NULL || fileName == NULL || len==NULL || (target==NULL && *len>0)){
|
|
|
|
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
|
2002-11-12 01:50:37 +00:00
|
|
|
dirlen = (int32_t)uprv_strlen(inputDir);
|
|
|
|
filelen = (int32_t)uprv_strlen(fileName);
|
2002-10-10 01:04:15 +00:00
|
|
|
if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
|
2002-11-12 01:50:37 +00:00
|
|
|
requiredLen = dirlen + filelen + 2;
|
|
|
|
if((*len < requiredLen) || target==NULL){
|
|
|
|
*len = requiredLen;
|
|
|
|
*status = U_BUFFER_OVERFLOW_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
2002-10-10 01:04:15 +00:00
|
|
|
|
|
|
|
target[0] = '\0';
|
|
|
|
/*
|
|
|
|
* append the input dir to openFileName if the first char in
|
|
|
|
* filename is not file seperation char and the last char input directory is not '.'.
|
|
|
|
* This is to support :
|
|
|
|
* genrb -s. /home/icu/data
|
|
|
|
* genrb -s. icu/data
|
|
|
|
* The user cannot mix notations like
|
|
|
|
* genrb -s. /icu/data --- the absolute path specified. -s redundant
|
|
|
|
* user should use
|
|
|
|
* genrb -s. icu/data --- start from CWD and look in icu/data dir
|
|
|
|
*/
|
|
|
|
if( (fileName[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){
|
|
|
|
uprv_strcpy(target, inputDir);
|
|
|
|
target[dirlen] = U_FILE_SEP_CHAR;
|
|
|
|
}
|
|
|
|
target[dirlen + 1] = '\0';
|
|
|
|
} else {
|
2002-11-12 01:50:37 +00:00
|
|
|
requiredLen = dirlen + filelen + 1;
|
|
|
|
if((*len < requiredLen) || target==NULL){
|
|
|
|
*len = requiredLen;
|
|
|
|
*status = U_BUFFER_OVERFLOW_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
uprv_strcpy(target, inputDir);
|
2002-10-10 01:04:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uprv_strcat(target, fileName);
|
2002-11-12 01:50:37 +00:00
|
|
|
return target;
|
2002-10-10 01:04:15 +00:00
|
|
|
}
|
|
|
|
U_CAPI const UChar* U_EXPORT2
|
|
|
|
ucbuf_readline(UCHARBUF* buf,int32_t* len,UErrorCode* err){
|
2002-11-12 01:50:37 +00:00
|
|
|
UChar* temp = buf->currentPos;
|
2002-10-10 01:04:15 +00:00
|
|
|
UChar* savePos =NULL;
|
2002-11-12 01:50:37 +00:00
|
|
|
UChar c=0x0000;
|
|
|
|
if(buf->isBuffered){
|
|
|
|
/* The input is buffered we have to do more
|
|
|
|
* for returning a pointer U_TRUNCATED_CHAR_FOUND
|
|
|
|
*/
|
|
|
|
for(;;){
|
|
|
|
c = *temp++;
|
|
|
|
if(buf->remaining==0){
|
|
|
|
*err = (UErrorCode) U_EOF;
|
|
|
|
}
|
|
|
|
if(temp>=buf->bufLimit && buf->currentPos == buf->buffer){
|
|
|
|
*err= U_TRUNCATED_CHAR_FOUND;
|
|
|
|
return NULL;
|
|
|
|
}else{
|
|
|
|
ucbuf_fillucbuf(buf,err);
|
|
|
|
if(U_FAILURE(*err)){
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
2002-11-13 20:55:52 +00:00
|
|
|
if (temp>=buf->bufLimit|| c == 0x0a || c==0x2028 || c==0x0085){ /* Unipad inserts 2028 line separators! */
|
2002-11-12 01:50:37 +00:00
|
|
|
*len = temp - buf->currentPos;
|
2002-10-10 01:04:15 +00:00
|
|
|
savePos = buf->currentPos;
|
2002-11-12 01:50:37 +00:00
|
|
|
buf->currentPos = temp;
|
|
|
|
return savePos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
/* we know that all input is read into the internal
|
|
|
|
* buffer so we can safely return pointers
|
|
|
|
*/
|
|
|
|
for(;;){
|
|
|
|
c = *temp++;
|
2002-11-13 20:55:52 +00:00
|
|
|
|
2002-11-12 01:50:37 +00:00
|
|
|
if(buf->currentPos==buf->bufLimit){
|
|
|
|
*err = (UErrorCode) U_EOF;
|
2002-10-10 01:04:15 +00:00
|
|
|
return NULL;
|
2002-11-12 01:50:37 +00:00
|
|
|
}
|
2002-11-13 20:55:52 +00:00
|
|
|
if (temp>=buf->bufLimit|| c == 0x0a || c==0x2028 || c==0x0085) { /* Unipad inserts 2028 line separators! */
|
2002-11-12 01:50:37 +00:00
|
|
|
*len = temp - buf->currentPos;
|
2002-10-10 01:04:15 +00:00
|
|
|
savePos = buf->currentPos;
|
2002-11-12 01:50:37 +00:00
|
|
|
buf->currentPos = temp;
|
|
|
|
return savePos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
2002-10-10 01:04:15 +00:00
|
|
|
}
|