2001-05-10 16:54:09 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* Copyright (C) 1998-2001, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* File ucbuf.c
|
|
|
|
*
|
|
|
|
* Modification History:
|
|
|
|
*
|
|
|
|
* Date Name Description
|
|
|
|
* 05/10/01 Ram Creation.
|
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/ucnv.h"
|
|
|
|
#include "filestrm.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include "unicode/ustring.h"
|
|
|
|
#include "ucbuf.h"
|
2001-05-16 01:09:06 +00:00
|
|
|
|
2001-05-22 18:00:55 +00:00
|
|
|
#define MAX_IN_BUF 1000
|
|
|
|
#define MAX_U_BUF 1500
|
2001-05-10 16:54:09 +00:00
|
|
|
|
2001-05-26 01:16:37 +00:00
|
|
|
static UBool ucbuf_autodetect_nrw(FileStream* in, const char** cp,int* numRead){
|
|
|
|
|
|
|
|
char start[4]={'\0'};
|
|
|
|
int cap =T_FileStream_size(in);
|
|
|
|
UBool autodetect =FALSE;
|
|
|
|
int i=4;
|
|
|
|
*numRead=4;
|
|
|
|
*cp="";
|
|
|
|
if(cap>0){
|
|
|
|
T_FileStream_read(in, start, 4);
|
|
|
|
if(start[0] == '\xFE' && start[1] == '\xFF') {
|
|
|
|
*cp = "UTF16_BigEndian";
|
|
|
|
autodetect = TRUE;
|
|
|
|
} else if(start[0] == '\xFF' && start[1] == '\xFE') {
|
|
|
|
*cp = "UTF16_LittleEndian";
|
|
|
|
*numRead-=2;
|
2001-05-31 21:36:09 +00:00
|
|
|
if(start[2] == '\x00' && start[3] =='\x00'){
|
2001-05-26 01:16:37 +00:00
|
|
|
*cp="UTF32_LittleEndian";
|
|
|
|
*numRead+=2;
|
|
|
|
}
|
|
|
|
autodetect = TRUE;
|
|
|
|
} else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') {
|
|
|
|
*cp = "UTF8";
|
|
|
|
*numRead-=1;
|
|
|
|
autodetect = TRUE;
|
|
|
|
}else if(start[0] == '\x0E' && start[1] == '\xFE' && start[2] == '\xFF'){
|
|
|
|
*cp ="SCSU";
|
|
|
|
*numRead-=1;
|
|
|
|
autodetect = TRUE;
|
2001-09-20 02:35:51 +00:00
|
|
|
}else if(start[0] == '\x00' && start[1] == '\x00' &&
|
|
|
|
start[2] == '\xFF' && start[3]=='\xFE'){
|
2001-05-26 01:16:37 +00:00
|
|
|
*cp = "UTF32_BigEndian";
|
|
|
|
autodetect =TRUE;
|
|
|
|
}else{
|
|
|
|
*numRead =0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while(i> *numRead){
|
|
|
|
T_FileStream_ungetc(start[i-1],in);
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
return autodetect;
|
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* Autodetects UTF8, UTF-16-BigEndian and UTF-16-LittleEndian BOMs*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI UBool U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_autodetect(FileStream* in,const char** cp){
|
2001-05-10 16:54:09 +00:00
|
|
|
UBool autodetect = FALSE;
|
2001-05-26 01:16:37 +00:00
|
|
|
int numRead =0;
|
2001-05-31 21:36:09 +00:00
|
|
|
const char* tcp;
|
2001-05-26 01:16:37 +00:00
|
|
|
autodetect=ucbuf_autodetect_nrw(in,&tcp, &numRead);
|
|
|
|
*cp =tcp;
|
|
|
|
/* rewind the file Stream */
|
|
|
|
T_FileStream_rewind(in);
|
2001-05-10 16:54:09 +00:00
|
|
|
return autodetect;
|
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* fill the uchar buffer */
|
2001-05-16 01:09:06 +00:00
|
|
|
static UCHARBUF*
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* err){
|
|
|
|
UChar* pTarget=NULL;
|
|
|
|
UChar* target=NULL;
|
|
|
|
const char* source=NULL;
|
2001-05-26 22:29:39 +00:00
|
|
|
char cbuf[MAX_IN_BUF] = {'\0'};
|
2001-05-10 21:43:01 +00:00
|
|
|
int numRead=0;
|
|
|
|
int offset=0;
|
|
|
|
|
2001-05-22 18:00:55 +00:00
|
|
|
pTarget = buf->buffer;
|
|
|
|
/* check if we arrived here without exhausting the buffer*/
|
|
|
|
if(buf->currentPos<buf->bufLimit){
|
|
|
|
offset= buf->bufLimit-buf->currentPos;
|
|
|
|
memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar));
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
2001-05-22 18:00:55 +00:00
|
|
|
|
|
|
|
#if DEBUG
|
|
|
|
memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset));
|
|
|
|
#endif
|
2001-05-10 21:43:01 +00:00
|
|
|
|
|
|
|
/* read the file */
|
2001-05-22 18:00:55 +00:00
|
|
|
numRead=T_FileStream_read(buf->in,cbuf,MAX_IN_BUF-offset);
|
2001-05-10 21:43:01 +00:00
|
|
|
buf->remaining-=numRead;
|
|
|
|
|
|
|
|
target=pTarget;
|
|
|
|
/* convert the bytes */
|
|
|
|
if(buf->conv){
|
|
|
|
/* since state is saved in the converter we add offset to source*/
|
|
|
|
target = pTarget+offset;
|
|
|
|
source = cbuf;
|
2001-09-20 02:35:51 +00:00
|
|
|
ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset),
|
|
|
|
&source,source+numRead,NULL,
|
|
|
|
(UBool)(buf->remaining==0),err);
|
2001-05-10 21:43:01 +00:00
|
|
|
numRead= target-pTarget;
|
|
|
|
if(U_FAILURE(*err)){
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
u_charsToUChars(cbuf,target+offset,numRead);
|
2001-05-22 18:00:55 +00:00
|
|
|
numRead=((buf->remaining>MAX_IN_BUF)? MAX_IN_BUF:numRead+offset);
|
2001-05-10 21:43:01 +00:00
|
|
|
}
|
|
|
|
buf->currentPos = pTarget;
|
|
|
|
buf->bufLimit=pTarget+numRead;
|
|
|
|
return buf;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* get a UChar from the stream*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI UChar32 U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_getc(UCHARBUF* buf,UErrorCode* err){
|
2001-05-22 17:36:43 +00:00
|
|
|
if(buf->currentPos>=buf->bufLimit){
|
2001-05-10 21:43:01 +00:00
|
|
|
if(buf->remaining==0){
|
|
|
|
return U_EOF;
|
|
|
|
}
|
|
|
|
buf=ucbuf_fillucbuf(buf,err);
|
|
|
|
if(U_FAILURE(*err)){
|
|
|
|
return U_EOF;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-05-22 17:36:43 +00:00
|
|
|
return *(buf->currentPos++);
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* u_unescapeAt() callback to return a UChar*/
|
2001-05-10 21:43:01 +00:00
|
|
|
static UChar
|
|
|
|
_charAt(int32_t offset, void *context) {
|
2001-05-10 16:54:09 +00:00
|
|
|
return ((UCHARBUF*) context)->currentPos[offset];
|
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* getc and escape it */
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI UChar32 U_EXPORT2
|
2001-05-10 16:54:09 +00:00
|
|
|
ucbuf_getcx(UCHARBUF* buf,UErrorCode* err) {
|
|
|
|
int32_t length;
|
|
|
|
int32_t offset;
|
2001-08-28 01:25:35 +00:00
|
|
|
UChar32 c32,c1,c2;
|
|
|
|
|
2001-05-10 16:54:09 +00:00
|
|
|
/* Fill the buffer if it is empty */
|
2001-08-28 01:25:35 +00:00
|
|
|
if (buf->currentPos >=buf->bufLimit-2) {
|
2001-05-22 17:36:43 +00:00
|
|
|
ucbuf_fillucbuf(buf,err);
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Get the next character in the buffer */
|
|
|
|
if (buf->currentPos < buf->bufLimit) {
|
2001-05-26 01:16:37 +00:00
|
|
|
c1 = *(buf->currentPos)++;
|
2001-05-10 16:54:09 +00:00
|
|
|
} else {
|
2001-05-26 01:16:37 +00:00
|
|
|
c1 = U_EOF;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2001-09-20 02:35:51 +00:00
|
|
|
|
|
|
|
c2 = *(buf->currentPos);
|
2001-05-10 16:54:09 +00:00
|
|
|
|
|
|
|
/* If it isn't a backslash, return it */
|
2001-08-28 01:25:35 +00:00
|
|
|
if (c1 != 0x005C) {
|
2001-05-26 01:16:37 +00:00
|
|
|
return c1;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2001-09-20 02:35:51 +00:00
|
|
|
|
2001-05-10 16:54:09 +00:00
|
|
|
/* Determine the amount of data in the buffer */
|
|
|
|
length = buf->bufLimit-buf->currentPos;
|
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
/* The longest escape sequence is \Uhhhhhhhh; make sure
|
2001-05-10 16:54:09 +00:00
|
|
|
we have at least that many characters */
|
|
|
|
if (length < 10) {
|
|
|
|
|
|
|
|
/* fill the buffer */
|
|
|
|
ucbuf_fillucbuf(buf,err);
|
|
|
|
length = buf->bufLimit-buf->buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process the escape */
|
|
|
|
offset = 0;
|
|
|
|
c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf);
|
|
|
|
|
2001-09-20 02:35:51 +00:00
|
|
|
/* check if u_unescapeAt unescaped and converted
|
|
|
|
* to c32 or not
|
|
|
|
*/
|
|
|
|
if(c32!=c2){
|
|
|
|
/* Update the current buffer position */
|
|
|
|
buf->currentPos += offset;
|
|
|
|
}else{
|
|
|
|
/* unescaping failed so we just return
|
|
|
|
* c1 and not consume the buffer
|
|
|
|
* this is useful for rules with escapes
|
|
|
|
* in resouce bundles
|
|
|
|
* eg: \' \\ \"
|
|
|
|
*/
|
|
|
|
return c1;
|
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
|
|
|
|
return c32;
|
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* open a UCHARBUF */
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI UCHARBUF* U_EXPORT2
|
2001-05-26 01:16:37 +00:00
|
|
|
ucbuf_open(FileStream* in, UErrorCode* err){
|
2001-05-10 21:43:01 +00:00
|
|
|
|
|
|
|
UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF));
|
2001-05-31 21:36:09 +00:00
|
|
|
const char *cp;
|
2001-05-26 01:16:37 +00:00
|
|
|
int numRead =0;
|
2001-05-10 21:43:01 +00:00
|
|
|
if(U_FAILURE(*err)){
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if(buf){
|
|
|
|
buf->in=in;
|
2001-05-26 01:16:37 +00:00
|
|
|
ucbuf_autodetect_nrw(in,&cp,&numRead);
|
|
|
|
buf->remaining=T_FileStream_size(in)-numRead;
|
2001-05-22 18:00:55 +00:00
|
|
|
buf->buffer=(UChar*) uprv_malloc(sizeof(UChar)* MAX_U_BUF);
|
|
|
|
if (buf->buffer == NULL) {
|
|
|
|
*err = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
buf->currentPos=buf->buffer;
|
|
|
|
buf->bufLimit=buf->buffer;
|
2001-05-10 21:43:01 +00:00
|
|
|
if(*cp!='\0'){
|
|
|
|
buf->conv=ucnv_open(cp,err);
|
|
|
|
}else{
|
|
|
|
buf->conv=NULL;
|
|
|
|
}
|
|
|
|
if(U_FAILURE(*err)){
|
2001-05-10 23:24:57 +00:00
|
|
|
fprintf(stderr, "Could not open codepage [%s]: %s\n", cp, u_errorName(*err));
|
2001-05-10 21:43:01 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
buf=ucbuf_fillucbuf(buf,err);
|
|
|
|
return buf;
|
|
|
|
}else{
|
|
|
|
*err = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return NULL;
|
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* TODO: this method will fail if at the
|
2001-05-22 15:54:03 +00:00
|
|
|
* begining of buffer and the uchar to unget
|
|
|
|
* is from the previous buffer. Need to implement
|
|
|
|
* system to take care of that situation.
|
|
|
|
*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_ungetc(UChar32 c,UCHARBUF* buf){
|
2001-05-22 17:36:43 +00:00
|
|
|
/* decrement currentPos pointer
|
|
|
|
* if not at the begining of buffer
|
|
|
|
*/
|
2001-05-10 21:43:01 +00:00
|
|
|
if(buf->currentPos!=buf->buffer){
|
|
|
|
buf->currentPos--;
|
|
|
|
}
|
2001-05-16 16:34:10 +00:00
|
|
|
}
|
2001-05-10 16:54:09 +00:00
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* frees the resources of UChar* buffer */
|
|
|
|
static void
|
|
|
|
ucbuf_closebuf(UCHARBUF* buf){
|
|
|
|
uprv_free(buf->buffer);
|
2001-05-21 19:38:13 +00:00
|
|
|
buf->buffer = NULL;
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
|
|
|
|
2001-05-16 16:34:10 +00:00
|
|
|
/* close the buf and release resources*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_close(UCHARBUF* buf){
|
|
|
|
if(buf->conv){
|
|
|
|
ucnv_close(buf->conv);
|
|
|
|
}
|
|
|
|
buf->in=NULL;
|
|
|
|
buf->currentPos=NULL;
|
|
|
|
buf->bufLimit=NULL;
|
|
|
|
ucbuf_closebuf(buf);
|
|
|
|
uprv_free(buf);
|
2001-05-10 16:54:09 +00:00
|
|
|
}
|
2001-05-16 16:34:10 +00:00
|
|
|
|
|
|
|
/* rewind the buf and file stream */
|
|
|
|
U_CAPI void U_EXPORT2
|
|
|
|
ucbuf_rewind(UCHARBUF* buf){
|
2001-05-22 17:36:43 +00:00
|
|
|
if(buf){
|
|
|
|
const char* cp="";
|
|
|
|
buf->currentPos=buf->buffer;
|
|
|
|
buf->bufLimit=buf->buffer;
|
|
|
|
ucnv_reset(buf->conv);
|
|
|
|
T_FileStream_rewind(buf->in);
|
|
|
|
ucbuf_autodetect(buf->in,&cp);
|
|
|
|
buf->remaining=T_FileStream_size(buf->in);
|
|
|
|
}
|
2001-05-16 16:34:10 +00:00
|
|
|
}
|