/* ******************************************************************************* * * Copyright (C) 1998-2001, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * * File ucbuf.c * * Modification History: * * Date Name Description * 05/10/01 Ram Creation. ******************************************************************************* */ #include "unicode/utypes.h" #include "unicode/ucnv.h" #include "unicode/ucnv_err.h" #include "filestrm.h" #include "cmemory.h" #include "unicode/ustring.h" #include "ucbuf.h" #include #define MAX_IN_BUF 1000 #define MAX_U_BUF 1500 #define CONTEXT_LEN 15 struct UCHARBUF { UChar* buffer; UChar* currentPos; UChar* bufLimit; int32_t remaining; FileStream* in; UConverter* conv; UBool showWarning; /* makes this API not produce any errors */ }; static UBool ucbuf_autodetect_nrw(FileStream* in, const char** cp,int* numRead){ /* initial 0xa5 bytes: make sure that if we read <4 bytes we don't misdetect something */ char start[4]={ '\xa5', '\xa5', '\xa5', '\xa5' }; int cap =T_FileStream_size(in); UBool autodetect; int signatureLength; *numRead=0; *cp=""; if(cap<=0) { return FALSE; } autodetect = TRUE; *numRead=T_FileStream_read(in, start, 4); /* *numRead might be <4 */ if(start[0] == '\xFE' && start[1] == '\xFF') { *cp = "UTF-16BE"; signatureLength=2; } else if(start[0] == '\xFF' && start[1] == '\xFE') { if(start[2] == '\x00' && start[3] =='\x00'){ *cp="UTF-32LE"; signatureLength=4; } else { *cp = "UTF-16LE"; signatureLength=2; } } else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') { *cp = "UTF-8"; signatureLength=3; }else if(start[0] == '\x0E' && start[1] == '\xFE' && start[2] == '\xFF'){ *cp ="SCSU"; signatureLength=3; }else if(start[0] == '\x00' && start[1] == '\x00' && start[2] == '\xFE' && start[3]=='\xFF'){ *cp = "UTF-32BE"; signatureLength=4; }else{ signatureLength=0; autodetect=FALSE; } while(signatureLength<*numRead) { T_FileStream_ungetc(start[--*numRead], in); } return autodetect; } /* Autodetects UTF8, UTF-16-BigEndian and UTF-16-LittleEndian BOMs*/ U_CAPI UBool U_EXPORT2 ucbuf_autodetect(FileStream* in,const char** cp){ UBool autodetect = FALSE; int numRead =0; const char* tcp; autodetect=ucbuf_autodetect_nrw(in,&tcp, &numRead); *cp =tcp; /* rewind the file Stream */ T_FileStream_rewind(in); return autodetect; } /* fill the uchar buffer */ static UCHARBUF* ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* err){ UChar* pTarget=NULL; UChar* target=NULL; const char* source=NULL; char cbuf[MAX_IN_BUF] = {'\0'}; int numRead=0; int offset=0; const char* sourceLimit =NULL; pTarget = buf->buffer; /* check if we arrived here without exhausting the buffer*/ if(buf->currentPosbufLimit){ offset= buf->bufLimit-buf->currentPos; memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar)); } #if DEBUG memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset)); #endif /* read the file */ numRead=T_FileStream_read(buf->in,cbuf,MAX_IN_BUF-offset); buf->remaining-=numRead; target=pTarget; /* convert the bytes */ if(buf->conv){ /* set the callback to stop */ UConverterToUCallback toUOldAction ; void* toUOldContext; void* toUNewContext=NULL; ucnv_setToUCallBack(buf->conv, UCNV_TO_U_CALLBACK_STOP, toUNewContext, &toUOldAction, (const void**)&toUOldContext, err); /* since state is saved in the converter we add offset to source*/ target = pTarget+offset; source = cbuf; sourceLimit = source + numRead; ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset), &source,source+numRead,NULL, (UBool)(buf->remaining==0),err); if(U_FAILURE(*err)){ char context[CONTEXT_LEN]; char preContext[CONTEXT_LEN]; char postContext[CONTEXT_LEN]; int8_t len = CONTEXT_LEN; int32_t start=0; int32_t stop =0; int32_t pos =0; if( buf->showWarning==TRUE){ fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" " converting input stream to target encoding: %s\n", u_errorName(*err)); } *err = U_ZERO_ERROR; /* now get the context chars */ ucnv_getInvalidChars(buf->conv,context,&len,err); context[len]= 0 ; /* null terminate the buffer */ pos = source-cbuf-len; /* for pre-context */ start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); stop = pos-len; memcpy(preContext,cbuf+start,stop-start); /* null terminate the buffer */ preContext[stop-start] = 0; /* for post-context */ start = pos+len; stop = ((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf); memcpy(postContext,source,stop-start); /* null terminate the buffer */ postContext[stop-start] = 0; if(buf->showWarning ==TRUE){ /* print out the context */ fprintf(stderr,"\tPre-context: %s\n",preContext); fprintf(stderr,"\tContext: %s\n",context); fprintf(stderr,"\tPost-context: %s\n", postContext); } /* reset the converter */ ucnv_reset(buf->conv); /* set the call back to substiture * and restart conversion */ ucnv_setToUCallBack(buf->conv, UCNV_TO_U_CALLBACK_SUBSTITUTE, toUNewContext, &toUOldAction, (const void**)&toUOldContext, err); /* reset source and target start positions */ target = pTarget+offset; source = cbuf; /* re convert */ ucnv_toUnicode(buf->conv,&target,target+(MAX_U_BUF-offset), &source,sourceLimit,NULL, (UBool)(buf->remaining==0),err); } numRead= target-pTarget; #if DEBUG { int i; target = pTarget; for(i=0;iremaining>MAX_IN_BUF)? MAX_IN_BUF:numRead+offset); } buf->currentPos = pTarget; buf->bufLimit=pTarget+numRead; return buf; } /* get a UChar from the stream*/ U_CAPI UChar32 U_EXPORT2 ucbuf_getc(UCHARBUF* buf,UErrorCode* err){ if(buf->currentPos>=buf->bufLimit){ if(buf->remaining==0){ return U_EOF; } buf=ucbuf_fillucbuf(buf,err); if(U_FAILURE(*err)){ return U_EOF; } } return *(buf->currentPos++); } /* u_unescapeAt() callback to return a UChar*/ static UChar _charAt(int32_t offset, void *context) { return ((UCHARBUF*) context)->currentPos[offset]; } /* getc and escape it */ U_CAPI UChar32 U_EXPORT2 ucbuf_getcx(UCHARBUF* buf,UErrorCode* err) { int32_t length; int32_t offset; UChar32 c32,c1,c2; /* Fill the buffer if it is empty */ if (buf->currentPos >=buf->bufLimit-2) { ucbuf_fillucbuf(buf,err); } /* Get the next character in the buffer */ if (buf->currentPos < buf->bufLimit) { c1 = *(buf->currentPos)++; } else { c1 = U_EOF; } c2 = *(buf->currentPos); /* If it isn't a backslash, return it */ if (c1 != 0x005C) { return c1; } /* Determine the amount of data in the buffer */ length = buf->bufLimit-buf->currentPos; /* The longest escape sequence is \Uhhhhhhhh; make sure we have at least that many characters */ if (length < 10) { /* fill the buffer */ ucbuf_fillucbuf(buf,err); length = buf->bufLimit-buf->buffer; } /* Process the escape */ offset = 0; c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf); /* check if u_unescapeAt unescaped and converted * to c32 or not */ if(c32!=c2){ /* Update the current buffer position */ buf->currentPos += offset; }else{ /* unescaping failed so we just return * c1 and not consume the buffer * this is useful for rules with escapes * in resouce bundles * eg: \' \\ \" */ return c1; } return c32; } /* open a UCHARBUF */ U_CAPI UCHARBUF* U_EXPORT2 ucbuf_open(FileStream* in,const char* cp, UBool showWarning, UErrorCode* err){ UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF)); int numRead =0; if(U_FAILURE(*err)){ return NULL; } if(buf){ buf->in=in; buf->conv=NULL; buf->showWarning = showWarning; if(!cp ||(cp && *cp=='\0')){ /* don't have code page name... try to autodetect */ if(ucbuf_autodetect_nrw(in,&cp,&numRead)){ buf->conv=ucnv_open(cp,err); } }else{ buf->conv=ucnv_open(cp,err); } if((buf->conv==NULL) && (buf->showWarning==TRUE)){ fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n"); } buf->remaining=T_FileStream_size(in)-numRead; buf->buffer=(UChar*) uprv_malloc(sizeof(UChar)* MAX_U_BUF); if (buf->buffer == NULL) { *err = U_MEMORY_ALLOCATION_ERROR; return NULL; } buf->currentPos=buf->buffer; buf->bufLimit=buf->buffer; if(U_FAILURE(*err)){ fprintf(stderr, "Could not open codepage [%s]: %s\n", cp, u_errorName(*err)); return NULL; } buf=ucbuf_fillucbuf(buf,err); return buf; }else{ *err = U_MEMORY_ALLOCATION_ERROR; return NULL; } } /* TODO: this method will fail if at the * begining of buffer and the uchar to unget * is from the previous buffer. Need to implement * system to take care of that situation. */ U_CAPI void U_EXPORT2 ucbuf_ungetc(UChar32 c,UCHARBUF* buf){ /* decrement currentPos pointer * if not at the begining of buffer */ if(buf->currentPos!=buf->buffer){ buf->currentPos--; } } /* frees the resources of UChar* buffer */ static void ucbuf_closebuf(UCHARBUF* buf){ uprv_free(buf->buffer); buf->buffer = NULL; } /* close the buf and release resources*/ U_CAPI void U_EXPORT2 ucbuf_close(UCHARBUF* buf){ if(buf->conv){ ucnv_close(buf->conv); } buf->in=NULL; buf->currentPos=NULL; buf->bufLimit=NULL; ucbuf_closebuf(buf); uprv_free(buf); } /* rewind the buf and file stream */ U_CAPI void U_EXPORT2 ucbuf_rewind(UCHARBUF* buf){ if(buf){ const char* cp=""; buf->currentPos=buf->buffer; buf->bufLimit=buf->buffer; ucnv_reset(buf->conv); T_FileStream_rewind(buf->in); ucbuf_autodetect(buf->in,&cp); buf->remaining=T_FileStream_size(buf->in); } }