ICU-1603 API for Unicode signature detection

X-SVN-Rev: 7402
This commit is contained in:
Ram Viswanadha 2002-01-08 01:05:57 +00:00
parent 8be8e7e5c4
commit 17881f0a1a
3 changed files with 297 additions and 2 deletions

View File

@ -1239,6 +1239,66 @@ ucnv_getInvalidUChars (const UConverter * converter,
}
}
#define SIG_MAX_LEN 4
U_CAPI const char* U_EXPORT2
ucnv_detectUnicodeSignature( const char* source,
int32_t sourceLength,
int32_t* signatureLength,
UErrorCode* pErrorCode){
/* initial 0xa5 bytes: make sure that if we read <4
* bytes we don't misdetect something
*/
char start[SIG_MAX_LEN]={ '\xa5', '\xa5', '\xa5', '\xa5' };
int i = 0;
if((pErrorCode==NULL) || U_FAILURE(*pErrorCode)){
return NULL;
}
if(source == NULL || signatureLength == NULL || sourceLength < -1){
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if(sourceLength==-1){
sourceLength=uprv_strlen(source);
}
while(i<sourceLength&& i<SIG_MAX_LEN){
start[i]=source[i];
i++;
}
if(start[0] == '\xFE' && start[1] == '\xFF') {
*signatureLength=2;
return "UTF-16BE";
} else if(start[0] == '\xFF' && start[1] == '\xFE') {
if(start[2] == '\x00' && start[3] =='\x00'){
*signatureLength=4;
return "UTF-32LE";
} else {
*signatureLength=2;
return "UTF-16LE";
}
} else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') {
*signatureLength=3;
return "UTF-8";
}else if(start[0] == '\x0E' && start[1] == '\xFE' && start[2] == '\xFF'){
*signatureLength=3;
return "SCSU";
}else if(start[0] == '\x00' && start[1] == '\x00' &&
start[2] == '\xFE' && start[3]=='\xFF'){
*signatureLength=4;
return "UTF-32BE";
}else{
*signatureLength=0;
return NULL;
}
}
/*
* Hey, Emacs, please set the following:
*

View File

@ -1108,14 +1108,56 @@ ucnv_isAmbiguous(const UConverter *cnv);
* mapping, FALSE otherwise.
* @stable
*/
U_CAPI void U_EXPORT2 ucnv_setFallback(UConverter *cnv, UBool usesFallback);
U_CAPI void U_EXPORT2
ucnv_setFallback(UConverter *cnv, UBool usesFallback);
/**
* Determines if the converter uses fallback mappings or not.
* @return TRUE if the converter uses fallback, FALSE otherwise.
* @stable
*/
U_CAPI UBool U_EXPORT2 ucnv_usesFallback(const UConverter *cnv);
U_CAPI UBool U_EXPORT2
ucnv_usesFallback(const UConverter *cnv);
/**
* Detects Unicode signatures in the given byte stream. The signature bytes are not consumed,
* instead the number of bytes that make up the signature is returned. The conversion APIs
* donot discard signature bytes, so if the caller wishes to discard them, the caller should
* explicity add code to do that after calling this function.
* <p>
* Usage:
* @code
* UErrorCode err = U_ZERO_ERROR;
* char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
* char* source = input;
* int32_t signatureLength = 0;
* char* encoding = ucnv_detectUnicodeSignatures(source,sizeof(input),&signatureLength,&err);
* UConverter* conv = NULL;
* if(encoding!=NULL && U_SUCCESS(err)){
* // should signature be discarded ?
* if (discardSignature){
* source += signatureLength;
* }
* conv = ucnv_open(encoding, &err);
* .... do the conversion ....
* }
*
* @endcode
*
* @param source The source string in which the signature should be detected.
* @param sourceLength Length of the input string, or -1 if NUL-terminated.
* @param signatureLength A pointer to int8_t to receive the number of bytes that make up the signature
* of the detected UTF. 0 if not detected.
* @param pErrorCode A pointer to receive information about any errors that may occur during detection.
* Must be a valid pointer to an error code value, which must not indicate a failure
* before the function call.
* @return The name of the encoding detected. NULL if encoding is not detected.
*/
U_CAPI const char* U_EXPORT2
ucnv_detectUnicodeSignature( const char* source,
int32_t sourceLength,
int32_t* signatureLength,
UErrorCode* pErrorCode);
#endif
/*_UCNV*/

View File

@ -29,6 +29,7 @@ static void TestJitterbug1293(void);
static void TestNewConvertWithBufferSizes(int32_t osize, int32_t isize) ;
static void TestConverterTypesAndStarters(void);
static void TestAmbiguous(void);
static void TestSignatureDetection();
static void TestUTF7(void);
static void TestUTF8(void);
static void TestUTF16BE(void);
@ -186,6 +187,7 @@ void addTestNewConvert(TestNode** root)
addTest(root, &TestOutBufSizes, "tsconv/nucnvtst/TestOutBufSizes");
addTest(root, &TestConverterTypesAndStarters, "tsconv/nucnvtst/TestConverterTypesAndStarters");
addTest(root, &TestAmbiguous, "tsconv/nucnvtst/TestAmbiguous");
addTest(root, &TestSignatureDetection, "tsconv/nucnvtst/TestSignatureDetection");
addTest(root, &TestUTF7, "tsconv/nucnvtst/TestUTF7");
addTest(root, &TestUTF8, "tsconv/nucnvtst/TestUTF8");
addTest(root, &TestUTF16BE, "tsconv/nucnvtst/TestUTF16BE");
@ -1320,6 +1322,197 @@ static void TestAmbiguous()
ucnv_close(ascii_cnv);
}
static void
TestSignatureDetection(){
/* with null terminated strings */
{
char* data[] = {
"\xFE\xFF\x00\x00", /* UTF-16BE */
"\xFF\xFE\x00\x00", /* UTF-16LE */
"\xEF\xBB\xBF\x00", /* UTF-8 */
"\x0E\xFE\xFF\x00", /* SCSU */
"\xFE\xFF", /* UTF-16BE */
"\xFF\xFE", /* UTF-16LE */
"\xEF\xBB\xBF", /* UTF-8 */
"\x0E\xFE\xFF", /* SCSU */
"\xFE\xFF\x41\x42", /* UTF-16BE */
"\xFF\xFE\x41\x41", /* UTF-16LE */
"\xEF\xBB\xBF\x41", /* UTF-8 */
"\x0E\xFE\xFF\x41", /* SCSU */
};
char* expected[] = {
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
};
int32_t expectedLength[] ={
2,
2,
3,
3,
2,
2,
3,
3,
2,
2,
3,
3,
};
int i=0;
UErrorCode err;
int32_t signatureLength = -1;
char* source = NULL;
const char* enc = NULL;
for( ; i<sizeof(data)/sizeof(char*); i++){
err = U_ZERO_ERROR;
source = data[i];
enc = ucnv_detectUnicodeSignature(source, -1 , &signatureLength, &err);
if(U_FAILURE(err)){
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i. Error: %s\n", source,i,u_errorName(err));
continue;
}
if(enc == NULL || strcmp(enc,expected[i]) !=0){
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i. Expected: %s. Got: %s\n",source,i,expected[i],enc);
continue;
}
if(signatureLength != expectedLength[i]){
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i.Expected Length: %i. Got length: %i\n",source,i,signatureLength,expectedLength[i]);
}
}
}
{
char* data[] = {
"\xFE\xFF\x00", /* UTF-16BE */
"\xFF\xFE\x00", /* UTF-16LE */
"\xEF\xBB\xBF\x00", /* UTF-8 */
"\x0E\xFE\xFF\x00", /* SCSU */
"\x00\x00\xFE\xFF", /* UTF-32BE */
"\xFF\xFE\x00\x00", /* UTF-32LE */
"\xFE\xFF", /* UTF-16BE */
"\xFF\xFE", /* UTF-16LE */
"\xEF\xBB\xBF", /* UTF-8 */
"\x0E\xFE\xFF", /* SCSU */
"\x00\x00\xFE\xFF", /* UTF-32BE */
"\xFF\xFE\x00\x00", /* UTF-32LE */
"\xFE\xFF\x41\x42", /* UTF-16BE */
"\xFF\xFE\x41\x41", /* UTF-16LE */
"\xEF\xBB\xBF\x41", /* UTF-8 */
"\x0E\xFE\xFF\x41", /* SCSU */
"\x00\x00\xFE\xFF\x41", /* UTF-32BE */
"\xFF\xFE\x00\x00\x42", /* UTF-32LE */
"\xFF\x41\x42" /* NULL */
};
int len[] = {
3,
3,
4,
4,
4,
4,
2,
2,
3,
3,
4,
4,
4,
4,
4,
4,
5,
5,
3
};
char* expected[] = {
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-32BE",
"UTF-32LE",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-32BE",
"UTF-32LE",
"UTF-16BE",
"UTF-16LE",
"UTF-8",
"SCSU",
"UTF-32BE",
"UTF-32LE",
NULL
};
int32_t expectedLength[] ={
2,
2,
3,
3,
4,
4,
2,
2,
3,
3,
4,
4,
2,
2,
3,
3,
4,
4,
0
};
int i=0;
UErrorCode err;
int32_t signatureLength = -1;
int32_t sourceLength=-1;
char* source = NULL;
const char* enc = NULL;
for( ; i<sizeof(data)/sizeof(char*); i++){
err = U_ZERO_ERROR;
source = data[i];
sourceLength = len[i];
enc = ucnv_detectUnicodeSignature(source, sourceLength , &signatureLength, &err);
if(U_FAILURE(err)){
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i. Error: %s\n", source,i,u_errorName(err));
continue;
}
if(enc == NULL || strcmp(enc,expected[i]) !=0){
if(expected[i] !=NULL){
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i. Expected: %s. Got: %s\n",source,i,expected[i],enc);
continue;
}
}
if(signatureLength != expectedLength[i]){
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i.Expected Length: %i. Got length: %i\n",source,i,signatureLength,expectedLength[i]);
}
}
}
}
void
static TestUTF7() {
/* test input */