ICU-1603 API for Unicode signature detection
X-SVN-Rev: 7402
This commit is contained in:
parent
8be8e7e5c4
commit
17881f0a1a
@ -1239,6 +1239,66 @@ ucnv_getInvalidUChars (const UConverter * converter,
|
||||
}
|
||||
}
|
||||
|
||||
#define SIG_MAX_LEN 4
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
ucnv_detectUnicodeSignature( const char* source,
|
||||
int32_t sourceLength,
|
||||
int32_t* signatureLength,
|
||||
UErrorCode* pErrorCode){
|
||||
|
||||
/* initial 0xa5 bytes: make sure that if we read <4
|
||||
* bytes we don't misdetect something
|
||||
*/
|
||||
char start[SIG_MAX_LEN]={ '\xa5', '\xa5', '\xa5', '\xa5' };
|
||||
int i = 0;
|
||||
|
||||
if((pErrorCode==NULL) || U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(source == NULL || signatureLength == NULL || sourceLength < -1){
|
||||
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(sourceLength==-1){
|
||||
sourceLength=uprv_strlen(source);
|
||||
}
|
||||
|
||||
|
||||
while(i<sourceLength&& i<SIG_MAX_LEN){
|
||||
start[i]=source[i];
|
||||
i++;
|
||||
}
|
||||
|
||||
if(start[0] == '\xFE' && start[1] == '\xFF') {
|
||||
*signatureLength=2;
|
||||
return "UTF-16BE";
|
||||
} else if(start[0] == '\xFF' && start[1] == '\xFE') {
|
||||
if(start[2] == '\x00' && start[3] =='\x00'){
|
||||
*signatureLength=4;
|
||||
return "UTF-32LE";
|
||||
} else {
|
||||
*signatureLength=2;
|
||||
return "UTF-16LE";
|
||||
}
|
||||
} else if(start[0] == '\xEF' && start[1] == '\xBB' && start[2] == '\xBF') {
|
||||
*signatureLength=3;
|
||||
return "UTF-8";
|
||||
}else if(start[0] == '\x0E' && start[1] == '\xFE' && start[2] == '\xFF'){
|
||||
*signatureLength=3;
|
||||
return "SCSU";
|
||||
}else if(start[0] == '\x00' && start[1] == '\x00' &&
|
||||
start[2] == '\xFE' && start[3]=='\xFF'){
|
||||
*signatureLength=4;
|
||||
return "UTF-32BE";
|
||||
}else{
|
||||
*signatureLength=0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
|
@ -1108,14 +1108,56 @@ ucnv_isAmbiguous(const UConverter *cnv);
|
||||
* mapping, FALSE otherwise.
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucnv_setFallback(UConverter *cnv, UBool usesFallback);
|
||||
U_CAPI void U_EXPORT2
|
||||
ucnv_setFallback(UConverter *cnv, UBool usesFallback);
|
||||
|
||||
/**
|
||||
* Determines if the converter uses fallback mappings or not.
|
||||
* @return TRUE if the converter uses fallback, FALSE otherwise.
|
||||
* @stable
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 ucnv_usesFallback(const UConverter *cnv);
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucnv_usesFallback(const UConverter *cnv);
|
||||
|
||||
/**
|
||||
* Detects Unicode signatures in the given byte stream. The signature bytes are not consumed,
|
||||
* instead the number of bytes that make up the signature is returned. The conversion APIs
|
||||
* donot discard signature bytes, so if the caller wishes to discard them, the caller should
|
||||
* explicity add code to do that after calling this function.
|
||||
* <p>
|
||||
* Usage:
|
||||
* @code
|
||||
* UErrorCode err = U_ZERO_ERROR;
|
||||
* char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
|
||||
* char* source = input;
|
||||
* int32_t signatureLength = 0;
|
||||
* char* encoding = ucnv_detectUnicodeSignatures(source,sizeof(input),&signatureLength,&err);
|
||||
* UConverter* conv = NULL;
|
||||
* if(encoding!=NULL && U_SUCCESS(err)){
|
||||
* // should signature be discarded ?
|
||||
* if (discardSignature){
|
||||
* source += signatureLength;
|
||||
* }
|
||||
* conv = ucnv_open(encoding, &err);
|
||||
* .... do the conversion ....
|
||||
* }
|
||||
*
|
||||
* @endcode
|
||||
*
|
||||
* @param source The source string in which the signature should be detected.
|
||||
* @param sourceLength Length of the input string, or -1 if NUL-terminated.
|
||||
* @param signatureLength A pointer to int8_t to receive the number of bytes that make up the signature
|
||||
* of the detected UTF. 0 if not detected.
|
||||
* @param pErrorCode A pointer to receive information about any errors that may occur during detection.
|
||||
* Must be a valid pointer to an error code value, which must not indicate a failure
|
||||
* before the function call.
|
||||
* @return The name of the encoding detected. NULL if encoding is not detected.
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2
|
||||
ucnv_detectUnicodeSignature( const char* source,
|
||||
int32_t sourceLength,
|
||||
int32_t* signatureLength,
|
||||
UErrorCode* pErrorCode);
|
||||
|
||||
#endif
|
||||
/*_UCNV*/
|
||||
|
@ -29,6 +29,7 @@ static void TestJitterbug1293(void);
|
||||
static void TestNewConvertWithBufferSizes(int32_t osize, int32_t isize) ;
|
||||
static void TestConverterTypesAndStarters(void);
|
||||
static void TestAmbiguous(void);
|
||||
static void TestSignatureDetection();
|
||||
static void TestUTF7(void);
|
||||
static void TestUTF8(void);
|
||||
static void TestUTF16BE(void);
|
||||
@ -186,6 +187,7 @@ void addTestNewConvert(TestNode** root)
|
||||
addTest(root, &TestOutBufSizes, "tsconv/nucnvtst/TestOutBufSizes");
|
||||
addTest(root, &TestConverterTypesAndStarters, "tsconv/nucnvtst/TestConverterTypesAndStarters");
|
||||
addTest(root, &TestAmbiguous, "tsconv/nucnvtst/TestAmbiguous");
|
||||
addTest(root, &TestSignatureDetection, "tsconv/nucnvtst/TestSignatureDetection");
|
||||
addTest(root, &TestUTF7, "tsconv/nucnvtst/TestUTF7");
|
||||
addTest(root, &TestUTF8, "tsconv/nucnvtst/TestUTF8");
|
||||
addTest(root, &TestUTF16BE, "tsconv/nucnvtst/TestUTF16BE");
|
||||
@ -1320,6 +1322,197 @@ static void TestAmbiguous()
|
||||
ucnv_close(ascii_cnv);
|
||||
}
|
||||
|
||||
static void
|
||||
TestSignatureDetection(){
|
||||
/* with null terminated strings */
|
||||
{
|
||||
char* data[] = {
|
||||
"\xFE\xFF\x00\x00", /* UTF-16BE */
|
||||
"\xFF\xFE\x00\x00", /* UTF-16LE */
|
||||
"\xEF\xBB\xBF\x00", /* UTF-8 */
|
||||
"\x0E\xFE\xFF\x00", /* SCSU */
|
||||
|
||||
"\xFE\xFF", /* UTF-16BE */
|
||||
"\xFF\xFE", /* UTF-16LE */
|
||||
"\xEF\xBB\xBF", /* UTF-8 */
|
||||
"\x0E\xFE\xFF", /* SCSU */
|
||||
|
||||
"\xFE\xFF\x41\x42", /* UTF-16BE */
|
||||
"\xFF\xFE\x41\x41", /* UTF-16LE */
|
||||
"\xEF\xBB\xBF\x41", /* UTF-8 */
|
||||
"\x0E\xFE\xFF\x41", /* SCSU */
|
||||
|
||||
};
|
||||
char* expected[] = {
|
||||
"UTF-16BE",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"SCSU",
|
||||
|
||||
"UTF-16BE",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"SCSU",
|
||||
|
||||
"UTF-16BE",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"SCSU",
|
||||
|
||||
};
|
||||
int32_t expectedLength[] ={
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
|
||||
};
|
||||
int i=0;
|
||||
UErrorCode err;
|
||||
int32_t signatureLength = -1;
|
||||
char* source = NULL;
|
||||
const char* enc = NULL;
|
||||
for( ; i<sizeof(data)/sizeof(char*); i++){
|
||||
err = U_ZERO_ERROR;
|
||||
source = data[i];
|
||||
enc = ucnv_detectUnicodeSignature(source, -1 , &signatureLength, &err);
|
||||
if(U_FAILURE(err)){
|
||||
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i. Error: %s\n", source,i,u_errorName(err));
|
||||
continue;
|
||||
}
|
||||
if(enc == NULL || strcmp(enc,expected[i]) !=0){
|
||||
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i. Expected: %s. Got: %s\n",source,i,expected[i],enc);
|
||||
continue;
|
||||
}
|
||||
if(signatureLength != expectedLength[i]){
|
||||
log_err("ucnv_detectUnicodeSignature failed for source : %s at index :%i.Expected Length: %i. Got length: %i\n",source,i,signatureLength,expectedLength[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
char* data[] = {
|
||||
"\xFE\xFF\x00", /* UTF-16BE */
|
||||
"\xFF\xFE\x00", /* UTF-16LE */
|
||||
"\xEF\xBB\xBF\x00", /* UTF-8 */
|
||||
"\x0E\xFE\xFF\x00", /* SCSU */
|
||||
"\x00\x00\xFE\xFF", /* UTF-32BE */
|
||||
"\xFF\xFE\x00\x00", /* UTF-32LE */
|
||||
"\xFE\xFF", /* UTF-16BE */
|
||||
"\xFF\xFE", /* UTF-16LE */
|
||||
"\xEF\xBB\xBF", /* UTF-8 */
|
||||
"\x0E\xFE\xFF", /* SCSU */
|
||||
"\x00\x00\xFE\xFF", /* UTF-32BE */
|
||||
"\xFF\xFE\x00\x00", /* UTF-32LE */
|
||||
"\xFE\xFF\x41\x42", /* UTF-16BE */
|
||||
"\xFF\xFE\x41\x41", /* UTF-16LE */
|
||||
"\xEF\xBB\xBF\x41", /* UTF-8 */
|
||||
"\x0E\xFE\xFF\x41", /* SCSU */
|
||||
"\x00\x00\xFE\xFF\x41", /* UTF-32BE */
|
||||
"\xFF\xFE\x00\x00\x42", /* UTF-32LE */
|
||||
"\xFF\x41\x42" /* NULL */
|
||||
};
|
||||
int len[] = {
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
4,
|
||||
5,
|
||||
5,
|
||||
3
|
||||
};
|
||||
|
||||
char* expected[] = {
|
||||
"UTF-16BE",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"SCSU",
|
||||
"UTF-32BE",
|
||||
"UTF-32LE",
|
||||
"UTF-16BE",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"SCSU",
|
||||
"UTF-32BE",
|
||||
"UTF-32LE",
|
||||
"UTF-16BE",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"SCSU",
|
||||
"UTF-32BE",
|
||||
"UTF-32LE",
|
||||
NULL
|
||||
};
|
||||
int32_t expectedLength[] ={
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
4,
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
4,
|
||||
2,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
4,
|
||||
0
|
||||
};
|
||||
int i=0;
|
||||
UErrorCode err;
|
||||
int32_t signatureLength = -1;
|
||||
int32_t sourceLength=-1;
|
||||
char* source = NULL;
|
||||
const char* enc = NULL;
|
||||
for( ; i<sizeof(data)/sizeof(char*); i++){
|
||||
err = U_ZERO_ERROR;
|
||||
source = data[i];
|
||||
sourceLength = len[i];
|
||||
enc = ucnv_detectUnicodeSignature(source, sourceLength , &signatureLength, &err);
|
||||
if(U_FAILURE(err)){
|
||||
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i. Error: %s\n", source,i,u_errorName(err));
|
||||
continue;
|
||||
}
|
||||
if(enc == NULL || strcmp(enc,expected[i]) !=0){
|
||||
if(expected[i] !=NULL){
|
||||
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i. Expected: %s. Got: %s\n",source,i,expected[i],enc);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(signatureLength != expectedLength[i]){
|
||||
log_err("ucnv_detectUnicodeSignature test2 failed for source : %s at index :%i.Expected Length: %i. Got length: %i\n",source,i,signatureLength,expectedLength[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
static TestUTF7() {
|
||||
/* test input */
|
||||
|
Loading…
Reference in New Issue
Block a user