2001-05-10 16:54:09 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* Copyright (C) 1998-2001, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
|
|
|
* File ucbuf.c
|
|
|
|
*
|
|
|
|
* Modification History:
|
|
|
|
*
|
|
|
|
* Date Name Description
|
|
|
|
* 05/10/01 Ram Creation.
|
2001-05-16 01:09:06 +00:00
|
|
|
*
|
|
|
|
* This API reads in files and returns UChars
|
2001-05-10 16:54:09 +00:00
|
|
|
*******************************************************************************
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/ucnv.h"
|
|
|
|
#include "filestrm.h"
|
|
|
|
#include "cmemory.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
#ifndef UCBUF_H
|
|
|
|
#define UCBUF_H 1
|
|
|
|
|
|
|
|
typedef struct UCHARBUF UCHARBUF;
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* End of file value
|
|
|
|
*/
|
2001-07-24 23:15:31 +00:00
|
|
|
#define U_EOF 0xFFFFFFFF
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* Error value if a sequence cannot be unescaped
|
|
|
|
*/
|
2001-07-24 23:15:31 +00:00
|
|
|
#define U_ERR 0xFFFFFFFE
|
2002-10-10 01:04:15 +00:00
|
|
|
|
|
|
|
typedef struct ULine ULine;
|
|
|
|
|
|
|
|
struct ULine {
|
|
|
|
UChar *name;
|
|
|
|
int32_t len;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Opens the UCHARBUF with the given file stream and code page for conversion
|
|
|
|
* @param fileName Name of the file to open.
|
|
|
|
* @param codepage The encoding of the file stream to convert to Unicode.
|
|
|
|
* If *codepoge is NULL on input the API will try to autodetect
|
|
|
|
* popular Unicode encodings
|
|
|
|
* @param showWarning Flag to print out warnings to STDOUT
|
|
|
|
* @param buffered If TRUE performs a buffered read of the input file. If FALSE reads
|
|
|
|
* the whole file into memory and converts it.
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
* @return pointer to the newly opened UCHARBUF
|
|
|
|
*/
|
|
|
|
U_CAPI UCHARBUF* U_EXPORT2
|
|
|
|
ucbuf_open(const char* fileName,const char** codepage,UBool showWarning, UBool buffered, UErrorCode* err);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets a UTF-16 code unit at the current position from the converted buffer
|
|
|
|
* and increments the current position
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
*/
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_getc(UCHARBUF* buf,UErrorCode* err);
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* Gets a UTF-32 code point at the current position from the converted buffer
|
|
|
|
* and increments the current position
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
*/
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_getc32(UCHARBUF* buf,UErrorCode* err);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets a UTF-16 code unit at the current position from the converted buffer after
|
|
|
|
* unescaping and increments the current position. If the escape sequence is for UTF-32
|
|
|
|
* code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
*/
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets a pointer to the current position in the internal buffer and length of the line.
|
|
|
|
* It imperative to make a copy of the returned buffere before performing operations on it.
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
* @param len Output param to receive the len of the buffer returned till end of the line
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
* Error: U_TRUNCATED_CHAR_FOUND
|
|
|
|
* @return Pointer to the internal buffer
|
|
|
|
*/
|
|
|
|
U_CAPI const UChar* U_EXPORT2
|
|
|
|
ucbuf_readline(UCHARBUF* buf,int32_t* len, UErrorCode* err);
|
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* Resets the buffers and the underlying file stream.
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_rewind(UCHARBUF* buf,UErrorCode* err);
|
2001-05-10 21:43:01 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* Returns a pointer to the internal converted buffer
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
* @param len Pointer to int32_t to receive the lenth of buffer
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
|
|
|
* @return Pointer to internal UChar buffer
|
|
|
|
*/
|
|
|
|
U_CAPI const UChar* U_EXPORT2
|
|
|
|
ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* err);
|
2001-05-10 21:43:01 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* Closes the UCHARBUF structure members and cleans up the malloc'ed memory
|
|
|
|
* @param buf Pointer to UCHARBUF structure
|
|
|
|
*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2001-05-10 21:43:01 +00:00
|
|
|
ucbuf_close(UCHARBUF* buf);
|
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
|
|
|
* Rewinds the buffer by one codepoint
|
|
|
|
*/
|
2001-05-16 01:09:06 +00:00
|
|
|
U_CAPI void U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf);
|
|
|
|
|
2001-05-10 21:43:01 +00:00
|
|
|
|
2002-10-10 01:04:15 +00:00
|
|
|
/**
|
2002-11-08 01:28:14 +00:00
|
|
|
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
|
|
|
|
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
|
2002-10-10 01:04:15 +00:00
|
|
|
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
|
|
|
|
* is necessary.
|
2002-11-08 01:28:14 +00:00
|
|
|
* If the charset was autodetected, the caller must close both the input FileStream
|
|
|
|
* and the converter.
|
|
|
|
*
|
2002-10-10 01:04:15 +00:00
|
|
|
* @param fileName The file name to be opened and encoding autodected
|
2002-11-08 01:28:14 +00:00
|
|
|
* @param conv Output param to receive the opened converter if autodetected; NULL otherwise.
|
2002-10-10 01:04:15 +00:00
|
|
|
* @param cp Output param to receive the detected encoding
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
2002-11-08 01:28:14 +00:00
|
|
|
* @return The input FileStream if its charset was autodetected; NULL otherwise.
|
2002-10-10 01:04:15 +00:00
|
|
|
*/
|
2002-11-08 01:28:14 +00:00
|
|
|
U_CAPI FileStream * U_EXPORT2
|
2002-10-10 01:04:15 +00:00
|
|
|
ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength,UErrorCode* error);
|
|
|
|
|
|
|
|
/**
|
2002-11-08 01:28:14 +00:00
|
|
|
* Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
|
|
|
|
* Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
|
2002-10-10 01:04:15 +00:00
|
|
|
* the converter to correct state for converting the rest of the stream. So the UConverter parameter
|
|
|
|
* is necessary.
|
2002-11-08 01:28:14 +00:00
|
|
|
* If the charset was autodetected, the caller must close the converter.
|
|
|
|
*
|
2002-10-10 01:04:15 +00:00
|
|
|
* @param fileStream The file stream whose encoding is to be detected
|
2002-11-08 01:28:14 +00:00
|
|
|
* @param conv Output param to receive the opened converter if autodetected; NULL otherwise.
|
2002-10-10 01:04:15 +00:00
|
|
|
* @param cp Output param to receive the detected encoding
|
|
|
|
* @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
|
|
|
|
* indicates a failure on entry, the function will immediately return.
|
|
|
|
* On exit the value will indicate the success of the operation.
|
2002-11-08 01:28:14 +00:00
|
|
|
* @return Boolean whether the Unicode charset was autodetected.
|
2002-10-10 01:04:15 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
U_CAPI UBool U_EXPORT2
|
|
|
|
ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the approximate size in UChars required for converting the file to UChars
|
|
|
|
*/
|
|
|
|
U_CAPI int32_t U_EXPORT2
|
|
|
|
ucbuf_size(UCHARBUF* buf);
|
|
|
|
|
|
|
|
U_CAPI const char* U_EXPORT2
|
|
|
|
ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status);
|
2001-05-10 16:54:09 +00:00
|
|
|
|
|
|
|
#endif
|