4c3e3b8dff
X-SVN-Rev: 5642
416 lines
15 KiB
Java
416 lines
15 KiB
Java
/**
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/XMLParse.java,v $
|
|
* $Date: 2001/08/31 00:19:16 $
|
|
* $Revision: 1.2 $
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
package com.ibm.text.utility;
|
|
|
|
/**
|
|
* Very dumb XML parser, designed for restricted environment where transmitter is guaranteed
|
|
* to limit types of XML files generated.
|
|
*
|
|
* RESTRICTIONS
|
|
* Requires document to be well-formed. Doesn't properly signal errors if it is not.
|
|
* No DTDs, !DOCTYPE, !ATTLIST, !ELEMENT, ![, !NOTATION, !ENTITY, CDATA
|
|
* No processing instructions
|
|
* Does do character references, lt, gt, amp, apos, quot
|
|
* The encoding is specified by the user, by using the right Reader
|
|
* On creation, you supply a buffer for the textual elements. Use a buffer that is as large
|
|
* as the largest possible piece of text (e.g. attribute value or element text) in the file.
|
|
*
|
|
* @author Mark Davis
|
|
*/
|
|
import java.io.*;
|
|
|
|
public final class XMLParse implements XMLParseTypes {
|
|
|
|
/** Create a parser.
|
|
*/
|
|
public XMLParse(Reader stream, char[] buffer) {
|
|
this.stream = stream;
|
|
this.buffer = buffer;
|
|
}
|
|
|
|
/** Create a parser.
|
|
*/
|
|
public XMLParse(String fileName, char[] buffer) throws FileNotFoundException {
|
|
stream = new BufferedReader(new FileReader(fileName),32*1024);
|
|
this.buffer = buffer;
|
|
}
|
|
|
|
/** Get the textual value associated with this item.
|
|
* Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT.
|
|
*/
|
|
public String getValue() {
|
|
return String.valueOf(buffer, 0, bufferCount);
|
|
}
|
|
|
|
/** Get length of the textual value associated with this item.
|
|
* Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT.
|
|
*/
|
|
public int getValueCount() {
|
|
return bufferCount;
|
|
}
|
|
|
|
/** Get the buffer that was passed in on creation.
|
|
*/
|
|
public char[] getValueArray() {
|
|
return buffer;
|
|
}
|
|
|
|
/** Get the "kind" of the last item (see XMLParseTypes)
|
|
*/
|
|
public int getKind() {
|
|
return kind;
|
|
}
|
|
|
|
/** Get the next element, returning a "Kind" (see XMLParseTypes)
|
|
*/
|
|
|
|
public byte next() {
|
|
|
|
char c = '\u0000';
|
|
char type = c;
|
|
|
|
while (c != 0xFFFF) {
|
|
try {
|
|
|
|
// First read the character. If there is a buffered char, use it instead
|
|
|
|
if (bufferChar != 0) {
|
|
c = bufferChar;
|
|
bufferChar = 0;
|
|
} else {
|
|
c = (char) stream.read();
|
|
}
|
|
|
|
// Now set the right type. Since we assume validity, anything but the syntax chars
|
|
// can be classed as IDENTIFIER
|
|
|
|
switch (c) {
|
|
case ' ': case '\r': case '\n': case '\t':
|
|
type = ' ';
|
|
break;
|
|
case '<': case '>': case '#': case ';': case '/': case '\'': case '"':
|
|
case '=': case '?': case '!': case '-':
|
|
type = c;
|
|
break;
|
|
case '&': // CR, either numerical or lt, gt, quot, amp, apos
|
|
|
|
// gather characters
|
|
|
|
int crCount = 0;
|
|
while (true) {
|
|
c = (char) stream.read();
|
|
if (c == ';') break;
|
|
crBuffer[crCount++] = c;
|
|
}
|
|
|
|
// parse it, and break into two pieces if necessary
|
|
|
|
int x = parseCR(crBuffer, crCount);
|
|
c = (char)x;
|
|
if (x > 0xFFFF) { // Supplementary
|
|
x -= 0x10000;
|
|
c = (char) (0xD800 + (x >> 10));
|
|
bufferChar = (char) (0xDC00 + (x & 0x3FF));
|
|
}
|
|
|
|
// Since we assume validity, any CRs are not syntax characters
|
|
|
|
type = IDENTIFIER; // everything else
|
|
break;
|
|
default:
|
|
type = IDENTIFIER; // everything else
|
|
break;
|
|
}
|
|
} catch (Exception e) {
|
|
c = '\uFFFF';
|
|
}
|
|
|
|
// We now have a character. Throw it at our little state machine
|
|
|
|
if (SHOW) System.out.println(c + ", " + type + ", " + stateNames[state]);
|
|
switch (state) {
|
|
case IN_TEXT:
|
|
if (type == '<') {
|
|
state = START_ELEMENT;
|
|
if (bufferCount != 0) {
|
|
kind = TEXT;
|
|
return kind;
|
|
}
|
|
break;
|
|
}
|
|
buffer[bufferCount++] = c;
|
|
break;
|
|
case START_ELEMENT: // must be either '/' or more than one ID char
|
|
bufferCount = 0;
|
|
switch (type) {
|
|
case '/':
|
|
elementType = ELEMENT_TAG_SLASH;
|
|
state = IN_ELEMENT;
|
|
break;
|
|
case '!':
|
|
buffer[bufferCount++] = c;
|
|
elementType = ELEMENT_TAG_COMMENT;
|
|
state = IN_COMMENT;
|
|
break;
|
|
case '?':
|
|
elementType = ELEMENT_TAG_QUESTION;
|
|
state = IN_ELEMENT;
|
|
break;
|
|
default:
|
|
elementType = ELEMENT_TAG;
|
|
buffer[bufferCount++] = c;
|
|
state = IN_ELEMENT;
|
|
break;
|
|
}
|
|
break;
|
|
case IN_COMMENT:
|
|
buffer[bufferCount++] = c;
|
|
if (type == '-') state = IN_COMMENT2;
|
|
else state = IN_COMMENT;
|
|
break;
|
|
case IN_COMMENT2:
|
|
buffer[bufferCount++] = c;
|
|
if (type == '-') state = IN_COMMENT3;
|
|
else state = IN_COMMENT;
|
|
break;
|
|
case IN_COMMENT3:
|
|
if (type == '>') {
|
|
kind = ELEMENT_TAG_COMMENT;
|
|
bufferChar = c;
|
|
state = IN_ATTRIBUTES;
|
|
elementType = END_ELEMENT_COMMENT;
|
|
return kind;
|
|
} else if (type != '-') {
|
|
state = IN_COMMENT;
|
|
}
|
|
buffer[bufferCount++] = c;
|
|
break;
|
|
case IN_ELEMENT:
|
|
if (type != IDENTIFIER) {
|
|
state = IN_ATTRIBUTES;
|
|
kind = elementType;
|
|
elementType = END_ELEMENT;
|
|
bufferChar = c;
|
|
return kind;
|
|
}
|
|
buffer[bufferCount++] = c;
|
|
break;
|
|
case IN_ATTRIBUTES:
|
|
bufferCount = 0;
|
|
if (type == '/') {
|
|
elementType = END_ELEMENT_SLASH;
|
|
} else if (type == '?') {
|
|
elementType = END_ELEMENT_QUESTION;
|
|
} else if (type == '>') {
|
|
state = IN_TEXT;
|
|
kind = elementType;
|
|
return kind;
|
|
} else if (type == IDENTIFIER) {
|
|
state = IN_ATTR;
|
|
buffer[bufferCount++] = c;
|
|
break;
|
|
}
|
|
break;
|
|
case IN_ATTR:
|
|
if (type != IDENTIFIER) {
|
|
state = START_VALUE;
|
|
kind = ATTRIBUTE_TAG;
|
|
return kind;
|
|
}
|
|
buffer[bufferCount++] = c;
|
|
break;
|
|
case START_VALUE: // must have <s>* = ( ' | " )
|
|
if (type == '\'' || type == '"') {
|
|
lastQuote = c;
|
|
state = IN_VALUE;
|
|
bufferCount = 0;
|
|
}
|
|
break;
|
|
case IN_VALUE: // only terminated by lastQuote
|
|
if (type == lastQuote) {
|
|
state = IN_ATTRIBUTES;
|
|
kind = ATTRIBUTE_VALUE;
|
|
return kind;
|
|
}
|
|
buffer[bufferCount++] = c;
|
|
break;
|
|
}
|
|
}
|
|
return DONE;
|
|
}
|
|
|
|
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
|
* (see XMLParseTypes for values)
|
|
*/
|
|
|
|
public static String quote(int c) {
|
|
return quote(c, 0);
|
|
}
|
|
|
|
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
|
* (see XMLParseTypes for values)
|
|
*/
|
|
|
|
public static String quote(int c, int flags) {
|
|
String result = quoteGuts(c, flags);
|
|
if (result != null) return result;
|
|
return String.valueOf((char)c);
|
|
}
|
|
|
|
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
|
* (see XMLParseTypes for values)
|
|
*/
|
|
|
|
public static String quote(String source) {
|
|
return quote(source, 0);
|
|
}
|
|
|
|
/** Utility for doing XML quotes. Flags control which characters are handled and how.
|
|
* (see XMLParseTypes for values)
|
|
*/
|
|
|
|
public static String quote(String source, int flags) {
|
|
StringBuffer result = new StringBuffer();
|
|
String temp;
|
|
for (int i = 0; i < source.length(); ++i) {
|
|
int c = UTF32.char32At(source, i);
|
|
if (c > 0xFFFF) ++i;
|
|
temp = quoteGuts(c, flags);
|
|
if (temp != null) result.append(temp);
|
|
else if (c <= 0xFFFF) result.append((char)c);
|
|
else result.append(source.substring(i-1,i+1)); // surrogates
|
|
}
|
|
return result.toString();
|
|
}
|
|
|
|
/** Parses inside of CR. buffer should not contain the initial '&', or final ';'
|
|
*/
|
|
static int parseCR(char[] crBuffer, int crCount) {
|
|
int c;
|
|
int start = 0;
|
|
if (crCount == 0) return -1;
|
|
switch (crBuffer[start++]) {
|
|
case 'l': c = '<'; break; // lt
|
|
case 'g': c = '>'; break; // gt
|
|
case 'q': c = '"'; break; // quot
|
|
case 'a': // &, '
|
|
if (crCount > start && crBuffer[start] == 'm') c = '&';
|
|
else c = '\'';
|
|
break;
|
|
case '#':
|
|
int radix = 10;
|
|
if (crCount > start && crBuffer[start] == 'x') {
|
|
radix = 16;
|
|
++start;
|
|
}
|
|
// Simple code for now. Could be sped up.
|
|
c = Integer.parseInt(String.valueOf(crBuffer,start,crCount-start), radix);
|
|
break;
|
|
default:
|
|
c = -1;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
/** Utility for doing hex, padding with zeros
|
|
*/
|
|
|
|
static public String hex(long i, int places) {
|
|
String result = Long.toString(i, 16).toUpperCase();
|
|
if (result.length() < places) {
|
|
result = "0000000000000000".substring(result.length(),places) + result;
|
|
}
|
|
return result;
|
|
}
|
|
// =================== PRIVATES =================================
|
|
|
|
private static final char[] buf2 = new char[2];
|
|
|
|
private static final boolean SHOW = false;
|
|
|
|
private char[] buffer;
|
|
private int bufferCount;
|
|
private byte kind = TEXT;
|
|
|
|
private Reader stream;
|
|
private char[] crBuffer = new char[10];
|
|
private int state = IN_TEXT;
|
|
private byte elementType;
|
|
private char lastQuote;
|
|
private char bufferChar;
|
|
|
|
private static final byte IN_TEXT = 0, START_ELEMENT = 1, IN_ELEMENT = 2,
|
|
IN_ATTR = 3, START_VALUE = 4, IN_VALUE = 5, IN_ATTRIBUTES = 6,
|
|
IN_COMMENT = 7, IN_COMMENT2 = 8, IN_COMMENT3 = 9;
|
|
|
|
private static final String[] stateNames = {"IN_TEXT", "START_ELEMENT", "IN_ELEMENT",
|
|
"IN_ATTR", "START_VALUE", "IN_VALUE", "IN_ATTRIBUTES",
|
|
"IN_COMMENT", "IN_COMMENT2", "IN_COMMENT3"};
|
|
|
|
private static final char IDENTIFIER = 'a';
|
|
|
|
|
|
private static String quoteGuts(int c, int flags) {
|
|
String prefix = "&";
|
|
switch (c) {
|
|
case '<': return "<";
|
|
case '>': return ">";
|
|
case '&': return "&";
|
|
case '\'': return "'";
|
|
case '"': return """;
|
|
|
|
// Optionally fix TAB, CR, LF
|
|
|
|
case 0x09: case 0x0A: case 0x0D:
|
|
if ((flags & QUOTE_TABCRLF) == 0) return null;
|
|
break;
|
|
|
|
// Fix controls, non-characters, since XML can't handle
|
|
|
|
case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
|
|
case 0x08: case 0x0B: case 0x0C: case 0x0E: case 0x0F:
|
|
case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
|
|
case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F:
|
|
case 0x7F:
|
|
case 0xFFFE: case 0xFFFF:
|
|
prefix = "";
|
|
break;
|
|
|
|
// Optionally fix IE Bug characters
|
|
|
|
case 0xFF00: case 0xFF01: case 0xFF02: case 0xFF03: case 0xFF04: case 0xFF05: case 0xFF06: case 0xFF07:
|
|
case 0xFFF8: case 0xFFF9: case 0xFFFA: case 0xFFFB: case 0xFFFC: case 0xFFFD:
|
|
if ((flags & QUOTE_IEBUG) == 0) return null;
|
|
prefix = "";
|
|
break;
|
|
|
|
default:
|
|
if (c <= 0x7E) { // don't quote other ASCII
|
|
if ((flags & QUOTE_ASCII) == 0) return null;
|
|
} else if (0xD800 <= c && c <= 0xDFFF) {// fix surrogates, since XML can't handle
|
|
prefix = "";
|
|
} else if (c > 0xFFFF && (flags & QUOTE_IEBUG) != 0) {
|
|
prefix = "";
|
|
} else if ((flags & QUOTE_NON_ASCII) == 0) {
|
|
return null;
|
|
}
|
|
break;
|
|
}
|
|
if ((flags & QUOTE_DECIMAL) == 0) {
|
|
return prefix + "#x" + hex(c,1) + ";";
|
|
} else {
|
|
return prefix + "#" + Integer.toString(c) + ";";
|
|
}
|
|
}
|
|
} |