package com.ibm.text.utility; /** * Very dumb XML parser, designed for restricted environment where transmitter is guaranteed * to limit types of XML files generated. * * RESTRICTIONS * Requires document to be well-formed. Doesn't properly signal errors if it is not. * No DTDs, !DOCTYPE, !ATTLIST, !ELEMENT, ![, !NOTATION, !ENTITY, CDATA * No processing instructions * Does do character references, lt, gt, amp, apos, quot * The encoding is specified by the user, by using the right Reader * On creation, you supply a buffer for the textual elements. Use a buffer that is as large * as the largest possible piece of text (e.g. attribute value or element text) in the file. * * @author Mark Davis */ import java.io.*; public final class XMLParse implements XMLParseTypes { /** Create a parser. */ public XMLParse(Reader stream, char[] buffer) { this.stream = stream; this.buffer = buffer; } /** Create a parser. */ public XMLParse(String fileName, char[] buffer) throws FileNotFoundException { stream = new BufferedReader(new FileReader(fileName),32*1024); this.buffer = buffer; } /** Get the textual value associated with this item. * Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT. */ public String getValue() { return String.valueOf(buffer, 0, bufferCount); } /** Get length of the textual value associated with this item. * Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT. */ public int getValueCount() { return bufferCount; } /** Get the buffer that was passed in on creation. */ public char[] getValueArray() { return buffer; } /** Get the "kind" of the last item (see XMLParseTypes) */ public int getKind() { return kind; } /** Get the next element, returning a "Kind" (see XMLParseTypes) */ public byte next() { char c = '\u0000'; char type = c; while (c != 0xFFFF) { try { // First read the character. If there is a buffered char, use it instead if (bufferChar != 0) { c = bufferChar; bufferChar = 0; } else { c = (char) stream.read(); } // Now set the right type. Since we assume validity, anything but the syntax chars // can be classed as IDENTIFIER switch (c) { case ' ': case '\r': case '\n': case '\t': type = ' '; break; case '<': case '>': case '#': case ';': case '/': case '\'': case '"': case '=': case '?': case '!': case '-': type = c; break; case '&': // CR, either numerical or lt, gt, quot, amp, apos // gather characters int crCount = 0; while (true) { c = (char) stream.read(); if (c == ';') break; crBuffer[crCount++] = c; } // parse it, and break into two pieces if necessary int x = parseCR(crBuffer, crCount); c = (char)x; if (x > 0xFFFF) { // Supplementary x -= 0x10000; c = (char) (0xD800 + (x >> 10)); bufferChar = (char) (0xDC00 + (x & 0x3FF)); } // Since we assume validity, any CRs are not syntax characters type = IDENTIFIER; // everything else break; default: type = IDENTIFIER; // everything else break; } } catch (Exception e) { c = '\uFFFF'; } // We now have a character. Throw it at our little state machine if (SHOW) System.out.println(c + ", " + type + ", " + stateNames[state]); switch (state) { case IN_TEXT: if (type == '<') { state = START_ELEMENT; if (bufferCount != 0) { kind = TEXT; return kind; } break; } buffer[bufferCount++] = c; break; case START_ELEMENT: // must be either '/' or more than one ID char bufferCount = 0; switch (type) { case '/': elementType = ELEMENT_TAG_SLASH; state = IN_ELEMENT; break; case '!': buffer[bufferCount++] = c; elementType = ELEMENT_TAG_COMMENT; state = IN_COMMENT; break; case '?': elementType = ELEMENT_TAG_QUESTION; state = IN_ELEMENT; break; default: elementType = ELEMENT_TAG; buffer[bufferCount++] = c; state = IN_ELEMENT; break; } break; case IN_COMMENT: buffer[bufferCount++] = c; if (type == '-') state = IN_COMMENT2; else state = IN_COMMENT; break; case IN_COMMENT2: buffer[bufferCount++] = c; if (type == '-') state = IN_COMMENT3; else state = IN_COMMENT; break; case IN_COMMENT3: if (type == '>') { kind = ELEMENT_TAG_COMMENT; bufferChar = c; state = IN_ATTRIBUTES; elementType = END_ELEMENT_COMMENT; return kind; } else if (type != '-') { state = IN_COMMENT; } buffer[bufferCount++] = c; break; case IN_ELEMENT: if (type != IDENTIFIER) { state = IN_ATTRIBUTES; kind = elementType; elementType = END_ELEMENT; bufferChar = c; return kind; } buffer[bufferCount++] = c; break; case IN_ATTRIBUTES: bufferCount = 0; if (type == '/') { elementType = END_ELEMENT_SLASH; } else if (type == '?') { elementType = END_ELEMENT_QUESTION; } else if (type == '>') { state = IN_TEXT; kind = elementType; return kind; } else if (type == IDENTIFIER) { state = IN_ATTR; buffer[bufferCount++] = c; break; } break; case IN_ATTR: if (type != IDENTIFIER) { state = START_VALUE; kind = ATTRIBUTE_TAG; return kind; } buffer[bufferCount++] = c; break; case START_VALUE: // must have * = ( ' | " ) if (type == '\'' || type == '"') { lastQuote = c; state = IN_VALUE; bufferCount = 0; } break; case IN_VALUE: // only terminated by lastQuote if (type == lastQuote) { state = IN_ATTRIBUTES; kind = ATTRIBUTE_VALUE; return kind; } buffer[bufferCount++] = c; break; } } return DONE; } /** Utility for doing XML quotes. Flags control which characters are handled and how. * (see XMLParseTypes for values) */ public static String quote(int c) { return quote(c, 0); } /** Utility for doing XML quotes. Flags control which characters are handled and how. * (see XMLParseTypes for values) */ public static String quote(int c, int flags) { String result = quoteGuts(c, flags); if (result != null) return result; return String.valueOf((char)c); } /** Utility for doing XML quotes. Flags control which characters are handled and how. * (see XMLParseTypes for values) */ public static String quote(String source) { return quote(source, 0); } /** Utility for doing XML quotes. Flags control which characters are handled and how. * (see XMLParseTypes for values) */ public static String quote(String source, int flags) { StringBuffer result = new StringBuffer(); String temp; for (int i = 0; i < source.length(); ++i) { int c = UTF32.char32At(source, i); if (c > 0xFFFF) ++i; temp = quoteGuts(c, flags); if (temp != null) result.append(temp); else if (c <= 0xFFFF) result.append((char)c); else result.append(source.substring(i-1,i+1)); // surrogates } return result.toString(); } /** Parses inside of CR. buffer should not contain the initial '&', or final ';' */ static int parseCR(char[] crBuffer, int crCount) { int c; int start = 0; if (crCount == 0) return -1; switch (crBuffer[start++]) { case 'l': c = '<'; break; // lt case 'g': c = '>'; break; // gt case 'q': c = '"'; break; // quot case 'a': // &, ' if (crCount > start && crBuffer[start] == 'm') c = '&'; else c = '\''; break; case '#': int radix = 10; if (crCount > start && crBuffer[start] == 'x') { radix = 16; ++start; } // Simple code for now. Could be sped up. c = Integer.parseInt(String.valueOf(crBuffer,start,crCount-start), radix); break; default: c = -1; } return c; } /** Utility for doing hex, padding with zeros */ static public String hex(long i, int places) { String result = Long.toString(i, 16).toUpperCase(); if (result.length() < places) { result = "0000000000000000".substring(result.length(),places) + result; } return result; } // =================== PRIVATES ================================= private static final char[] buf2 = new char[2]; private static final boolean SHOW = false; private char[] buffer; private int bufferCount; private byte kind = TEXT; private Reader stream; private char[] crBuffer = new char[10]; private int state = IN_TEXT; private byte elementType; private char lastQuote; private char bufferChar; private static final byte IN_TEXT = 0, START_ELEMENT = 1, IN_ELEMENT = 2, IN_ATTR = 3, START_VALUE = 4, IN_VALUE = 5, IN_ATTRIBUTES = 6, IN_COMMENT = 7, IN_COMMENT2 = 8, IN_COMMENT3 = 9; private static final String[] stateNames = {"IN_TEXT", "START_ELEMENT", "IN_ELEMENT", "IN_ATTR", "START_VALUE", "IN_VALUE", "IN_ATTRIBUTES", "IN_COMMENT", "IN_COMMENT2", "IN_COMMENT3"}; private static final char IDENTIFIER = 'a'; private static String quoteGuts(int c, int flags) { String prefix = "&"; switch (c) { case '<': return "<"; case '>': return ">"; case '&': return "&"; case '\'': return "'"; case '"': return """; // Optionally fix TAB, CR, LF case 0x09: case 0x0A: case 0x0D: if ((flags & QUOTE_TABCRLF) == 0) return null; break; // Fix controls, non-characters, since XML can't handle case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07: case 0x08: case 0x0B: case 0x0C: case 0x0E: case 0x0F: case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17: case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F: case 0x7F: case 0xFFFE: case 0xFFFF: prefix = ""; break; // Optionally fix IE Bug characters case 0xFF00: case 0xFF01: case 0xFF02: case 0xFF03: case 0xFF04: case 0xFF05: case 0xFF06: case 0xFF07: case 0xFFF8: case 0xFFF9: case 0xFFFA: case 0xFFFB: case 0xFFFC: case 0xFFFD: if ((flags & QUOTE_IEBUG) == 0) return null; prefix = ""; break; default: if (c <= 0x7E) { // don't quote other ASCII if ((flags & QUOTE_ASCII) == 0) return null; } else if (0xD800 <= c && c <= 0xDFFF) {// fix surrogates, since XML can't handle prefix = ""; } else if (c > 0xFFFF && (flags & QUOTE_IEBUG) != 0) { prefix = ""; } else if ((flags & QUOTE_NON_ASCII) == 0) { return null; } break; } if ((flags & QUOTE_DECIMAL) == 0) { return prefix + "#x" + hex(c,1) + ";"; } else { return prefix + "#" + Integer.toString(c) + ";"; } } }