scuffed-code/icu4c/source/samples/xml2txt/xml2txt.cpp

900 lines
25 KiB
C++
Raw Normal View History

#include "xml2txt.h"
static bool DTDFLAG = false;
static char* gTxtFile;
static char* gXmlFile;
static const char *sourceDir;
static const char *destDir;
static bool gDoNamespaces = false;
static bool gDoSchema = false;
static bool gDoCreate = false;
static XMLCh* gEncodingName = 0;
static XMLFormatter::UnRepFlags gUnRepFlags = XMLFormatter::UnRep_CharRef;
static DOMParser::ValSchemes gValScheme = DOMParser::Val_Auto;
static XMLFormatter* gFormatter = 0;
enum
{
HELP,
SOURCEDIR,
DESTDIR,
};
//#define UOPTION_TXT UOPTION_DEF("txt", 't', UOPT_NO_ARG)
//#define UOPTION_RES UOPTION_DEF("res", 'r', UOPT_NO_ARG)
UOption options[]={
UOPTION_HELP_H,
UOPTION_SOURCEDIR,
UOPTION_DESTDIR,
};
#ifdef XP_MAC_CONSOLE
#include <console.h>
#endif
// ---------------------------------------------------------------------------
//
// Usage()
//
// ---------------------------------------------------------------------------
void usage()
{
cout << "\nUsage: XML2TXT [OPTIONS] [FILES]\n\n"
"This program is used to convert XML files to TXT files.\n"
"Please refer to the following options. Options are not \n"
"case sensitive.\n"
"Options:\n"
"\t-s or --sourcedir \t source directory for files followed by path, default is current directory.\n"
"\t-d or --destdir \t destination directory, followed by the path, default is current directory.\n"
"\t-h or -? or --help \t this usage text.\n"
"\nAttention: \n"
"\tThe text file's encoding is the same as the source file's.\n"
<< endl;
}
int main(int argC, char* argV[])
{
int retval = 0;
const char* arg=NULL;
try
{
XMLPlatformUtils::Initialize();
}
catch(const XMLException& toCatch)
{
cerr << "Error during Xerces-c Initialization.\n"
<< " Exception message:"
<< DOMString(toCatch.getMessage()) << endl;
return 1;
}
#ifdef XP_MAC_CONSOLE
argC = ccommand((char***)&argV);
#endif
argC = u_parseArgs(argC, argV, (int32_t)(sizeof(options)/sizeof(options[0])), options);
if(argC<0) {
cout << "error in command line argument" << argV[-argC] << endl;
}
// Watch for special case help request
if(argC<2 || options[HELP].doesOccur) {
usage();
return argC < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
if(options[SOURCEDIR].doesOccur) {
sourceDir = options[SOURCEDIR].value;
}
else {
#ifdef WIN32
destDir = _getcwd(NULL, 0);
#else
destDir = getcwd(NULL, 0);
#endif
}
if(options[DESTDIR].doesOccur) {
destDir = options[DESTDIR].value;
}
else {
#ifdef WIN32
destDir = _getcwd(NULL, 0);
#else
destDir = getcwd(NULL, 0);
#endif
}
for(int i = 1; i< argC; i++) {
arg = getLongPathname(argV[i]);
gXmlFile = CreateFile(arg, sourceDir);
gTxtFile = CreateTxtName(arg, destDir);
retval = ProcessTxtFile();
}
XMLPlatformUtils::Terminate();
return retval;
}
int ProcessTxtFile()
{
int retval = 0;
DOMParser* parser;
DOMTreeErrorReporter* errReporter;
parser = new DOMParser();
errReporter = new DOMTreeErrorReporter();
parser->setValidationScheme(gValScheme);
parser->setDoNamespaces(true);
parser->setDoSchema(gDoSchema);
parser->setErrorHandler(errReporter);
parser->setCreateEntityReferenceNodes(gDoCreate);
parser->setToCreateXMLDeclTypeNode(true);
//
// Parse the XML file, catching any XML exceptions that might propogate
// out of it.
//
bool errorsOccured = false;
try
{
parser->parse(gXmlFile);
int errorCount = parser->getErrorCount();
if (errorCount > 0)
errorsOccured = true;
}
catch (const XMLException& e)
{
cerr << "An error occured during parsing\n Message: "
<< DOMString(e.getMessage()) << endl;
errorsOccured = true;
}
catch (const DOM_DOMException& e)
{
cerr << "A DOM error occured during parsing\n DOMException code: "
<< e.code << endl;
errorsOccured = true;
}
catch (...)
{
cerr << "An error occured during parsing\n " << endl;
errorsOccured = true;
}
if(!errorsOccured && !errReporter->getSawErrors())
{
DOM_Node document = parser->getDocument();
Check(document); //if check fails, exit(0); else excute the following code
if(DTDFLAG == false){
cout << "DTD no assigned!" << endl;
exit(0);
}
}
// If the parse and doubt-check was successful, output the document data from the DOM tree
if (!errorsOccured && !errReporter->getSawErrors())
{
DOM_Node doc = parser->getDocument();
DOMPrintFormatTarget *formatTarget = new DOMPrintFormatTarget(gTxtFile);
if (gEncodingName == 0)
{
DOMString encNameStr("UTF-8");
DOM_Node aNode = doc.getFirstChild();
if (aNode.getNodeType() == DOM_Node::XML_DECL_NODE)
{
DOMString aStr = ((DOM_XMLDecl &)aNode).getEncoding();
if (aStr != "")
{
encNameStr = aStr;
}
}
unsigned int lent = encNameStr.length();
gEncodingName = new XMLCh[lent + 1];
XMLString::copyNString(gEncodingName, encNameStr.rawBuffer(), lent);
gEncodingName[lent] = 0;
}
try
{
gFormatter = new XMLFormatter(gEncodingName, formatTarget,
XMLFormatter::NoEscapes, gUnRepFlags);
ofstream ofile(gTxtFile, ios::trunc);
cout << doc;
}
catch (XMLException& e)
{
cerr << "An error occurred during creation of output transcoder. Msg is:"
<< endl
<< DOMString(e.getMessage()) << endl;
retval = 3;
}
delete formatTarget;
delete gFormatter;
}
delete errReporter;
delete parser;
parser = NULL;
errReporter = NULL;
delete gEncodingName;
gEncodingName=NULL;
return retval;
}
//----------------------------------------------------------------------------
// double-check before DOM Tree PrintOut
//----------------------------------------------------------------------------
void Check( DOM_Node &document)
{
// Get the name and value out for convenience
DOMString nodeName = document.getNodeName(); //<tag name>, type
DOMString nodeValue = document.getNodeValue(); //<tag content>
DOMString attributeKey, attributeVal; //(key/name)(val/filename)
unsigned long lent = nodeValue.length();
switch (document.getNodeType())
{
case DOM_Node::TEXT_NODE:
{
break;
}
case DOM_Node::PROCESSING_INSTRUCTION_NODE :
{
break;
}
case DOM_Node::DOCUMENT_NODE :
{
DOM_Node child = document.getFirstChild();
while( child != 0)
{
Check(child);
child = child.getNextSibling();
}
break;
}
case DOM_Node::ELEMENT_NODE :
{
DOM_NamedNodeMap attributes = document.getAttributes();
int attrCount = attributes.getLength();
int item_num=0;
for (int i = 0; i < attrCount; i++)
{
DOM_Node attribute = attributes.item(i);
if(attribute.getNodeName().equals("key")||attribute.getNodeName().equals("name")){
attributeKey = attribute.getNodeValue();
}
else if(attribute.getNodeName().equals("val")||attribute.getNodeName().equals("filename")){
attributeVal = attribute.getNodeValue();
item_num = i;
}
else{
//call error report
ErrorReport(document, 0);
}
}
if(document.getParentNode().getNodeName().equals("array") && attributeKey!=NULL){
ErrorReport(document, 1); //ErrorType =1--the element in the array has name
}
else if(document.getParentNode().getNodeName().equals("table") && attributeKey==NULL){
ErrorReport(document, 2); //element in a table has no name
}
if(document.getNodeName().equals("table"))
{
//unsigned int Child_Num;
if(document.hasChildNodes())
{
ChildName* cn = new ChildName();
cn->SetNext(NULL);
ChildName* head = CheckNameDuplicate(document, cn);
DelChildName(head);
}
}
else if(document.getNodeName().equals("array")) {}
else if(document.getNodeName().equals("resourceBundle")) {}
else if(document.getNodeName().equals("str")||document.getNodeName().equals("importBin"))
{
CheckEscape(attributes, attributeVal, item_num);
}
else if(document.getNodeName().equals("intVector"))
{
DOMString ivstring;
ivstring = CheckIntvector(attributeVal, document);
if(ivstring !=NULL)
attributes.item(item_num).setNodeValue(ivstring);
}
else if(document.getNodeName().equals("int"))
{
CheckInt(attributeVal, document);
}
else if(document.getNodeName().equals("bin"))
{
CheckBin(attributeVal, document);
}
else if(document.getNodeName().equals("import")) {}
else if(document.getNodeName().equals("alias")) {}
else {
ErrorReport(document, 6);
}
DOM_Node child = document.getFirstChild();
if (child != 0)
{
while( child != 0)
{
Check(child);
child = child.getNextSibling();
}
}
break;
}
case DOM_Node::ENTITY_REFERENCE_NODE:
{
break;
}
case DOM_Node::CDATA_SECTION_NODE:
{
break;
}
case DOM_Node::COMMENT_NODE:
{
break;
}
case DOM_Node::DOCUMENT_TYPE_NODE:
{
DTDFLAG = true;
break;
}
case DOM_Node::ENTITY_NODE:
{
break;
}
case DOM_Node::XML_DECL_NODE:
{
break;
}
default:
cerr << "Unrecognized node type = "
<< (long)document.getNodeType() << endl;
}
}
void CheckEscape(DOM_NamedNodeMap attributes, DOMString attributeVal, int item_num)
{
unsigned int len;
char Escape[7] = {'\\', 'u', '0', '0', '2', '2', '\0'};
len = attributeVal.length();
DOMString fromStr;
DOMString toStr;
const XMLCh quote[] = {(unsigned short)0x22, (unsigned short) 0};
if(len>0)
{
for(unsigned int i=0; i<len; i++)
{
fromStr = attributeVal.substringData (i,1);
char* temp=fromStr.transcode();
if(fromStr.equals(quote))
{
toStr.appendData(Escape);
}
else
toStr.appendData(fromStr);
}
attributes.item(item_num).setNodeValue(toStr);
}
}
DOMString getAttributeKey(DOM_Node CNode)
{
DOM_NamedNodeMap attributes = CNode.getAttributes();
int attrCount = attributes.getLength();
DOMString attributeKey;
for (int i = 0; i < attrCount; i++)
{
DOM_Node attribute = attributes.item(i);
if(attribute.getNodeName().equals("key"))
attributeKey = attribute.getNodeValue();
}
return attributeKey;
}
void DelChildName(ChildName* cn)
{
ChildName* temp = cn->Next;
while(temp!=NULL)
{
delete cn;
cn = NULL;
cn = temp;
temp = temp->Next;
}
delete cn;
}
ChildName* CheckNameDuplicate(DOM_Node document, ChildName* cn)
{
DOM_Node CNode = document.getFirstChild();
while(CNode!=NULL)
{
if(CNode.getNodeName().equals("string")||CNode.getNodeName().equals("bin")||CNode.getNodeName().equals("int")||CNode.getNodeName().equals("intvector")||CNode.getNodeName().equals("import")||CNode.getNodeName().equals("table")||CNode.getNodeName().equals("array"))
{
DOMString cname = getAttributeKey(CNode);
char* string = cname.transcode();
ChildName* temp = cn;
while(temp->Next!=NULL)
{
if(cname.equals(temp->Name))
{
DelChildName(cn);
ErrorReport(CNode, 5); //name duplication
}
temp = temp ->Next;
}
ChildName* childname = new ChildName();
childname->SetName(cname);
childname->SetNext(cn);
cn = childname;
}
CNode = CNode.getNextSibling();
}
return cn;
}
unsigned int GetCNodeNum(DOM_Node document)
{
unsigned int num=0;
DOM_Node CNode = document.getFirstChild();
while(CNode!=NULL)
{
if(CNode.getNodeName().equals("string")||CNode.getNodeName().equals("bin")||CNode.getNodeName().equals("int")||CNode.getNodeName().equals("intvector")||CNode.getNodeName().equals("import")||CNode.getNodeName().equals("table")||CNode.getNodeName().equals("array"))
num++;
CNode = CNode.getNextSibling();
}
return num;
}
void CheckBin(DOMString attributeVal, DOM_Node document)
{
char *stopstring;
char toConv[2] = {'\0', '\0'};
char* string = attributeVal.transcode();
int count = strlen(string);
if(count > 0)
{
if((count % 2)==0)
{
for(int i=0; i<count; i++)
{
toConv[0]=string[i];
int value = strtoul(toConv, &stopstring, 16);
unsigned int len = stopstring-toConv;
if(len!= strlen(toConv))
{
ErrorReport(document, 4); //invalid bin value
}
}
}
else
ErrorReport(document, 4); //invalid bin value
}
}
void CheckInt(DOMString attributeVal, DOM_Node document)
{
char *stopstring;
char* string= attributeVal.transcode();
long value = strtoul(string, &stopstring, 0);
unsigned int len=stopstring-string;
if(len!=strlen(string))
ErrorReport(document, 3); //invalid int value
}
DOMString CheckIntvector(DOMString attributeVal, DOM_Node document)
{
DOMString ivstring;
char* string ;
if(attributeVal != NULL)
{
string = attributeVal.transcode();
char integer[32];
char *stopstring;
int i,j;
int len = strlen(string);
int begin,end;
int value;
begin = end =0;
for(i = 0; i < len; i++)
{
if(string[i]==(char)32 && i!= (len-1)){
end = i+1;
for(j = begin; j < end; j++)
integer[j-begin] = string[j];
integer[end-begin]='\0';
ivstring.appendData(integer);
ivstring.appendData(",");
value = strtoul(integer, &stopstring, 0);
int l = stopstring - integer;
if((stopstring - integer)!=(end - begin -1))
ErrorReport(document, 3); //invalid int value
begin = end;
}
}
if(string[len-1]!=(char)32)
{
for(j = begin; j < len; j++)
integer[j-begin] = string[j];
integer[len-begin] = '\0';
ivstring.appendData(integer);
value = strtoul(integer, &stopstring, 0);
int l = stopstring - integer;
if((stopstring - integer)!=(len - begin))
ErrorReport(document, 3);
}
return ivstring;
}
else
return NULL;
}
// ---------------------------------------------------------------------------
// ostream << DOM_Node
//
// Stream out a DOM node, and, recursively, all of its children.
// ---------------------------------------------------------------------------
ostream& operator<<(ostream& target, DOM_Node& toWrite)
{
// Get the name and value out for convenience
DOMString nodeName = toWrite.getNodeName(); //<tag name>, type
DOMString nodeValue = toWrite.getNodeValue(); //<tag content>
DOMString attributeKey, attributeVal; //(key/name)(val/filename)
unsigned long lent = nodeValue.length();
switch (toWrite.getNodeType())
{
case DOM_Node::TEXT_NODE:
{
gFormatter->formatBuf(nodeValue.rawBuffer(),
lent, XMLFormatter::CharEscapes);
break;
}
case DOM_Node::PROCESSING_INSTRUCTION_NODE :
{
break;
}
case DOM_Node::DOCUMENT_NODE :
{
DOM_Node child = toWrite.getFirstChild();
while( child != 0)
{
target << child;
child = child.getNextSibling();
}
break;
}
case DOM_Node::ELEMENT_NODE :
{
DOM_NamedNodeMap attributes = toWrite.getAttributes();
int attrCount = attributes.getLength();
for (int i = 0; i < attrCount; i++)
{
DOM_Node attribute = attributes.item(i);
if(attribute.getNodeName().equals("key")||attribute.getNodeName().equals("name")){
attributeKey = attribute.getNodeValue();
}
else if(attribute.getNodeName().equals("val")||attribute.getNodeName().equals("filename")){
attributeVal = attribute.getNodeValue();
}
}
//Print Out
if(nodeName.equals("resourceBundle"))
*gFormatter << attributeKey;
else
{
if(nodeName.equals("bin") && attributeVal==NULL)
*gFormatter <<attributeKey << ":" << nodeName << chSpace<< "{" << chDoubleQuote <<attributeVal << chDoubleQuote;
else if(nodeName.equals("str"))
*gFormatter <<attributeKey << chSpace<< "{" << chDoubleQuote <<attributeVal << chDoubleQuote;
else if(nodeName.equals("intVector"))
*gFormatter <<attributeKey << ":" << "intvector" << chSpace<< "{" <<attributeVal ;
else if(nodeName.equals("importBin"))
*gFormatter <<attributeKey << ":" << "import" << chSpace<< "{" << chDoubleQuote <<attributeVal << chDoubleQuote ;
else
*gFormatter <<attributeKey << ":" << nodeName << chSpace<< "{" << attributeVal;
}
attributeKey = attributeVal = NULL;
DOM_Node child = toWrite.getFirstChild();
if (child != 0)
{
while( child != 0)
{
target << child;
child = child.getNextSibling();
}
if(!nodeName.equals("resourceBundle"))
*gFormatter << "}";
}
else
{
if(!nodeName.equals("resourceBundle"))
*gFormatter << "}";
}
break;
}
case DOM_Node::ENTITY_REFERENCE_NODE:
{
break;
}
case DOM_Node::CDATA_SECTION_NODE:
{
break;
}
case DOM_Node::COMMENT_NODE:
{
break;
}
case DOM_Node::DOCUMENT_TYPE_NODE:
{
DOM_DocumentType doctype = (DOM_DocumentType &)toWrite;
break;
}
case DOM_Node::ENTITY_NODE:
{
break;
}
case DOM_Node::XML_DECL_NODE:
{
break;
}
default:
cerr << "Unrecognized node type = " << (long)toWrite.getNodeType() << endl;
}
return target;
}
void ErrorReport(DOM_Node& toWrite, int ErrorType){
DOM_NamedNodeMap attributes;
DOM_Node attribute;
int attrCount, i;
cout << "\nerror occurs at:\n";
DOMString ErrorMsg;
while(toWrite.getParentNode()!=NULL){
//do
ErrorMsg.insertData(0, ")");
attributes = toWrite.getAttributes();
attrCount = attributes.getLength();
if(attrCount!=0)
{
for (i = attrCount-1; i>=0; i--)
{
attribute = attributes.item(i);
ErrorMsg.insertData(0, " ; ");
ErrorMsg.insertData(0, attribute.getNodeValue());
}
}
ErrorMsg.insertData(0, "(");
ErrorMsg.insertData(0, toWrite.getNodeName());
ErrorMsg.insertData(0, "==>");
toWrite = toWrite.getParentNode();
}
ErrorMsg.appendData("\n");
switch (ErrorType)
{
case 1:
ErrorMsg.appendData("The element in the array can't have a name!\n");
break;
case 2:
ErrorMsg.appendData("The element in the table should have a name!\n");
break;
case 3:
ErrorMsg.appendData("Invalid integer value!\n");
break;
case 4:
ErrorMsg.appendData("Invalid bin!\n");
break;
case 5:
ErrorMsg.appendData("Name Duplication in the table!\n");
break;
case 6:
ErrorMsg.appendData("Invalid element name! Remember to assign correct DTD file on the xml file.\n");
break;
}
cout << ErrorMsg;
exit(0);
}
char* CreateTxtName(const char* arg, const char* Dir)
{
char* temp = CreateFile(arg, Dir);
int len = strlen(temp);
temp[len-1] = 't';
temp[len-2] = 'x';
temp[len-3] = 't';
return temp;
/*char drive[_MAX_DRIVE];
char dir[_MAX_DIR];
char fname[_MAX_FNAME];
char ext[_MAX_EXT];
_splitpath(gXmlFile, drive, dir, fname, ext);
strcpy(gTxtFile, "\0");
if (drive != NULL) {
strcat(gTxtFile, drive);
}
if (dir != NULL) {
strcat(gTxtFile, dir);
}
if (fname !=NULL) {
strcat(gTxtFile, fname);
}
strcat(gTxtFile, "tempfile.txt");*/
}
char* CreateFile(const char* arg, const char* Dir)
{ char* temp = new char[256];
char a[2]={'\\', '\0'};
char* currdir;
if(sourceDir!=NULL) {
strcpy(temp, Dir);
int len = strlen(temp);
if(temp[len - 1]!='\\')
strcat(temp, a);
strcat(temp, arg);
}
else {
char drive[_MAX_DRIVE];
char dir[_MAX_DIR];
char fname[_MAX_FNAME];
char ext[_MAX_EXT];
_splitpath(arg, drive, dir, fname, ext);
if(*drive == NULL && *dir == NULL) {
#ifdef WIN32
currdir = _getcwd(NULL, 0);
#else
currdir = getcwd(NULL, 0);
#endif
strcpy(temp, currdir);
strcat(temp, a);
}
strcat(temp, arg);
}
return temp;
}
// ---------------------------------------------------------------------------
// ostream << DOMString
//
// Stream out a DOM string. Doing this requires that we first transcode
// to char * form in the default code page for the system
// ---------------------------------------------------------------------------
ostream& operator<< (ostream& target, const DOMString& s)
{
char *p = s.transcode();
target << p;
delete [] p;
return target;
}
XMLFormatter& operator<< (XMLFormatter& strm, const DOMString& s)
{
unsigned int lent = s.length();
if (lent <= 0)
return strm;
XMLCh* buf = new XMLCh[lent + 1];
XMLString::copyNString(buf, s.rawBuffer(), lent);
buf[lent] = 0;
strm << buf;
delete [] buf;
return strm;
}