ICU-12766 wip escaping for utf-8 input

* create a new escaper tool - needs to be invoked in mh- files
* escapes to temporary _whatever.cpp files
* does NOT handle multi line u"xxx" (or u'xxx'? )
* does not cleanup _*.cpp files
* fixup a bunch of Makefiles to add -I$(srcdir) (doesn't hurt anyway)

X-SVN-Rev: 39787
This commit is contained in:
Steven R. Loomis 2017-03-13 23:19:33 +00:00
parent d9cdb7568c
commit 4890638e08
14 changed files with 269 additions and 45 deletions

View File

@ -140,6 +140,7 @@ $(LIBDIR) $(BINDIR):
## Recursive targets
all-recursive install-recursive clean-recursive distclean-recursive dist-recursive check-recursive check-exhaustive-recursive: $(LIBDIR) $(BINDIR)
@(cd tools/escapesrc && $(MAKE) RECURSIVE=YES $$local_target) || exit
@dot_seen=no; \
target=`echo $@ | sed s/-recursive//`; \
list='$(LOCAL_SUBDIRS)'; for subdir in $$list; do \

View File

@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for ICU 59.1
# Generated by GNU Autoconf 2.69 for ICU 59.1.
#
# Report bugs to <http://icu-project.org/bugs>.
#
@ -674,7 +674,6 @@ GENCCODE_ASSEMBLY
HAVE_MMAP
LIB_THREAD
U_HAVE_ATOMIC
U_HAVE_STD_STRING
ENABLE_RPATH
U_ENABLE_DYLOAD
U_HAVE_PLUGINS
@ -5958,37 +5957,6 @@ $as_echo "$as_me: Adding CXXFLAGS option -std=c++11" >&6;}
fi
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if #include <string> works" >&5
$as_echo_n "checking if #include <string> works... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <string>
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_cxx_try_compile "$LINENO"; then :
ac_cv_header_stdstring=yes
else
ac_cv_header_stdstring=no
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdstring" >&5
$as_echo "$ac_cv_header_stdstring" >&6; }
if test $ac_cv_header_stdstring = yes
then
U_HAVE_STD_STRING=1
else
U_HAVE_STD_STRING=0
CONFIG_CPPFLAGS="${CONFIG_CPPFLAGS} -DU_HAVE_STD_STRING=0"
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if #include <atomic> works" >&5
$as_echo_n "checking if #include <atomic> works... " >&6; }
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@ -7800,7 +7768,7 @@ echo "CXXFLAGS=$CXXFLAGS"
# output the Makefiles
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/gendict/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/collperf/Makefile test/perf/collperf2/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/howExpensiveIs/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile test/perf/leperf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile"
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/gendict/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile tools/escapesrc/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/collperf/Makefile test/perf/collperf2/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/howExpensiveIs/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile test/perf/leperf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
@ -8545,6 +8513,7 @@ do
"tools/pkgdata/Makefile") CONFIG_FILES="$CONFIG_FILES tools/pkgdata/Makefile" ;;
"tools/tzcode/Makefile") CONFIG_FILES="$CONFIG_FILES tools/tzcode/Makefile" ;;
"tools/gencfu/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencfu/Makefile" ;;
"tools/escapesrc/Makefile") CONFIG_FILES="$CONFIG_FILES tools/escapesrc/Makefile" ;;
"test/Makefile") CONFIG_FILES="$CONFIG_FILES test/Makefile" ;;
"test/compat/Makefile") CONFIG_FILES="$CONFIG_FILES test/compat/Makefile" ;;
"test/testdata/Makefile") CONFIG_FILES="$CONFIG_FILES test/testdata/Makefile" ;;

View File

@ -44,7 +44,7 @@ CLEANFILES = *~ $(DEPS) $(ALL_MAN_FILES)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit
LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

View File

@ -55,7 +55,7 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
CFLAGS += $(LIBCFLAGS)
CXXFLAGS += $(LIBCXXFLAGS)
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS) $(CPPFLAGSICUIO)
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS) $(CPPFLAGSICUIO)
DEFS += -DU_IO_IMPLEMENTATION
LDFLAGS += $(LDFLAGSICUIO)
LIBS = $(LIBICUUC) $(LIBICUI18N) $(DEFAULT_LIBS)

View File

@ -34,7 +34,7 @@ BUILDDIR := $(BUILDDIR:test\\intltest/../../=)
# Simplify the path for Windows 98
BUILDDIR := $(BUILDDIR:TEST\\INTLTEST/../../=)
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/tools/ctestfw
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/tools/ctestfw
CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT= -DUNISTR_FROM_STRING_EXPLICIT=
DEFS += -D'U_TOPSRCDIR="$(top_srcdir)/"' -D'U_TOPBUILDDIR="$(BUILDDIR)"'
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M) $(LIB_THREAD)

View File

@ -34,7 +34,7 @@ BUILDDIR := $(BUILDDIR:test\\iotest/../../=)
# Simplify the path for Windows 98
BUILDDIR := $(BUILDDIR:TEST\\IOTEST/../../=)
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/ctestfw -I$(top_srcdir)/io
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/ctestfw -I$(top_srcdir)/io
CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT= -DUNISTR_FROM_STRING_EXPLICIT=
DEFS += -D'U_TOPSRCDIR="$(top_srcdir)/"' -D'U_TOPBUILDDIR="$(BUILDDIR)"'
LIBS = $(LIBCTESTFW) $(LIBICUTOOLUTIL) $(LIBICUIO) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

View File

@ -5,6 +5,8 @@
## others. All Rights Reserved.
## Steven R. Loomis
SKIP_ESCAPING=YES
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@

View File

@ -2,8 +2,256 @@
// License & terms of use: http://www.unicode.org/copyright.html
#include <stdio.h>
#include <string>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
int main(int argc, const char *argv[]) {
puts("Hi\n");
// with caution:
#include "unicode/utf8.h"
std::string prog;
void usage() {
fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
}
int cleanup(const std::string &outfile) {
const char *outstr = outfile.c_str();
if(outstr && *outstr) {
int rc = unlink(outstr);
if(rc == 0) {
fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
return 0;
} else {
if( errno == ENOENT ) {
return 0; // File did not exist - no error.
} else {
perror("unlink");
return 1;
}
}
}
return 0;
}
inline bool hasNonAscii(const char *line, size_t len) {
const unsigned char *uline = reinterpret_cast<const unsigned char*>(line);
for(size_t i=0;i<len; i++) {
if( uline[i] > 0x7F) {
return true;
}
}
return false;
}
inline const char *skipws(const char *p, const char *e) {
for(;p<e;p++) {
switch(*p) {
case ' ':
case '\t':
case '\n':
case '\r':
break;
default:
return p; // non ws
}
}
return p;
}
inline bool isCommentOrEmpty(const char* line, size_t len) {
const char *p = line;
const char *e = line+len;
p = skipws(p,e);
if(p==e) {
return true; // whitespace only
}
p++;
switch(*p) {
case '#': return true; // #directive
case '/':
p++;
if(p==e) return false; // single slash
switch(*p) {
case '/': // '/ /'
case '*': // '/ *'
return true; // start of comment
default: return false; // something else
}
default: return false; // something else
}
/*NOTREACHED*/
}
/**
* fix the string at the position
* false = no err
* true = had err
*/
bool fixAt(std::string &linestr, size_t pos) {
if(linestr[pos] != 'u') {
fprintf(stderr, "Not a 'u'?");
return true;
}
char quote = linestr[pos+1];
//printf("u%c…%c\n", quote, quote);
for(pos += 2; pos < linestr.size(); pos++) {
if(linestr[pos] == quote) return false; // end of quote
if(linestr[pos] == '\\') {
pos++;
if(linestr[pos] == quote) continue; // quoted quote
if(linestr[pos] == 'u') continue; // for now ... unicode escape
if(linestr[pos] == '\\') continue;
// some other escape… ignore
} else {
// Proceed to decode utf-8
const uint8_t *s = (const uint8_t*) (linestr.c_str());
const uint8_t *b = s;
int32_t i = pos;
int32_t length = linestr.size();
UChar32 c;
if(U8_IS_SINGLE((uint8_t)s[i])) continue; // single code point
{
U8_NEXT(s, i, length, c);
}
if(c<0) {
fprintf(stderr, "Illegal utf-8 sequence %04X pos %d\n", c, pos);
return true;
}
size_t seqLen = (i-pos);
printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);
if( c <= 0xFFFF) {
char newSeq[] = "\\uFFFD";
sprintf(newSeq, "\\u%04X", c);
linestr.replace(pos, seqLen, newSeq);
//pos += seqLen; // advance
pos += strlen(newSeq) - 1;
} else {
fprintf(stderr, "%s: Error: not implemented yet: surrogate pairs for U+%04X\n", prog.c_str(), c);
return true;
}
}
}
return false;
}
/**
* false = no err
* true = had err
*/
bool fixLine(int no, std::string &linestr) {
const char *line = linestr.c_str();
size_t len = linestr.size();
// Quick Check: all ascii?
if(!hasNonAscii(line, len)) {
return false; // ASCII
}
if(isCommentOrEmpty(line, len)) {
return false; // Comment or just empty
}
if(!strnstr(line, "u'", len) && !strnstr(line, "u\"", len)) {
return false; // Nothing to do. No u' or u" detected
}
// start from the end and find all u" cases
size_t pos = len = linestr.size();
while((pos = linestr.rfind("u\"", pos)) != std::string::npos) {
printf("found doublequote at %d\n", pos);
if(fixAt(linestr, pos)) return true;
pos--;
}
// reset and find all u' cases
pos = len = linestr.size();
while((pos = linestr.rfind("u'", pos)) != std::string::npos) {
printf("found singlequote at %d\n", pos);
if(fixAt(linestr, pos)) return true;
pos--;
}
fprintf(stderr, "%d - fixed\n", no);
return false;
}
int convert(const std::string &infile, const std::string &outfile) {
fprintf(stderr, "%s: %s -> %s\n", prog.c_str(), infile.c_str(), outfile.c_str());
FILE *inf = fopen(infile.c_str(), "rb");
if(!inf) {
fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
cleanup(outfile);
return 1;
}
FILE *outf = fopen(outfile.c_str(), "w");
if(!outf) {
fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
fclose(inf);
return 1;
}
// TODO: any platform variations of this?
fprintf(outf, "#line 1 \"%s\"\n", infile.c_str());
size_t len;
char *line;
int no = 0;
std::string linestr;
while((line = fgetln(inf, &len))!= NULL) {
no++;
linestr.assign(line, len);
if(fixLine(no, linestr)) {
fclose(inf);
fclose(outf);
fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
cleanup(outfile);
return 1;
}
len = linestr.size(); // size may have changed.
if(fwrite(linestr.c_str(), 1, linestr.size(), outf) != len) {
fclose(inf);
fclose(outf);
fprintf(stderr, "%s: short write to %s:%d\n", prog.c_str(), outfile.c_str(), no);
cleanup(outfile);
return 1;
}
}
fclose(inf);
fclose(outf);
return 0;
}
int main(int argc, const char *argv[]) {
prog = argv[0];
if(argc != 3) {
usage();
return 1;
}
std::string infile = argv[1];
std::string outfile = argv[2];
return convert(infile, outfile);
}
#include "utf_impl.cpp"

View File

@ -24,7 +24,7 @@ CLEANFILES = *~ $(DEPS)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = gennorm2.o n2builder.o

View File

@ -33,7 +33,7 @@ TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
# derb depends on icuio
@ICUIO_TRUE@DERB = $(BINDIR)/$(DERB_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil -I$(top_srcdir)/io
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil -I$(top_srcdir)/io
CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

View File

@ -1,5 +1,7 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
char16_t CH = u'';
/*
*******************************************************************************
*
@ -217,6 +219,8 @@ extern int
main(int argc, char* argv[]) {
UErrorCode errorCode = U_ZERO_ERROR;
UBool didSomething = FALSE;
printf("U+%lx\n", CH); return 0;
/* preset then read command line options */
argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);

View File

@ -28,7 +28,7 @@ CLEANFILES = *~ $(DEPS) $(MAN_FILES)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = makeconv.o ucnvstat.o genmbcs.o gencnvex.o

View File

@ -32,7 +32,7 @@ endif
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil
DEFS += -DUDATA_SO_SUFFIX=\".$(SO)\" -DSTATIC_O=\"$(STATIC_O)\"
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

View File

@ -45,7 +45,7 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
CFLAGS += $(LIBCFLAGS)
CXXFLAGS += $(LIBCXXFLAGS)
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS)
CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS)
# from icuinfo
CPPFLAGS+= "-DU_BUILD=\"@build@\"" "-DU_HOST=\"@host@\"" "-DU_CC=\"@CC@\"" "-DU_CXX=\"@CXX@\""