From e259adc9442502e524ffeb3ad042393c02d6030b Mon Sep 17 00:00:00 2001 From: Norbert Runge Date: Tue, 18 Sep 2018 15:57:51 -0700 Subject: [PATCH] ICU-20119 Additional changes to UTF-8 checking script. --- icu4c/source/tools/icu-file-utf8-check.py | 108 ++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100755 icu4c/source/tools/icu-file-utf8-check.py diff --git a/icu4c/source/tools/icu-file-utf8-check.py b/icu4c/source/tools/icu-file-utf8-check.py new file mode 100755 index 0000000000..04ba3074da --- /dev/null +++ b/icu4c/source/tools/icu-file-utf8-check.py @@ -0,0 +1,108 @@ +#! /usr/bin/python -B + +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others. +# All rights reserved. + +# +# Script to check that ICU source files contain only valid UTF-8 encoded text, +# and that all files except '.txt' files do not contain a Byte Order Mark (BOM). +# +# THIS SCRIPT DOES NOT WORK ON WINDOWS +# It only works correctly on platforms where the native line ending is a plain \n +# +# usage: +# icu-svnprops-check.py [options] +# +# options: +# -h | --help Print a usage line and exit. +# +# The tool operates recursively on the directory from which it is run. +# Only files from the ICU github repository are checked. +# No changes are made to the repository; only the working copy will be altered. + +import sys +import os +import os.path +import re +import getopt + + +def runCommand(cmd): + output_file = os.popen(cmd); + output_text = output_file.read(); + exit_status = output_file.close(); + if exit_status: + print >>sys.stderr, '"', cmd, '" failed. Exiting.' + sys.exit(exit_status) + return output_text + + +def usage(): + print "usage: " + sys.argv[0] + " [-h | --help]" + + +# +# File check. Check source code files for UTF-8 and all except text files for not containing a BOM +# file_name: name of a text file. +# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java). +# +def check_file(file_name, is_source): + f = open(file_name, 'r') + bytes = f.read() + f.close() + + if is_source: + try: + bytes.decode("UTF-8") + except UnicodeDecodeError: + print "Error: %s is a source code file but contains non-utf-8 bytes." % file_name + + if ord(bytes[0]) == 0xef: + if not (file_name.endswith(".txt") or file_name.endswith(".sln") + or file_name.endswith(".targets") + or ".vcxproj" in file_name): + print "Warning: file %s contains a UTF-8 BOM: " % file_name + + return + +def main(argv): + try: + opts, args = getopt.getopt(argv, "h", ("help")) + except getopt.GetoptError: + print "unrecognized option: " + argv[0] + usage() + sys.exit(2) + for opt, arg in opts: + if opt in ("-h", "--help"): + usage() + sys.exit() + if args: + print "unexpected command line argument" + usage() + sys.exit() + + output = runCommand("git ls-files "); + file_list = output.splitlines() + + source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))") + + for f in file_list: + if os.path.isdir(f): + print "Skipping dir " + f + continue + if not os.path.isfile(f): + print "Repository file not in working copy: " + f + continue; + + if source_file_re.match(f): + source_file = True + check_file(f, source_file); + else: + source_file = False + check_file(f, source_file) + +if __name__ == "__main__": + main(sys.argv[1:])