ICU-20119 Additional changes to UTF-8 checking script.
This commit is contained in:
parent
6f057afad6
commit
e259adc944
108
icu4c/source/tools/icu-file-utf8-check.py
Executable file
108
icu4c/source/tools/icu-file-utf8-check.py
Executable file
@ -0,0 +1,108 @@
|
||||
#! /usr/bin/python -B
|
||||
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
|
||||
# All rights reserved.
|
||||
|
||||
#
|
||||
# Script to check that ICU source files contain only valid UTF-8 encoded text,
|
||||
# and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
|
||||
#
|
||||
# THIS SCRIPT DOES NOT WORK ON WINDOWS
|
||||
# It only works correctly on platforms where the native line ending is a plain \n
|
||||
#
|
||||
# usage:
|
||||
# icu-svnprops-check.py [options]
|
||||
#
|
||||
# options:
|
||||
# -h | --help Print a usage line and exit.
|
||||
#
|
||||
# The tool operates recursively on the directory from which it is run.
|
||||
# Only files from the ICU github repository are checked.
|
||||
# No changes are made to the repository; only the working copy will be altered.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
import getopt
|
||||
|
||||
|
||||
def runCommand(cmd):
|
||||
output_file = os.popen(cmd);
|
||||
output_text = output_file.read();
|
||||
exit_status = output_file.close();
|
||||
if exit_status:
|
||||
print >>sys.stderr, '"', cmd, '" failed. Exiting.'
|
||||
sys.exit(exit_status)
|
||||
return output_text
|
||||
|
||||
|
||||
def usage():
|
||||
print "usage: " + sys.argv[0] + " [-h | --help]"
|
||||
|
||||
|
||||
#
|
||||
# File check. Check source code files for UTF-8 and all except text files for not containing a BOM
|
||||
# file_name: name of a text file.
|
||||
# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
|
||||
#
|
||||
def check_file(file_name, is_source):
|
||||
f = open(file_name, 'r')
|
||||
bytes = f.read()
|
||||
f.close()
|
||||
|
||||
if is_source:
|
||||
try:
|
||||
bytes.decode("UTF-8")
|
||||
except UnicodeDecodeError:
|
||||
print "Error: %s is a source code file but contains non-utf-8 bytes." % file_name
|
||||
|
||||
if ord(bytes[0]) == 0xef:
|
||||
if not (file_name.endswith(".txt") or file_name.endswith(".sln")
|
||||
or file_name.endswith(".targets")
|
||||
or ".vcxproj" in file_name):
|
||||
print "Warning: file %s contains a UTF-8 BOM: " % file_name
|
||||
|
||||
return
|
||||
|
||||
def main(argv):
|
||||
try:
|
||||
opts, args = getopt.getopt(argv, "h", ("help"))
|
||||
except getopt.GetoptError:
|
||||
print "unrecognized option: " + argv[0]
|
||||
usage()
|
||||
sys.exit(2)
|
||||
for opt, arg in opts:
|
||||
if opt in ("-h", "--help"):
|
||||
usage()
|
||||
sys.exit()
|
||||
if args:
|
||||
print "unexpected command line argument"
|
||||
usage()
|
||||
sys.exit()
|
||||
|
||||
output = runCommand("git ls-files ");
|
||||
file_list = output.splitlines()
|
||||
|
||||
source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
|
||||
|
||||
for f in file_list:
|
||||
if os.path.isdir(f):
|
||||
print "Skipping dir " + f
|
||||
continue
|
||||
if not os.path.isfile(f):
|
||||
print "Repository file not in working copy: " + f
|
||||
continue;
|
||||
|
||||
if source_file_re.match(f):
|
||||
source_file = True
|
||||
check_file(f, source_file);
|
||||
else:
|
||||
source_file = False
|
||||
check_file(f, source_file)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
Loading…
Reference in New Issue
Block a user