icu/tools/scripts/icu-file-utf8-check.py
gnrunge 5a922544be ICU-21248 Adds line ending check to Travis CI check. The eol check fits
quite well into what the script is doing already.
Note that one file, 'testrunner.cmd', contains \r, which seems to
be acceptable since it is a Windows file.
2020-09-24 09:56:55 -07:00

140 lines
4.2 KiB
Python
Executable file

#! /usr/bin/python -B
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2009-2011, International Business Machines Corporation, Google and Others.
# All rights reserved.
#
# Script to check that ICU source files contain only valid UTF-8 encoded text,
# and that all files except '.txt' files do not contain a Byte Order Mark (BOM).
#
# THIS SCRIPT DOES NOT WORK ON WINDOWS
# It only works correctly on platforms where the native line ending is a plain \n
#
# usage:
# icu-file-utf8-check.py [options]
#
# options:
# -h | --help Print a usage line and exit.
#
# The tool operates recursively on the directory from which it is run.
# Only files from the ICU github repository are checked.
# No changes are made to the repository; only the working copy will be altered.
# The script checks all source files and returns a non-zero exit code if any of
# the checked files contain a non-UTF-8 character.
from __future__ import print_function
import sys
import os
import os.path
import re
import getopt
# List of directories to check for UTF-8 and BOM. Currently covers
# all of icu/. Modify as needed.
icu_directories_to_be_scanned = ["."]
# Files that are allowed to contain \r line endings. If this list
# grows too long consider a file instead.
ignore_cr_in_files = [
"vendor/double-conversion/upstream/msvc/testrunner.cmd"
]
def runCommand(cmd):
output_file = os.popen(cmd);
output_text = output_file.read();
exit_status = output_file.close();
return output_text, exit_status
def usage():
print("usage: " + sys.argv[0] + " [-h | --help]")
#
# File check. Check source code files for UTF-8 and all except text files for not containing a BOM
# file_name: name of a text file.
# is_source: Flag, set to True if file is a source code file (.c, .cpp, .h, .java).
#
def check_file(file_name, is_source):
rc = 0
f = open(file_name, 'rb')
bytes = f.read()
f.close()
if is_source:
try:
bytes.decode("UTF-8")
except UnicodeDecodeError:
print("Error: %s is a source code file but contains non-utf-8 bytes." % file_name)
rc = 1
if bytes[0] == 0xef:
if not (file_name.endswith(".txt") or file_name.endswith(".sln")
or file_name.endswith(".targets") or ".vcxproj" in file_name):
print("Warning: file %s contains a UTF-8 BOM: " % file_name)
rc = 1
return rc
def main(argv):
exit_status = 0
rc = 0
try:
opts, args = getopt.getopt(argv, "h", ("help"))
except getopt.GetoptError:
print("unrecognized option: " + argv[0])
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
if args:
print("unexpected command line argument")
usage()
sys.exit(2)
source_file_re = re.compile(".*((?:\\.c$)|(?:\\.cpp$)|(?:\\.h$)|(?:\\.java$))")
git_cmd = "git ls-files DIR"
for dir in icu_directories_to_be_scanned:
print('Scanning ' + dir)
cmd = git_cmd.replace("DIR", dir)
output, rc = runCommand(cmd)
if rc:
print('"', cmd, '" failed. Exiting.', file=sys.stderr)
file_list = output.splitlines()
for f in file_list:
if os.path.isdir(f):
print("Skipping dir " + f)
continue
if not os.path.isfile(f):
print("Repository file not in working copy: " + f)
continue;
source_file = source_file_re.match(f)
if check_file(f, source_file) != 0:
exit_status = 1
# Lastly, check the line endings of the file.
# Note that 'grep' returns null if it reports a file,
# a non-null value otherwise.
output, rc = runCommand("grep -rPIl \"\\r\" " + f)
if (rc is None):
if f not in ignore_cr_in_files:
print("File ", f, " has \\r line ending")
exit_status = 1
print(exit_status)
sys.exit(exit_status)
if __name__ == "__main__":
main(sys.argv[1:])