scuffed-code/tools/release/c/bomfix.py

#!/usr/bin/python -B
# -*- coding: utf-8 -*-

#
# Copyright (C) 2017 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
# Copyright © 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2011 IBM Corporation and Others. All Rights Reserved.
#
# Run this like so:
#  cd /path/to/ICU
#  python /path/to/thisscript/bomfix.py
#
# it will fixup any files that have a mime-type of "utf-8" but no BOM.

import os
import codecs
import subprocess

print "Fixing bom in .\n"

ctx = None

tree = os.walk(".")

nots=0
notutf8=0
noprops=0
utf8=0
fixed=0
tfiles=0
bom=codecs.BOM_UTF8


# my own rewrite
def my_propget(prop, path, ignored_rev, ignored_recurs, ignored_ctx):
    "function_docstring"
    try:
        return subprocess.check_output(["svn", "pg", prop, path], stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as cpe:
        # now decode the error
        if "E200005" in cpe.output:
            # not under version control
            return None
        if "W200017" in cpe.output:
            # property not found
            return None
        else:
            print "On " + fp + ":\n" + cpe.output + "\n"
            print "This error wasn't recognized by bomfix, sorry."
            raise cpe

for ent in tree:
    (path,dirs,files) = ent
    if(path.find("/.svn") != -1):
        continue
    for file in files:
        tfiles=tfiles+1
        revision = None
        # use relative path
        fp = path + "/" + file
        #print "testing " + fp
        props = my_propget("svn:mime-type", fp, revision, 0, ctx)
        if not props:
            noprops = noprops + 1
            continue

        if (fp == "./LICENSE"):
            print "Skipping: %s" % fp
            continue

        type = props

        # ends with \n because of process
        if (not type == "text/plain;charset=utf-8\n"):
            #print fp + ": delta " + type
            notutf8 = notutf8 + 1
            continue

        # fp is utf-8
        utf8=utf8+1

        f = open(fp, 'rb')
        bytes=f.read(3)
        if not bytes:
            print fp + ": could not read 3 bytes"
            continue
        elif (bytes == bom):
            #print fp + ": OK"
            continue

        f.seek(0)

        os.rename(fp,fp+".tmp")
        o=open(fp,'wb')
        o.write(bom)
        while(1):
            bytes = f.read(2048)
            if bytes:
                o.write(bytes)
            else:
                break
        o.close()
        f.close()
        os.remove(fp+".tmp")
        fixed=fixed+1


        print fp


print "%d files, %d not under svn, %d with no props, %d not utf8: %d utf8, %d fixed\n" % (tfiles,nots,noprops,notutf8,utf8,fixed)