summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKim Grasman <kim.grasman@gmail.com>2018-03-18 16:53:32 +0100
committerKim Gräsman <kim.grasman@gmail.com>2018-04-08 19:03:35 +0200
commit5082fddccb3d5aabaace2208f1162029a27c0334 (patch)
treefa32d73e7b02bf721694d722fa71befc949382be
parenteee0b0fc5113caf4834419bce28b96852115f5ea (diff)
Detect and preserve original file encoding
This uses a few simple heuristics to detect file encoding before rewriting file contents. All file I/O is now binary, and decoding/encoding is explicit based on detected encoding. Cover a few common encodings (utf-8, ascii, windows-1250, windows-1252), and make it easy to add more encodings should it be necessary. Should make fix_includes.py behave better under Python3 with non-ASCII-encoded files.
-rwxr-xr-xfix_includes.py45
-rwxr-xr-xfix_includes_test.py21
2 files changed, 59 insertions, 7 deletions
diff --git a/fix_includes.py b/fix_includes.py
index 0dee667..37c535f 100755
--- a/fix_includes.py
+++ b/fix_includes.py
@@ -504,9 +504,11 @@ class LineInfo(object):
class FileInfo(object):
""" Details about a file's storage encoding """
DEFAULT_LINESEP = os.linesep
+ DEFAULT_ENCODING = 'utf-8'
- def __init__(self, linesep):
+ def __init__(self, linesep, encoding):
self.linesep = linesep
+ self.encoding = encoding
@staticmethod
def parse(filename):
@@ -515,7 +517,8 @@ class FileInfo(object):
content = f.read()
linesep = FileInfo.guess_linesep(content)
- return FileInfo(linesep)
+ encoding = FileInfo.guess_encoding(content)
+ return FileInfo(linesep, encoding)
@staticmethod
def guess_linesep(bytebuf):
@@ -529,11 +532,41 @@ class FileInfo(object):
return FileInfo.DEFAULT_LINESEP
+ @staticmethod
+ def guess_encoding(bytebuf):
+ """ Return approximate encoding for buffer.
+
+ This is heavily heuristic, and will return any supported encoding that can
+ describe the file without losing information, not necessarily the *right*
+ encoding. This is usually OK, because IWYU typically only adds ASCII content
+ (or content pulled from the file itself).
+ """
+ def try_decode(buf, encoding):
+ try:
+ buf.decode(encoding, errors='strict')
+ except UnicodeError:
+ return False
+ return True
+
+ # Special-case UTF-8 BOM
+ if bytebuf[0:3] == b'\xef\xbb\xbf':
+ if try_decode(bytebuf, 'utf-8'):
+ return 'utf-8'
+
+ encodings = ['ascii', 'utf-8', 'windows-1250', 'windows-1252']
+ for encoding in encodings:
+ if try_decode(bytebuf, encoding):
+ return encoding
+
+ return FileInfo.DEFAULT_ENCODING
+
def _ReadFile(filename, fileinfo):
"""Read from filename and return a list of file lines."""
try:
- return open(filename).read().splitlines()
+ with open(filename, 'rb') as f:
+ content = f.read()
+ return content.decode(fileinfo.encoding).splitlines()
except (IOError, OSError) as why:
print("Skipping '%s': %s" % (filename, why))
return None
@@ -542,10 +575,10 @@ def _ReadFile(filename, fileinfo):
def _WriteFile(filename, fileinfo, file_lines):
"""Write the given file-lines to the file."""
try:
- # Open file in binary mode to preserve line endings
with open(filename, 'wb') as f:
- f.write(fileinfo.linesep.join(file_lines))
- f.write(fileinfo.linesep)
+ content = fileinfo.linesep.join(file_lines) + fileinfo.linesep
+ content = content.encode(fileinfo.encoding)
+ f.write(content)
except (IOError, OSError) as why:
print("Error writing '%s': %s" % (filename, why))
diff --git a/fix_includes_test.py b/fix_includes_test.py
index 1915fe6..11e09f5 100755
--- a/fix_includes_test.py
+++ b/fix_includes_test.py
@@ -51,7 +51,7 @@ class FixIncludesBase(unittest.TestCase):
return self.before_map[filename]
def _ParseFileInfo(self, filename):
- return fix_includes.FileInfo('\n')
+ return fix_includes.FileInfo('\n', 'utf-8')
def _WriteFile(self, filename, fileinfo, contents):
return self.actual_after_contents.extend(contents)
@@ -3465,6 +3465,25 @@ class FileInfoTest(unittest.TestCase):
self.assertEqual(fix_includes.FileInfo.DEFAULT_LINESEP,
fix_includes.FileInfo.guess_linesep(buf))
+ def testEncodingASCII(self):
+ buf = b'abcdefgh'
+ self.assertEqual('ascii', fix_includes.FileInfo.guess_encoding(buf))
+
+ def testEncodingUTF8BOM(self):
+ buf = b'\xef\xbb\xbfSomeASCIIButWithTheBOM'
+ self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
+
+ def testEncodingUTF8NoBOM(self):
+ # This is a recurring test input in Swedish, translates to "shrimp sandwich"
+ # and contains all three Swedish exotic characters.
+ buf = b'r\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'
+ self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
+
+ def testEncodingISO8859_1(self):
+ # Yours truly
+ buf = b'Kim Gr\xe4sman'
+ self.assertEqual('windows-1250', fix_includes.FileInfo.guess_encoding(buf))
+
if __name__ == '__main__':
unittest.main()