diff options
author | Kim Grasman <kim.grasman@gmail.com> | 2018-03-18 16:53:32 +0100 |
---|---|---|
committer | Kim Gräsman <kim.grasman@gmail.com> | 2018-04-08 19:03:35 +0200 |
commit | 5082fddccb3d5aabaace2208f1162029a27c0334 (patch) | |
tree | fa32d73e7b02bf721694d722fa71befc949382be | |
parent | eee0b0fc5113caf4834419bce28b96852115f5ea (diff) |
Detect and preserve original file encoding
This uses a few simple heuristics to detect file encoding before
rewriting file contents.
All file I/O is now binary, and decoding/encoding is explicit based on
detected encoding.
Cover a few common encodings (utf-8, ascii, windows-1250, windows-1252),
and make it easy to add more encodings should it be necessary.
Should make fix_includes.py behave better under Python3 with
non-ASCII-encoded files.
-rwxr-xr-x | fix_includes.py | 45 | ||||
-rwxr-xr-x | fix_includes_test.py | 21 |
2 files changed, 59 insertions, 7 deletions
diff --git a/fix_includes.py b/fix_includes.py index 0dee667..37c535f 100755 --- a/fix_includes.py +++ b/fix_includes.py @@ -504,9 +504,11 @@ class LineInfo(object): class FileInfo(object): """ Details about a file's storage encoding """ DEFAULT_LINESEP = os.linesep + DEFAULT_ENCODING = 'utf-8' - def __init__(self, linesep): + def __init__(self, linesep, encoding): self.linesep = linesep + self.encoding = encoding @staticmethod def parse(filename): @@ -515,7 +517,8 @@ class FileInfo(object): content = f.read() linesep = FileInfo.guess_linesep(content) - return FileInfo(linesep) + encoding = FileInfo.guess_encoding(content) + return FileInfo(linesep, encoding) @staticmethod def guess_linesep(bytebuf): @@ -529,11 +532,41 @@ class FileInfo(object): return FileInfo.DEFAULT_LINESEP + @staticmethod + def guess_encoding(bytebuf): + """ Return approximate encoding for buffer. + + This is heavily heuristic, and will return any supported encoding that can + describe the file without losing information, not necessarily the *right* + encoding. This is usually OK, because IWYU typically only adds ASCII content + (or content pulled from the file itself). + """ + def try_decode(buf, encoding): + try: + buf.decode(encoding, errors='strict') + except UnicodeError: + return False + return True + + # Special-case UTF-8 BOM + if bytebuf[0:3] == b'\xef\xbb\xbf': + if try_decode(bytebuf, 'utf-8'): + return 'utf-8' + + encodings = ['ascii', 'utf-8', 'windows-1250', 'windows-1252'] + for encoding in encodings: + if try_decode(bytebuf, encoding): + return encoding + + return FileInfo.DEFAULT_ENCODING + def _ReadFile(filename, fileinfo): """Read from filename and return a list of file lines.""" try: - return open(filename).read().splitlines() + with open(filename, 'rb') as f: + content = f.read() + return content.decode(fileinfo.encoding).splitlines() except (IOError, OSError) as why: print("Skipping '%s': %s" % (filename, why)) return None @@ -542,10 +575,10 @@ def _ReadFile(filename, fileinfo): def _WriteFile(filename, fileinfo, file_lines): """Write the given file-lines to the file.""" try: - # Open file in binary mode to preserve line endings with open(filename, 'wb') as f: - f.write(fileinfo.linesep.join(file_lines)) - f.write(fileinfo.linesep) + content = fileinfo.linesep.join(file_lines) + fileinfo.linesep + content = content.encode(fileinfo.encoding) + f.write(content) except (IOError, OSError) as why: print("Error writing '%s': %s" % (filename, why)) diff --git a/fix_includes_test.py b/fix_includes_test.py index 1915fe6..11e09f5 100755 --- a/fix_includes_test.py +++ b/fix_includes_test.py @@ -51,7 +51,7 @@ class FixIncludesBase(unittest.TestCase): return self.before_map[filename] def _ParseFileInfo(self, filename): - return fix_includes.FileInfo('\n') + return fix_includes.FileInfo('\n', 'utf-8') def _WriteFile(self, filename, fileinfo, contents): return self.actual_after_contents.extend(contents) @@ -3465,6 +3465,25 @@ class FileInfoTest(unittest.TestCase): self.assertEqual(fix_includes.FileInfo.DEFAULT_LINESEP, fix_includes.FileInfo.guess_linesep(buf)) + def testEncodingASCII(self): + buf = b'abcdefgh' + self.assertEqual('ascii', fix_includes.FileInfo.guess_encoding(buf)) + + def testEncodingUTF8BOM(self): + buf = b'\xef\xbb\xbfSomeASCIIButWithTheBOM' + self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf)) + + def testEncodingUTF8NoBOM(self): + # This is a recurring test input in Swedish, translates to "shrimp sandwich" + # and contains all three Swedish exotic characters. + buf = b'r\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s' + self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf)) + + def testEncodingISO8859_1(self): + # Yours truly + buf = b'Kim Gr\xe4sman' + self.assertEqual('windows-1250', fix_includes.FileInfo.guess_encoding(buf)) + if __name__ == '__main__': unittest.main() |