Detect and preserve original file encoding

This uses a few simple heuristics to detect file encoding before rewriting file contents. All file I/O is now binary, and decoding/encoding is explicit based on detected encoding. Cover a few common encodings (utf-8, ascii, windows-1250, windows-1252), and make it easy to add more encodings should it be necessary. Should make fix_includes.py behave better under Python3 with non-ASCII-encoded files.
author: Kim Grasman <kim.grasman@gmail.com> 2018-03-18 16:53:32 +0100
committer: Kim Gräsman <kim.grasman@gmail.com> 2018-04-08 19:03:35 +0200
commit: 5082fddccb3d5aabaace2208f1162029a27c0334 (patch)
tree: fa32d73e7b02bf721694d722fa71befc949382be
parent: eee0b0fc5113caf4834419bce28b96852115f5ea (diff)
2 files changed, 59 insertions, 7 deletions
diff --git a/fix_includes.py b/fix_includes.py
index 0dee667..37c535f 100755
--- a/fix_includes.py
+++ b/fix_includes.py
@@ -504,9 +504,11 @@ class LineInfo(object):
 class FileInfo(object):
   """ Details about a file's storage encoding  """
   DEFAULT_LINESEP = os.linesep
+  DEFAULT_ENCODING = 'utf-8'
 
-  def __init__(self, linesep):
+  def __init__(self, linesep, encoding):
     self.linesep = linesep
+    self.encoding = encoding
 
   @staticmethod
   def parse(filename):
@@ -515,7 +517,8 @@ class FileInfo(object):
       content = f.read()
 
     linesep = FileInfo.guess_linesep(content)
-    return FileInfo(linesep)
+    encoding = FileInfo.guess_encoding(content)
+    return FileInfo(linesep, encoding)
 
   @staticmethod
   def guess_linesep(bytebuf):
@@ -529,11 +532,41 @@ class FileInfo(object):
 
     return FileInfo.DEFAULT_LINESEP
 
+  @staticmethod
+  def guess_encoding(bytebuf):
+    """ Return approximate encoding for buffer.
+
+    This is heavily heuristic, and will return any supported encoding that can
+    describe the file without losing information, not necessarily the *right*
+    encoding. This is usually OK, because IWYU typically only adds ASCII content
+    (or content pulled from the file itself).
+    """
+    def try_decode(buf, encoding):
+      try:
+        buf.decode(encoding, errors='strict')
+      except UnicodeError:
+        return False
+      return True
+
+    # Special-case UTF-8 BOM
+    if bytebuf[0:3] == b'\xef\xbb\xbf':
+      if try_decode(bytebuf, 'utf-8'):
+        return 'utf-8'
+
+    encodings = ['ascii', 'utf-8', 'windows-1250', 'windows-1252']
+    for encoding in encodings:
+      if try_decode(bytebuf, encoding):
+        return encoding
+
+    return FileInfo.DEFAULT_ENCODING
+
 
 def _ReadFile(filename, fileinfo):
   """Read from filename and return a list of file lines."""
   try:
-    return open(filename).read().splitlines()
+    with open(filename, 'rb') as f:
+      content = f.read()
+      return content.decode(fileinfo.encoding).splitlines()
   except (IOError, OSError) as why:
     print("Skipping '%s': %s" % (filename, why))
   return None
@@ -542,10 +575,10 @@ def _ReadFile(filename, fileinfo):
 def _WriteFile(filename, fileinfo, file_lines):
   """Write the given file-lines to the file."""
   try:
-    # Open file in binary mode to preserve line endings
     with open(filename, 'wb') as f:
-      f.write(fileinfo.linesep.join(file_lines))
-      f.write(fileinfo.linesep)
+      content = fileinfo.linesep.join(file_lines) + fileinfo.linesep
+      content = content.encode(fileinfo.encoding)
+      f.write(content)
   except (IOError, OSError) as why:
     print("Error writing '%s': %s" % (filename, why))
 
diff --git a/fix_includes_test.py b/fix_includes_test.py
index 1915fe6..11e09f5 100755
--- a/fix_includes_test.py
+++ b/fix_includes_test.py
@@ -51,7 +51,7 @@ class FixIncludesBase(unittest.TestCase):
     return self.before_map[filename]
 
   def _ParseFileInfo(self, filename):
-      return fix_includes.FileInfo('\n')
+      return fix_includes.FileInfo('\n', 'utf-8')
 
   def _WriteFile(self, filename, fileinfo, contents):
       return self.actual_after_contents.extend(contents)
@@ -3465,6 +3465,25 @@ class FileInfoTest(unittest.TestCase):
     self.assertEqual(fix_includes.FileInfo.DEFAULT_LINESEP,
                      fix_includes.FileInfo.guess_linesep(buf))
 
+  def testEncodingASCII(self):
+    buf = b'abcdefgh'
+    self.assertEqual('ascii', fix_includes.FileInfo.guess_encoding(buf))
+
+  def testEncodingUTF8BOM(self):
+    buf = b'\xef\xbb\xbfSomeASCIIButWithTheBOM'
+    self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
+
+  def testEncodingUTF8NoBOM(self):
+    # This is a recurring test input in Swedish, translates to "shrimp sandwich"
+    # and contains all three Swedish exotic characters.
+    buf = b'r\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'
+    self.assertEqual('utf-8', fix_includes.FileInfo.guess_encoding(buf))
+
+  def testEncodingISO8859_1(self):
+    # Yours truly
+    buf = b'Kim Gr\xe4sman'
+    self.assertEqual('windows-1250', fix_includes.FileInfo.guess_encoding(buf))
+
 
 if __name__ == '__main__':
   unittest.main()
author	Kim Grasman <kim.grasman@gmail.com>	2018-03-18 16:53:32 +0100
committer	Kim Gräsman <kim.grasman@gmail.com>	2018-04-08 19:03:35 +0200
commit	5082fddccb3d5aabaace2208f1162029a27c0334 (patch)
tree	fa32d73e7b02bf721694d722fa71befc949382be
parent	eee0b0fc5113caf4834419bce28b96852115f5ea (diff)