diff --git a/dulwich/line_ending.py b/dulwich/line_ending.py new file mode 100644 index 00000000..c2a79f4f --- /dev/null +++ b/dulwich/line_ending.py @@ -0,0 +1,181 @@ +# line_ending.py -- Line ending conversion functions +# Copyright (C) 2018-2018 Boris Feld +# +# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU +# General Public License as public by the Free Software Foundation; version 2.0 +# or (at your option) any later version. You can redistribute it and/or +# modify it under the terms of either of these two licenses. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# You should have received a copy of the licenses; if not, see +# for a copy of the GNU General Public License +# and for a copy of the Apache +# License, Version 2.0. +# +""" All line-ending related functions, from conversions to config processing + +Line-ending normalization is a complex beast. Here is some notes and details +about how it seems to work. + +The normalization is a two-fold process that happens at two moments: + +- When reading a file from the index and to the working directory. For example + when doing a `git clone` or `git checkout` call. We call this process the + read filter in this module. +- When writing a file to the index from the working directory. For example + when doing a `git add` call. We call this process the write filter in this + module. + +One thing to know is that Git does line-ending normalization only on text +files. How does Git know that a file is text? We can either mark a file as a +text file, a binary file or ask Git to automatically decides. Git has an +heuristic to detect if a file is a text file or a binary file. It seems based +on the percentage of non-printable characters in files. + +The code for this heuristic is here: +https://git.kernel.org/pub/scm/git/git.git/tree/convert.c#n46 + +Dulwich have an implementation with a slightly different heuristic, the +`is_binary` function in `dulwich.patch`. + +The binary detection heuristic implementation is close to the one in JGit: +https://github.com/eclipse/jgit/blob/f6873ffe522bbc3536969a3a3546bf9a819b92bf/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java#L300 + +There is multiple variables that impact the normalization. + +First, a repository can contains a `.gitattributes` file (or more than one...) +that can further customize the operation on some file patterns, for example: + + *.txt text + +Force all `.txt` files to be treated as text files and to have their lines +endings normalized. + + *.jpg -text + +Force all `.jpg` files to be treated as binary files and to not have their +lines endings converted. + + *.vcproj text eol=crlf + +Force all `.vcproj` files to be treated as text files and to have their lines +endings converted into `CRLF` in working directory no matter the native EOL of +the platform. + + *.sh text eol=lf + +Force all `.sh` files to be treated as text files and to have their lines +endings converted into `LF` in working directory no matter the native EOL of +the platform. + +If the `eol` attribute is not defined, Git uses the `core.eol` configuration +value described later. + + * text=auto + +Force all files to be scanned by the text file heuristic detection and to have +their line endings normalized in case they are detected as text files. + +Git also have a obsolete attribute named `crlf` that can be translated to the +corresponding text attribute value. + +Then there are some configuration option (that can be defined at the +repository or user level): + +- core.autocrlf +- core.eol + +`core.autocrlf` is taken into account for all files that doesn't have a `text` +attribute defined in `.gitattributes`; it takes three possible values: + + - `true`: This forces all files on the working directory to have CRLF + line-endings in the working directory and convert line-endings to LF + when writing to the index. When autocrlf is set to true, eol value is + ignored. + - `input`: Quite similar to the `true` value but only force the write + filter, ie line-ending of new files added to the index will get their + line-endings converted to LF. + - `false` (default): No normalization is done. + +`core.eol` is the top-level configuration to define the line-ending to use +when applying the read_filer. It takes three possible values: + + - `lf`: When normalization is done, force line-endings to be `LF` in the + working directory. + - `crlf`: When normalization is done, force line-endings to be `CRLF` in + the working directory. + - `native` (default): When normalization is done, force line-endings to be + the platform's native line ending. + +One thing to remember is when line-ending normalization is done on a file, Git +always normalize line-ending to `LF` when writing to the index. + +There are sources that seems to indicate that Git won't do line-ending +normalization when a file contains mixed line-endings. I think this logic +might be in text / binary detection heuristic but couldn't find it yet. + +Sources: +- https://git-scm.com/docs/git-config#git-config-coreeol +- https://git-scm.com/docs/git-config#git-config-coreautocrlf +- https://git-scm.com/docs/gitattributes#_checking_out_and_checking_in +- https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/ +""" + +CRLF = b"\r\n" +LF = b"\n" + + +def convert_crlf_to_lf(text_hunk): + """Convert CRLF in text hunk into LF + + :param text_hunk: A bytes string representing a text hunk + :return: The text hunk with the same type, with CRLF replaced into LF + """ + return text_hunk.replace(CRLF, LF) + + +def convert_lf_to_crlf(text_hunk): + """Convert LF in text hunk into CRLF + + :param text_hunk: A bytes string representing a text hunk + :return: The text hunk with the same type, with LF replaced into CRLF + """ + # TODO find a more efficient way of doing it + intermediary = text_hunk.replace(CRLF, LF) + return intermediary.replace(LF, CRLF) + + +def get_checkout_filter_autocrlf(core_autocrlf): + """ Returns the correct checkout filter base on autocrlf value + + :param core_autocrlf: The bytes configuration value of core.autocrlf. + Valid values are: b'true', b'false' or b'input'. + :return: Either None if no filter has to be applied or a function + accepting a single argument, a binary text hunk + """ + + if core_autocrlf == b"true": + return convert_lf_to_crlf + + return None + + +def get_checkin_filter_autocrlf(core_autocrlf): + """ Returns the correct checkin filter base on autocrlf value + + :param core_autocrlf: The bytes configuration value of core.autocrlf. + Valid values are: b'true', b'false' or b'input'. + :return: Either None if no filter has to be applied or a function + accepting a single argument, a binary text hunk + """ + + if core_autocrlf == b"true" or core_autocrlf == b"input": + return convert_crlf_to_lf + + # Checking filter should never be `convert_lf_to_crlf` + return None diff --git a/dulwich/tests/__init__.py b/dulwich/tests/__init__.py index a533b176..2984efc9 100644 --- a/dulwich/tests/__init__.py +++ b/dulwich/tests/__init__.py @@ -1,194 +1,195 @@ # __init__.py -- The tests for dulwich # Copyright (C) 2007 James Westby # # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as public by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Tests for Dulwich.""" import doctest import os import shutil import subprocess import sys import tempfile # If Python itself provides an exception, use that import unittest from unittest import ( # noqa: F401 SkipTest, TestCase as _TestCase, skipIf, expectedFailure, ) class TestCase(_TestCase): def setUp(self): super(TestCase, self).setUp() self._old_home = os.environ.get("HOME") os.environ["HOME"] = "/nonexistant" def tearDown(self): super(TestCase, self).tearDown() if self._old_home: os.environ["HOME"] = self._old_home else: del os.environ["HOME"] class BlackboxTestCase(TestCase): """Blackbox testing.""" # TODO(jelmer): Include more possible binary paths. bin_directories = [os.path.abspath(os.path.join( os.path.dirname(__file__), "..", "..", "bin")), '/usr/bin', '/usr/local/bin'] def bin_path(self, name): """Determine the full path of a binary. :param name: Name of the script :return: Full path """ for d in self.bin_directories: p = os.path.join(d, name) if os.path.isfile(p): return p else: raise SkipTest("Unable to find binary %s" % name) def run_command(self, name, args): """Run a Dulwich command. :param name: Name of the command, as it exists in bin/ :param args: Arguments to the command """ env = dict(os.environ) env["PYTHONPATH"] = os.pathsep.join(sys.path) # Since they don't have any extensions, Windows can't recognize # executablility of the Python files in /bin. Even then, we'd have to # expect the user to set up file associations for .py files. # # Save us from all that headache and call python with the bin script. argv = [sys.executable, self.bin_path(name)] + args return subprocess.Popen( argv, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, env=env) def self_test_suite(): names = [ 'archive', 'blackbox', 'client', 'config', 'diff_tree', 'fastexport', 'file', 'grafts', 'greenthreads', 'hooks', 'ignore', 'index', + 'line_ending', 'lru_cache', 'mailmap', 'objects', 'objectspec', 'object_store', 'missing_obj_finder', 'pack', 'patch', 'porcelain', 'protocol', 'reflog', 'refs', 'repository', 'server', 'stash', 'utils', 'walk', 'web', ] module_names = ['dulwich.tests.test_' + name for name in names] loader = unittest.TestLoader() return loader.loadTestsFromNames(module_names) def tutorial_test_suite(): import dulwich.client # noqa: F401 import dulwich.config # noqa: F401 import dulwich.index # noqa: F401 import dulwich.reflog # noqa: F401 import dulwich.repo # noqa: F401 import dulwich.server # noqa: F401 import dulwich.patch # noqa: F401 tutorial = [ 'introduction', 'file-format', 'repo', 'object-store', 'remote', 'conclusion', ] tutorial_files = ["../../docs/tutorial/%s.txt" % name for name in tutorial] def setup(test): test.__old_cwd = os.getcwd() test.tempdir = tempfile.mkdtemp() test.globs.update({'tempdir': test.tempdir}) os.chdir(test.tempdir) def teardown(test): os.chdir(test.__old_cwd) shutil.rmtree(test.tempdir) return doctest.DocFileSuite( module_relative=True, package='dulwich.tests', setUp=setup, tearDown=teardown, *tutorial_files) def nocompat_test_suite(): result = unittest.TestSuite() result.addTests(self_test_suite()) result.addTests(tutorial_test_suite()) from dulwich.contrib import test_suite as contrib_test_suite result.addTests(contrib_test_suite()) return result def compat_test_suite(): result = unittest.TestSuite() from dulwich.tests.compat import test_suite as compat_test_suite result.addTests(compat_test_suite()) return result def test_suite(): result = unittest.TestSuite() result.addTests(self_test_suite()) if sys.platform != 'win32': result.addTests(tutorial_test_suite()) from dulwich.tests.compat import test_suite as compat_test_suite result.addTests(compat_test_suite()) from dulwich.contrib import test_suite as contrib_test_suite result.addTests(contrib_test_suite()) return result diff --git a/dulwich/tests/test_line_ending.py b/dulwich/tests/test_line_ending.py new file mode 100644 index 00000000..7ff9d8aa --- /dev/null +++ b/dulwich/tests/test_line_ending.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# test_line_ending.py -- Tests for the line ending functions +# encoding: utf-8 +# Copyright (C) 2018-2019 Boris Feld +# +# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU +# General Public License as public by the Free Software Foundation; version 2.0 +# or (at your option) any later version. You can redistribute it and/or +# modify it under the terms of either of these two licenses. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# You should have received a copy of the licenses; if not, see +# for a copy of the GNU General Public License +# and for a copy of the Apache +# License, Version 2.0. +# + +"""Tests for the line ending conversion.""" + +from dulwich.line_ending import ( + convert_crlf_to_lf, + convert_lf_to_crlf, + get_checkin_filter_autocrlf, + get_checkout_filter_autocrlf, +) +from dulwich.tests import TestCase + + +class LineEndingConversion(TestCase): + """Test the line ending conversion functions in various cases""" + + def test_convert_crlf_to_lf_no_op(self): + self.assertEqual(convert_crlf_to_lf(b"foobar"), b"foobar") + + def test_convert_crlf_to_lf(self): + self.assertEqual( + convert_crlf_to_lf(b"line1\r\nline2"), b"line1\nline2" + ) + + def test_convert_crlf_to_lf_mixed(self): + self.assertEqual( + convert_crlf_to_lf(b"line1\r\n\nline2"), b"line1\n\nline2" + ) + + def test_convert_lf_to_crlf_no_op(self): + self.assertEqual(convert_lf_to_crlf(b"foobar"), b"foobar") + + def test_convert_lf_to_crlf(self): + self.assertEqual( + convert_lf_to_crlf(b"line1\nline2"), b"line1\r\nline2" + ) + + def test_convert_lf_to_crlf_mixed(self): + self.assertEqual( + convert_lf_to_crlf(b"line1\r\n\nline2"), b"line1\r\n\r\nline2" + ) + + +class GetLineEndingAutocrlfFilters(TestCase): + def test_get_checkin_filter_autocrlf_default(self): + checkin_filter = get_checkin_filter_autocrlf(b"false") + + self.assertEqual(checkin_filter, None) + + def test_get_checkin_filter_autocrlf_true(self): + checkin_filter = get_checkin_filter_autocrlf(b"true") + + self.assertEqual(checkin_filter, convert_crlf_to_lf) + + def test_get_checkin_filter_autocrlf_input(self): + checkin_filter = get_checkin_filter_autocrlf(b"input") + + self.assertEqual(checkin_filter, convert_crlf_to_lf) + + def test_get_checkout_filter_autocrlf_default(self): + checkout_filter = get_checkout_filter_autocrlf(b"false") + + self.assertEqual(checkout_filter, None) + + def test_get_checkout_filter_autocrlf_true(self): + checkout_filter = get_checkout_filter_autocrlf(b"true") + + self.assertEqual(checkout_filter, convert_lf_to_crlf) + + def test_get_checkout_filter_autocrlf_input(self): + checkout_filter = get_checkout_filter_autocrlf(b"input") + + self.assertEqual(checkout_filter, None)