diff --git a/MANIFEST.in b/MANIFEST.in index 1374aef..fe3bc0d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,7 @@ include README.md include Makefile include requirements*.txt include version.txt -recursive-include swh/loader/git/tests/data *.xz -recursive-include swh/loader/git/tests/resources/ * +recursive-include swh/loader/git/tests/data * +recursive-include swh/loader/git/tests/data *.bundle *.tgz recursive-include swh py.typed diff --git a/PKG-INFO b/PKG-INFO index 16997f4..747fa5e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,101 +1,101 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.2.0 +Version: 0.3.0 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-git/ Description: swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich Configuration ------------- You can run the loader from a remote origin (*loader*) or from an origin on disk (*from_disk*) directly by calling: ``` python3 -m swh.loader.git.{loader,from_disk} ``` ### Location Both tools expect a configuration file. Either one of the following location: - /etc/softwareheritage/ - ~/.config/swh/ - ~/.swh/ Note: Will call that location $SWH_CONFIG_PATH ### Configuration sample Respectively the loader from a remote (`git.yml`) and the loader from a disk (`git-disk.yml`), $SWH_CONFIG_PATH/loader/git{-disk}.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements-swh.txt b/requirements-swh.txt index 2ee53a3..747453e 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 0.3.2 -swh.model >= 0.3.0 +swh.loader.core >= 0.5.2 +swh.model >= 0.4.0 swh.scheduler >= 0.0.39 -swh.storage >= 0.7.0 +swh.storage >= 0.10.0 diff --git a/requirements-test.txt b/requirements-test.txt index 00557c9..a037972 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ pytest pytest-mock swh.scheduler[testing] +swh.storage[testing] diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 16997f4..747fa5e 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,101 +1,101 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.2.0 +Version: 0.3.0 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-git/ Description: swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich Configuration ------------- You can run the loader from a remote origin (*loader*) or from an origin on disk (*from_disk*) directly by calling: ``` python3 -m swh.loader.git.{loader,from_disk} ``` ### Location Both tools expect a configuration file. Either one of the following location: - /etc/softwareheritage/ - ~/.config/swh/ - ~/.swh/ Note: Will call that location $SWH_CONFIG_PATH ### Configuration sample Respectively the loader from a remote (`git.yml`) and the loader from a disk (`git-disk.yml`), $SWH_CONFIG_PATH/loader/git{-disk}.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.loader.git.egg-info/SOURCES.txt b/swh.loader.git.egg-info/SOURCES.txt index 3c3ad69..df2a14b 100644 --- a/swh.loader.git.egg-info/SOURCES.txt +++ b/swh.loader.git.egg-info/SOURCES.txt @@ -1,34 +1,34 @@ MANIFEST.in Makefile README.md pyproject.toml requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py version.txt swh/__init__.py swh.loader.git.egg-info/PKG-INFO swh.loader.git.egg-info/SOURCES.txt swh.loader.git.egg-info/dependency_links.txt swh.loader.git.egg-info/entry_points.txt swh.loader.git.egg-info/requires.txt swh.loader.git.egg-info/top_level.txt swh/loader/__init__.py swh/loader/git/__init__.py swh/loader/git/converters.py swh/loader/git/from_disk.py swh/loader/git/loader.py swh/loader/git/py.typed swh/loader/git/tasks.py swh/loader/git/utils.py swh/loader/git/tests/__init__.py swh/loader/git/tests/conftest.py swh/loader/git/tests/test_converters.py swh/loader/git/tests/test_from_disk.py swh/loader/git/tests/test_loader.py swh/loader/git/tests/test_tasks.py swh/loader/git/tests/test_utils.py -swh/loader/git/tests/data/git-repos/example-submodule.fast-export.xz -swh/loader/git/tests/resources/testrepo.tgz \ No newline at end of file +swh/loader/git/tests/data/testrepo.tgz +swh/loader/git/tests/data/git-repos/example-submodule.bundle \ No newline at end of file diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index f0b7c4a..4f8f700 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,14 +1,15 @@ dulwich>=0.18.7 retrying vcversioner click swh.core>=0.0.7 -swh.loader.core>=0.3.2 -swh.model>=0.3.0 +swh.loader.core>=0.5.2 +swh.model>=0.4.0 swh.scheduler>=0.0.39 -swh.storage>=0.7.0 +swh.storage>=0.10.0 [testing] pytest pytest-mock swh.scheduler[testing] +swh.storage[testing] diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index be3abac..6b75214 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,189 +1,186 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple from swh.model.hashutil import DEFAULT_ALGORITHMS, hash_to_bytes, MultiHash from swh.model.model import ( BaseContent, Content, Directory, DirectoryEntry, ObjectType, Person, Release, Revision, RevisionType, SkippedContent, TargetType, Timestamp, TimestampWithTimezone, ) HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} def dulwich_blob_to_content_id(blob) -> Dict[str, Any]: """Convert a dulwich blob to a Software Heritage content id""" if blob.type_name != b"blob": raise ValueError("Argument is not a blob.") size = blob.raw_length() data = blob.as_raw_string() hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() hashes["sha1_git"] = blob.sha().digest() hashes["length"] = size return hashes def dulwich_blob_to_content(blob, max_content_size=None) -> BaseContent: """Convert a dulwich blob to a Software Heritage content """ if blob.type_name != b"blob": raise ValueError("Argument is not a blob.") hashes = dulwich_blob_to_content_id(blob) if max_content_size is not None and hashes["length"] >= max_content_size: return SkippedContent(status="absent", reason="Content too large", **hashes,) else: return Content(data=blob.as_raw_string(), status="visible", **hashes,) def dulwich_tree_to_directory(tree, log=None) -> Directory: """Format a tree as a directory""" if tree.type_name != b"tree": raise ValueError("Argument is not a tree.") entries = [] entry_mode_map = { 0o040000: "dir", 0o160000: "rev", 0o100644: "file", 0o100755: "file", 0o120000: "file", } for entry in tree.iteritems(): entries.append( DirectoryEntry( type=entry_mode_map.get(entry.mode, "file"), perms=entry.mode, name=entry.path, target=hash_to_bytes(entry.sha.decode("ascii")), ) ) return Directory(id=tree.sha().digest(), entries=tuple(entries),) def parse_author(name_email: bytes) -> Person: """Parse an author line""" return Person.from_fullname(name_email) def dulwich_tsinfo_to_timestamp( timestamp, timezone, timezone_neg_utc ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return TimestampWithTimezone( timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), offset=timezone // 60, negative_utc=timezone_neg_utc if timezone == 0 else False, ) def dulwich_commit_to_revision(commit, log=None) -> Revision: if commit.type_name != b"commit": raise ValueError("Argument is not a commit.") git_metadata = [] if commit.encoding is not None: - git_metadata.append(["encoding", commit.encoding]) + git_metadata.append((b"encoding", commit.encoding)) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b"\n") - git_metadata.append(["mergetag", raw_string[:-1]]) + git_metadata.append((b"mergetag", raw_string[:-1])) if commit.extra: - git_metadata.extend([k.decode("utf-8"), v] for k, v in commit.extra) + git_metadata.extend((k, v) for k, v in commit.extra) if commit.gpgsig: - git_metadata.append(["gpgsig", commit.gpgsig]) + git_metadata.append((b"gpgsig", commit.gpgsig)) - if git_metadata: - metadata: Optional[Dict[str, Any]] = { - "extra_headers": git_metadata, - } - else: - metadata = None + extra_headers: Tuple[Tuple[bytes, bytes], ...] + extra_headers = tuple(git_metadata) return Revision( id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), committer=parse_author(commit.committer), committer_date=dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), type=RevisionType.GIT, directory=bytes.fromhex(commit.tree.decode()), message=commit.message, - metadata=metadata, + metadata=None, + extra_headers=extra_headers, synthetic=False, parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), ) DULWICH_TARGET_TYPES = { b"blob": TargetType.CONTENT, b"tree": TargetType.DIRECTORY, b"commit": TargetType.REVISION, b"tag": TargetType.RELEASE, } DULWICH_OBJECT_TYPES = { b"blob": ObjectType.CONTENT, b"tree": ObjectType.DIRECTORY, b"commit": ObjectType.REVISION, b"tag": ObjectType.RELEASE, } def dulwich_tag_to_release(tag, log=None) -> Release: if tag.type_name != b"tag": raise ValueError("Argument is not a tag.") target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) if not tag.tag_time: date = None else: date = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: author = date = None return Release( id=tag.sha().digest(), author=author, date=date, name=tag.name, target=bytes.fromhex(target.decode()), target_type=DULWICH_OBJECT_TYPES[target_type.type_name], message=tag._message, metadata=None, synthetic=False, ) diff --git a/swh/loader/git/tests/__init__.py b/swh/loader/git/tests/__init__.py index c30ab7c..424c5fb 100644 --- a/swh/loader/git/tests/__init__.py +++ b/swh/loader/git/tests/__init__.py @@ -1,27 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -TEST_LOADER_CONFIG = { - "storage": { - "cls": "pipeline", - "steps": [ - {"cls": "filter"}, - { - "cls": "buffer", - "min_batch_size": { - "content": 10, - "content_bytes": 100 * 1024 * 1024, - "directory": 10, - "revision": 10, - "release": 10, - }, - }, - {"cls": "memory"}, - ], - }, - "max_content_size": 100 * 1024 * 1024, - "pack_size_bytes": 4 * 1024 * 1024 * 1024, - "save_data": False, -} diff --git a/swh/loader/git/tests/conftest.py b/swh/loader/git/tests/conftest.py index ca28974..78f3b50 100644 --- a/swh/loader/git/tests/conftest.py +++ b/swh/loader/git/tests/conftest.py @@ -1,15 +1,57 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os +import yaml + import pytest +from typing import Any, Dict + from swh.scheduler.tests.conftest import * # noqa +from swh.storage.tests.conftest import * # noqa + + +@pytest.fixture +def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: + swh_storage_backend_config["journal_writer"] = {} + return { + "storage": { + "cls": "pipeline", + "steps": [ + {"cls": "filter"}, + { + "cls": "buffer", + "min_batch_size": { + "content": 10, + "content_bytes": 100 * 1024 * 1024, + "directory": 10, + "revision": 10, + "release": 10, + }, + }, + swh_storage_backend_config, + ], + }, + "max_content_size": 100 * 1024 * 1024, + "pack_size_bytes": 4 * 1024 * 1024 * 1024, + "save_data": False, + } + + +@pytest.fixture +def swh_config(swh_loader_config, monkeypatch, tmp_path): + conffile = os.path.join(str(tmp_path), "loader.yml") + with open(conffile, "w") as f: + f.write(yaml.dump(swh_loader_config)) + monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) + return conffile @pytest.fixture(scope="session") # type: ignore # expected redefinition def celery_includes(): return [ "swh.loader.git.tasks", ] diff --git a/swh/loader/git/tests/data/git-repos/example-submodule.bundle b/swh/loader/git/tests/data/git-repos/example-submodule.bundle new file mode 100644 index 0000000..2fba12e Binary files /dev/null and b/swh/loader/git/tests/data/git-repos/example-submodule.bundle differ diff --git a/swh/loader/git/tests/data/git-repos/example-submodule.fast-export.xz b/swh/loader/git/tests/data/git-repos/example-submodule.fast-export.xz deleted file mode 100644 index 3c2adc8..0000000 Binary files a/swh/loader/git/tests/data/git-repos/example-submodule.fast-export.xz and /dev/null differ diff --git a/swh/loader/git/tests/resources/testrepo.tgz b/swh/loader/git/tests/data/testrepo.tgz similarity index 100% rename from swh/loader/git/tests/resources/testrepo.tgz rename to swh/loader/git/tests/data/testrepo.tgz diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 849de2a..3ed7723 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,319 +1,436 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pytest import shutil import subprocess import tempfile import unittest import dulwich.repo from swh.model.hashutil import bytehex_to_hash, hash_to_bytes from swh.model.model import ( Content, Person, Release, Revision, RevisionType, ObjectType, Timestamp, TimestampWithTimezone, ) import swh.loader.git.converters as converters TEST_DATA = os.path.join(os.path.dirname(__file__), "data") +GPGSIG = ( + b"-----BEGIN PGP SIGNATURE-----\n" + b"\n" + b"iQJLBAABCAA1FiEEAOWDevQbOk/9ITMF6ImSleOlnUcFAl8EnS4XHGRhdmlkLmRv\n" + b"dWFyZEBzZGZhMy5vcmcACgkQ6ImSleOlnUdrqQ/8C5RO4NZ5Qr/dwAy2cPA7ktkY\n" + b"1oUjKtspQoPbC1X3MXVa1aWo9B3KuOMR2URw44RhMNFwjccLOhfss06E8p7CZr2H\n" + b"uR3CzdDw7i52jHLCL2M2ZMaPAEbQuHjXWiUWIUXz9So8YwpTyd2XQneyOC2RDDEI\n" + b"I2NVbmiMeDz33jJYPrQO0QayW+ErW+xgBF7N/qS9jFWsdV1ZNfn9NxkTH8UdGuAX\n" + b"583P+0tVC2DjXc6vORVhyFzyfn1A9wHosbtWI2Mpa+zezPjoPSkcyQAJu2GyOkMC\n" + b"YzSjJdQVqyovo+INkIf6PuUNdp41886BG/06xwT8fl4sVsyO51lNIfgH0DMwfTTB\n" + b"ZgThYnvvO7SrXDm3QzBTXkvAiHiFFl3iNyGkCyxvgVmaTntuFT+cP+HD/pCiGaC+\n" + b"jHzRwfUrmuLd/lLPyq3JXBibyjnfd3SVS+7q1NZHJ4WUmCboZ0+pfrEl65mEQ/Hz\n" + b"J1qCwQ/3SsTB77ANf6lLzGSowjjrtHcBTkTbFxR4ACUhiBbosyDKpHTM7fzGFGjo\n" + b"EIjohzrEnqR3bbyxJkK+nxoOByhIRdowgyeJ02I4neMyLJqcaup8NMWCddxqjaPt\n" + b"YobghnjaDqEd+suL/v83hbZUAZHNO3i1OZYGMqzp1WHikDPoTwGP76baqBoXi56T\n" + b"4WSpxCAJRDODHLk1HgU=\n" + b"=73wF" + b"\n" + b"-----END PGP SIGNATURE-----" +) + +MERGETAG = ( + b"object 9768d0b576dbaaecd80abedad6dfd0d72f1476da\n" + b"type commit\n" + b"tag v0.0.1\n" + b"tagger David Douard 1594138133 +0200\n" + b"\n" + b"v0.0.1\n" + b"-----BEGIN PGP SIGNATURE-----\n" + b"\n" + b"iQJLBAABCAA1FiEEAOWDevQbOk/9ITMF6ImSleOlnUcFAl8EnhkXHGRhdmlkLmRv\n" + b"dWFyZEBzZGZhMy5vcmcACgkQ6ImSleOlnUcdzg//ZW9y2xU5JFQuUsBe/LfKrs+m\n" + b"0ohVInPKXwAfpB3+gn/XtTSLe+Nnr8+QEZyVRCUz2gpGZ2tNqRjhYLIX4x5KKlaV\n" + b"rfl/6Cy7zibsxxuzA1h7HylCs3IPsueQpznVHUwD9jQ5baGJSc2Lt1LufXTueHZJ\n" + b"Oc0oLiP5xCZcPqeX8R/4zUUImJZ1QrPeKmQ/3F+Iq62iWp7nWDp8PtwpykSiYlNf\n" + b"KrJM8omGvrlrWLtfPNUaQFClXwnwK1/HyNY2kYan6K5NtsIl2UX0LZ42GkRjJIrb\n" + b"q4TFIZWZ6xndtEhHEX6B8Q5TZV6sqPgNnfGpbhj8BDoZgjD0Y43fzfDiZ0Bl2tph\n" + b"tXaLg3SX/UUjFVzC1zkoQ2MR7+j8NVKauAsBINpKF4pMGsrsVRk8764pgO49iQ+S\n" + b"8JVCVV76dNNm1gd7BbhFAdIAiegBtsEF69niJBoHKYLlrT8E8hDkF/gk4IkimPqf\n" + b"UHtw/fPhVW3B4G2skd013NJGcnRj5oKtaM99d2Roxc3vhSRiTsoaM8BM9NDvLmJg\n" + b"35rWEOnet39iJIMCHk3AYaJl8QmUhllDdr6vygaBVeVEf27m2c3NzONmIKpWqa2J\n" + b"kTpF4cmzHYro34G7WuJ1bYvmLb6qWNQt9wd8RW+J1kVm5I8dkjPzLUougBpOd0YL\n" + b"Bl5UTQILbV4Tv8ZlmJM=\n" + b"=s1lv\n" + b"-----END PGP SIGNATURE-----" +) class SWHObjectType: """Dulwich lookalike ObjectType class """ def __init__(self, type_name): self.type_name = type_name class SWHTag: """Dulwich lookalike tag class """ def __init__( self, name, type_name, target, target_type, tagger, tag_time, tag_timezone, message, ): self.name = name self.type_name = type_name self.object = SWHObjectType(target_type), target self.tagger = tagger self._message = message self.tag_time = tag_time self.tag_timezone = tag_timezone self._tag_timezone_neg_utc = False def sha(self): from hashlib import sha1 return sha1() @pytest.mark.fs class TestConverters(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.repo_path = tempfile.mkdtemp() - cls.repo = dulwich.repo.Repo.init_bare(cls.repo_path) - fast_export = os.path.join( - TEST_DATA, "git-repos", "example-submodule.fast-export.xz" - ) - - xz = subprocess.Popen( - ["xzcat"], stdin=open(fast_export, "rb"), stdout=subprocess.PIPE, - ) + bundle = os.path.join(TEST_DATA, "git-repos", "example-submodule.bundle") git = subprocess.Popen( - ["git", "fast-import", "--quiet"], stdin=xz.stdout, cwd=cls.repo_path, + ["git", "clone", "--quiet", "--bare", "--mirror", bundle, cls.repo_path], + cwd=TEST_DATA, ) # flush stdout of xz - xz.stdout.close() git.communicate() + cls.repo = dulwich.repo.Repo(cls.repo_path) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) def test_blob_to_content(self): content_id = b"28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0" content = converters.dulwich_blob_to_content(self.repo[content_id]) expected_content = Content( sha1_git=bytehex_to_hash(content_id), sha1=hash_to_bytes("4850a3420a2262ff061cb296fb915430fa92301c"), sha256=hash_to_bytes( "fee7c8a485a10321ad94b64135073cb5" "5f22cb9f57fa2417d2adfb09d310adef" ), blake2s256=hash_to_bytes( "5d71873f42a137f6d89286e43677721e574" "1fa05ce4cd5e3c7ea7c44d4c2d10b" ), data=( b'[submodule "example-dependency"]\n' b"\tpath = example-dependency\n" b"\turl = https://github.com/githubtraining/" b"example-dependency.git\n" ), length=124, status="visible", ) self.assertEqual(content, expected_content) def test_convertion_wrong_input(self): class Something: type_name = b"something-not-the-right-type" m = { "blob": converters.dulwich_blob_to_content, "blob2": converters.dulwich_blob_to_content_id, "tree": converters.dulwich_tree_to_directory, "commit": converters.dulwich_tree_to_directory, "tag": converters.dulwich_tag_to_release, } for _callable in m.values(): with self.assertRaises(ValueError): _callable(Something()) def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) - expected_revision = Revision( id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca", type=RevisionType.GIT, committer=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli ", email=b"zack@upsilon.cc", ), author=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli ", email=b"zack@upsilon.cc", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1443083765, microseconds=0,), negative_utc=False, offset=120, ), message=b"add submodule dependency\n", metadata=None, + extra_headers=(), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1443083765, microseconds=0,), negative_utc=False, offset=120, ), parents=(b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r",), synthetic=False, ) self.assertEqual(revision, expected_revision) + def test_commit_to_revision_with_extra_headers(self): + sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" + + revision = converters.dulwich_commit_to_revision(self.repo[sha1]) + expected_revision = Revision( + id=hash_to_bytes(sha1.decode()), + directory=bytes.fromhex("f8ec06e4ed7b9fff4918a0241a48023143f30000"), + type=RevisionType.GIT, + committer=Person( + name=b"David Douard", + fullname=b"David Douard ", + email=b"david.douard@sdfa3.org", + ), + author=Person( + name=b"David Douard", + fullname=b"David Douard ", + email=b"david.douard@sdfa3.org", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1594137902, microseconds=0,), + negative_utc=False, + offset=120, + ), + message=b"Am\xe9lioration du fichier READM\xa4\n", + metadata=None, + extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1594136900, microseconds=0,), + negative_utc=False, + offset=120, + ), + parents=(bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"),), + synthetic=False, + ) + + assert revision == expected_revision + + def test_commit_to_revision_with_extra_headers_mergetag(self): + sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703" + + revision = converters.dulwich_commit_to_revision(self.repo[sha1]) + expected_revision = Revision( + id=hash_to_bytes(sha1.decode()), + directory=bytes.fromhex("faa4b64a841ca3e3f07d6501caebda2e3e8e544e"), + type=RevisionType.GIT, + committer=Person( + name=b"David Douard", + fullname=b"David Douard ", + email=b"david.douard@sdfa3.org", + ), + author=Person( + name=b"David Douard", + fullname=b"David Douard ", + email=b"david.douard@sdfa3.org", + ), + committer_date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1594138183, microseconds=0,), + negative_utc=False, + offset=120, + ), + message=b"Merge tag 'v0.0.1' into readme\n\nv0.0.1\n", + metadata=None, + extra_headers=((b"encoding", b"ISO-8859-15"), (b"mergetag", MERGETAG)), + date=TimestampWithTimezone( + timestamp=Timestamp(seconds=1594138183, microseconds=0,), + negative_utc=False, + offset=120, + ), + parents=( + bytes.fromhex("322f5bc915e50fc25e85226b5a182bded0e98e4b"), + bytes.fromhex("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), + ), + synthetic=False, + ) + + assert revision == expected_revision + def test_author_line_to_author(self): # edge case out of the way with self.assertRaises(TypeError): converters.parse_author(None) tests = { b"a ": Person( name=b"a", email=b"b@c.com", fullname=b"a ", ), b"": Person( name=None, email=b"foo@bar.com", fullname=b"", ), b"malformed ": Person( name=b"trailing", email=b"sp@c.e", fullname=b"trailing ", ), b"no": Person(name=b"no", email=b"sp@c.e", fullname=b"no",), b" <>": Person(name=None, email=None, fullname=b" <>",), b"something": Person(name=b"something", email=None, fullname=b"something"), } for author in sorted(tests): parsed_author = tests[author] self.assertEqual(parsed_author, converters.parse_author(author)) def test_dulwich_tag_to_release_no_author_no_date(self): target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" message = b"some release message" tag = SWHTag( name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, tagger=None, tag_time=None, tag_timezone=None, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=None, date=None, id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) self.assertEqual(actual_release, expected_release) def test_dulwich_tag_to_release_author_and_date(self): tagger = b"hey dude " target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" message = b"some release message" import datetime date = datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp() tag = SWHTag( name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, tagger=tagger, tag_time=date, tag_timezone=0, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"hello@mail.org", fullname=b"hey dude ", name=b"hey dude", ), date=TimestampWithTimezone( negative_utc=False, offset=0, timestamp=Timestamp(seconds=1196812800, microseconds=0,), ), id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) self.assertEqual(actual_release, expected_release) def test_dulwich_tag_to_release_author_no_date(self): # to reproduce bug T815 (fixed) tagger = b"hey dude " target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" message = b"some release message" tag = SWHTag( name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, tagger=tagger, tag_time=None, tag_timezone=None, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"hello@mail.org", fullname=b"hey dude ", name=b"hey dude", ), date=None, id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) self.assertEqual(actual_release, expected_release) diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py index f1291a6..cba35f9 100644 --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -1,422 +1,426 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import datetime import os.path import dulwich.repo +import pytest + +from unittest import TestCase from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.model.hashutil import hash_to_bytes +from swh.loader.git.from_disk import GitLoaderFromDisk, GitLoaderFromArchive -from swh.loader.core.tests import BaseLoaderTest -from swh.loader.tests.common import assert_last_visit_matches - -from swh.loader.git.from_disk import GitLoaderFromDisk as OrigGitLoaderFromDisk -from swh.loader.git.from_disk import GitLoaderFromArchive as OrigGitLoaderFromArchive - -from . import TEST_LOADER_CONFIG - - -class GitLoaderFromArchive(OrigGitLoaderFromArchive): - def project_name_from_archive(self, archive_path): - # We don't want the project name to be 'resources'. - return "testrepo" +from swh.loader.tests import ( + assert_last_visit_matches, + check_snapshot, + get_stats, + prepare_repository_from_archive, +) - def parse_config_file(self, *args, **kwargs): - return TEST_LOADER_CONFIG - - -CONTENT1 = { - "33ab5639bfd8e7b95eb1d8d0b87781d4ffea4d5d", # README v1 - "349c4ff7d21f1ec0eda26f3d9284c293e3425417", # README v2 - "799c11e348d39f1704022b8354502e2f81f3c037", # file1.txt - "4bdb40dfd6ec75cb730e678b5d7786e30170c5fb", # file2.txt -} SNAPSHOT_ID = "a23699280a82a043f8c0994cf1631b568f716f95" SNAPSHOT1 = { "id": SNAPSHOT_ID, "branches": { "HEAD": {"target": "refs/heads/master", "target_type": "alias",}, "refs/heads/master": { "target": "2f01f5ca7e391a2f08905990277faf81e709a649", "target_type": "revision", }, "refs/heads/branch1": { "target": "b0a77609903f767a2fd3d769904ef9ef68468b87", "target_type": "revision", }, "refs/heads/branch2": { "target": "bd746cd1913721b269b395a56a97baf6755151c2", "target_type": "revision", }, "refs/tags/branch2-after-delete": { "target": "bd746cd1913721b269b395a56a97baf6755151c2", "target_type": "revision", }, "refs/tags/branch2-before-delete": { "target": "1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b", "target_type": "revision", }, }, } # directory hashes obtained with: # gco b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a # swh-hashtree --ignore '.git' --path . # gco 2f01f5ca7e391a2f08905990277faf81e709a649 # swh-hashtree --ignore '.git' --path . # gco bcdc5ebfde1a3cd6c96e0c2ea4eed19c13208777 # swh-hashtree --ignore '.git' --path . # gco 1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b # swh-hashtree --ignore '.git' --path . # gco 79f65ac75f79dda6ff03d66e1242702ab67fb51c # swh-hashtree --ignore '.git' --path . # gco b0a77609903f767a2fd3d769904ef9ef68468b87 # swh-hashtree --ignore '.git' --path . # gco bd746cd1913721b269b395a56a97baf6755151c2 # swh-hashtree --ignore '.git' --path . REVISIONS1 = { "b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a": ( "40dbdf55dfd4065422462cc74a949254aefa972e" ), "2f01f5ca7e391a2f08905990277faf81e709a649": ( "e1d0d894835f91a0f887a4bc8b16f81feefdfbd5" ), "bcdc5ebfde1a3cd6c96e0c2ea4eed19c13208777": ( "b43724545b4759244bb54be053c690649161411c" ), "1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b": ( "fbf70528223d263661b5ad4b80f26caf3860eb8e" ), "79f65ac75f79dda6ff03d66e1242702ab67fb51c": ( "5df34ec74d6f69072d9a0a6677d8efbed9b12e60" ), "b0a77609903f767a2fd3d769904ef9ef68468b87": ( "9ca0c7d6ffa3f9f0de59fd7912e08f11308a1338" ), "bd746cd1913721b269b395a56a97baf6755151c2": ( "e1d0d894835f91a0f887a4bc8b16f81feefdfbd5" ), } -class BaseGitLoaderFromDiskTest(BaseLoaderTest): - def setUp(self, archive_name, uncompress_archive, filename="testrepo"): - super().setUp( - archive_name=archive_name, - filename=filename, - prefix_tmp_folder_name="swh.loader.git.", - start_path=os.path.dirname(__file__), - uncompress_archive=uncompress_archive, - ) - - -class GitLoaderFromDiskTest(OrigGitLoaderFromDisk): - def parse_config_file(self, *args, **kwargs): - return TEST_LOADER_CONFIG - - -class BaseDirGitLoaderFromDiskTest(BaseGitLoaderFromDiskTest): - """Mixin base loader test to prepare the git - repository to uncompress, load and test the results. - - This sets up - - """ - - def setUp(self): - super().setUp("testrepo.tgz", uncompress_archive=True) - self.loader = GitLoaderFromDiskTest( - url=self.repo_url, - visit_date=datetime.datetime( - 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc - ), - directory=self.destination_path, - ) - self.storage = self.loader.storage - self.repo = dulwich.repo.Repo(self.destination_path) - - def load(self): - return self.loader.load() - - -class BaseGitLoaderFromArchiveTest(BaseGitLoaderFromDiskTest): - """Mixin base loader test to prepare the git - repository to uncompress, load and test the results. - - This sets up - - """ - - def setUp(self): - super().setUp("testrepo.tgz", uncompress_archive=False) - self.loader = GitLoaderFromArchive( - url=self.repo_url, - visit_date=datetime.datetime( - 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc - ), - archive_path=self.destination_path, - ) - self.storage = self.loader.storage - - def load(self): - return self.loader.load() - - -class GitLoaderFromDiskTests: +class CommonGitLoaderTests: """Common tests for all git loaders.""" def test_load(self): """Loads a simple repository (made available by `setUp()`), and checks everything was added in the storage.""" - res = self.load() - self.assertEqual(res["status"], "eventful", res) - - self.assertContentsContain(CONTENT1) - self.assertCountDirectories(7) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7) - self.assertCountSnapshots(1) - - self.assertRevisionsContain(REVISIONS1) + res = self.loader.load() - self.assertSnapshotEqual(SNAPSHOT1) - - self.assertEqual(self.loader.load_status(), {"status": "eventful"}) - self.assertEqual(self.loader.visit_status(), "full") + assert res == {"status": "eventful"} assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", snapshot=hash_to_bytes(SNAPSHOT1["id"]), ) + stats = get_stats(self.loader.storage) + assert stats == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + + check_snapshot(SNAPSHOT1, self.loader.storage) + def test_load_unchanged(self): """Checks loading a repository a second time does not add any extra data.""" - res = self.load() - self.assertEqual(res["status"], "eventful") + res = self.loader.load() + assert res == {"status": "eventful"} assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", snapshot=hash_to_bytes(SNAPSHOT1["id"]), ) - res = self.load() - self.assertEqual(res["status"], "uneventful") - self.assertCountSnapshots(1) + stats0 = get_stats(self.loader.storage) + assert stats0 == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + + res = self.loader.load() + assert res == {"status": "uneventful"} + stats1 = get_stats(self.loader.storage) + expected_stats = copy.deepcopy(stats0) + expected_stats["origin_visit"] += 1 + assert stats1 == expected_stats + + check_snapshot(SNAPSHOT1, self.loader.storage) assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", snapshot=hash_to_bytes(SNAPSHOT1["id"]), ) -class DirGitLoaderTest(BaseDirGitLoaderFromDiskTest, GitLoaderFromDiskTests): - """Tests for the GitLoaderFromDisk. Includes the common ones, and - add others that only work with a local dir.""" +class FullGitLoaderTests(CommonGitLoaderTests): + """Tests for GitLoader (from disk or not). Includes the common ones, and + add others that only work with a local dir. + + """ def test_load_changed(self): """Loads a repository, makes some changes by adding files, commits, and merges, load it again, and check the storage contains everything it should.""" # Initial load - res = self.load() - self.assertEqual(res["status"], "eventful", res) + res = self.loader.load() + assert res == {"status": "eventful"} + + stats0 = get_stats(self.loader.storage) + assert stats0 == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } # Load with a new file + revision with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: fd.write("print('Hello world')\n") self.repo.stage([b"hello.py"]) new_revision = self.repo.do_commit(b"Hello world\n").decode() new_dir = "85dae072a5aa9923ffa7a7568f819ff21bf49858" assert self.repo[new_revision.encode()].tree == new_dir.encode() revisions = REVISIONS1.copy() assert new_revision not in revisions revisions[new_revision] = new_dir - res = self.load() - self.assertEqual(res["status"], "eventful") + res = self.loader.load() + assert res == {"status": "eventful"} - self.assertCountContents(4 + 1) - self.assertCountDirectories(7 + 1) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7 + 1) - self.assertCountSnapshots(1 + 1) + stats1 = get_stats(self.loader.storage) + expected_stats = copy.deepcopy(stats0) + # did one new visit + expected_stats["origin_visit"] += 1 + # with one more of the following objects + expected_stats["person"] += 1 + expected_stats["content"] += 1 + expected_stats["directory"] += 1 + expected_stats["revision"] += 1 + # concluding into 1 new snapshot + expected_stats["snapshot"] += 1 - self.assertRevisionsContain(revisions) - - self.assertEqual(self.loader.load_status(), {"status": "eventful"}) - self.assertEqual(self.loader.visit_status(), "full") + assert stats1 == expected_stats visit_status = assert_last_visit_matches( - self.storage, self.repo_url, status="full", type="git" + self.loader.storage, self.repo_url, status="full", type="git" ) - self.assertIsNotNone(visit_status.snapshot) + assert visit_status.snapshot is not None snapshot_id = visit_status.snapshot - snapshot = self.storage.snapshot_get(snapshot_id) + snapshot = self.loader.storage.snapshot_get(snapshot_id) branches = snapshot["branches"] assert branches[b"HEAD"] == { "target": b"refs/heads/master", "target_type": "alias", } assert branches[b"refs/heads/master"] == { "target": hash_to_bytes(new_revision), "target_type": "revision", } # Merge branch1 into HEAD. current = self.repo[b"HEAD"] branch1 = self.repo[b"refs/heads/branch1"] merged_tree = dulwich.objects.Tree() for item in self.repo[current.tree].items(): merged_tree.add(*item) for item in self.repo[branch1.tree].items(): merged_tree.add(*item) merged_dir_id = "dab8a37df8db8666d4e277bef9a546f585b5bedd" assert merged_tree.id.decode() == merged_dir_id self.repo.object_store.add_object(merged_tree) merge_commit = self.repo.do_commit( b"merge.\n", tree=merged_tree.id, merge_heads=[branch1.id] ) assert merge_commit.decode() not in revisions revisions[merge_commit.decode()] = merged_tree.id.decode() - res = self.load() - self.assertEqual(res["status"], "eventful") - - self.assertCountContents(4 + 1) - self.assertCountDirectories(7 + 2) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7 + 2) - self.assertCountSnapshots(1 + 1 + 1) + res = self.loader.load() + assert res == {"status": "eventful"} - self.assertRevisionsContain(revisions) + stats2 = get_stats(self.loader.storage) + expected_stats = copy.deepcopy(stats1) + # one more visit + expected_stats["origin_visit"] += 1 + # with 1 new directory and revision + expected_stats["directory"] += 1 + expected_stats["revision"] += 1 + # concluding into 1 new snapshot + expected_stats["snapshot"] += 1 - self.assertEqual(self.loader.load_status(), {"status": "eventful"}) - self.assertEqual(self.loader.visit_status(), "full") + assert stats2 == expected_stats visit_status = assert_last_visit_matches( - self.storage, self.repo_url, status="full", type="git" + self.loader.storage, self.repo_url, status="full", type="git" ) - self.assertIsNotNone(visit_status.snapshot) + assert visit_status.snapshot is not None merge_snapshot_id = visit_status.snapshot assert merge_snapshot_id != snapshot_id - merge_snapshot = self.storage.snapshot_get(merge_snapshot_id) + merge_snapshot = self.loader.storage.snapshot_get(merge_snapshot_id) merge_branches = merge_snapshot["branches"] assert merge_branches[b"HEAD"] == { "target": b"refs/heads/master", "target_type": "alias", } assert merge_branches[b"refs/heads/master"] == { "target": hash_to_bytes(merge_commit.decode()), "target_type": "revision", } def test_load_filter_branches(self): filtered_branches = {b"refs/pull/42/merge"} unfiltered_branches = {b"refs/pull/42/head"} # Add branches to the repository on disk; some should be filtered by # the loader, some should not. for branch_name in filtered_branches | unfiltered_branches: self.repo[branch_name] = self.repo[b"refs/heads/master"] # Generate the expected snapshot from SNAPSHOT1 (which is the original # state of the git repo)... branches = {} for branch_name, branch_dict in SNAPSHOT1["branches"].items(): target_type_name = branch_dict["target_type"] target_obj = branch_dict["target"] if target_type_name != "alias": target = bytes.fromhex(target_obj) else: target = target_obj.encode() branch = SnapshotBranch( target=target, target_type=TargetType(target_type_name) ) branches[branch_name.encode()] = branch # ... and the unfiltered_branches, which are all pointing to the same # commit as "refs/heads/master". for branch_name in unfiltered_branches: branches[branch_name] = branches[b"refs/heads/master"] expected_snapshot = Snapshot(branches=branches) # Load the modified repository - res = self.load() - assert res["status"] == "eventful" - - assert self.loader.load_status() == {"status": "eventful"} - assert self.loader.visit_status() == "full" + res = self.loader.load() + assert res == {"status": "eventful"} assert_last_visit_matches( - self.storage, + self.loader.storage, self.repo_url, status="full", type="git", snapshot=expected_snapshot.id, ) def test_load_dangling_symref(self): with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f: f.write(b"ref: refs/heads/dangling-branch\n") - res = self.load() - self.assertEqual(res["status"], "eventful", res) - - self.assertContentsContain(CONTENT1) - self.assertCountDirectories(7) - self.assertCountReleases(0) # FIXME: should be 2 after T2059 - self.assertCountRevisions(7) - self.assertCountSnapshots(1) + res = self.loader.load() + assert res == {"status": "eventful"} visit_status = assert_last_visit_matches( - self.storage, self.repo_url, status="full", type="git" + self.loader.storage, self.repo_url, status="full", type="git" ) snapshot_id = visit_status.snapshot assert snapshot_id is not None - snapshot = self.storage.snapshot_get(snapshot_id) + snapshot = self.loader.storage.snapshot_get(snapshot_id) branches = snapshot["branches"] assert branches[b"HEAD"] == { "target": b"refs/heads/dangling-branch", "target_type": "alias", } assert branches[b"refs/heads/dangling-branch"] is None + stats = get_stats(self.loader.storage) + assert stats == { + "content": 4, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "person": 1, + "release": 0, + "revision": 7, + "skipped_content": 0, + "snapshot": 1, + } + -class GitLoaderFromArchiveTest(BaseGitLoaderFromArchiveTest, GitLoaderFromDiskTests): - """Tests for GitLoaderFromArchive. Imports the common ones - from GitLoaderTests.""" +class GitLoaderFromDiskTest(TestCase, FullGitLoaderTests): + """Prepare a git directory repository to be loaded through a GitLoaderFromDisk. + This tests all git loader scenario. - pass + """ + + @pytest.fixture(autouse=True) + def init(self, swh_config, datadir, tmp_path): + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = str(tmp_path) + self.repo_url = prepare_repository_from_archive( + archive_path, archive_name, tmp_path=tmp_path + ) + self.destination_path = os.path.join(tmp_path, archive_name) + self.loader = GitLoaderFromDisk( + url=self.repo_url, + visit_date=datetime.datetime( + 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc + ), + directory=self.destination_path, + ) + self.repo = dulwich.repo.Repo(self.destination_path) + + +class GitLoaderFromArchiveTest(TestCase, CommonGitLoaderTests): + """Tests for GitLoaderFromArchive. Only tests common scenario.""" + + @pytest.fixture(autouse=True) + def init(self, swh_config, datadir, tmp_path): + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + self.repo_url = archive_path + self.loader = GitLoaderFromArchive( + url=self.repo_url, + archive_path=archive_path, + visit_date=datetime.datetime( + 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc + ), + ) diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py index 2baafeb..06c379a 100644 --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,26 +1,36 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.loader.git.loader import GitLoader -from swh.loader.git.tests.test_from_disk import DirGitLoaderTest +import os -from . import TEST_LOADER_CONFIG +import pytest +import dulwich.repo +from unittest import TestCase -class GitLoaderTest(GitLoader): - def parse_config_file(self, *args, **kwargs): - return {**super().parse_config_file(*args, **kwargs), **TEST_LOADER_CONFIG} +from swh.loader.git.loader import GitLoader +from swh.loader.git.tests.test_from_disk import FullGitLoaderTests +from swh.loader.tests import prepare_repository_from_archive -class TestGitLoader(DirGitLoaderTest): - """Same tests as for the GitLoaderFromDisk, but running on GitLoader.""" - def setUp(self): - super().setUp() - self.loader = GitLoaderTest(self.repo_url) - self.storage = self.loader.storage +class GitLoaderTest(TestCase, FullGitLoaderTests): + """Prepare a git directory repository to be loaded through a GitLoader. + This tests all git loader scenario. - def load(self): - return self.loader.load() + """ + + @pytest.fixture(autouse=True) + def init(self, swh_config, datadir, tmp_path): + super().setUp() + archive_name = "testrepo" + archive_path = os.path.join(datadir, f"{archive_name}.tgz") + tmp_path = str(tmp_path) + self.repo_url = prepare_repository_from_archive( + archive_path, archive_name, tmp_path=tmp_path + ) + self.destination_path = os.path.join(tmp_path, archive_name) + self.loader = GitLoader(self.repo_url) + self.repo = dulwich.repo.Repo(self.destination_path) diff --git a/version.txt b/version.txt index 30ba5a3..b6853eb 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.2.0-0-g55ff84b \ No newline at end of file +v0.3.0-0-g0394f0f \ No newline at end of file