diff --git a/PKG-INFO b/PKG-INFO index 0ac5fc5..343d708 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,101 +1,101 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.3.2 +Version: 0.3.3 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-git/ Description: swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich Configuration ------------- You can run the loader from a remote origin (*loader*) or from an origin on disk (*from_disk*) directly by calling: ``` python3 -m swh.loader.git.{loader,from_disk} ``` ### Location Both tools expect a configuration file. Either one of the following location: - /etc/softwareheritage/ - ~/.config/swh/ - ~/.swh/ Note: Will call that location $SWH_CONFIG_PATH ### Configuration sample Respectively the loader from a remote (`git.yml`) and the loader from a disk (`git-disk.yml`), $SWH_CONFIG_PATH/loader/git{-disk}.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements-swh.txt b/requirements-swh.txt index 000f2a0..ff2fd93 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 0.5.5 +swh.loader.core >= 0.5.9 swh.model >= 0.4.0 swh.scheduler >= 0.0.39 swh.storage >= 0.10.0 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index 0ac5fc5..343d708 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,101 +1,101 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.3.2 +Version: 0.3.3 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-git/ Description: swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich Configuration ------------- You can run the loader from a remote origin (*loader*) or from an origin on disk (*from_disk*) directly by calling: ``` python3 -m swh.loader.git.{loader,from_disk} ``` ### Location Both tools expect a configuration file. Either one of the following location: - /etc/softwareheritage/ - ~/.config/swh/ - ~/.swh/ Note: Will call that location $SWH_CONFIG_PATH ### Configuration sample Respectively the loader from a remote (`git.yml`) and the loader from a disk (`git-disk.yml`), $SWH_CONFIG_PATH/loader/git{-disk}.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index 4270307..105cfc1 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,15 +1,15 @@ dulwich>=0.18.7 retrying vcversioner click swh.core>=0.0.7 -swh.loader.core>=0.5.5 +swh.loader.core>=0.5.9 swh.model>=0.4.0 swh.scheduler>=0.0.39 swh.storage>=0.10.0 [testing] pytest pytest-mock swh.scheduler[testing]>=0.5.0 swh.storage[testing] diff --git a/swh/loader/git/tests/conftest.py b/swh/loader/git/tests/conftest.py index a290fb0..c6528aa 100644 --- a/swh/loader/git/tests/conftest.py +++ b/swh/loader/git/tests/conftest.py @@ -1,47 +1,35 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os -import yaml - import pytest from typing import Any, Dict @pytest.fixture def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: swh_storage_backend_config["journal_writer"] = {} return { "storage": { "cls": "pipeline", "steps": [ {"cls": "filter"}, { "cls": "buffer", "min_batch_size": { "content": 10, "content_bytes": 100 * 1024 * 1024, "directory": 10, "revision": 10, "release": 10, }, }, swh_storage_backend_config, ], }, "max_content_size": 100 * 1024 * 1024, "pack_size_bytes": 4 * 1024 * 1024 * 1024, "save_data": False, } - - -@pytest.fixture -def swh_config(swh_loader_config, monkeypatch, tmp_path): - conffile = os.path.join(str(tmp_path), "loader.yml") - with open(conffile, "w") as f: - f.write(yaml.dump(swh_loader_config)) - monkeypatch.setenv("SWH_CONFIG_FILENAME", conffile) - return conffile diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py index e5d7659..66eff14 100644 --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -1,410 +1,413 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import datetime import os.path import dulwich.repo import pytest from unittest import TestCase -from swh.model.model import Snapshot +from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.model.hashutil import hash_to_bytes from swh.loader.git.from_disk import GitLoaderFromDisk, GitLoaderFromArchive from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) -SNAPSHOT1 = { - "id": hash_to_bytes("a23699280a82a043f8c0994cf1631b568f716f95"), - "branches": { - b"HEAD": {"target": b"refs/heads/master", "target_type": "alias",}, - b"refs/heads/master": { - "target": hash_to_bytes("2f01f5ca7e391a2f08905990277faf81e709a649"), - "target_type": "revision", - }, - b"refs/heads/branch1": { - "target": hash_to_bytes("b0a77609903f767a2fd3d769904ef9ef68468b87"), - "target_type": "revision", - }, - b"refs/heads/branch2": { - "target": hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), - "target_type": "revision", - }, - b"refs/tags/branch2-after-delete": { - "target": hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), - "target_type": "revision", - }, - b"refs/tags/branch2-before-delete": { - "target": hash_to_bytes("1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b"), - "target_type": "revision", - }, +SNAPSHOT1 = Snapshot( + id=hash_to_bytes("a23699280a82a043f8c0994cf1631b568f716f95"), + branches={ + b"HEAD": SnapshotBranch( + target=b"refs/heads/master", target_type=TargetType.ALIAS, + ), + b"refs/heads/master": SnapshotBranch( + target=hash_to_bytes("2f01f5ca7e391a2f08905990277faf81e709a649"), + target_type=TargetType.REVISION, + ), + b"refs/heads/branch1": SnapshotBranch( + target=hash_to_bytes("b0a77609903f767a2fd3d769904ef9ef68468b87"), + target_type=TargetType.REVISION, + ), + b"refs/heads/branch2": SnapshotBranch( + target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), + target_type=TargetType.REVISION, + ), + b"refs/tags/branch2-after-delete": SnapshotBranch( + target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), + target_type=TargetType.REVISION, + ), + b"refs/tags/branch2-before-delete": SnapshotBranch( + target=hash_to_bytes("1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b"), + target_type=TargetType.REVISION, + ), }, -} +) # directory hashes obtained with: # gco b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a # swh-hashtree --ignore '.git' --path . # gco 2f01f5ca7e391a2f08905990277faf81e709a649 # swh-hashtree --ignore '.git' --path . # gco bcdc5ebfde1a3cd6c96e0c2ea4eed19c13208777 # swh-hashtree --ignore '.git' --path . # gco 1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b # swh-hashtree --ignore '.git' --path . # gco 79f65ac75f79dda6ff03d66e1242702ab67fb51c # swh-hashtree --ignore '.git' --path . # gco b0a77609903f767a2fd3d769904ef9ef68468b87 # swh-hashtree --ignore '.git' --path . # gco bd746cd1913721b269b395a56a97baf6755151c2 # swh-hashtree --ignore '.git' --path . REVISIONS1 = { "b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a": ( "40dbdf55dfd4065422462cc74a949254aefa972e" ), "2f01f5ca7e391a2f08905990277faf81e709a649": ( "e1d0d894835f91a0f887a4bc8b16f81feefdfbd5" ), "bcdc5ebfde1a3cd6c96e0c2ea4eed19c13208777": ( "b43724545b4759244bb54be053c690649161411c" ), "1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b": ( "fbf70528223d263661b5ad4b80f26caf3860eb8e" ), "79f65ac75f79dda6ff03d66e1242702ab67fb51c": ( "5df34ec74d6f69072d9a0a6677d8efbed9b12e60" ), "b0a77609903f767a2fd3d769904ef9ef68468b87": ( "9ca0c7d6ffa3f9f0de59fd7912e08f11308a1338" ), "bd746cd1913721b269b395a56a97baf6755151c2": ( "e1d0d894835f91a0f887a4bc8b16f81feefdfbd5" ), } class CommonGitLoaderTests: """Common tests for all git loaders.""" def test_load(self): """Loads a simple repository (made available by `setUp()`), and checks everything was added in the storage.""" res = self.loader.load() assert res == {"status": "eventful"} assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", - snapshot=hash_to_bytes(SNAPSHOT1["id"]), + snapshot=SNAPSHOT1.id, ) stats = get_stats(self.loader.storage) assert stats == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } check_snapshot(SNAPSHOT1, self.loader.storage) def test_load_unchanged(self): """Checks loading a repository a second time does not add any extra data.""" res = self.loader.load() assert res == {"status": "eventful"} assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", - snapshot=hash_to_bytes(SNAPSHOT1["id"]), + snapshot=SNAPSHOT1.id, ) stats0 = get_stats(self.loader.storage) assert stats0 == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } res = self.loader.load() assert res == {"status": "uneventful"} stats1 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats0) expected_stats["origin_visit"] += 1 assert stats1 == expected_stats check_snapshot(SNAPSHOT1, self.loader.storage) assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", - snapshot=hash_to_bytes(SNAPSHOT1["id"]), + snapshot=SNAPSHOT1.id, ) class FullGitLoaderTests(CommonGitLoaderTests): """Tests for GitLoader (from disk or not). Includes the common ones, and add others that only work with a local dir. """ def test_load_changed(self): """Loads a repository, makes some changes by adding files, commits, and merges, load it again, and check the storage contains everything it should.""" # Initial load res = self.loader.load() assert res == {"status": "eventful"} stats0 = get_stats(self.loader.storage) assert stats0 == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } # Load with a new file + revision with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: fd.write("print('Hello world')\n") self.repo.stage([b"hello.py"]) new_revision = self.repo.do_commit(b"Hello world\n").decode() new_dir = "85dae072a5aa9923ffa7a7568f819ff21bf49858" assert self.repo[new_revision.encode()].tree == new_dir.encode() revisions = REVISIONS1.copy() assert new_revision not in revisions revisions[new_revision] = new_dir res = self.loader.load() assert res == {"status": "eventful"} stats1 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats0) # did one new visit expected_stats["origin_visit"] += 1 # with one more of the following objects expected_stats["person"] += 1 expected_stats["content"] += 1 expected_stats["directory"] += 1 expected_stats["revision"] += 1 # concluding into 1 new snapshot expected_stats["snapshot"] += 1 assert stats1 == expected_stats visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) assert visit_status.snapshot is not None snapshot_id = visit_status.snapshot snapshot = self.loader.storage.snapshot_get(snapshot_id) branches = snapshot["branches"] assert branches[b"HEAD"] == { "target": b"refs/heads/master", "target_type": "alias", } assert branches[b"refs/heads/master"] == { "target": hash_to_bytes(new_revision), "target_type": "revision", } # Merge branch1 into HEAD. current = self.repo[b"HEAD"] branch1 = self.repo[b"refs/heads/branch1"] merged_tree = dulwich.objects.Tree() for item in self.repo[current.tree].items(): merged_tree.add(*item) for item in self.repo[branch1.tree].items(): merged_tree.add(*item) merged_dir_id = "dab8a37df8db8666d4e277bef9a546f585b5bedd" assert merged_tree.id.decode() == merged_dir_id self.repo.object_store.add_object(merged_tree) merge_commit = self.repo.do_commit( b"merge.\n", tree=merged_tree.id, merge_heads=[branch1.id] ) assert merge_commit.decode() not in revisions revisions[merge_commit.decode()] = merged_tree.id.decode() res = self.loader.load() assert res == {"status": "eventful"} stats2 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats1) # one more visit expected_stats["origin_visit"] += 1 # with 1 new directory and revision expected_stats["directory"] += 1 expected_stats["revision"] += 1 # concluding into 1 new snapshot expected_stats["snapshot"] += 1 assert stats2 == expected_stats visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) assert visit_status.snapshot is not None merge_snapshot_id = visit_status.snapshot assert merge_snapshot_id != snapshot_id merge_snapshot = self.loader.storage.snapshot_get(merge_snapshot_id) merge_branches = merge_snapshot["branches"] assert merge_branches[b"HEAD"] == { "target": b"refs/heads/master", "target_type": "alias", } assert merge_branches[b"refs/heads/master"] == { "target": hash_to_bytes(merge_commit.decode()), "target_type": "revision", } def test_load_filter_branches(self): filtered_branches = {b"refs/pull/42/merge"} unfiltered_branches = {b"refs/pull/42/head"} # Add branches to the repository on disk; some should be filtered by # the loader, some should not. for branch_name in filtered_branches | unfiltered_branches: self.repo[branch_name] = self.repo[b"refs/heads/master"] # Generate the expected snapshot from SNAPSHOT1 (which is the original # state of the git repo)... - branches = SNAPSHOT1["branches"].copy() + branches = dict(SNAPSHOT1.branches) # ... and the unfiltered_branches, which are all pointing to the same # commit as "refs/heads/master". for branch_name in unfiltered_branches: branches[branch_name] = branches[b"refs/heads/master"] expected_snapshot = Snapshot(branches=branches) # Load the modified repository res = self.loader.load() assert res == {"status": "eventful"} + check_snapshot(expected_snapshot, self.loader.storage) assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", snapshot=expected_snapshot.id, ) def test_load_dangling_symref(self): with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f: f.write(b"ref: refs/heads/dangling-branch\n") res = self.loader.load() assert res == {"status": "eventful"} visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) snapshot_id = visit_status.snapshot assert snapshot_id is not None snapshot = self.loader.storage.snapshot_get(snapshot_id) branches = snapshot["branches"] assert branches[b"HEAD"] == { "target": b"refs/heads/dangling-branch", "target_type": "alias", } assert branches[b"refs/heads/dangling-branch"] is None stats = get_stats(self.loader.storage) assert stats == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "person": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } class GitLoaderFromDiskTest(TestCase, FullGitLoaderTests): """Prepare a git directory repository to be loaded through a GitLoaderFromDisk. This tests all git loader scenario. """ @pytest.fixture(autouse=True) def init(self, swh_config, datadir, tmp_path): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) self.repo_url = prepare_repository_from_archive( archive_path, archive_name, tmp_path=tmp_path ) self.destination_path = os.path.join(tmp_path, archive_name) self.loader = GitLoaderFromDisk( url=self.repo_url, visit_date=datetime.datetime( 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc ), directory=self.destination_path, ) self.repo = dulwich.repo.Repo(self.destination_path) class GitLoaderFromArchiveTest(TestCase, CommonGitLoaderTests): """Tests for GitLoaderFromArchive. Only tests common scenario.""" @pytest.fixture(autouse=True) def init(self, swh_config, datadir, tmp_path): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") self.repo_url = archive_path self.loader = GitLoaderFromArchive( url=self.repo_url, archive_path=archive_path, visit_date=datetime.datetime( 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc ), ) diff --git a/version.txt b/version.txt index e3b7ee9..48c2f63 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.3.2-0-ga6dd635 \ No newline at end of file +v0.3.3-0-gf96ec77 \ No newline at end of file