diff --git a/PKG-INFO b/PKG-INFO index d03d9a6..b6d644a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,106 +1,106 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.10.1 +Version: 1.0.0 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-git/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. The main entry points are - :class:`swh.loader.git.loader.GitLoader` for the main loader which ingests a remote git repository's contents. - :class:`swh.loader.git.from_disk.GitLoaderFromDisk` which ingests a local git clone repository. - :class:`swh.loader.git.loader.GitLoaderFromArchive` which ingests a git repository wrapped in an archive. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich CLI Run ---------- You can run the loader from a remote origin (*loader*) or from an origin on disk (*from_disk*) directly by calling: ``` swh loader -C run git ``` or "git_disk". ## Configuration sample /tmp/git.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` diff --git a/pytest.ini b/pytest.ini index 5172db2..0b40b0a 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,7 @@ [pytest] # Drop this when these fixtures aren't imported automatically addopts = -p no:pytest_swh_scheduler -p no:pytest_swh_storage +markers = + fs: depends on writing to the filesystem norecursedirs = docs .* diff --git a/requirements-swh.txt b/requirements-swh.txt index 523fb7f..8621493 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 swh.loader.core >= 0.18.0 -swh.model >= 0.4.0 +swh.model >= 2.9.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh.loader.git.egg-info/PKG-INFO b/swh.loader.git.egg-info/PKG-INFO index d03d9a6..b6d644a 100644 --- a/swh.loader.git.egg-info/PKG-INFO +++ b/swh.loader.git.egg-info/PKG-INFO @@ -1,106 +1,106 @@ Metadata-Version: 2.1 Name: swh.loader.git -Version: 0.10.1 +Version: 1.0.0 Summary: Software Heritage git loader Home-page: https://forge.softwareheritage.org/diffusion/DLDG/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-git Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-loader-git/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-loader-git ============== The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. The main entry points are - :class:`swh.loader.git.loader.GitLoader` for the main loader which ingests a remote git repository's contents. - :class:`swh.loader.git.from_disk.GitLoaderFromDisk` which ingests a local git clone repository. - :class:`swh.loader.git.loader.GitLoaderFromArchive` which ingests a git repository wrapped in an archive. License ------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Dependencies ------------ ### Runtime - python3 - python3-dulwich - python3-retrying - python3-swh.core - python3-swh.model - python3-swh.storage - python3-swh.scheduler ### Test - python3-nose Requirements ------------ - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via dulwich CLI Run ---------- You can run the loader from a remote origin (*loader*) or from an origin on disk (*from_disk*) directly by calling: ``` swh loader -C run git ``` or "git_disk". ## Configuration sample /tmp/git.yml: ``` storage: cls: remote args: url: http://localhost:5002/ ``` diff --git a/swh.loader.git.egg-info/requires.txt b/swh.loader.git.egg-info/requires.txt index aae685c..b5c769b 100644 --- a/swh.loader.git.egg-info/requires.txt +++ b/swh.loader.git.egg-info/requires.txt @@ -1,16 +1,16 @@ dulwich>=0.18.7 retrying click swh.core>=0.0.7 swh.loader.core>=0.18.0 -swh.model>=0.4.0 +swh.model>=2.9.0 swh.scheduler>=0.0.39 swh.storage>=0.22.0 [testing] pytest pytest-mock swh.scheduler[testing]>=0.5.0 swh.storage[testing] types-click types-python-dateutil diff --git a/swh/loader/git/converters.py b/swh/loader/git/converters.py index 5d33787..1ea3b1a 100644 --- a/swh/loader/git/converters.py +++ b/swh/loader/git/converters.py @@ -1,186 +1,216 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Convert dulwich objects to dictionaries suitable for swh.storage""" -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, cast + +from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree from swh.model.hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytes from swh.model.model import ( BaseContent, Content, Directory, DirectoryEntry, + HashableObject, ObjectType, Person, Release, Revision, RevisionType, SkippedContent, TargetType, Timestamp, TimestampWithTimezone, ) -HASH_ALGORITHMS = DEFAULT_ALGORITHMS - {"sha1_git"} +class HashMismatch(Exception): + pass + + +def check_id(obj: HashableObject) -> None: + real_id = obj.compute_hash() + if obj.id != real_id: + raise HashMismatch( + f"Expected {type(obj).__name__} hash to be {obj.id.hex()}, " + f"got {real_id.hex()}" + ) -def dulwich_blob_to_content_id(blob) -> Dict[str, Any]: + +def dulwich_blob_to_content_id(obj: ShaFile) -> Dict[str, Any]: """Convert a dulwich blob to a Software Heritage content id""" - if blob.type_name != b"blob": + if obj.type_name != b"blob": raise ValueError("Argument is not a blob.") + blob = cast(Blob, obj) size = blob.raw_length() data = blob.as_raw_string() - hashes = MultiHash.from_data(data, HASH_ALGORITHMS).digest() - hashes["sha1_git"] = blob.sha().digest() + hashes = MultiHash.from_data(data, DEFAULT_ALGORITHMS).digest() + if hashes["sha1_git"] != blob.sha().digest(): + raise HashMismatch( + f"Expected Content hash to be {blob.sha().digest().hex()}, " + f"got {hashes['sha1_git'].hex()}" + ) hashes["length"] = size return hashes -def dulwich_blob_to_content(blob, max_content_size=None) -> BaseContent: +def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: """Convert a dulwich blob to a Software Heritage content """ - if blob.type_name != b"blob": + if obj.type_name != b"blob": raise ValueError("Argument is not a blob.") + blob = cast(Blob, obj) + hashes = dulwich_blob_to_content_id(blob) if max_content_size is not None and hashes["length"] >= max_content_size: return SkippedContent(status="absent", reason="Content too large", **hashes,) else: return Content(data=blob.as_raw_string(), status="visible", **hashes,) -def dulwich_tree_to_directory(tree, log=None) -> Directory: +def dulwich_tree_to_directory(obj: ShaFile, log=None) -> Directory: """Format a tree as a directory""" - if tree.type_name != b"tree": + if obj.type_name != b"tree": raise ValueError("Argument is not a tree.") + tree = cast(Tree, obj) entries = [] entry_mode_map = { 0o040000: "dir", 0o160000: "rev", 0o100644: "file", 0o100755: "file", 0o120000: "file", } for entry in tree.iteritems(): entries.append( DirectoryEntry( type=entry_mode_map.get(entry.mode, "file"), perms=entry.mode, name=entry.path, target=hash_to_bytes(entry.sha.decode("ascii")), ) ) - return Directory(id=tree.sha().digest(), entries=tuple(entries),) + dir_ = Directory(id=tree.sha().digest(), entries=tuple(entries),) + check_id(dir_) + return dir_ def parse_author(name_email: bytes) -> Person: """Parse an author line""" return Person.from_fullname(name_email) def dulwich_tsinfo_to_timestamp( timestamp, timezone, timezone_neg_utc ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage""" return TimestampWithTimezone( timestamp=Timestamp(seconds=int(timestamp), microseconds=0,), offset=timezone // 60, negative_utc=timezone_neg_utc if timezone == 0 else False, ) -def dulwich_commit_to_revision(commit, log=None) -> Revision: - if commit.type_name != b"commit": +def dulwich_commit_to_revision(obj: ShaFile, log=None) -> Revision: + if obj.type_name != b"commit": raise ValueError("Argument is not a commit.") + commit = cast(Commit, obj) extra_headers = [] if commit.encoding is not None: extra_headers.append((b"encoding", commit.encoding)) if commit.mergetag: for mergetag in commit.mergetag: raw_string = mergetag.as_raw_string() assert raw_string.endswith(b"\n") extra_headers.append((b"mergetag", raw_string[:-1])) if commit.extra: extra_headers.extend((k, v) for k, v in commit.extra) if commit.gpgsig: extra_headers.append((b"gpgsig", commit.gpgsig)) - return Revision( + rev = Revision( id=commit.sha().digest(), author=parse_author(commit.author), date=dulwich_tsinfo_to_timestamp( commit.author_time, commit.author_timezone, commit._author_timezone_neg_utc, ), committer=parse_author(commit.committer), committer_date=dulwich_tsinfo_to_timestamp( commit.commit_time, commit.commit_timezone, commit._commit_timezone_neg_utc, ), type=RevisionType.GIT, directory=bytes.fromhex(commit.tree.decode()), message=commit.message, metadata=None, extra_headers=tuple(extra_headers), synthetic=False, parents=tuple(bytes.fromhex(p.decode()) for p in commit.parents), ) + check_id(rev) + return rev DULWICH_TARGET_TYPES = { b"blob": TargetType.CONTENT, b"tree": TargetType.DIRECTORY, b"commit": TargetType.REVISION, b"tag": TargetType.RELEASE, } DULWICH_OBJECT_TYPES = { b"blob": ObjectType.CONTENT, b"tree": ObjectType.DIRECTORY, b"commit": ObjectType.REVISION, b"tag": ObjectType.RELEASE, } -def dulwich_tag_to_release(tag, log=None) -> Release: - if tag.type_name != b"tag": +def dulwich_tag_to_release(obj: ShaFile, log=None) -> Release: + if obj.type_name != b"tag": raise ValueError("Argument is not a tag.") + tag = cast(Tag, obj) target_type, target = tag.object if tag.tagger: author: Optional[Person] = parse_author(tag.tagger) if not tag.tag_time: date = None else: date = dulwich_tsinfo_to_timestamp( tag.tag_time, tag.tag_timezone, tag._tag_timezone_neg_utc, ) else: author = date = None message = tag.message if tag.signature: message += tag.signature - return Release( + rel = Release( id=tag.sha().digest(), author=author, date=date, name=tag.name, target=bytes.fromhex(target.decode()), target_type=DULWICH_OBJECT_TYPES[target_type.type_name], message=message, metadata=None, synthetic=False, ) + check_id(rel) + return rel diff --git a/swh/loader/git/from_disk.py b/swh/loader/git/from_disk.py index 7dd1d10..70995f8 100644 --- a/swh/loader/git/from_disk.py +++ b/swh/loader/git/from_disk.py @@ -1,390 +1,448 @@ # Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict from datetime import datetime import os import shutil from typing import Dict, Optional from dulwich.errors import ObjectFormatException try: from dulwich.errors import EmptyFileException # type: ignore except ImportError: # dulwich >= 0.20 from dulwich.objects import EmptyFileException +import dulwich.objects import dulwich.repo from swh.loader.core.loader import DVCSLoader from swh.model import hashutil from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.interface import StorageInterface from . import converters, utils +def _check_tag(tag): + """Copy-paste of dulwich.objects.Tag, minus the tagger and time checks, + which are too strict and error on old tags.""" + # Copyright (C) 2007 James Westby + # Copyright (C) 2008-2013 Jelmer Vernooij + # + # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU + # General Public License as public by the Free Software Foundation; version 2.0 + # or (at your option) any later version. You can redistribute it and/or + # modify it under the terms of either of these two licenses. + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # + # You should have received a copy of the licenses; if not, see + # for a copy of the GNU General Public License + # and for a copy of the Apache + # License, Version 2.0. + dulwich.objects.ShaFile.check(tag) + tag._check_has_member("_object_sha", "missing object sha") + tag._check_has_member("_object_class", "missing object type") + tag._check_has_member("_name", "missing tag name") + + if not tag._name: + raise ObjectFormatException("empty tag name") + + dulwich.objects.check_hexsha(tag._object_sha, "invalid object sha") + + if tag._tag_time is not None: + dulwich.objects.check_time(tag._tag_time) + + from dulwich.objects import ( + _OBJECT_HEADER, + _TAG_HEADER, + _TAGGER_HEADER, + _TYPE_HEADER, + ) + + last = None + for field, _ in dulwich.objects._parse_message(tag._chunked_text): + if field == _OBJECT_HEADER and last is not None: + raise ObjectFormatException("unexpected object") + elif field == _TYPE_HEADER and last != _OBJECT_HEADER: + raise ObjectFormatException("unexpected type") + elif field == _TAG_HEADER and last != _TYPE_HEADER: + raise ObjectFormatException("unexpected tag name") + elif field == _TAGGER_HEADER and last != _TAG_HEADER: + raise ObjectFormatException("unexpected tagger") + last = field + + class GitLoaderFromDisk(DVCSLoader): """Load a git repository from a directory. """ visit_type = "git" def __init__( self, storage: StorageInterface, url: str, visit_date: Optional[datetime] = None, directory: Optional[str] = None, save_data_path: Optional[str] = None, max_content_size: Optional[int] = None, ): super().__init__( storage=storage, save_data_path=save_data_path, max_content_size=max_content_size, ) self.origin_url = url self.visit_date = visit_date self.directory = directory def prepare_origin_visit(self): self.origin = Origin(url=self.origin_url) def prepare(self): self.repo = dulwich.repo.Repo(self.directory) def iter_objects(self): object_store = self.repo.object_store for pack in object_store.packs: objs = list(pack.index.iterentries()) objs.sort(key=lambda x: x[1]) for sha, offset, crc32 in objs: yield hashutil.hash_to_bytehex(sha) yield from object_store._iter_loose_objects() yield from object_store._iter_alternate_objects() def _check(self, obj): """Check the object's repository representation. If any errors in check exists, an ObjectFormatException is raised. Args: obj (object): Dulwich object read from the repository. """ - obj.check() - from dulwich.objects import Commit, Tag + if isinstance(obj, dulwich.objects.Tag): + _check_tag(obj) + else: + obj.check() try: # For additional checks on dulwich objects with date # for now, only checks on *time - if isinstance(obj, Commit): + if isinstance(obj, dulwich.objects.Commit): commit_time = obj._commit_time utils.check_date_time(commit_time) author_time = obj._author_time utils.check_date_time(author_time) - elif isinstance(obj, Tag): + elif isinstance(obj, dulwich.objects.Tag): tag_time = obj._tag_time - utils.check_date_time(tag_time) + if tag_time: + utils.check_date_time(tag_time) except Exception as e: raise ObjectFormatException(e) def get_object(self, oid): """Given an object id, return the object if it is found and not malformed in some way. Args: oid (bytes): the object's identifier Returns: The object if found without malformation """ try: # some errors are raised when reading the object obj = self.repo[oid] # some we need to check ourselves self._check(obj) except KeyError: _id = oid.decode("utf-8") self.log.warn( "object %s not found, skipping" % _id, extra={ "swh_type": "swh_loader_git_missing_object", "swh_object_id": _id, "origin_url": self.origin.url, }, ) return None except ObjectFormatException as e: id_ = oid.decode("utf-8") self.log.warn( "object %s malformed (%s), skipping", id_, e.args[0], extra={ "swh_type": "swh_loader_git_missing_object", "swh_object_id": id_, "origin_url": self.origin.url, }, ) return None except EmptyFileException: id_ = oid.decode("utf-8") self.log.warn( "object %s corrupted (empty file), skipping", id_, extra={ "swh_type": "swh_loader_git_missing_object", "swh_object_id": id_, "origin_url": self.origin.url, }, ) else: return obj def fetch_data(self): """Fetch the data from the data source""" visit_status = origin_get_latest_visit_status( self.storage, self.origin_url, require_snapshot=True ) self.previous_snapshot_id = ( None if visit_status is None else visit_status.snapshot ) type_to_ids = defaultdict(list) for oid in self.iter_objects(): obj = self.get_object(oid) if obj is None: continue type_name = obj.type_name type_to_ids[type_name].append(oid) self.type_to_ids = type_to_ids def has_contents(self): """Checks whether we need to load contents""" return bool(self.type_to_ids[b"blob"]) def get_content_ids(self): """Get the content identifiers from the git repository""" for oid in self.type_to_ids[b"blob"]: yield converters.dulwich_blob_to_content_id(self.repo[oid]) def get_contents(self): """Get the contents that need to be loaded""" missing_contents = set( self.storage.content_missing(self.get_content_ids(), "sha1_git") ) for oid in missing_contents: yield converters.dulwich_blob_to_content( self.repo[hashutil.hash_to_bytehex(oid)] ) def has_directories(self): """Checks whether we need to load directories""" return bool(self.type_to_ids[b"tree"]) def get_directory_ids(self): """Get the directory identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tree"]) def get_directories(self): """Get the directories that need to be loaded""" missing_dirs = set( self.storage.directory_missing(sorted(self.get_directory_ids())) ) for oid in missing_dirs: yield converters.dulwich_tree_to_directory( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log ) def has_revisions(self): """Checks whether we need to load revisions""" return bool(self.type_to_ids[b"commit"]) def get_revision_ids(self): """Get the revision identifiers from the git repository""" return ( hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"] ) def get_revisions(self): """Get the revisions that need to be loaded""" missing_revs = set( self.storage.revision_missing(sorted(self.get_revision_ids())) ) for oid in missing_revs: yield converters.dulwich_commit_to_revision( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log ) def has_releases(self): """Checks whether we need to load releases""" return bool(self.type_to_ids[b"tag"]) def get_release_ids(self): """Get the release identifiers from the git repository""" return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"]) def get_releases(self): """Get the releases that need to be loaded""" missing_rels = set(self.storage.release_missing(sorted(self.get_release_ids()))) for oid in missing_rels: yield converters.dulwich_tag_to_release( self.repo[hashutil.hash_to_bytehex(oid)], log=self.log ) def get_snapshot(self): """Turn the list of branches into a snapshot to load""" branches: Dict[bytes, Optional[SnapshotBranch]] = {} for ref, target in self.repo.refs.as_dict().items(): if utils.ignore_branch_name(ref): continue obj = self.get_object(target) if obj: target_type = converters.DULWICH_TARGET_TYPES[obj.type_name] branches[ref] = SnapshotBranch( target=hashutil.bytehex_to_hash(target), target_type=target_type, ) else: branches[ref] = None dangling_branches = {} for ref, target in self.repo.refs.get_symrefs().items(): if utils.ignore_branch_name(ref): continue branches[ref] = SnapshotBranch(target=target, target_type=TargetType.ALIAS) if target not in branches: # This handles the case where the pointer is "dangling". # There's a chance that a further symbolic reference will # override this default value, which is totally fine. dangling_branches[target] = ref branches[target] = None utils.warn_dangling_branches( branches, dangling_branches, self.log, self.origin_url ) self.snapshot = Snapshot(branches=branches) return self.snapshot def save_data(self): """We already have the data locally, no need to save it""" pass def load_status(self): """The load was eventful if the current occurrences are different to the ones we retrieved at the beginning of the run""" eventful = False if self.previous_snapshot_id: eventful = self.snapshot.id != self.previous_snapshot_id else: eventful = bool(self.snapshot.branches) return {"status": ("eventful" if eventful else "uneventful")} class GitLoaderFromArchive(GitLoaderFromDisk): """Load a git repository from an archive. This loader ingests a git repository compressed into an archive. The supported archive formats are ``.zip`` and ``.tar.gz``. From an input tarball named ``my-git-repo.zip``, the following layout is expected in it:: my-git-repo/ ├── .git │ ├── branches │ ├── COMMIT_EDITMSG │ ├── config │ ├── description │ ├── HEAD ... Nevertheless, the loader is able to ingest tarballs with the following layouts too:: . ├── .git │ ├── branches │ ├── COMMIT_EDITMSG │ ├── config │ ├── description │ ├── HEAD ... or:: other-repo-name/ ├── .git │ ├── branches │ ├── COMMIT_EDITMSG │ ├── config │ ├── description │ ├── HEAD ... """ def __init__(self, *args, archive_path, **kwargs): super().__init__(*args, **kwargs) self.temp_dir = self.repo_path = None self.archive_path = archive_path def project_name_from_archive(self, archive_path): """Compute the project name from the archive's path. """ archive_name = os.path.basename(archive_path) for ext in (".zip", ".tar.gz", ".tgz"): if archive_name.lower().endswith(ext): archive_name = archive_name[: -len(ext)] break return archive_name def prepare(self): """1. Uncompress the archive in temporary location. 2. Prepare as the GitLoaderFromDisk does 3. Load as GitLoaderFromDisk does """ project_name = self.project_name_from_archive(self.archive_path) self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( project_name, self.archive_path ) self.log.info( "Project %s - Uncompressing archive %s at %s", self.origin_url, os.path.basename(self.archive_path), self.repo_path, ) self.directory = self.repo_path super().prepare() def cleanup(self): """Cleanup the temporary location (if it exists). """ if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) self.log.info( "Project %s - Done injecting %s" % (self.origin_url, self.repo_path) ) diff --git a/swh/loader/git/tests/test_converters.py b/swh/loader/git/tests/test_converters.py index 21c8f89..71967d8 100644 --- a/swh/loader/git/tests/test_converters.py +++ b/swh/loader/git/tests/test_converters.py @@ -1,473 +1,562 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy import os import shutil import subprocess import tempfile -import unittest import dulwich.repo import pytest import swh.loader.git.converters as converters -from swh.model.hashutil import bytehex_to_hash, hash_to_bytes +from swh.model.hashutil import bytehex_to_hash, hash_to_bytehex, hash_to_bytes from swh.model.model import ( Content, ObjectType, Person, Release, Revision, RevisionType, Timestamp, TimestampWithTimezone, ) TEST_DATA = os.path.join(os.path.dirname(__file__), "data") GPGSIG = ( b"-----BEGIN PGP SIGNATURE-----\n" b"\n" b"iQJLBAABCAA1FiEEAOWDevQbOk/9ITMF6ImSleOlnUcFAl8EnS4XHGRhdmlkLmRv\n" b"dWFyZEBzZGZhMy5vcmcACgkQ6ImSleOlnUdrqQ/8C5RO4NZ5Qr/dwAy2cPA7ktkY\n" b"1oUjKtspQoPbC1X3MXVa1aWo9B3KuOMR2URw44RhMNFwjccLOhfss06E8p7CZr2H\n" b"uR3CzdDw7i52jHLCL2M2ZMaPAEbQuHjXWiUWIUXz9So8YwpTyd2XQneyOC2RDDEI\n" b"I2NVbmiMeDz33jJYPrQO0QayW+ErW+xgBF7N/qS9jFWsdV1ZNfn9NxkTH8UdGuAX\n" b"583P+0tVC2DjXc6vORVhyFzyfn1A9wHosbtWI2Mpa+zezPjoPSkcyQAJu2GyOkMC\n" b"YzSjJdQVqyovo+INkIf6PuUNdp41886BG/06xwT8fl4sVsyO51lNIfgH0DMwfTTB\n" b"ZgThYnvvO7SrXDm3QzBTXkvAiHiFFl3iNyGkCyxvgVmaTntuFT+cP+HD/pCiGaC+\n" b"jHzRwfUrmuLd/lLPyq3JXBibyjnfd3SVS+7q1NZHJ4WUmCboZ0+pfrEl65mEQ/Hz\n" b"J1qCwQ/3SsTB77ANf6lLzGSowjjrtHcBTkTbFxR4ACUhiBbosyDKpHTM7fzGFGjo\n" b"EIjohzrEnqR3bbyxJkK+nxoOByhIRdowgyeJ02I4neMyLJqcaup8NMWCddxqjaPt\n" b"YobghnjaDqEd+suL/v83hbZUAZHNO3i1OZYGMqzp1WHikDPoTwGP76baqBoXi56T\n" b"4WSpxCAJRDODHLk1HgU=\n" b"=73wF" b"\n" b"-----END PGP SIGNATURE-----" ) MERGETAG = ( b"object 9768d0b576dbaaecd80abedad6dfd0d72f1476da\n" b"type commit\n" b"tag v0.0.1\n" b"tagger David Douard 1594138133 +0200\n" b"\n" b"v0.0.1\n" b"-----BEGIN PGP SIGNATURE-----\n" b"\n" b"iQJLBAABCAA1FiEEAOWDevQbOk/9ITMF6ImSleOlnUcFAl8EnhkXHGRhdmlkLmRv\n" b"dWFyZEBzZGZhMy5vcmcACgkQ6ImSleOlnUcdzg//ZW9y2xU5JFQuUsBe/LfKrs+m\n" b"0ohVInPKXwAfpB3+gn/XtTSLe+Nnr8+QEZyVRCUz2gpGZ2tNqRjhYLIX4x5KKlaV\n" b"rfl/6Cy7zibsxxuzA1h7HylCs3IPsueQpznVHUwD9jQ5baGJSc2Lt1LufXTueHZJ\n" b"Oc0oLiP5xCZcPqeX8R/4zUUImJZ1QrPeKmQ/3F+Iq62iWp7nWDp8PtwpykSiYlNf\n" b"KrJM8omGvrlrWLtfPNUaQFClXwnwK1/HyNY2kYan6K5NtsIl2UX0LZ42GkRjJIrb\n" b"q4TFIZWZ6xndtEhHEX6B8Q5TZV6sqPgNnfGpbhj8BDoZgjD0Y43fzfDiZ0Bl2tph\n" b"tXaLg3SX/UUjFVzC1zkoQ2MR7+j8NVKauAsBINpKF4pMGsrsVRk8764pgO49iQ+S\n" b"8JVCVV76dNNm1gd7BbhFAdIAiegBtsEF69niJBoHKYLlrT8E8hDkF/gk4IkimPqf\n" b"UHtw/fPhVW3B4G2skd013NJGcnRj5oKtaM99d2Roxc3vhSRiTsoaM8BM9NDvLmJg\n" b"35rWEOnet39iJIMCHk3AYaJl8QmUhllDdr6vygaBVeVEf27m2c3NzONmIKpWqa2J\n" b"kTpF4cmzHYro34G7WuJ1bYvmLb6qWNQt9wd8RW+J1kVm5I8dkjPzLUougBpOd0YL\n" b"Bl5UTQILbV4Tv8ZlmJM=\n" b"=s1lv\n" b"-----END PGP SIGNATURE-----" ) class SWHObjectType: """Dulwich lookalike ObjectType class """ def __init__(self, type_name): self.type_name = type_name class SWHTag: """Dulwich lookalike tag class """ def __init__( self, + sha, name, type_name, target, target_type, tagger, tag_time, tag_timezone, message, signature, ): + self._sha = sha self.name = name self.type_name = type_name self.object = SWHObjectType(target_type), target self.tagger = tagger self.message = message self.signature = signature self.tag_time = tag_time self.tag_timezone = tag_timezone self._tag_timezone_neg_utc = False def sha(self): - from hashlib import sha1 + class hasher: + def digest(): + return self._sha - return sha1() + return hasher @pytest.mark.fs -class TestConverters(unittest.TestCase): +class TestConverters: @classmethod - def setUpClass(cls): - super().setUpClass() + def setup_class(cls): cls.repo_path = tempfile.mkdtemp() bundle = os.path.join(TEST_DATA, "git-repos", "example-submodule.bundle") git = subprocess.Popen( ["git", "clone", "--quiet", "--bare", "--mirror", bundle, cls.repo_path], cwd=TEST_DATA, ) # flush stdout of xz git.communicate() cls.repo = dulwich.repo.Repo(cls.repo_path) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.repo_path) def test_blob_to_content(self): content_id = b"28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0" content = converters.dulwich_blob_to_content(self.repo[content_id]) expected_content = Content( sha1_git=bytehex_to_hash(content_id), sha1=hash_to_bytes("4850a3420a2262ff061cb296fb915430fa92301c"), sha256=hash_to_bytes( "fee7c8a485a10321ad94b64135073cb5" "5f22cb9f57fa2417d2adfb09d310adef" ), blake2s256=hash_to_bytes( "5d71873f42a137f6d89286e43677721e574" "1fa05ce4cd5e3c7ea7c44d4c2d10b" ), data=( b'[submodule "example-dependency"]\n' b"\tpath = example-dependency\n" b"\turl = https://github.com/githubtraining/" b"example-dependency.git\n" ), length=124, status="visible", ) - self.assertEqual(content, expected_content) + assert content == expected_content + + def test_corrupt_blob(self, mocker): + # has a signature + sha1 = hash_to_bytes("28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0") + + blob = copy.deepcopy(self.repo[hash_to_bytehex(sha1)]) + + class hasher: + def digest(): + return sha1 + + blob._sha = hasher + + converters.dulwich_blob_to_content(blob) + converters.dulwich_blob_to_content_id(blob) + + sha1 = hash_to_bytes("1234" * 10) + + with pytest.raises(converters.HashMismatch): + converters.dulwich_blob_to_content(blob) + with pytest.raises(converters.HashMismatch): + converters.dulwich_blob_to_content_id(blob) def test_convertion_wrong_input(self): class Something: type_name = b"something-not-the-right-type" m = { "blob": converters.dulwich_blob_to_content, - "blob2": converters.dulwich_blob_to_content_id, "tree": converters.dulwich_tree_to_directory, "commit": converters.dulwich_tree_to_directory, "tag": converters.dulwich_tag_to_release, } for _callable in m.values(): - with self.assertRaises(ValueError): + with pytest.raises(ValueError): _callable(Something()) + def test_corrupt_tree(self): + # has a signature + sha1 = b"f0695c2e2fa7ce9d574023c3413761a473e500ca" + tree = copy.deepcopy(self.repo[sha1]) + converters.dulwich_tree_to_directory(tree) + + del tree._entries[next(iter(tree._entries))] + + with pytest.raises(converters.HashMismatch): + converters.dulwich_tree_to_directory(tree) + def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca", type=RevisionType.GIT, committer=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli ", email=b"zack@upsilon.cc", ), author=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli ", email=b"zack@upsilon.cc", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1443083765, microseconds=0,), negative_utc=False, offset=120, ), message=b"add submodule dependency\n", metadata=None, extra_headers=(), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1443083765, microseconds=0,), negative_utc=False, offset=120, ), parents=(b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r",), synthetic=False, ) - self.assertEqual(revision, expected_revision) + assert revision == expected_revision def test_commit_to_revision_with_extra_headers(self): sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex("f8ec06e4ed7b9fff4918a0241a48023143f30000"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard ", email=b"david.douard@sdfa3.org", ), author=Person( name=b"David Douard", fullname=b"David Douard ", email=b"david.douard@sdfa3.org", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594137902, microseconds=0,), negative_utc=False, offset=120, ), message=b"Am\xe9lioration du fichier READM\xa4\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594136900, microseconds=0,), negative_utc=False, offset=120, ), parents=(bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"),), synthetic=False, ) assert revision == expected_revision + @pytest.mark.parametrize( + "attribute", ["_message", "_encoding", "_author", "_gpgsig"] + ) + def test_corrupt_commit(self, attribute): + # has a signature + sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" + commit = copy.deepcopy(self.repo[sha1]) + converters.dulwich_commit_to_revision(commit) + setattr(commit, attribute, b"abcde") + with pytest.raises(converters.HashMismatch): + converters.dulwich_commit_to_revision(commit) + + if attribute == "_gpgsig": + setattr(commit, attribute, None) + with pytest.raises(converters.HashMismatch): + converters.dulwich_commit_to_revision(commit) + def test_commit_to_revision_with_extra_headers_mergetag(self): sha1 = b"3ab3da4bf0f81407be16969df09cd1c8af9ac703" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex("faa4b64a841ca3e3f07d6501caebda2e3e8e544e"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard ", email=b"david.douard@sdfa3.org", ), author=Person( name=b"David Douard", fullname=b"David Douard ", email=b"david.douard@sdfa3.org", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594138183, microseconds=0,), negative_utc=False, offset=120, ), message=b"Merge tag 'v0.0.1' into readme\n\nv0.0.1\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"mergetag", MERGETAG)), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1594138183, microseconds=0,), negative_utc=False, offset=120, ), parents=( bytes.fromhex("322f5bc915e50fc25e85226b5a182bded0e98e4b"), bytes.fromhex("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), ), synthetic=False, ) assert revision == expected_revision def test_author_line_to_author(self): # edge case out of the way - with self.assertRaises(TypeError): + with pytest.raises(TypeError): converters.parse_author(None) tests = { b"a ": Person( name=b"a", email=b"b@c.com", fullname=b"a ", ), b"": Person( name=None, email=b"foo@bar.com", fullname=b"", ), b"malformed ": Person( name=b"trailing", email=b"sp@c.e", fullname=b"trailing ", ), b"no": Person(name=b"no", email=b"sp@c.e", fullname=b"no",), b" <>": Person(name=None, email=None, fullname=b" <>",), b"something": Person(name=b"something", email=None, fullname=b"something"), } for author in sorted(tests): parsed_author = tests[author] - self.assertEqual(parsed_author, converters.parse_author(author)) + assert parsed_author == converters.parse_author(author) def test_dulwich_tag_to_release_no_author_no_date(self): - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136") + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, signature=None, tagger=None, tag_time=None, tag_timezone=None, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=None, date=None, - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) - self.assertEqual(actual_release, expected_release) + assert actual_release == expected_release def test_dulwich_tag_to_release_author_and_date(self): + sha = hash_to_bytes("fc1e6a4f1e37e93e28e78560e73efd0b12f616ef") tagger = b"hey dude " - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" import datetime date = datetime.datetime(2007, 12, 5, tzinfo=datetime.timezone.utc).timestamp() tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, signature=None, tagger=tagger, tag_time=date, tag_timezone=0, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"hello@mail.org", fullname=b"hey dude ", name=b"hey dude", ), date=TimestampWithTimezone( negative_utc=False, offset=0, timestamp=Timestamp(seconds=1196812800, microseconds=0,), ), - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) - self.assertEqual(actual_release, expected_release) + assert actual_release == expected_release def test_dulwich_tag_to_release_author_no_date(self): # to reproduce bug T815 (fixed) + sha = hash_to_bytes("41076e970975122dc6b2a878aa9797960bc4781d") tagger = b"hey dude " - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, signature=None, tagger=tagger, tag_time=None, tag_timezone=None, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=Person( email=b"hello@mail.org", fullname=b"hey dude ", name=b"hey dude", ), date=None, - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) - self.assertEqual(actual_release, expected_release) + assert actual_release == expected_release def test_dulwich_tag_to_release_signature(self): - target = b"641fb6e08ddb2e4fd096dcf18e80b894bf" + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" + sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") tag = SWHTag( + sha=sha, name=b"blah", type_name=b"tag", target=target, target_type=b"commit", message=message, signature=GPGSIG, tagger=None, tag_time=None, tag_timezone=None, ) # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=None, date=None, - id=b"\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t", + id=sha, message=message + GPGSIG, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) - self.assertEqual(actual_release, expected_release) + assert actual_release == expected_release + + @pytest.mark.parametrize("attribute", ["name", "message", "signature"]) + def test_corrupt_tag(self, attribute): + # has a signature + sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") + target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" + message = b"some release message" + tag = SWHTag( + sha=sha, + name=b"blah", + type_name=b"tag", + target=target, + target_type=b"commit", + message=message, + signature=GPGSIG, + tagger=None, + tag_time=None, + tag_timezone=None, + ) + converters.dulwich_tag_to_release(tag) + + setattr(tag, attribute, b"abcde") + with pytest.raises(converters.HashMismatch): + converters.dulwich_tag_to_release(tag) + + if attribute == "signature": + setattr(tag, attribute, None) + with pytest.raises(converters.HashMismatch): + converters.dulwich_tag_to_release(tag) diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py index a0e9b28..725259a 100644 --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -1,471 +1,543 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import datetime import os.path -from unittest import TestCase +import dulwich.objects +import dulwich.porcelain import dulwich.repo import pytest from swh.loader.git.from_disk import GitLoaderFromArchive, GitLoaderFromDisk from swh.loader.tests import ( assert_last_visit_matches, check_snapshot, get_stats, prepare_repository_from_archive, ) -from swh.model.hashutil import hash_to_bytes -from swh.model.model import Snapshot, SnapshotBranch, TargetType +from swh.model.hashutil import bytehex_to_hash, hash_to_bytes +from swh.model.model import ObjectType, Release, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.snapshot import snapshot_get_all_branches SNAPSHOT1 = Snapshot( id=hash_to_bytes("a23699280a82a043f8c0994cf1631b568f716f95"), branches={ b"HEAD": SnapshotBranch( target=b"refs/heads/master", target_type=TargetType.ALIAS, ), b"refs/heads/master": SnapshotBranch( target=hash_to_bytes("2f01f5ca7e391a2f08905990277faf81e709a649"), target_type=TargetType.REVISION, ), b"refs/heads/branch1": SnapshotBranch( target=hash_to_bytes("b0a77609903f767a2fd3d769904ef9ef68468b87"), target_type=TargetType.REVISION, ), b"refs/heads/branch2": SnapshotBranch( target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), target_type=TargetType.REVISION, ), b"refs/tags/branch2-after-delete": SnapshotBranch( target=hash_to_bytes("bd746cd1913721b269b395a56a97baf6755151c2"), target_type=TargetType.REVISION, ), b"refs/tags/branch2-before-delete": SnapshotBranch( target=hash_to_bytes("1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b"), target_type=TargetType.REVISION, ), }, ) # directory hashes obtained with: # gco b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a # swh-hashtree --ignore '.git' --path . # gco 2f01f5ca7e391a2f08905990277faf81e709a649 # swh-hashtree --ignore '.git' --path . # gco bcdc5ebfde1a3cd6c96e0c2ea4eed19c13208777 # swh-hashtree --ignore '.git' --path . # gco 1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b # swh-hashtree --ignore '.git' --path . # gco 79f65ac75f79dda6ff03d66e1242702ab67fb51c # swh-hashtree --ignore '.git' --path . # gco b0a77609903f767a2fd3d769904ef9ef68468b87 # swh-hashtree --ignore '.git' --path . # gco bd746cd1913721b269b395a56a97baf6755151c2 # swh-hashtree --ignore '.git' --path . REVISIONS1 = { "b6f40292c4e94a8f7e7b4aff50e6c7429ab98e2a": ( "40dbdf55dfd4065422462cc74a949254aefa972e" ), "2f01f5ca7e391a2f08905990277faf81e709a649": ( "e1d0d894835f91a0f887a4bc8b16f81feefdfbd5" ), "bcdc5ebfde1a3cd6c96e0c2ea4eed19c13208777": ( "b43724545b4759244bb54be053c690649161411c" ), "1135e94ccf73b5f9bd6ef07b3fa2c5cc60bba69b": ( "fbf70528223d263661b5ad4b80f26caf3860eb8e" ), "79f65ac75f79dda6ff03d66e1242702ab67fb51c": ( "5df34ec74d6f69072d9a0a6677d8efbed9b12e60" ), "b0a77609903f767a2fd3d769904ef9ef68468b87": ( "9ca0c7d6ffa3f9f0de59fd7912e08f11308a1338" ), "bd746cd1913721b269b395a56a97baf6755151c2": ( "e1d0d894835f91a0f887a4bc8b16f81feefdfbd5" ), } class CommonGitLoaderTests: """Common tests for all git loaders.""" def test_load(self): """Loads a simple repository (made available by `setUp()`), and checks everything was added in the storage.""" res = self.loader.load() assert res == {"status": "eventful"} assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", snapshot=SNAPSHOT1.id, ) stats = get_stats(self.loader.storage) assert stats == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } check_snapshot(SNAPSHOT1, self.loader.storage) def test_load_unchanged(self): """Checks loading a repository a second time does not add any extra data.""" res = self.loader.load() assert res == {"status": "eventful"} assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", snapshot=SNAPSHOT1.id, ) stats0 = get_stats(self.loader.storage) assert stats0 == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } res = self.loader.load() assert res == {"status": "uneventful"} stats1 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats0) expected_stats["origin_visit"] += 1 assert stats1 == expected_stats check_snapshot(SNAPSHOT1, self.loader.storage) assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", snapshot=SNAPSHOT1.id, ) def test_load_visit_without_snapshot_so_status_failed(self): # unfortunately, monkey-patch the hard way, self.loader is already instantiated # (patching won't work self.loader is already instantiated) # Make get_contents fail for some reason self.loader.get_contents = None res = self.loader.load() assert res == {"status": "failed"} assert_last_visit_matches( self.loader.storage, self.repo_url, status="failed", type="git", snapshot=None, ) def test_load_visit_with_snapshot_so_status_partial(self): # unfortunately, monkey-patch the hard way, self.loader is already instantiated # (patching won't work self.loader is already instantiated) # fake store_metadata raising for some reason, so we could have a snapshot id # at this point in time self.loader.store_metadata = None # fake having a snapshot so the visit status is partial self.loader.loaded_snapshot_id = hash_to_bytes( "a23699280a82a043f8c0994cf1631b568f716f95" ) res = self.loader.load() assert res == {"status": "failed"} assert_last_visit_matches( self.loader.storage, self.repo_url, status="partial", type="git", snapshot=None, ) class FullGitLoaderTests(CommonGitLoaderTests): """Tests for GitLoader (from disk or not). Includes the common ones, and add others that only work with a local dir. """ def test_load_changed(self): """Loads a repository, makes some changes by adding files, commits, and merges, load it again, and check the storage contains everything it should.""" # Initial load res = self.loader.load() assert res == {"status": "eventful"} stats0 = get_stats(self.loader.storage) assert stats0 == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } # Load with a new file + revision with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: fd.write("print('Hello world')\n") self.repo.stage([b"hello.py"]) new_revision = self.repo.do_commit(b"Hello world\n").decode() new_dir = "85dae072a5aa9923ffa7a7568f819ff21bf49858" assert self.repo[new_revision.encode()].tree == new_dir.encode() revisions = REVISIONS1.copy() assert new_revision not in revisions revisions[new_revision] = new_dir res = self.loader.load() assert res == {"status": "eventful"} stats1 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats0) # did one new visit expected_stats["origin_visit"] += 1 # with one more of the following objects expected_stats["content"] += 1 expected_stats["directory"] += 1 expected_stats["revision"] += 1 # concluding into 1 new snapshot expected_stats["snapshot"] += 1 assert stats1 == expected_stats visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) assert visit_status.snapshot is not None snapshot_id = visit_status.snapshot snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id) branches = snapshot.branches assert branches[b"HEAD"] == SnapshotBranch( target=b"refs/heads/master", target_type=TargetType.ALIAS, ) assert branches[b"refs/heads/master"] == SnapshotBranch( target=hash_to_bytes(new_revision), target_type=TargetType.REVISION, ) # Merge branch1 into HEAD. current = self.repo[b"HEAD"] branch1 = self.repo[b"refs/heads/branch1"] merged_tree = dulwich.objects.Tree() for item in self.repo[current.tree].items(): merged_tree.add(*item) for item in self.repo[branch1.tree].items(): merged_tree.add(*item) merged_dir_id = "dab8a37df8db8666d4e277bef9a546f585b5bedd" assert merged_tree.id.decode() == merged_dir_id self.repo.object_store.add_object(merged_tree) merge_commit = self.repo.do_commit( b"merge.\n", tree=merged_tree.id, merge_heads=[branch1.id] ) assert merge_commit.decode() not in revisions revisions[merge_commit.decode()] = merged_tree.id.decode() res = self.loader.load() assert res == {"status": "eventful"} stats2 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats1) # one more visit expected_stats["origin_visit"] += 1 # with 1 new directory and revision expected_stats["directory"] += 1 expected_stats["revision"] += 1 # concluding into 1 new snapshot expected_stats["snapshot"] += 1 assert stats2 == expected_stats visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) assert visit_status.snapshot is not None merge_snapshot_id = visit_status.snapshot assert merge_snapshot_id != snapshot_id merge_snapshot = snapshot_get_all_branches( self.loader.storage, merge_snapshot_id ) merge_branches = merge_snapshot.branches assert merge_branches[b"HEAD"] == SnapshotBranch( target=b"refs/heads/master", target_type=TargetType.ALIAS, ) assert merge_branches[b"refs/heads/master"] == SnapshotBranch( target=hash_to_bytes(merge_commit.decode()), target_type=TargetType.REVISION, ) def test_load_filter_branches(self): filtered_branches = {b"refs/pull/42/merge"} unfiltered_branches = {b"refs/pull/42/head"} # Add branches to the repository on disk; some should be filtered by # the loader, some should not. for branch_name in filtered_branches | unfiltered_branches: self.repo[branch_name] = self.repo[b"refs/heads/master"] # Generate the expected snapshot from SNAPSHOT1 (which is the original # state of the git repo)... branches = dict(SNAPSHOT1.branches) # ... and the unfiltered_branches, which are all pointing to the same # commit as "refs/heads/master". for branch_name in unfiltered_branches: branches[branch_name] = branches[b"refs/heads/master"] expected_snapshot = Snapshot(branches=branches) # Load the modified repository res = self.loader.load() assert res == {"status": "eventful"} check_snapshot(expected_snapshot, self.loader.storage) assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git", snapshot=expected_snapshot.id, ) def test_load_dangling_symref(self): with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f: f.write(b"ref: refs/heads/dangling-branch\n") res = self.loader.load() assert res == {"status": "eventful"} visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) snapshot_id = visit_status.snapshot assert snapshot_id is not None snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id) branches = snapshot.branches assert branches[b"HEAD"] == SnapshotBranch( target=b"refs/heads/dangling-branch", target_type=TargetType.ALIAS, ) assert branches[b"refs/heads/dangling-branch"] is None stats = get_stats(self.loader.storage) assert stats == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } def test_load_empty_tree(self): empty_dir_id = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" # Check the empty tree does not already exist for some reason # (it would make this test pointless) assert list( self.loader.storage.directory_missing([hash_to_bytes(empty_dir_id)]) ) == [hash_to_bytes(empty_dir_id)] empty_tree = dulwich.objects.Tree() assert empty_tree.id.decode() == empty_dir_id self.repo.object_store.add_object(empty_tree) self.repo.do_commit(b"remove all bugs\n", tree=empty_tree.id) res = self.loader.load() assert res == {"status": "eventful"} assert ( list(self.loader.storage.directory_missing([hash_to_bytes(empty_dir_id)])) == [] ) results = self.loader.storage.directory_get_entries(hash_to_bytes(empty_dir_id)) assert results.next_page_token is None assert results.results == [] + def test_load_tag(self): + with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: + fd.write("print('Hello world')\n") + + self.repo.stage([b"hello.py"]) + new_revision = self.repo.do_commit(b"Hello world\n") + + dulwich.porcelain.tag_create( + self.repo, + b"v1.0.0", + message=b"First release!", + annotated=True, + objectish=new_revision, + ) + + res = self.loader.load() + assert res == {"status": "eventful"} + + branches = self.loader.storage.snapshot_get_branches(self.loader.snapshot.id) + + branch = branches["branches"][b"refs/tags/v1.0.0"] + assert branch.target_type == TargetType.RELEASE + + release = self.loader.storage.release_get([branch.target])[0] + assert release.date is not None + assert release.author is not None + assert release == Release( + name=b"v1.0.0", + message=b"First release!\n", + target_type=ObjectType.REVISION, + target=bytehex_to_hash(new_revision), + author=release.author, + date=release.date, + synthetic=False, + ) + + def test_load_tag_minimal(self): + with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: + fd.write("print('Hello world')\n") + + self.repo.stage([b"hello.py"]) + new_revision = self.repo.do_commit(b"Hello world\n") + + # dulwich.porcelain.tag_create doesn't allow creating tags without + # a tagger or a date, so we have to create it "manually" + tag = dulwich.objects.Tag() + tag.message = b"First release!\n" + tag.name = b"v1.0.0" + tag.object = (dulwich.objects.Commit, new_revision) + self.repo.object_store.add_object(tag) + self.repo[b"refs/tags/v1.0.0"] = tag.id + + res = self.loader.load() + assert res == {"status": "eventful"} + + branches = self.loader.storage.snapshot_get_branches(self.loader.snapshot.id) + + print(list(branches["branches"])) + branch = branches["branches"][b"refs/tags/v1.0.0"] + assert branch.target_type == TargetType.RELEASE + + release = self.loader.storage.release_get([branch.target])[0] + assert release == Release( + id=bytehex_to_hash(tag.id), + name=b"v1.0.0", + message=b"First release!\n", + target_type=ObjectType.REVISION, + target=bytehex_to_hash(new_revision), + synthetic=False, + ) + -class GitLoaderFromDiskTest(TestCase, FullGitLoaderTests): +class TestGitLoaderFromDisk(FullGitLoaderTests): """Prepare a git directory repository to be loaded through a GitLoaderFromDisk. This tests all git loader scenario. """ @pytest.fixture(autouse=True) def init(self, swh_storage, datadir, tmp_path): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) self.repo_url = prepare_repository_from_archive( archive_path, archive_name, tmp_path=tmp_path ) self.destination_path = os.path.join(tmp_path, archive_name) self.loader = GitLoaderFromDisk( swh_storage, url=self.repo_url, visit_date=datetime.datetime( 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc ), directory=self.destination_path, ) self.repo = dulwich.repo.Repo(self.destination_path) -class GitLoaderFromArchiveTest(TestCase, CommonGitLoaderTests): +class TestGitLoaderFromArchive(CommonGitLoaderTests): """Tests for GitLoaderFromArchive. Only tests common scenario.""" @pytest.fixture(autouse=True) def init(self, swh_storage, datadir, tmp_path): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") self.repo_url = archive_path self.loader = GitLoaderFromArchive( swh_storage, url=self.repo_url, archive_path=archive_path, visit_date=datetime.datetime( 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc ), ) diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py index b0b0881..f18d9c2 100644 --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -1,124 +1,119 @@ # Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os -from unittest import TestCase from dulwich.errors import GitProtocolError, NotGitRepository, ObjectFormatException import dulwich.repo import pytest from swh.loader.git.loader import GitLoader from swh.loader.git.tests.test_from_disk import FullGitLoaderTests from swh.loader.tests import assert_last_visit_matches, prepare_repository_from_archive class CommonGitLoaderNotFound: @pytest.fixture(autouse=True) def __inject_fixtures(self, mocker): """Inject required fixtures in unittest.TestCase class """ self.mocker = mocker - def test_load_visit_not_found(self): - """Ingesting an unknown url result in a visit with not_found status - - """ - for failure_exception in [ + @pytest.mark.parametrize( + "failure_exception", + [ GitProtocolError("Repository unavailable"), # e.g DMCA takedown GitProtocolError("Repository not found"), GitProtocolError("unexpected http resp 401"), NotGitRepository("not a git repo"), - ]: - with self.subTest(failure_exception=failure_exception): - # simulate an initial communication error (e.g no repository found, ...) - mock = self.mocker.patch( - "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" - ) - mock.side_effect = failure_exception - - res = self.loader.load() - assert res == {"status": "uneventful"} - - assert_last_visit_matches( - self.loader.storage, - self.repo_url, - status="not_found", - type="git", - snapshot=None, - ) - - def test_load_visit_failure(self): + ], + ) + def test_load_visit_not_found(self, failure_exception): + """Ingesting an unknown url result in a visit with not_found status + + """ + # simulate an initial communication error (e.g no repository found, ...) + mock = self.mocker.patch( + "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" + ) + mock.side_effect = failure_exception + + res = self.loader.load() + assert res == {"status": "uneventful"} + + assert_last_visit_matches( + self.loader.storage, + self.repo_url, + status="not_found", + type="git", + snapshot=None, + ) + + @pytest.mark.parametrize( + "failure_exception", + [IOError, ObjectFormatException, OSError, ValueError, GitProtocolError,], + ) + def test_load_visit_failure(self, failure_exception): """Failing during the fetch pack step result in failing visit """ - for failure_exception in [ - IOError, - ObjectFormatException, - OSError, - ValueError, - GitProtocolError, - ]: - with self.subTest(failure_exception=failure_exception): - # simulate a fetch communication error after the initial connection - # server error (e.g IOError, ObjectFormatException, ...) - mock = self.mocker.patch( - "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" - ) - - mock.side_effect = failure_exception("failure") - - res = self.loader.load() - assert res == {"status": "failed"} - - assert_last_visit_matches( - self.loader.storage, - self.repo_url, - status="failed", - type="git", - snapshot=None, - ) - - -class GitLoaderTest(TestCase, FullGitLoaderTests, CommonGitLoaderNotFound): + # simulate a fetch communication error after the initial connection + # server error (e.g IOError, ObjectFormatException, ...) + mock = self.mocker.patch( + "swh.loader.git.loader.GitLoader.fetch_pack_from_origin" + ) + + mock.side_effect = failure_exception("failure") + + res = self.loader.load() + assert res == {"status": "failed"} + + assert_last_visit_matches( + self.loader.storage, + self.repo_url, + status="failed", + type="git", + snapshot=None, + ) + + +class TestGitLoader(FullGitLoaderTests, CommonGitLoaderNotFound): """Prepare a git directory repository to be loaded through a GitLoader. This tests all git loader scenario. """ @pytest.fixture(autouse=True) def init(self, swh_storage, datadir, tmp_path): - super().setUp() archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) self.repo_url = prepare_repository_from_archive( archive_path, archive_name, tmp_path=tmp_path ) self.destination_path = os.path.join(tmp_path, archive_name) self.loader = GitLoader(swh_storage, self.repo_url) self.repo = dulwich.repo.Repo(self.destination_path) -class GitLoader2Test(TestCase, FullGitLoaderTests, CommonGitLoaderNotFound): +class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a base-url different than the repo-url. To walk slightly different paths, the end result should stay the same. """ @pytest.fixture(autouse=True) def init(self, swh_storage, datadir, tmp_path): - super().setUp() archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) self.repo_url = prepare_repository_from_archive( archive_path, archive_name, tmp_path=tmp_path ) self.destination_path = os.path.join(tmp_path, archive_name) base_url = f"base://{self.repo_url}" self.loader = GitLoader(swh_storage, self.repo_url, base_url=base_url) self.repo = dulwich.repo.Repo(self.destination_path) diff --git a/swh/loader/git/tests/test_utils.py b/swh/loader/git/tests/test_utils.py index a994eb6..7b1acb1 100644 --- a/swh/loader/git/tests/test_utils.py +++ b/swh/loader/git/tests/test_utils.py @@ -1,30 +1,30 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest +import pytest from swh.loader.git import utils -class TestUtils(unittest.TestCase): +class TestUtils: def test_check_date_time(self): """A long as datetime is fine, date time check does not raise """ for e in range(32, 37): ts = 2 ** e utils.check_date_time(ts) def test_check_date_time_empty_value(self): - self.assertIsNone(utils.check_date_time(None)) + assert utils.check_date_time(None) is None def test_check_date_time_raises(self): """From a give threshold, check will no longer works. """ exp = 38 timestamp = 2 ** exp - with self.assertRaisesRegex(ValueError, "is out of range"): + with pytest.raises(ValueError, match=".*is out of range.*"): utils.check_date_time(timestamp)