diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -24,7 +24,7 @@ from swh.indexer.indexer import ContentIndexer, OriginIndexer, RevisionIndexer from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS -from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.origin_head import get_head_swhid from swh.indexer.storage import INDEXER_CFG_KEY, Sha1 from swh.indexer.storage.model import ( ContentMetadataRow, @@ -33,6 +33,7 @@ ) from swh.model import hashutil from swh.model.model import Origin, Revision, Sha1Git +from swh.model.swhids import ObjectType REVISION_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 @@ -321,7 +322,6 @@ def __init__(self, config=None, **kwargs) -> None: super().__init__(config=config, **kwargs) - self.origin_head_indexer = OriginHeadIndexer(config=config) self.revision_metadata_indexer = RevisionMetadataIndexer(config=config) def index_list( @@ -345,11 +345,12 @@ for origin in known_origins: if origin is None: continue - head_results = self.origin_head_indexer.index(origin.url) - if head_results: - (head_result,) = head_results + head_swhid = get_head_swhid(self.storage, origin.url) + if head_swhid: + # TODO: add support for releases + assert head_swhid.object_type == ObjectType.REVISION, head_swhid origins_with_head.append(origin) - head_rev_ids.append(head_result["revision_id"]) + head_rev_ids.append(head_swhid.object_id) head_revs = list( call_with_batches( diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -1,159 +1,120 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging import re -from typing import Any, Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union -import click - -from swh.indexer.indexer import OriginIndexer from swh.model.model import SnapshotBranch, TargetType +from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches -class OriginHeadIndexer(OriginIndexer[Dict]): - """Origin-level indexer. - - This indexer is in charge of looking up the revision that acts as the - "head" of an origin. - - In git, this is usually the commit pointed to by the 'master' branch.""" - - USE_TOOLS = False - - def persist_index_computations(self, results: Any) -> Dict[str, int]: - """Do nothing. The indexer's results are not persistent, they - should only be piped to another indexer.""" - return {} - - # Dispatch - - def index(self, id: str, data: None = None, **kwargs) -> List[Dict]: - origin_url = id - visit_status = origin_get_latest_visit_status( - self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True - ) - if not visit_status: - return [] - assert visit_status.snapshot is not None - snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot) - if snapshot is None: - return [] - method = getattr( - self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic - ) - - rev_id = method(snapshot.branches) # type: ignore - if rev_id is not None: - return [ - { - "origin_url": origin_url, - "revision_id": rev_id, - } - ] - - # could not find a head revision - return [] - - # Tarballs - - _archive_filename_re = re.compile( - rb"^" - rb"(?P.*)[-_]" - rb"(?P[0-9]+(\.[0-9])*)" - rb"(?P[-+][a-zA-Z0-9.~]+?)?" - rb"(?P(\.[a-zA-Z0-9]+)+)" - rb"$" +def get_head_swhid(storage, origin_url: str) -> Optional[CoreSWHID]: + """Returns the SWHID of the head revision or release of an origin""" + visit_status = origin_get_latest_visit_status( + storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) + if not visit_status: + return None + assert visit_status.snapshot is not None + snapshot = snapshot_get_all_branches(storage, visit_status.snapshot) + if snapshot is None: + return None + + if visit_status.type == "ftp": + return _try_get_ftp_head(dict(snapshot.branches)) + else: + return _try_get_head_generic(dict(snapshot.branches)) + + +_archive_filename_re = re.compile( + rb"^" + rb"(?P.*)[-_]" + rb"(?P[0-9]+(\.[0-9])*)" + rb"(?P[-+][a-zA-Z0-9.~]+?)?" + rb"(?P(\.[a-zA-Z0-9]+)+)" + rb"$" +) - @classmethod - def _parse_version(cls: Any, filename: bytes) -> Tuple[Union[float, int], ...]: - """Extracts the release version from an archive filename, - to get an ordering whose maximum is likely to be the last - version of the software - - >>> OriginHeadIndexer._parse_version(b'foo') - (-inf,) - >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') - (-inf,) - >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') - (0, 0, 1, 0) - >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') - (0, 0, 1, -1, 'beta2') - >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') - (0, 0, 1, 1, 'foobar') - """ - res = cls._archive_filename_re.match(filename) - if res is None: - return (float("-infinity"),) - version = [int(n) for n in res.group("version").decode().split(".")] - if res.group("preversion") is None: - version.append(0) + +def _parse_version(filename: bytes) -> Tuple[Union[float, int, str], ...]: + """Extracts the release version from an archive filename, + to get an ordering whose maximum is likely to be the last + version of the software + + >>> _parse_version(b'foo') + (-inf,) + >>> _parse_version(b'foo.tar.gz') + (-inf,) + >>> _parse_version(b'gnu-hello-0.0.1.tar.gz') + (0, 0, 1, 0) + >>> _parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') + (0, 0, 1, -1, 'beta2') + >>> _parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') + (0, 0, 1, 1, 'foobar') + """ + res = _archive_filename_re.match(filename) + if res is None: + return (float("-infinity"),) + version: List[Union[float, int, str]] = [ + int(n) for n in res.group("version").decode().split(".") + ] + if res.group("preversion") is None: + version.append(0) + else: + preversion = res.group("preversion").decode() + if preversion.startswith("-"): + version.append(-1) + version.append(preversion[1:]) + elif preversion.startswith("+"): + version.append(1) + version.append(preversion[1:]) else: - preversion = res.group("preversion").decode() - if preversion.startswith("-"): - version.append(-1) - version.append(preversion[1:]) - elif preversion.startswith("+"): - version.append(1) - version.append(preversion[1:]) - else: - assert False, res.group("preversion") - return tuple(version) - - def _try_get_ftp_head(self, branches: Dict[bytes, SnapshotBranch]) -> Any: - archive_names = list(branches) - max_archive_name = max(archive_names, key=self._parse_version) - r = self._try_resolve_target(branches, max_archive_name) - return r - - # Generic - - def _try_get_head_generic(self, branches: Dict[bytes, SnapshotBranch]) -> Any: - # Works on 'deposit', 'pypi', and VCSs. - return self._try_resolve_target(branches, b"HEAD") or self._try_resolve_target( - branches, b"master" - ) - - def _try_resolve_target( - self, branches: Dict[bytes, SnapshotBranch], branch_name: bytes - ) -> Any: - try: - branch = branches[branch_name] - if branch is None: - return None - while branch.target_type == TargetType.ALIAS: - branch = branches[branch.target] - if branch is None: - return None - - if branch.target_type == TargetType.REVISION: - return branch.target - elif branch.target_type == TargetType.CONTENT: - return None # TODO - elif branch.target_type == TargetType.DIRECTORY: - return None # TODO - elif branch.target_type == TargetType.RELEASE: - return None # TODO - else: - assert False, branch - except KeyError: - return None + assert False, res.group("preversion") + return tuple(version) -@click.command() -@click.option( - "--origins", "-i", help='Origins to lookup, in the "type+url" format', multiple=True -) -def main(origins: List[str]) -> None: - rev_metadata_indexer = OriginHeadIndexer() - rev_metadata_indexer.run(origins) +def _try_get_ftp_head( + branches: Dict[bytes, Optional[SnapshotBranch]] +) -> Optional[CoreSWHID]: + archive_names = list(branches) + max_archive_name = max(archive_names, key=_parse_version) + return _try_resolve_target(branches, max_archive_name) -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - main() +def _try_get_head_generic( + branches: Dict[bytes, Optional[SnapshotBranch]] +) -> Optional[CoreSWHID]: + # Works on 'deposit', 'pypi', and VCSs. + return _try_resolve_target(branches, b"HEAD") or _try_resolve_target( + branches, b"master" + ) + + +def _try_resolve_target( + branches: Dict[bytes, Optional[SnapshotBranch]], branch_name: bytes +) -> Optional[CoreSWHID]: + try: + branch = branches[branch_name] + if branch is None: + return None + while branch.target_type == TargetType.ALIAS: + branch = branches[branch.target] + if branch is None: + return None + + if branch.target_type == TargetType.REVISION: + return CoreSWHID(object_type=ObjectType.REVISION, object_id=branch.target) + elif branch.target_type == TargetType.CONTENT: + return None # TODO + elif branch.target_type == TargetType.DIRECTORY: + return None # TODO + elif branch.target_type == TargetType.RELEASE: + return None # TODO + else: + assert False, branch + except KeyError: + return None diff --git a/swh/indexer/tests/tasks.py b/swh/indexer/tests/tasks.py --- a/swh/indexer/tests/tasks.py +++ b/swh/indexer/tests/tasks.py @@ -3,7 +3,6 @@ from swh.indexer.metadata import OriginMetadataIndexer, RevisionMetadataIndexer from .test_metadata import ContentMetadataTestIndexer -from .test_origin_head import OriginHeadTestIndexer from .utils import BASE_TEST_CONFIG @@ -30,7 +29,6 @@ return {**BASE_TEST_CONFIG, "tools": []} def _prepare_sub_indexers(self): - self.origin_head_indexer = OriginHeadTestIndexer() self.revision_metadata_indexer = RevisionMetadataTestIndexer() diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -600,8 +600,8 @@ storage.revision_add([REVISION]) mocker.patch( - "swh.indexer.origin_head.OriginHeadIndexer.index", - return_value=[{"revision_id": REVISION.id}], + "swh.indexer.metadata.get_head_swhid", + return_value=REVISION.swhid(), ) mocker.patch( diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -1,14 +1,13 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import copy from datetime import datetime, timezone import pytest -from swh.indexer.origin_head import OriginHeadIndexer +from swh.indexer.origin_head import get_head_swhid from swh.indexer.tests.utils import fill_storage from swh.model.model import ( Origin, @@ -18,37 +17,9 @@ SnapshotBranch, TargetType, ) +from swh.model.swhids import CoreSWHID from swh.storage.utils import now - -@pytest.fixture -def swh_indexer_config(swh_indexer_config): - config = copy.deepcopy(swh_indexer_config) - config.update( - { - "tools": { - "name": "origin-metadata", - "version": "0.0.1", - "configuration": {}, - }, - "tasks": { - "revision_intrinsic_metadata": None, - "origin_intrinsic_metadata": None, - }, - } - ) - return config - - -class OriginHeadTestIndexer(OriginHeadIndexer): - """Specific indexer whose configuration is enough to satisfy the - indexing tests. - """ - - def persist_index_computations(self, results): - self.results = results - - SAMPLE_SNAPSHOT = Snapshot( branches={ b"foo": None, @@ -61,32 +32,23 @@ @pytest.fixture -def indexer(swh_config): - indexer = OriginHeadTestIndexer() - indexer.catch_exceptions = False - fill_storage(indexer.storage) - return indexer +def storage(swh_storage): + fill_storage(swh_storage) + return swh_storage -def test_git(indexer): +def test_git(storage): origin_url = "https://github.com/SoftwareHeritage/swh-storage" - indexer.run([origin_url]) - rev_id = b"8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}\xac\xefrm" - assert indexer.results == ( - [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ] + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:384b12006403cce45d6253e38f7bd77dacef726d" ) -def test_git_partial_snapshot(indexer): +def test_git_partial_snapshot(storage): """Checks partial snapshots are ignored.""" origin_url = "https://github.com/SoftwareHeritage/swh-core" - indexer.storage.origin_add([Origin(url=origin_url)]) - visit = indexer.storage.origin_visit_add( + storage.origin_add([Origin(url=origin_url)]) + visit = storage.origin_visit_add( [ OriginVisit( origin=origin_url, @@ -95,7 +57,7 @@ ) ] )[0] - indexer.storage.snapshot_add([SAMPLE_SNAPSHOT]) + storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, @@ -103,28 +65,26 @@ status="partial", snapshot=SAMPLE_SNAPSHOT.id, ) - indexer.storage.origin_visit_status_add([visit_status]) - indexer.run([origin_url]) - assert indexer.results == [] + storage.origin_visit_status_add([visit_status]) + assert get_head_swhid(storage, origin_url) is None -def test_vcs_missing_snapshot(indexer): +def test_vcs_missing_snapshot(storage): origin_url = "https://github.com/SoftwareHeritage/swh-indexer" - indexer.storage.origin_add([Origin(url=origin_url)]) - indexer.run([origin_url]) - assert indexer.results == [] + storage.origin_add([Origin(url=origin_url)]) + assert get_head_swhid(storage, origin_url) is None -def test_pypi_missing_branch(indexer): +def test_pypi_missing_branch(storage): origin_url = "https://pypi.org/project/abcdef/" - indexer.storage.origin_add( + storage.origin_add( [ Origin( url=origin_url, ) ] ) - visit = indexer.storage.origin_visit_add( + visit = storage.origin_visit_add( [ OriginVisit( origin=origin_url, @@ -133,7 +93,7 @@ ) ] )[0] - indexer.storage.snapshot_add([SAMPLE_SNAPSHOT]) + storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, @@ -141,71 +101,52 @@ status="full", snapshot=SAMPLE_SNAPSHOT.id, ) - indexer.storage.origin_visit_status_add([visit_status]) - indexer.run(["https://pypi.org/project/abcdef/"]) - assert indexer.results == [] + storage.origin_visit_status_add([visit_status]) + assert get_head_swhid(storage, origin_url) is None -def test_ftp(indexer): +def test_ftp(storage): origin_url = "rsync://ftp.gnu.org/gnu/3dldf" - indexer.run([origin_url]) - rev_id = b"\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee\xcc\x1a\xb4`\x8c\x8by" - assert indexer.results == [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ] + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:8ea98e2fea7d9f6546f49ffdeecc1ab4608c8b79" + ) -def test_ftp_missing_snapshot(indexer): +def test_ftp_missing_snapshot(storage): origin_url = "rsync://ftp.gnu.org/gnu/foobar" - indexer.storage.origin_add([Origin(url=origin_url)]) - indexer.run([origin_url]) - assert indexer.results == [] + storage.origin_add([Origin(url=origin_url)]) + assert get_head_swhid(storage, origin_url) is None -def test_deposit(indexer): +def test_deposit(storage): origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/" - indexer.storage.origin_add([Origin(url=origin_url)]) - indexer.run([origin_url]) - rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb" - assert indexer.results == [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ] - - -def test_deposit_missing_snapshot(indexer): + storage.origin_add([Origin(url=origin_url)]) + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:e76ea49c9ffbb7f73611087ba6e999b19e5d71eb" + ) + + +def test_deposit_missing_snapshot(storage): origin_url = "https://forge.softwareheritage.org/source/foobar" - indexer.storage.origin_add( + storage.origin_add( [ Origin( url=origin_url, ) ] ) - indexer.run([origin_url]) - assert indexer.results == [] + assert get_head_swhid(storage, origin_url) is None -def test_pypi(indexer): +def test_pypi(storage): origin_url = "https://pypi.org/project/limnoria/" - indexer.run([origin_url]) - - rev_id = b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t" - assert indexer.results == [{"revision_id": rev_id, "origin_url": origin_url}] + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" + ) -def test_svn(indexer): +def test_svn(storage): origin_url = "http://0-512-md.googlecode.com/svn/" - indexer.run([origin_url]) - rev_id = b"\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8\xc9\xad#.\x1bw=\x18" - assert indexer.results == [ - { - "revision_id": rev_id, - "origin_url": origin_url, - } - ] + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rev:e43f72e12c88abece79a87b8c9ad232e1b773d18" + )