diff --git a/PKG-INFO b/PKG-INFO index ff967a3..a1441e2 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.6.4 +Version: 0.7.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/requirements-swh.txt b/requirements-swh.txt index 854fbce..4f72e9b 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ swh.core[db,http] >= 0.9.1 swh.model >= 0.0.15 swh.objstorage >= 0.2.2 swh.scheduler >= 0.5.2 -swh.storage >= 0.12.0 +swh.storage >= 0.22.0 swh.journal >= 0.1.0 diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index ff967a3..a1441e2 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.6.4 +Version: 0.7.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/requires.txt b/swh.indexer.egg-info/requires.txt index c834180..2f9c7ae 100644 --- a/swh.indexer.egg-info/requires.txt +++ b/swh.indexer.egg-info/requires.txt @@ -1,19 +1,19 @@ click python-magic>=0.4.13 pyld xmltodict typing-extensions swh.core[db,http]>=0.9.1 swh.model>=0.0.15 swh.objstorage>=0.2.2 swh.scheduler>=0.5.2 -swh.storage>=0.12.0 +swh.storage>=0.22.0 swh.journal>=0.1.0 [testing] confluent-kafka pytest pytest-mock hypothesis>=3.11.0 swh.scheduler[testing]>=0.5.0 swh.storage[testing]>=0.10.0 diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py index d1c42b2..61c495a 100644 --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -1,155 +1,154 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import re from typing import Any, Dict, List, Tuple, Union import click from swh.indexer.indexer import OriginIndexer from swh.model.model import SnapshotBranch, TargetType from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_all_branches class OriginHeadIndexer(OriginIndexer[Dict]): """Origin-level indexer. This indexer is in charge of looking up the revision that acts as the "head" of an origin. In git, this is usually the commit pointed to by the 'master' branch.""" USE_TOOLS = False def persist_index_computations(self, results: Any) -> Dict[str, int]: """Do nothing. The indexer's results are not persistent, they should only be piped to another indexer.""" return {} # Dispatch def index(self, id: str, data: None = None, **kwargs) -> List[Dict]: origin_url = id - visit_and_status = origin_get_latest_visit_status( + visit_status = origin_get_latest_visit_status( self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) - if not visit_and_status: + if not visit_status: return [] - visit, visit_status = visit_and_status assert visit_status.snapshot is not None snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot) if snapshot is None: return [] method = getattr( - self, "_try_get_%s_head" % visit.type, self._try_get_head_generic + self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic ) rev_id = method(snapshot.branches) if rev_id is not None: return [{"origin_url": origin_url, "revision_id": rev_id,}] # could not find a head revision return [] # Tarballs _archive_filename_re = re.compile( rb"^" rb"(?P.*)[-_]" rb"(?P[0-9]+(\.[0-9])*)" rb"(?P[-+][a-zA-Z0-9.~]+?)?" rb"(?P(\.[a-zA-Z0-9]+)+)" rb"$" ) @classmethod def _parse_version(cls: Any, filename: bytes) -> Tuple[Union[float, int], ...]: """Extracts the release version from an archive filename, to get an ordering whose maximum is likely to be the last version of the software >>> OriginHeadIndexer._parse_version(b'foo') (-inf,) >>> OriginHeadIndexer._parse_version(b'foo.tar.gz') (-inf,) >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1.tar.gz') (0, 0, 1, 0) >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1-beta2.tar.gz') (0, 0, 1, -1, 'beta2') >>> OriginHeadIndexer._parse_version(b'gnu-hello-0.0.1+foobar.tar.gz') (0, 0, 1, 1, 'foobar') """ res = cls._archive_filename_re.match(filename) if res is None: return (float("-infinity"),) version = [int(n) for n in res.group("version").decode().split(".")] if res.group("preversion") is None: version.append(0) else: preversion = res.group("preversion").decode() if preversion.startswith("-"): version.append(-1) version.append(preversion[1:]) elif preversion.startswith("+"): version.append(1) version.append(preversion[1:]) else: assert False, res.group("preversion") return tuple(version) def _try_get_ftp_head(self, branches: Dict[bytes, SnapshotBranch]) -> Any: archive_names = list(branches) max_archive_name = max(archive_names, key=self._parse_version) r = self._try_resolve_target(branches, max_archive_name) return r # Generic def _try_get_head_generic(self, branches: Dict[bytes, SnapshotBranch]) -> Any: # Works on 'deposit', 'pypi', and VCSs. return self._try_resolve_target(branches, b"HEAD") or self._try_resolve_target( branches, b"master" ) def _try_resolve_target( self, branches: Dict[bytes, SnapshotBranch], branch_name: bytes ) -> Any: try: branch = branches[branch_name] if branch is None: return None while branch.target_type == TargetType.ALIAS: branch = branches[branch.target] if branch is None: return None if branch.target_type == TargetType.REVISION: return branch.target elif branch.target_type == TargetType.CONTENT: return None # TODO elif branch.target_type == TargetType.DIRECTORY: return None # TODO elif branch.target_type == TargetType.RELEASE: return None # TODO else: assert False, branch except KeyError: return None @click.command() @click.option( "--origins", "-i", help='Origins to lookup, in the "type+url" format', multiple=True ) def main(origins: List[str]) -> None: rev_metadata_indexer = OriginHeadIndexer() rev_metadata_indexer.run(origins) if __name__ == "__main__": logging.basicConfig(level=logging.INFO) main()