diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -32,10 +32,13 @@ OriginIntrinsicMetadataRow, ) from swh.model import hashutil -from swh.model.model import Directory, Origin, Sha1Git -from swh.model.swhids import ObjectType +from swh.model.model import Directory +from swh.model.model import ObjectType as ModelObjectType +from swh.model.model import Origin, Sha1Git +from swh.model.swhids import CoreSWHID, ObjectType REVISION_GET_BATCH_SIZE = 10 +RELEASE_GET_BATCH_SIZE = 10 ORIGIN_GET_BATCH_SIZE = 10 @@ -329,7 +332,8 @@ self, origins: List[Origin], check_origin_known: bool = True, **kwargs ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]: head_rev_ids = [] - origins_with_head = [] + head_rel_ids = [] + origin_heads: Dict[Origin, CoreSWHID] = {} # Filter out origins not in the storage if check_origin_known: @@ -348,25 +352,63 @@ continue head_swhid = get_head_swhid(self.storage, origin.url) if head_swhid: - # TODO: add support for releases - assert head_swhid.object_type == ObjectType.REVISION, head_swhid - origins_with_head.append(origin) - head_rev_ids.append(head_swhid.object_id) - - head_revs = list( - call_with_batches( - self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE + origin_heads[origin] = head_swhid + if head_swhid.object_type == ObjectType.REVISION: + head_rev_ids.append(head_swhid.object_id) + elif head_swhid.object_type == ObjectType.RELEASE: + head_rel_ids.append(head_swhid.object_id) + else: + assert False, head_swhid + + head_revs = dict( + zip( + head_rev_ids, + call_with_batches( + self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE + ), + ) + ) + head_rels = dict( + zip( + head_rel_ids, + call_with_batches( + self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE + ), ) ) - assert len(head_revs) == len(head_rev_ids) results = [] - for (origin, rev) in zip(origins_with_head, head_revs): - if not rev: - self.log.warning("Missing head revision of origin %r", origin.url) - continue + for (origin, head_swhid) in origin_heads.items(): + if head_swhid.object_type == ObjectType.REVISION: + rev = head_revs[head_swhid.object_id] + if not rev: + self.log.warning( + "Missing head object %s of origin %r", head_swhid, origin.url + ) + continue + directory_id = rev.directory + elif head_swhid.object_type == ObjectType.RELEASE: + rel = head_rels[head_swhid.object_id] + if not rel: + self.log.warning( + "Missing head object %s of origin %r", head_swhid, origin.url + ) + continue + if rel.target_type != ModelObjectType.DIRECTORY: + # TODO + self.log.warning( + "Head release %s of %r has unexpected target type %s", + head_swhid, + origin.url, + rel.target_type, + ) + continue + assert rel.target, rel + directory_id = rel.target + else: + assert False, head_swhid - for dir_metadata in self.directory_metadata_indexer.index(rev.directory): + for dir_metadata in self.directory_metadata_indexer.index(directory_id): # There is at most one dir_metadata orig_metadata = OriginIntrinsicMetadataRow( from_directory=dir_metadata.id, diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -113,7 +113,7 @@ elif branch.target_type == TargetType.DIRECTORY: return None # TODO elif branch.target_type == TargetType.RELEASE: - return None # TODO + return CoreSWHID(object_type=ObjectType.RELEASE, object_id=branch.target) else: assert False, branch except KeyError: diff --git a/swh/indexer/tests/test_origin_head.py b/swh/indexer/tests/test_origin_head.py --- a/swh/indexer/tests/test_origin_head.py +++ b/swh/indexer/tests/test_origin_head.py @@ -139,11 +139,16 @@ def test_pypi(storage): - origin_url = "https://pypi.org/project/limnoria/" + origin_url = "https://old-pypi.example.org/project/limnoria/" assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( "swh:1:rev:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" ) + origin_url = "https://pypi.org/project/limnoria/" + assert get_head_swhid(storage, origin_url) == CoreSWHID.from_string( + "swh:1:rel:83b9b6c705b125d0fe6dd86b41109dc5fa32f874" + ) + def test_svn(storage): origin_url = "http://0-512-md.googlecode.com/svn/" diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py --- a/swh/indexer/tests/test_origin_metadata.py +++ b/swh/indexer/tests/test_origin_metadata.py @@ -29,7 +29,47 @@ return cfg -def test_origin_metadata_indexer( +def test_origin_metadata_indexer_release( + swh_indexer_config, + idx_storage: IndexerStorageInterface, + storage: StorageInterface, + obj_storage, +) -> None: + indexer = OriginMetadataIndexer(config=swh_indexer_config) + origin = "https://npm.example.org/yarn-parser" + indexer.run([origin]) + + tool = swh_indexer_config["tools"] + + dir_id = DIRECTORY2.id + dir_metadata = DirectoryIntrinsicMetadataRow( + id=dir_id, + tool=tool, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) + origin_metadata = OriginIntrinsicMetadataRow( + id=origin, + tool=tool, + from_directory=dir_id, + metadata=YARN_PARSER_METADATA, + mappings=["npm"], + ) + + dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id])) + for dir_result in dir_results: + assert dir_result.tool + del dir_result.tool["id"] + assert dir_results == [dir_metadata] + + orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin])) + for orig_result in orig_results: + assert orig_result.tool + del orig_result.tool["id"] + assert orig_results == [origin_metadata] + + +def test_origin_metadata_indexer_revision( swh_indexer_config, idx_storage: IndexerStorageInterface, storage: StorageInterface, diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -19,10 +19,12 @@ Content, Directory, DirectoryEntry, + ObjectType, Origin, OriginVisit, OriginVisitStatus, Person, + Release, Revision, RevisionType, Snapshot, @@ -46,10 +48,15 @@ "type": "deposit", "origin": "https://forge.softwareheritage.org/source/jesuisgpl/", }, - {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, + { + "type": "pypi", + "origin": "https://old-pypi.example.org/project/limnoria/", + }, # with rev head + {"type": "pypi", "origin": "https://pypi.org/project/limnoria/"}, # with rel head {"type": "svn", "origin": "http://0-512-md.googlecode.com/svn/"}, {"type": "git", "origin": "https://github.com/librariesio/yarn-parser"}, {"type": "git", "origin": "https://github.com/librariesio/yarn-parser.git"}, + {"type": "git", "origin": "https://npm.example.org/yarn-parser"}, ] ORIGINS = [Origin(url=visit["origin"]) for visit in ORIGIN_VISITS] @@ -120,7 +127,26 @@ REVISIONS = [REVISION] +RELEASE = Release( + name=b"v0.0.0", + message=None, + author=Person( + name=b"Andrew Nesbitt", + fullname=b"Andrew Nesbitt ", + email=b"andrewnez@gmail.com", + ), + synthetic=False, + date=TimestampWithTimezone.from_datetime( + datetime.datetime(2017, 2, 20, 16, 14, 16, tzinfo=_utc_plus_2) + ), + target_type=ObjectType.DIRECTORY, + target=DIRECTORY2.id, +) + +RELEASES = [RELEASE] + SNAPSHOTS = [ + # https://github.com/SoftwareHeritage/swh-storage Snapshot( id=hash_to_bytes("a50fde72265343b7d28cecf6db20d98a81d21965"), branches={ @@ -141,6 +167,7 @@ ), }, ), + # rsync://ftp.gnu.org/gnu/3dldf Snapshot( id=hash_to_bytes("2c67f69a416bca4e1f3fcd848c588fab88ad0642"), branches={ @@ -166,6 +193,7 @@ ), }, ), + # https://forge.softwareheritage.org/source/jesuisgpl/", Snapshot( id=hash_to_bytes("68c0d26104d47e278dd6be07ed61fafb561d0d20"), branches={ @@ -175,6 +203,7 @@ ) }, ), + # https://old-pypi.example.org/project/limnoria/ Snapshot( id=hash_to_bytes("f255245269e15fc99d284affd79f766668de0b67"), branches={ @@ -191,6 +220,23 @@ ), }, ), + # https://pypi.org/project/limnoria/ + Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=b"releases/2018.09.09", target_type=TargetType.ALIAS + ), + b"releases/2018.09.01": SnapshotBranch( + target=b"<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d\xbb\xdfF\xfdw\xcf", + target_type=TargetType.RELEASE, + ), + b"releases/2018.09.09": SnapshotBranch( + target=b"\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8kA\x10\x9d\xc5\xfa2\xf8t", # noqa + target_type=TargetType.RELEASE, + ), + }, + ), + # http://0-512-md.googlecode.com/svn/ Snapshot( id=hash_to_bytes("a1a28c0ab387a8f9e0618cb705eab81fc448f473"), branches={ @@ -200,6 +246,7 @@ ) }, ), + # https://github.com/librariesio/yarn-parser Snapshot( id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ @@ -209,6 +256,7 @@ ) }, ), + # https://github.com/librariesio/yarn-parser.git Snapshot( id=hash_to_bytes("bb4fd3a836930ce629d912864319637040ff3040"), branches={ @@ -218,8 +266,19 @@ ) }, ), + # https://npm.example.org/yarn-parser + Snapshot( + branches={ + b"HEAD": SnapshotBranch( + target=RELEASE.id, + target_type=TargetType.RELEASE, + ) + }, + ), ] +assert len(SNAPSHOTS) == len(ORIGIN_VISITS) + SHA1_TO_LICENSES = { "01c9379dfc33803963d07c1ccc748d3fe4c96bb5": ["GPL"], @@ -562,6 +621,7 @@ storage.origin_add(ORIGINS) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) + storage.release_add(RELEASES) storage.snapshot_add(SNAPSHOTS) for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):