diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -89,7 +89,7 @@ committer=parse_author(depo["committer"]), revision_parents=tuple(hash_to_bytes(p) for p in depo["revision_parents"]), raw_info=raw_info, - revision_extrinsic_metadata=[ + directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( discovery_date=now(), metadata=raw_metadata.encode(), diff --git a/swh/loader/package/deposit/tests/test_deposit.py b/swh/loader/package/deposit/tests/test_deposit.py --- a/swh/loader/package/deposit/tests/test_deposit.py +++ b/swh/loader/package/deposit/tests/test_deposit.py @@ -209,17 +209,18 @@ assert orig_meta0.authority == authority assert orig_meta0.fetcher == fetcher - # Check revision metadata - revision_swhid = SWHID(object_type="revision", object_id=revision_id) - actual_rev_meta = loader.storage.raw_extrinsic_metadata_get( - MetadataTargetType.REVISION, revision_swhid, authority + # Check directory metadata + directory_id = hash_to_hex(revision.directory) + directory_swhid = SWHID(object_type="directory", object_id=directory_id) + actual_dir_meta = loader.storage.raw_extrinsic_metadata_get( + MetadataTargetType.DIRECTORY, directory_swhid, authority ) - assert actual_rev_meta.next_page_token is None - assert len(actual_rev_meta.results) == len(all_metadata_raw) - for rev_meta in actual_rev_meta.results: - assert rev_meta.authority == authority - assert rev_meta.fetcher == fetcher - assert rev_meta.metadata.decode() in all_metadata_raw + assert actual_dir_meta.next_page_token is None + assert len(actual_dir_meta.results) == len(all_metadata_raw) + for dir_meta in actual_dir_meta.results: + assert dir_meta.authority == authority + assert dir_meta.fetcher == fetcher + assert dir_meta.metadata.decode() in all_metadata_raw # Retrieve the information for deposit status update query to the deposit urls = [ @@ -374,38 +375,41 @@ assert orig_meta in expected_metadata # Check the revision metadata swh side - revision_swhid = SWHID(object_type="revision", object_id=revision_id) - actual_revision_metadata = loader.storage.raw_extrinsic_metadata_get( - MetadataTargetType.REVISION, revision_swhid, authority + directory_id = hash_to_hex(revision.directory) + directory_swhid = SWHID(object_type="directory", object_id=directory_id) + actual_directory_metadata = loader.storage.raw_extrinsic_metadata_get( + MetadataTargetType.DIRECTORY, directory_swhid, authority ) - assert actual_revision_metadata.next_page_token is None - assert len(actual_revision_metadata.results) == len(all_metadata_raw) + assert actual_directory_metadata.next_page_token is None + assert len(actual_directory_metadata.results) == len(all_metadata_raw) - rev_metadata_template = RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=revision_swhid, + revision_swhid = SWHID(object_type="revision", object_id=revision_id) + dir_metadata_template = RawExtrinsicMetadata( + type=MetadataTargetType.DIRECTORY, + id=directory_swhid, format="sword-v2-atom-codemeta-v2", authority=authority, fetcher=fetcher, origin=url, + revision=revision_swhid, # to satisfy the constructor discovery_date=now(), metadata=b"", ) - expected_revision_metadata = [] + expected_directory_metadata = [] for idx, raw_meta in enumerate(all_metadata_raw): - rev_metadata = actual_revision_metadata.results[idx] - expected_revision_metadata.append( + dir_metadata = actual_directory_metadata.results[idx] + expected_directory_metadata.append( attr.evolve( - rev_metadata_template, - discovery_date=rev_metadata.discovery_date, + dir_metadata_template, + discovery_date=dir_metadata.discovery_date, metadata=raw_meta.encode(), ) ) - assert actual_revision_metadata.results == expected_revision_metadata + assert actual_directory_metadata.results == expected_directory_metadata # Retrieve the information for deposit status update query to the deposit urls = [ diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -100,7 +100,7 @@ # go after attributes with default values. # See - revision_extrinsic_metadata = attr.ib( + directory_extrinsic_metadata = attr.ib( type=List[RawExtrinsicMetadataCore], default=[], kw_only=True, ) @@ -421,9 +421,14 @@ revision_id = self.resolve_revision_from(known_artifacts, p_info) if revision_id is None: try: - revision_id = self._load_revision(p_info, origin) - if revision_id: - self._load_extrinsic_revision_metadata(p_info, revision_id) + res = self._load_revision(p_info, origin) + if res: + (revision_id, directory_id) = res + assert revision_id + assert directory_id + self._load_extrinsic_directory_metadata( + p_info, revision_id, directory_id + ) self.storage.flush() status_load = "eventful" except Exception as e: @@ -516,13 +521,15 @@ return (uncompressed_path, directory) - def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: + def _load_revision( + self, p_info: TPackageInfo, origin + ) -> Optional[Tuple[Sha1Git, Sha1Git]]: """Does all the loading of a revision itself: * downloads a package and uncompresses it * loads it from disk * adds contents, directories, and revision to self.storage - * returns (revision_id, loaded) + * returns (revision_id, directory_id) Raises exception when unable to download or uncompress artifacts @@ -571,7 +578,8 @@ logger.debug("Revision: %s", revision) self.storage.revision_add([revision]) - return revision.id + assert directory.hash + return (revision.id, directory.hash) def _load_snapshot( self, @@ -725,10 +733,10 @@ return metadata_objects - def build_extrinsic_revision_metadata( - self, p_info: TPackageInfo, revision_id: Sha1Git + def build_extrinsic_directory_metadata( + self, p_info: TPackageInfo, revision_id: Sha1Git, directory_id: Sha1Git, ) -> List[RawExtrinsicMetadata]: - if not p_info.revision_extrinsic_metadata: + if not p_info.directory_extrinsic_metadata: # If this package loader doesn't write metadata, no need to require # an implementation for get_metadata_authority. return [] @@ -738,26 +746,31 @@ metadata_objects = [] - for item in p_info.revision_extrinsic_metadata: + for item in p_info.directory_extrinsic_metadata: metadata_objects.append( RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=SWHID(object_type="revision", object_id=revision_id), + type=MetadataTargetType.DIRECTORY, + id=SWHID(object_type="directory", object_id=directory_id), discovery_date=item.discovery_date or self.visit_date, authority=authority, fetcher=fetcher, format=item.format, metadata=item.metadata, origin=self.url, + revision=SWHID( + object_type="revision", object_id=hash_to_hex(revision_id) + ), ) ) return metadata_objects - def _load_extrinsic_revision_metadata( - self, p_info: TPackageInfo, revision_id: Sha1Git + def _load_extrinsic_directory_metadata( + self, p_info: TPackageInfo, revision_id: Sha1Git, directory_id: Sha1Git, ) -> None: - metadata_objects = self.build_extrinsic_revision_metadata(p_info, revision_id) + metadata_objects = self.build_extrinsic_directory_metadata( + p_info, revision_id, directory_id + ) self._load_metadata_objects(metadata_objects) def _load_metadata_objects( diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -70,7 +70,7 @@ shasum=package_metadata["dist"]["shasum"], version=extrinsic_version, raw_info=package_metadata, - revision_extrinsic_metadata=[ + directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="replicate-npm-package-json", metadata=json.dumps(package_metadata).encode(), diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -405,12 +405,15 @@ type=MetadataAuthorityType.FORGE, url="https://npmjs.com/", ) - for (version_name, version_id) in versions: - revision_swhid = SWHID(object_type="revision", object_id=version_id,) + for (version_name, revision_id) in versions: + revision = loader.storage.revision_get([hash_to_bytes(revision_id)])[0] + directory_id = revision.directory + directory_swhid = SWHID(object_type="directory", object_id=directory_id,) + revision_swhid = SWHID(object_type="revision", object_id=revision_id,) expected_metadata = [ RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=revision_swhid, + type=MetadataTargetType.DIRECTORY, + id=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.npm.loader.NpmLoader", version=__version__, @@ -421,11 +424,12 @@ json.loads(org_api_info)["versions"][version_name] ).encode(), origin="https://www.npmjs.com/package/org", + revision=revision_swhid, ) ] assert loader.storage.raw_extrinsic_metadata_get( - type=MetadataTargetType.REVISION, - id=revision_swhid, + type=MetadataTargetType.DIRECTORY, + id=directory_swhid, authority=metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -48,7 +48,7 @@ comment_text=metadata.get("comment_text"), sha256=metadata["digests"]["sha256"], upload_time=metadata["upload_time"], - revision_extrinsic_metadata=[ + directory_extrinsic_metadata=[ RawExtrinsicMetadataCore( format="pypi-project-json", metadata=json.dumps(metadata).encode(), ) diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -338,13 +338,16 @@ revision_swhid = SWHID( object_type="revision", object_id=hash_to_hex(expected_revision_id) ) + directory_swhid = SWHID( + object_type="directory", object_id=hash_to_hex(revision.directory) + ) metadata_authority = MetadataAuthority( type=MetadataAuthorityType.FORGE, url="https://pypi.org/", ) expected_metadata = [ RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=revision_swhid, + type=MetadataTargetType.DIRECTORY, + id=directory_swhid, authority=metadata_authority, fetcher=MetadataFetcher( name="swh.loader.package.pypi.loader.PyPILoader", version=__version__, @@ -355,11 +358,12 @@ json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0] ).encode(), origin=url, + revision=revision_swhid, ) ] assert loader.storage.raw_extrinsic_metadata_get( - type=MetadataTargetType.REVISION, - id=revision_swhid, + type=MetadataTargetType.DIRECTORY, + id=directory_swhid, authority=metadata_authority, ) == PagedResult(next_page_token=None, results=expected_metadata,) diff --git a/swh/loader/package/tests/test_loader_metadata.py b/swh/loader/package/tests/test_loader_metadata.py --- a/swh/loader/package/tests/test_loader_metadata.py +++ b/swh/loader/package/tests/test_loader_metadata.py @@ -39,6 +39,8 @@ REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0") REVISION_SWHID = SWHID(object_type="revision", object_id=REVISION_ID) +DIRECTORY_ID = hash_to_bytes("aa" * 20) +DIRECTORY_SWHID = SWHID(object_type="directory", object_id=DIRECTORY_ID) FETCHER = MetadataFetcher( @@ -48,26 +50,28 @@ DISCOVERY_DATE = datetime.datetime.now(tz=datetime.timezone.utc) -REVISION_METADATA = [ +DIRECTORY_METADATA = [ RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=REVISION_SWHID, + type=MetadataTargetType.DIRECTORY, + id=DIRECTORY_SWHID, discovery_date=DISCOVERY_DATE, authority=AUTHORITY, fetcher=FETCHER, format="test-format1", metadata=b"foo bar", origin=ORIGIN_URL, + revision=REVISION_SWHID, ), RawExtrinsicMetadata( - type=MetadataTargetType.REVISION, - id=REVISION_SWHID, + type=MetadataTargetType.DIRECTORY, + id=DIRECTORY_SWHID, discovery_date=DISCOVERY_DATE + datetime.timedelta(seconds=1), authority=AUTHORITY, fetcher=FETCHER, format="test-format2", metadata=b"bar baz", origin=ORIGIN_URL, + revision=REVISION_SWHID, ), ] @@ -90,7 +94,7 @@ def _load_directory(self, dl_artifacts, tmpdir): class directory: - hash = None + hash = DIRECTORY_ID return (None, directory) # just enough for _load_revision to work @@ -116,12 +120,12 @@ return attr.evolve(AUTHORITY, metadata={}) def get_package_info(self, version: str) -> Iterator[Tuple[str, BasePackageInfo]]: - m0 = REVISION_METADATA[0] - m1 = REVISION_METADATA[1] + m0 = DIRECTORY_METADATA[0] + m1 = DIRECTORY_METADATA[1] p_info = BasePackageInfo( url=ORIGIN_URL, filename="archive.tgz", - revision_extrinsic_metadata=[ + directory_extrinsic_metadata=[ RawExtrinsicMetadataCore(m0.format, m0.metadata, m0.discovery_date), RawExtrinsicMetadataCore(m1.format, m1.metadata, m1.discovery_date), ], @@ -180,10 +184,10 @@ } result = storage.raw_extrinsic_metadata_get( - MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, + MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, AUTHORITY, ) assert result.next_page_token is None - assert result.results == REVISION_METADATA + assert result.results == DIRECTORY_METADATA result = storage.raw_extrinsic_metadata_get( MetadataTargetType.ORIGIN, ORIGIN_URL, AUTHORITY, @@ -210,10 +214,10 @@ } result = storage.raw_extrinsic_metadata_get( - MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, + MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, AUTHORITY, ) assert result.next_page_token is None - assert result.results == REVISION_METADATA + assert result.results == DIRECTORY_METADATA assert caplog.text == "" @@ -234,9 +238,9 @@ } result = storage.raw_extrinsic_metadata_get( - MetadataTargetType.REVISION, REVISION_SWHID, AUTHORITY, + MetadataTargetType.DIRECTORY, DIRECTORY_SWHID, AUTHORITY, ) assert result.next_page_token is None - assert result.results == REVISION_METADATA + assert result.results == DIRECTORY_METADATA assert caplog.text == ""