Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/migrate_extrinsic_metadata.py
Property | Old Value | New Value |
---|---|---|
File Mode | 100644 | 100755 |
Show All 32 Lines | |||||
from urllib.parse import unquote, urlparse | from urllib.parse import unquote, urlparse | ||||
from urllib.request import urlopen | from urllib.request import urlopen | ||||
import iso8601 | import iso8601 | ||||
import psycopg2 | import psycopg2 | ||||
from swh.core.db import BaseDb | from swh.core.db import BaseDb | ||||
from swh.model.hashutil import hash_to_hex | from swh.model.hashutil import hash_to_hex | ||||
from swh.model.identifiers import SWHID, parse_swhid | |||||
from swh.model.model import ( | from swh.model.model import ( | ||||
MetadataAuthority, | MetadataAuthority, | ||||
MetadataAuthorityType, | MetadataAuthorityType, | ||||
MetadataFetcher, | MetadataFetcher, | ||||
MetadataTargetType, | |||||
RawExtrinsicMetadata, | RawExtrinsicMetadata, | ||||
Sha1Git, | Sha1Git, | ||||
) | ) | ||||
from swh.model.swhid import SWHID, SWHIDObjectType, parse_swhid | |||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
from swh.storage.algos.origin import iter_origin_visit_statuses, iter_origin_visits | from swh.storage.algos.origin import iter_origin_visit_statuses, iter_origin_visits | ||||
from swh.storage.algos.snapshot import snapshot_get_all_branches | from swh.storage.algos.snapshot import snapshot_get_all_branches | ||||
# XML namespaces and fields for metadata coming from the deposit: | # XML namespaces and fields for metadata coming from the deposit: | ||||
CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" | CODEMETA_NS = "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" | ||||
ATOM_NS = "http://www.w3.org/2005/Atom" | ATOM_NS = "http://www.w3.org/2005/Atom" | ||||
▲ Show 20 Lines • Show All 349 Lines • ▼ Show 20 Lines | def load_metadata( | ||||
discovery_date: datetime.datetime, | discovery_date: datetime.datetime, | ||||
metadata: Dict[str, Any], | metadata: Dict[str, Any], | ||||
format: str, | format: str, | ||||
authority: MetadataAuthority, | authority: MetadataAuthority, | ||||
origin: Optional[str], | origin: Optional[str], | ||||
dry_run: bool, | dry_run: bool, | ||||
): | ): | ||||
"""Does the actual loading to swh-storage.""" | """Does the actual loading to swh-storage.""" | ||||
origin_swhid: Optional[SWHID] | |||||
if origin is not None: | |||||
origin_swhid = SWHID( | |||||
object_type=SWHIDObjectType.ORIGIN, object_id=origin.encode() | |||||
) | |||||
else: | |||||
origin_swhid = None | |||||
directory_swhid = SWHID( | directory_swhid = SWHID( | ||||
object_type="directory", object_id=hash_to_hex(directory_id) | object_type=SWHIDObjectType.DIRECTORY, object_id=hash_to_hex(directory_id) | ||||
) | |||||
revision_swhid = SWHID( | |||||
object_type=SWHIDObjectType.REVISION, object_id=hash_to_hex(revision_id) | |||||
) | ) | ||||
revision_swhid = SWHID(object_type="revision", object_id=hash_to_hex(revision_id)) | |||||
obj = RawExtrinsicMetadata( | obj = RawExtrinsicMetadata( | ||||
type=MetadataTargetType.DIRECTORY, | |||||
target=directory_swhid, | target=directory_swhid, | ||||
discovery_date=discovery_date, | discovery_date=discovery_date, | ||||
authority=authority, | authority=authority, | ||||
fetcher=FETCHER, | fetcher=FETCHER, | ||||
format=format, | format=format, | ||||
metadata=json.dumps(metadata).encode(), | metadata=json.dumps(metadata).encode(), | ||||
origin=origin, | origin=origin_swhid, | ||||
revision=revision_swhid, | revision=revision_swhid, | ||||
) | ) | ||||
if not dry_run: | if not dry_run: | ||||
storage.raw_extrinsic_metadata_add([obj]) | storage.raw_extrinsic_metadata_add([obj]) | ||||
def handle_deposit_row( | def handle_deposit_row( | ||||
row, | row, | ||||
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines | for deposit_request_row in deposit_cur: | ||||
# this field | # this field | ||||
assert "id" in metadata or "title" in metadata | assert "id" in metadata or "title" in metadata | ||||
assert "codemeta:author" in metadata | assert "codemeta:author" in metadata | ||||
format = NEW_DEPOSIT_FORMAT | format = NEW_DEPOSIT_FORMAT | ||||
metadata_entries.append((date, format, metadata)) | metadata_entries.append((date, format, metadata)) | ||||
if discovery_date is None: | if discovery_date is None: | ||||
discovery_date = max(dates) | discovery_date = max(dates) | ||||
# Sanity checks to make sure deposit requests are consistent with each other | # Sanity checks to make sure deposit requests are consistent with each other | ||||
assert len(metadata_entries) >= 1, deposit_id | assert len(metadata_entries) >= 1, deposit_id | ||||
assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}" | assert len(provider_urls) == 1, f"expected 1 provider url, got {provider_urls}" | ||||
(provider_url,) = provider_urls | (provider_url,) = provider_urls | ||||
assert len(swhids) == 1 | assert len(swhids) == 1 | ||||
(swhid,) = swhids | (swhid,) = swhids | ||||
assert ( | assert ( | ||||
len(external_identifiers) == 1 | len(external_identifiers) == 1 | ||||
▲ Show 20 Lines • Show All 673 Lines • Show Last 20 Lines |