Page MenuHomeSoftware Heritage

D2145.id7259.diff
No OneTemporary

D2145.id7259.diff

diff --git a/swh/loader/package/tar.py b/swh/loader/package/tar.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tar.py
@@ -0,0 +1,101 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import iso8601
+import logging
+
+from os import path
+from typing import Dict, Generator, Optional, Mapping, Sequence, Tuple
+
+from swh.loader.package.loader import PackageLoader
+from swh.model.identifiers import normalize_timestamp
+
+
+logger = logging.getLogger(__name__)
+
+
+SWH_PERSON = {
+ 'name': b'Software Heritage',
+ 'fullname': b'Software Heritage',
+ 'email': b'robot@softwareheritage.org'
+}
+
+
+REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
+
+
+class TarLoader(PackageLoader):
+ visit_type = 'tar'
+
+ def __init__(self, url: str, packages: Sequence[Mapping[str, str]]):
+ """Loader constructor.
+
+ Args:
+ url: Origin url
+
+ packages: List of dict with keys:
+ - uri: the url to retrieve one versioned archive
+ - date (isoformat date string)
+ - sha256: integrity hash
+
+ """
+ super().__init__(url=url)
+ # sorting per date
+ self.packages = list(sorted(packages, key=lambda v: v['date']))
+
+ def get_versions(self) -> Sequence[str]:
+ versions = []
+ for package in self.packages:
+ v = package.get('version')
+ if v:
+ versions.append(v)
+ return versions
+
+ def get_default_release(self) -> str:
+ # It's the most recent, so for this loader, it's the last one
+ return self.packages[-1]['version']
+
+ def get_artifacts(self, version: str) -> Generator[
+ Tuple[str, str, Dict], None, None]:
+ for a_metadata in self.packages:
+ url = a_metadata['url']
+ artifact_version = a_metadata['version']
+ if version == artifact_version:
+ filename = path.split(url)[-1]
+ yield filename, url, a_metadata
+
+ def resolve_revision_from(
+ self, known_artifacts: Dict, artifact_metadata: Dict) \
+ -> Optional[bytes]:
+ def pk(d):
+ return [d.get(k) for k in ['time', 'archive', 'length']]
+
+ artifact_pk = pk(artifact_metadata)
+ for rev_id, known_artifact in known_artifacts.items():
+ logging.debug('known_artifact: %s', known_artifact)
+ known_pk = pk(known_artifact['extrinsic']['raw'])
+ if artifact_pk == known_pk:
+ return rev_id
+
+ def build_revision(
+ self, a_metadata: Dict, a_uncompressed_path: str) -> Dict:
+ normalized_date = normalize_timestamp(
+ iso8601.parse_date(a_metadata['date']))
+ return {
+ 'message': REVISION_MESSAGE,
+ 'date': normalized_date,
+ 'author': SWH_PERSON,
+ 'committer': SWH_PERSON,
+ 'committer_date': normalized_date,
+ 'parents': [],
+ 'metadata': {
+ 'intrinsic': {},
+ 'extrinsic': {
+ 'provider': self.url,
+ 'when': self.visit_date.isoformat(),
+ 'raw': a_metadata,
+ },
+ },
+ }
diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py
--- a/swh/loader/package/tests/test_gnu.py
+++ b/swh/loader/package/tests/test_gnu.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import os
-import re
from swh.model.hashutil import hash_to_bytes
@@ -134,16 +133,15 @@
_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa
-def test_visit_with_no_artifact_found(swh_config, requests_mock):
+def test_visit_with_no_artifact_found(swh_config, requests_mock_datadir):
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': '944729610',
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-unknown-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
- requests_mock.get(re.compile('https://'), status_code=404)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
diff --git a/swh/loader/package/tests/test_tar.py b/swh/loader/package/tests/test_tar.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tests/test_tar.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Tuple
+
+from swh.core.pytest_plugin import requests_mock_datadir_factory
+from swh.loader.package.tar import TarLoader
+
+from swh.loader.package.tests.common import check_snapshot
+
+
+URL = 'https://deposit.softwareheritage.org/hello/2.10.orig.tar.gz'
+
+
+PACKAGES = [
+ {
+ 'version': '2.10',
+ 'url': URL,
+ 'date': '2014-10-19T16:52:35+02:00',
+ }
+]
+
+
+def integrity_to_hash(integrity_value: str) -> Tuple[str, str]:
+ hash_name, base64_value = integrity_value.split('-')
+
+ from base64 import b64decode
+ from binascii import hexlify
+
+ hash_hex = hexlify(b64decode(base64_value)).decode('utf-8')
+ return hash_name, hash_hex
+
+
+def test_integrity_to_hash():
+ hash_name, hash_hex = integrity_to_hash(
+ 'sha256-MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs=')
+
+ assert hash_name == 'sha256'
+ assert hash_hex == '31e066137a962676e89f69d1b65382de95a7ef7d914b8cb956f41ea72e0f516b' # noqa
+
+
+requests_mock_datadir_missing = requests_mock_datadir_factory(ignore_urls=[
+ URL
+])
+
+
+def test_tar_visit_with_no_artifact_found(
+ swh_config, requests_mock_datadir_missing):
+ loader = TarLoader(url=URL, packages=PACKAGES)
+
+ actual_load_status = loader.load()
+ assert actual_load_status['status'] == 'uneventful'
+ stats = loader.storage.stat_counters()
+
+ assert {
+ 'content': 0,
+ 'directory': 0,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 0,
+ 'release': 0,
+ 'revision': 0,
+ 'skipped_content': 0,
+ 'snapshot': 1,
+ } == stats
+
+ origin_visit = next(loader.storage.origin_visit_get(URL))
+ assert origin_visit['status'] == 'partial'
+
+
+def test_tar_visit_with_artifact_found(swh_config, requests_mock_datadir):
+ loader = TarLoader(url=URL, packages=PACKAGES)
+
+ actual_load_status = loader.load()
+ assert actual_load_status['status'] == 'eventful'
+ stats = loader.storage.stat_counters()
+
+ assert {
+ 'content': 303,
+ 'directory': 12,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 1,
+ 'release': 0,
+ 'revision': 1,
+ 'skipped_content': 0,
+ 'snapshot': 1,
+ } == stats
+
+ origin_visit = next(loader.storage.origin_visit_get(URL))
+ assert origin_visit['status'] == 'full'
+
+ expected_snapshot = {
+ 'id': 'e759f554b660f03ebaf2e5bf62c34e0fe0ee5748',
+ 'branches': {
+ 'HEAD': {
+ 'target_type': 'alias',
+ 'target': 'releases/2.10'
+ },
+ 'releases/2.10': {
+ 'target_type': 'revision',
+ 'target': '326260b671e595403f03b9e673af519e23011c52',
+ }
+ },
+ }
+
+ check_snapshot(expected_snapshot, loader.storage)

File Metadata

Mime Type
text/plain
Expires
Nov 5 2024, 3:06 PM (12 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226657

Event Timeline