Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7066559
D2145.id7259.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D2145.id7259.diff
View Options
diff --git a/swh/loader/package/tar.py b/swh/loader/package/tar.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tar.py
@@ -0,0 +1,101 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import iso8601
+import logging
+
+from os import path
+from typing import Dict, Generator, Optional, Mapping, Sequence, Tuple
+
+from swh.loader.package.loader import PackageLoader
+from swh.model.identifiers import normalize_timestamp
+
+
+logger = logging.getLogger(__name__)
+
+
+SWH_PERSON = {
+ 'name': b'Software Heritage',
+ 'fullname': b'Software Heritage',
+ 'email': b'robot@softwareheritage.org'
+}
+
+
+REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
+
+
+class TarLoader(PackageLoader):
+ visit_type = 'tar'
+
+ def __init__(self, url: str, packages: Sequence[Mapping[str, str]]):
+ """Loader constructor.
+
+ Args:
+ url: Origin url
+
+ packages: List of dict with keys:
+ - uri: the url to retrieve one versioned archive
+ - date (isoformat date string)
+ - sha256: integrity hash
+
+ """
+ super().__init__(url=url)
+ # sorting per date
+ self.packages = list(sorted(packages, key=lambda v: v['date']))
+
+ def get_versions(self) -> Sequence[str]:
+ versions = []
+ for package in self.packages:
+ v = package.get('version')
+ if v:
+ versions.append(v)
+ return versions
+
+ def get_default_release(self) -> str:
+ # It's the most recent, so for this loader, it's the last one
+ return self.packages[-1]['version']
+
+ def get_artifacts(self, version: str) -> Generator[
+ Tuple[str, str, Dict], None, None]:
+ for a_metadata in self.packages:
+ url = a_metadata['url']
+ artifact_version = a_metadata['version']
+ if version == artifact_version:
+ filename = path.split(url)[-1]
+ yield filename, url, a_metadata
+
+ def resolve_revision_from(
+ self, known_artifacts: Dict, artifact_metadata: Dict) \
+ -> Optional[bytes]:
+ def pk(d):
+ return [d.get(k) for k in ['time', 'archive', 'length']]
+
+ artifact_pk = pk(artifact_metadata)
+ for rev_id, known_artifact in known_artifacts.items():
+ logging.debug('known_artifact: %s', known_artifact)
+ known_pk = pk(known_artifact['extrinsic']['raw'])
+ if artifact_pk == known_pk:
+ return rev_id
+
+ def build_revision(
+ self, a_metadata: Dict, a_uncompressed_path: str) -> Dict:
+ normalized_date = normalize_timestamp(
+ iso8601.parse_date(a_metadata['date']))
+ return {
+ 'message': REVISION_MESSAGE,
+ 'date': normalized_date,
+ 'author': SWH_PERSON,
+ 'committer': SWH_PERSON,
+ 'committer_date': normalized_date,
+ 'parents': [],
+ 'metadata': {
+ 'intrinsic': {},
+ 'extrinsic': {
+ 'provider': self.url,
+ 'when': self.visit_date.isoformat(),
+ 'raw': a_metadata,
+ },
+ },
+ }
diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py
--- a/swh/loader/package/tests/test_gnu.py
+++ b/swh/loader/package/tests/test_gnu.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import os
-import re
from swh.model.hashutil import hash_to_bytes
@@ -134,16 +133,15 @@
_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa
-def test_visit_with_no_artifact_found(swh_config, requests_mock):
+def test_visit_with_no_artifact_found(swh_config, requests_mock_datadir):
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': '944729610',
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-unknown-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
- requests_mock.get(re.compile('https://'), status_code=404)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
diff --git a/swh/loader/package/tests/test_tar.py b/swh/loader/package/tests/test_tar.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tests/test_tar.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Tuple
+
+from swh.core.pytest_plugin import requests_mock_datadir_factory
+from swh.loader.package.tar import TarLoader
+
+from swh.loader.package.tests.common import check_snapshot
+
+
+URL = 'https://deposit.softwareheritage.org/hello/2.10.orig.tar.gz'
+
+
+PACKAGES = [
+ {
+ 'version': '2.10',
+ 'url': URL,
+ 'date': '2014-10-19T16:52:35+02:00',
+ }
+]
+
+
+def integrity_to_hash(integrity_value: str) -> Tuple[str, str]:
+ hash_name, base64_value = integrity_value.split('-')
+
+ from base64 import b64decode
+ from binascii import hexlify
+
+ hash_hex = hexlify(b64decode(base64_value)).decode('utf-8')
+ return hash_name, hash_hex
+
+
+def test_integrity_to_hash():
+ hash_name, hash_hex = integrity_to_hash(
+ 'sha256-MeBmE3qWJnbon2nRtlOC3pWn732RS4y5VvQepy4PUWs=')
+
+ assert hash_name == 'sha256'
+ assert hash_hex == '31e066137a962676e89f69d1b65382de95a7ef7d914b8cb956f41ea72e0f516b' # noqa
+
+
+requests_mock_datadir_missing = requests_mock_datadir_factory(ignore_urls=[
+ URL
+])
+
+
+def test_tar_visit_with_no_artifact_found(
+ swh_config, requests_mock_datadir_missing):
+ loader = TarLoader(url=URL, packages=PACKAGES)
+
+ actual_load_status = loader.load()
+ assert actual_load_status['status'] == 'uneventful'
+ stats = loader.storage.stat_counters()
+
+ assert {
+ 'content': 0,
+ 'directory': 0,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 0,
+ 'release': 0,
+ 'revision': 0,
+ 'skipped_content': 0,
+ 'snapshot': 1,
+ } == stats
+
+ origin_visit = next(loader.storage.origin_visit_get(URL))
+ assert origin_visit['status'] == 'partial'
+
+
+def test_tar_visit_with_artifact_found(swh_config, requests_mock_datadir):
+ loader = TarLoader(url=URL, packages=PACKAGES)
+
+ actual_load_status = loader.load()
+ assert actual_load_status['status'] == 'eventful'
+ stats = loader.storage.stat_counters()
+
+ assert {
+ 'content': 303,
+ 'directory': 12,
+ 'origin': 1,
+ 'origin_visit': 1,
+ 'person': 1,
+ 'release': 0,
+ 'revision': 1,
+ 'skipped_content': 0,
+ 'snapshot': 1,
+ } == stats
+
+ origin_visit = next(loader.storage.origin_visit_get(URL))
+ assert origin_visit['status'] == 'full'
+
+ expected_snapshot = {
+ 'id': 'e759f554b660f03ebaf2e5bf62c34e0fe0ee5748',
+ 'branches': {
+ 'HEAD': {
+ 'target_type': 'alias',
+ 'target': 'releases/2.10'
+ },
+ 'releases/2.10': {
+ 'target_type': 'revision',
+ 'target': '326260b671e595403f03b9e673af519e23011c52',
+ }
+ },
+ }
+
+ check_snapshot(expected_snapshot, loader.storage)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Nov 5 2024, 3:06 PM (12 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226657
Attached To
D2145: package.tar: Add a generic archive loader implementation (merge with gnu's)
Event Timeline
Log In to Comment