Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163806
D2145.id7278.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
21 KB
Subscribers
None
D2145.id7278.diff
View Options
diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py
deleted file mode 100644
--- a/swh/loader/package/gnu.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (C) 2019 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import logging
-import re
-
-from os import path
-
-from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
-
-from swh.loader.package.loader import PackageLoader
-from swh.loader.package.utils import release_name
-
-from swh.model.identifiers import normalize_timestamp
-
-
-logger = logging.getLogger(__name__)
-
-
-# to recognize existing naming pattern
-extensions = [
- 'zip',
- 'tar',
- 'gz', 'tgz',
- 'bz2', 'bzip2',
- 'lzma', 'lz',
- 'xz',
- 'Z',
-]
-
-version_keywords = [
- 'cygwin_me',
- 'w32', 'win32', 'nt', 'cygwin', 'mingw',
- 'latest', 'alpha', 'beta',
- 'release', 'stable',
- 'hppa',
- 'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
- 'aix', 'ibm', 'rs6000',
- 'i386', 'i686',
- 'linux', 'redhat', 'linuxlibc',
- 'mips',
- 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
- 'unknown',
- 'netbsd', 'freebsd',
- 'sgi', 'irix',
-]
-
-# Match a filename into components.
-#
-# We use Debian's release number heuristic: A release number starts
-# with a digit, and is followed by alphanumeric characters or any of
-# ., +, :, ~ and -
-#
-# We hardcode a list of possible extensions, as this release number
-# scheme would match them too... We match on any combination of those.
-#
-# Greedy matching is done right to left (we only match the extension
-# greedily with +, software_name and release_number are matched lazily
-# with +? and *?).
-
-pattern = r'''
-^
-(?:
- # We have a software name and a release number, separated with a
- # -, _ or dot.
- (?P<software_name1>.+?[-_.])
- (?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
-|
- # We couldn't match a release number, put everything in the
- # software name.
- (?P<software_name2>.+?)
-)
-(?P<extension>(?:\.(?:%(extensions)s))+)
-$
-''' % {
- 'extensions': '|'.join(extensions),
- 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
-}
-
-
-def get_version(url: str) -> str:
- """Extract branch name from tarball url
-
- Args:
- url (str): Tarball URL
-
- Returns:
- byte: Branch name
-
- Example:
- For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
-
- >>> get_version(url)
- '0.2.0'
-
- """
- filename = path.split(url)[-1]
- m = re.match(pattern, filename,
- flags=re.VERBOSE | re.IGNORECASE)
- if m:
- d = m.groupdict()
- if d['software_name1'] and d['release_number']:
- return d['release_number']
- if d['software_name2']:
- return d['software_name2']
-
- return ''
-
-
-class GNULoader(PackageLoader):
- visit_type = 'gnu'
- SWH_PERSON = {
- 'name': b'Software Heritage',
- 'fullname': b'Software Heritage',
- 'email': b'robot@softwareheritage.org'
- }
- REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
-
- def __init__(self, package_url: str, tarballs: Sequence):
- """Loader constructor.
-
- For now, this is the lister's task output.
-
- Args:
- package_url: Origin url
-
- tarballs: List of dict with keys `date` (date) and `archive` (str)
- the url to retrieve one versioned archive
-
- """
- super().__init__(url=package_url)
- self.tarballs = list(sorted(tarballs, key=lambda v: v['time']))
-
- def get_versions(self) -> Sequence[str]:
- versions = []
- for archive in self.tarballs:
- v = get_version(archive['archive'])
- if v:
- versions.append(v)
- return versions
-
- def get_default_version(self) -> str:
- # It's the most recent, so for this loader, it's the last one
- return get_version(self.tarballs[-1]['archive'])
-
- def get_package_info(self, version: str) -> Generator[
- Tuple[str, Mapping[str, Any]], None, None]:
- for a_metadata in self.tarballs:
- url = a_metadata['archive']
- package_version = get_version(url)
- if version == package_version:
- p_info = {
- 'url': url,
- 'filename': path.split(url)[-1],
- 'raw': a_metadata,
- }
- # FIXME: this code assumes we have only 1 artifact per
- # versioned package
- yield release_name(version), p_info
-
- def resolve_revision_from(
- self, known_artifacts: Dict, artifact_metadata: Dict) \
- -> Optional[bytes]:
- def pk(d):
- return [d.get(k) for k in ['time', 'archive', 'length']]
-
- artifact_pk = pk(artifact_metadata)
- for rev_id, known_artifact in known_artifacts.items():
- logging.debug('known_artifact: %s', known_artifact)
- known_pk = pk(known_artifact['extrinsic']['raw'])
- if artifact_pk == known_pk:
- return rev_id
-
- def build_revision(
- self, a_metadata: Mapping[str, Any],
- uncompressed_path: str) -> Dict:
- normalized_date = normalize_timestamp(int(a_metadata['time']))
- return {
- 'type': 'tar',
- 'message': self.REVISION_MESSAGE,
- 'date': normalized_date,
- 'author': self.SWH_PERSON,
- 'committer': self.SWH_PERSON,
- 'committer_date': normalized_date,
- 'parents': [],
- 'metadata': {
- 'intrinsic': {},
- 'extrinsic': {
- 'provider': self.url,
- 'when': self.visit_date.isoformat(),
- 'raw': a_metadata,
- },
- },
- }
diff --git a/swh/loader/package/tar.py b/swh/loader/package/tar.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tar.py
@@ -0,0 +1,132 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import iso8601
+import logging
+
+from os import path
+from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
+
+from swh.loader.package.loader import PackageLoader
+from swh.loader.package.utils import release_name
+from swh.model.identifiers import normalize_timestamp
+
+
+logger = logging.getLogger(__name__)
+SWH_PERSON = {
+ 'name': b'Software Heritage',
+ 'fullname': b'Software Heritage',
+ 'email': b'robot@softwareheritage.org'
+}
+REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
+
+
+class ArchiveLoader(PackageLoader):
+ visit_type = 'tar'
+
+ def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]],
+ pk_artifact_keys: Optional[Sequence[str]] = None):
+ """Loader constructor.
+
+ For now, this is the lister's task output.
+
+ Args:
+ url: Origin url
+ artifacts: List of artifact information with keys:
+
+ **time**: last modification time as either isoformat date string
+ or timestamp
+ **url**: the artifact url to retrieve filename
+ **artifact's filename version**: artifact's version length
+ **artifact's size
+
+ pk_artifact_keys: Optional List of keys forming a composite primary
+ key for an artifact
+
+ """
+ super().__init__(url=url)
+ self.artifacts = artifacts # assume order is enforced in the lister
+ if not pk_artifact_keys:
+ # default keys for gnu
+ pk_artifact_keys = ['time', 'url', 'length', 'version']
+ self.pk_artifact_keys = pk_artifact_keys
+
+ def get_versions(self) -> Sequence[str]:
+ versions = []
+ for archive in self.artifacts:
+ v = archive.get('version')
+ if v:
+ versions.append(v)
+ return versions
+
+ def get_default_version(self) -> str:
+ # It's the most recent, so for this loader, it's the last one
+ return self.artifacts[-1]['version']
+
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
+ for a_metadata in self.artifacts:
+ url = a_metadata['url']
+ package_version = a_metadata['version']
+ if version == package_version:
+ filename = a_metadata.get('filename')
+ p_info = {
+ 'url': url,
+ 'filename': filename if filename else path.split(url)[-1],
+ 'raw': a_metadata,
+ }
+ # FIXME: this code assumes we have only 1 artifact per
+ # versioned package
+ yield release_name(version), p_info
+
+ def resolve_revision_from(
+ self, known_artifacts: Dict, artifact_metadata: Dict) \
+ -> Optional[bytes]:
+ artifact_pk = pk(artifact_metadata, pk_keys=self.pk_artifact_keys)
+ for rev_id, known_artifact in known_artifacts.items():
+ logging.debug('known_artifact: %s', known_artifact)
+ reference_artifact = known_artifact['extrinsic']['raw']
+ known_pk = pk(reference_artifact, pk_keys=self.pk_artifact_keys)
+ if artifact_pk == known_pk:
+ return rev_id
+
+ def build_revision(self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict:
+ time = a_metadata['time'] # assume it's a timestamp
+ if isinstance(time, str): # otherwise, assume it's a parsable date
+ time = iso8601.parse_date(time)
+ normalized_time = normalize_timestamp(time)
+ return {
+ 'type': 'tar',
+ 'message': REVISION_MESSAGE,
+ 'date': normalized_time,
+ 'author': SWH_PERSON,
+ 'committer': SWH_PERSON,
+ 'committer_date': normalized_time,
+ 'parents': [],
+ 'metadata': {
+ 'intrinsic': {},
+ 'extrinsic': {
+ 'provider': self.url,
+ 'when': self.visit_date.isoformat(),
+ 'raw': a_metadata,
+ },
+ },
+ }
+
+
+def pk(d: Mapping[str, Any], pk_keys: Sequence[str]) -> Sequence[Any]:
+ """Compute the primary key for a dict using the pk_keys as primary key
+ composite.
+
+ Args:
+ d: A dict entry to compute the primary key on
+ pk_keys: Sequence of keys to use as primary key
+
+ Returns:
+ The primary key for that dict entry
+
+ """
+ return [d.get(k) for k in pk_keys]
diff --git a/swh/loader/package/tasks.py b/swh/loader/package/tasks.py
--- a/swh/loader/package/tasks.py
+++ b/swh/loader/package/tasks.py
@@ -4,9 +4,9 @@
# See top-level LICENSE file for more information
from celery import current_app as app
-from swh.loader.package.gnu import GNULoader
+from swh.loader.package.tar import ArchiveLoader
-@app.task(name=__name__ + '.LoadGNU')
-def load_gnu(name, origin_url=None, tarballs=None):
- return GNULoader(origin_url, tarballs).load()
+@app.task(name=__name__ + '.LoadTar')
+def load_tar(url=None, artifacts=None, pk_keys=None):
+ return ArchiveLoader(url, artifacts, pk_keys=pk_keys).load()
diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_tar.py
rename from swh/loader/package/tests/test_gnu.py
rename to swh/loader/package/tests/test_tar.py
--- a/swh/loader/package/tests/test_gnu.py
+++ b/swh/loader/package/tests/test_tar.py
@@ -3,74 +3,24 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
-import re
-
from swh.model.hashutil import hash_to_bytes
-from swh.loader.package.gnu import GNULoader, get_version
+from swh.loader.package.tar import ArchiveLoader, pk
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
-def test_get_version():
- """From url to branch name should yield something relevant
-
- """
- for url, expected_branchname in [
- ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
- ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
- ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
- ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
- ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
- ('https://ftp.org/gnu/aris-w32.zip', 'w32'),
- ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
- ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
- ('https://ftp.org/gnu/crypto-build-demo.tar.gz',
- 'crypto-build-demo'),
- ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
- 'clue+clio+xit.clisp'),
- ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
- 'clue+clio.for-pcl'),
- ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
- 'hppa2.0-hp-hpux10.20'),
- ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
- ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
- ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
- ('clisp-powerpc-unknown-linuxlibc6.tar.gz',
- 'powerpc-unknown-linuxlibc6'),
-
- ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
- ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
- ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
- ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
- 'sparc-sun-sunos4.1.3_U1'),
- ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
- '2.25.1-powerpc-apple-MacOSX'),
- ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
- '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
- ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
- '2.27-i686-unknown-Linux-2.2.19'),
- ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
- '2.28-i386-i386-freebsd-4.3-RELEASE'),
- ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
- '2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
- ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
- '2.29-i386-i386-freebsd-4.6-STABLE'),
- ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
- '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
- ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
- '2.5.3-ansi-japi-xdr.20030701_mingw32'),
- ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
- ('sather-logo_images.tar.gz', 'sather-logo_images'),
- ('sather-specification-000328.html.tar.gz', '000328.html')
-
- ]:
- actual_branchname = get_version(url)
-
- assert actual_branchname == expected_branchname
-
+URL = 'https://ftp.gnu.org/gnu/8sync/'
+GNU_ARTIFACTS = [
+ {
+ 'time': 944729610,
+ 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ 'length': 221837,
+ 'filename': '8sync-0.1.0.tar.gz',
+ 'version': '0.1.0',
+ }
+]
_expected_new_contents_first_visit = [
'e9258d81faf5881a2f96a77ba609396f82cb97ad',
@@ -134,16 +84,18 @@
_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa
-def test_visit_with_no_artifact_found(swh_config, requests_mock):
- package_url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': '944729610',
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
-
- loader = GNULoader(package_url, tarballs)
- requests_mock.get(re.compile('https://'), status_code=404)
+def visit_with_no_artifact_found(swh_config, requests_mock_datadir):
+ url = URL
+ unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz'
+ loader = ArchiveLoader(url, artifacts=[
+ {
+ 'time': 944729610,
+ 'url': unknown_artifact_url, # unknown artifact
+ 'length': 221837,
+ 'filename': '8sync-0.1.0.tar.gz',
+ 'version': '0.1.0',
+ }
+ ])
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
@@ -161,19 +113,12 @@
'snapshot': 1,
} == stats
- origin_visit = next(loader.storage.origin_visit_get(package_url))
+ origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
def test_check_revision_metadata_structure(swh_config, requests_mock_datadir):
- package_url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': '944729610',
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
-
- loader = GNULoader(package_url, tarballs)
+ loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
@@ -205,15 +150,7 @@
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
- assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
- package_url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': 944729610,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
-
- loader = GNULoader(package_url, tarballs)
+ loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
@@ -253,15 +190,9 @@
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
- assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
- url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': 944729610,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
+ url = URL
+ loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS)
- loader = GNULoader(url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
@@ -283,15 +214,10 @@
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
- assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
- url = 'https://ftp.gnu.org/gnu/8sync/'
- tarball1 = {
- 'time': 944729610,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }
+ url = URL
+ artifact1 = GNU_ARTIFACTS[0]
+ loader = ArchiveLoader(url, [artifact1])
- loader = GNULoader(url, [tarball1])
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
@@ -316,12 +242,15 @@
]
assert len(urls) == 1
- tarball2 = {
+ artifact2 = {
'time': 1480991830,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
'length': 238466,
+ 'filename': '8sync-0.2.0.tar.gz',
+ 'version': '0.2.0',
}
- loader2 = GNULoader(url, [tarball1, tarball2])
+
+ loader2 = ArchiveLoader(url, [artifact1, artifact2])
# implementation detail: share the storage in between visits
loader2.storage = loader.storage
stats2 = loader2.storage.stat_counters()
@@ -352,3 +281,63 @@
]
# 1 artifact (2nd time no modification) + 1 new artifact
assert len(urls) == 2
+
+
+def test_pk():
+ """Compute primary key should return the right pk
+
+ """
+ data = {
+ 'a': 1,
+ 'b': 2,
+ 'length': 221837,
+ 'filename': '8sync-0.1.0.tar.gz',
+ 'version': '0.1.0',
+ }
+
+ for pk_keys, expected_pk in [
+ (['a', 'b'], [1, 2]),
+ ([], []),
+ (['a', 'key-that-does-not-exist'], [1, None])
+ ]:
+ actual_pk = pk(data, pk_keys=pk_keys)
+ assert actual_pk == expected_pk
+
+
+def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir):
+ """Load a project archive (not gnu) ends up with 1 snapshot
+
+ """
+ url = 'https://something.else.org/8sync/'
+ artifacts = [ # this is not a gnu artifact
+ {
+ 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp
+ 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa
+ # keep a gnu artifact reference to avoid adding other test files
+ 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ 'length': 238466,
+ 'filename': '8sync-0.2.0.tar.gz',
+ 'version': '0.2.0',
+ }
+ ]
+
+ # Here the loader defines the pk_keys to use for existence in the snapshot
+ # It's not the default archive loader which
+ loader = ArchiveLoader(
+ url, artifacts=artifacts, pk_artifact_keys=['sha256', 'length', 'url'])
+
+ actual_load_status = loader.load()
+ assert actual_load_status['status'] == 'eventful'
+ origin_visit = list(loader.storage.origin_visit_get(url))[-1]
+ assert origin_visit['status'] == 'full'
+
+ actual_load_status2 = loader.load()
+ assert actual_load_status2['status'] == 'uneventful'
+ origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
+ assert origin_visit2['status'] == 'full'
+
+ urls = [
+ m.url for m in requests_mock_datadir.request_history
+ if m.url.startswith('https://ftp.gnu.org')
+ ]
+ assert len(urls) == 1
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 3:53 PM (1 h, 46 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226558
Attached To
D2145: package.tar: Add a generic archive loader implementation (merge with gnu's)
Event Timeline
Log In to Comment