Page MenuHomeSoftware Heritage

D2145.id7278.diff
No OneTemporary

D2145.id7278.diff

diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py
deleted file mode 100644
--- a/swh/loader/package/gnu.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (C) 2019 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import logging
-import re
-
-from os import path
-
-from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
-
-from swh.loader.package.loader import PackageLoader
-from swh.loader.package.utils import release_name
-
-from swh.model.identifiers import normalize_timestamp
-
-
-logger = logging.getLogger(__name__)
-
-
-# to recognize existing naming pattern
-extensions = [
- 'zip',
- 'tar',
- 'gz', 'tgz',
- 'bz2', 'bzip2',
- 'lzma', 'lz',
- 'xz',
- 'Z',
-]
-
-version_keywords = [
- 'cygwin_me',
- 'w32', 'win32', 'nt', 'cygwin', 'mingw',
- 'latest', 'alpha', 'beta',
- 'release', 'stable',
- 'hppa',
- 'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
- 'aix', 'ibm', 'rs6000',
- 'i386', 'i686',
- 'linux', 'redhat', 'linuxlibc',
- 'mips',
- 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
- 'unknown',
- 'netbsd', 'freebsd',
- 'sgi', 'irix',
-]
-
-# Match a filename into components.
-#
-# We use Debian's release number heuristic: A release number starts
-# with a digit, and is followed by alphanumeric characters or any of
-# ., +, :, ~ and -
-#
-# We hardcode a list of possible extensions, as this release number
-# scheme would match them too... We match on any combination of those.
-#
-# Greedy matching is done right to left (we only match the extension
-# greedily with +, software_name and release_number are matched lazily
-# with +? and *?).
-
-pattern = r'''
-^
-(?:
- # We have a software name and a release number, separated with a
- # -, _ or dot.
- (?P<software_name1>.+?[-_.])
- (?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
-|
- # We couldn't match a release number, put everything in the
- # software name.
- (?P<software_name2>.+?)
-)
-(?P<extension>(?:\.(?:%(extensions)s))+)
-$
-''' % {
- 'extensions': '|'.join(extensions),
- 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
-}
-
-
-def get_version(url: str) -> str:
- """Extract branch name from tarball url
-
- Args:
- url (str): Tarball URL
-
- Returns:
- byte: Branch name
-
- Example:
- For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
-
- >>> get_version(url)
- '0.2.0'
-
- """
- filename = path.split(url)[-1]
- m = re.match(pattern, filename,
- flags=re.VERBOSE | re.IGNORECASE)
- if m:
- d = m.groupdict()
- if d['software_name1'] and d['release_number']:
- return d['release_number']
- if d['software_name2']:
- return d['software_name2']
-
- return ''
-
-
-class GNULoader(PackageLoader):
- visit_type = 'gnu'
- SWH_PERSON = {
- 'name': b'Software Heritage',
- 'fullname': b'Software Heritage',
- 'email': b'robot@softwareheritage.org'
- }
- REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
-
- def __init__(self, package_url: str, tarballs: Sequence):
- """Loader constructor.
-
- For now, this is the lister's task output.
-
- Args:
- package_url: Origin url
-
- tarballs: List of dict with keys `date` (date) and `archive` (str)
- the url to retrieve one versioned archive
-
- """
- super().__init__(url=package_url)
- self.tarballs = list(sorted(tarballs, key=lambda v: v['time']))
-
- def get_versions(self) -> Sequence[str]:
- versions = []
- for archive in self.tarballs:
- v = get_version(archive['archive'])
- if v:
- versions.append(v)
- return versions
-
- def get_default_version(self) -> str:
- # It's the most recent, so for this loader, it's the last one
- return get_version(self.tarballs[-1]['archive'])
-
- def get_package_info(self, version: str) -> Generator[
- Tuple[str, Mapping[str, Any]], None, None]:
- for a_metadata in self.tarballs:
- url = a_metadata['archive']
- package_version = get_version(url)
- if version == package_version:
- p_info = {
- 'url': url,
- 'filename': path.split(url)[-1],
- 'raw': a_metadata,
- }
- # FIXME: this code assumes we have only 1 artifact per
- # versioned package
- yield release_name(version), p_info
-
- def resolve_revision_from(
- self, known_artifacts: Dict, artifact_metadata: Dict) \
- -> Optional[bytes]:
- def pk(d):
- return [d.get(k) for k in ['time', 'archive', 'length']]
-
- artifact_pk = pk(artifact_metadata)
- for rev_id, known_artifact in known_artifacts.items():
- logging.debug('known_artifact: %s', known_artifact)
- known_pk = pk(known_artifact['extrinsic']['raw'])
- if artifact_pk == known_pk:
- return rev_id
-
- def build_revision(
- self, a_metadata: Mapping[str, Any],
- uncompressed_path: str) -> Dict:
- normalized_date = normalize_timestamp(int(a_metadata['time']))
- return {
- 'type': 'tar',
- 'message': self.REVISION_MESSAGE,
- 'date': normalized_date,
- 'author': self.SWH_PERSON,
- 'committer': self.SWH_PERSON,
- 'committer_date': normalized_date,
- 'parents': [],
- 'metadata': {
- 'intrinsic': {},
- 'extrinsic': {
- 'provider': self.url,
- 'when': self.visit_date.isoformat(),
- 'raw': a_metadata,
- },
- },
- }
diff --git a/swh/loader/package/tar.py b/swh/loader/package/tar.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/tar.py
@@ -0,0 +1,132 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import iso8601
+import logging
+
+from os import path
+from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
+
+from swh.loader.package.loader import PackageLoader
+from swh.loader.package.utils import release_name
+from swh.model.identifiers import normalize_timestamp
+
+
+logger = logging.getLogger(__name__)
+SWH_PERSON = {
+ 'name': b'Software Heritage',
+ 'fullname': b'Software Heritage',
+ 'email': b'robot@softwareheritage.org'
+}
+REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
+
+
+class ArchiveLoader(PackageLoader):
+ visit_type = 'tar'
+
+ def __init__(self, url: str, artifacts: Sequence[Mapping[str, Any]],
+ pk_artifact_keys: Optional[Sequence[str]] = None):
+ """Loader constructor.
+
+ For now, this is the lister's task output.
+
+ Args:
+ url: Origin url
+ artifacts: List of artifact information with keys:
+
+ **time**: last modification time as either isoformat date string
+ or timestamp
+ **url**: the artifact url to retrieve filename
+ **artifact's filename version**: artifact's version length
+ **artifact's size
+
+ pk_artifact_keys: Optional List of keys forming a composite primary
+ key for an artifact
+
+ """
+ super().__init__(url=url)
+ self.artifacts = artifacts # assume order is enforced in the lister
+ if not pk_artifact_keys:
+ # default keys for gnu
+ pk_artifact_keys = ['time', 'url', 'length', 'version']
+ self.pk_artifact_keys = pk_artifact_keys
+
+ def get_versions(self) -> Sequence[str]:
+ versions = []
+ for archive in self.artifacts:
+ v = archive.get('version')
+ if v:
+ versions.append(v)
+ return versions
+
+ def get_default_version(self) -> str:
+ # It's the most recent, so for this loader, it's the last one
+ return self.artifacts[-1]['version']
+
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
+ for a_metadata in self.artifacts:
+ url = a_metadata['url']
+ package_version = a_metadata['version']
+ if version == package_version:
+ filename = a_metadata.get('filename')
+ p_info = {
+ 'url': url,
+ 'filename': filename if filename else path.split(url)[-1],
+ 'raw': a_metadata,
+ }
+ # FIXME: this code assumes we have only 1 artifact per
+ # versioned package
+ yield release_name(version), p_info
+
+ def resolve_revision_from(
+ self, known_artifacts: Dict, artifact_metadata: Dict) \
+ -> Optional[bytes]:
+ artifact_pk = pk(artifact_metadata, pk_keys=self.pk_artifact_keys)
+ for rev_id, known_artifact in known_artifacts.items():
+ logging.debug('known_artifact: %s', known_artifact)
+ reference_artifact = known_artifact['extrinsic']['raw']
+ known_pk = pk(reference_artifact, pk_keys=self.pk_artifact_keys)
+ if artifact_pk == known_pk:
+ return rev_id
+
+ def build_revision(self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict:
+ time = a_metadata['time'] # assume it's a timestamp
+ if isinstance(time, str): # otherwise, assume it's a parsable date
+ time = iso8601.parse_date(time)
+ normalized_time = normalize_timestamp(time)
+ return {
+ 'type': 'tar',
+ 'message': REVISION_MESSAGE,
+ 'date': normalized_time,
+ 'author': SWH_PERSON,
+ 'committer': SWH_PERSON,
+ 'committer_date': normalized_time,
+ 'parents': [],
+ 'metadata': {
+ 'intrinsic': {},
+ 'extrinsic': {
+ 'provider': self.url,
+ 'when': self.visit_date.isoformat(),
+ 'raw': a_metadata,
+ },
+ },
+ }
+
+
+def pk(d: Mapping[str, Any], pk_keys: Sequence[str]) -> Sequence[Any]:
+ """Compute the primary key for a dict using the pk_keys as primary key
+ composite.
+
+ Args:
+ d: A dict entry to compute the primary key on
+ pk_keys: Sequence of keys to use as primary key
+
+ Returns:
+ The primary key for that dict entry
+
+ """
+ return [d.get(k) for k in pk_keys]
diff --git a/swh/loader/package/tasks.py b/swh/loader/package/tasks.py
--- a/swh/loader/package/tasks.py
+++ b/swh/loader/package/tasks.py
@@ -4,9 +4,9 @@
# See top-level LICENSE file for more information
from celery import current_app as app
-from swh.loader.package.gnu import GNULoader
+from swh.loader.package.tar import ArchiveLoader
-@app.task(name=__name__ + '.LoadGNU')
-def load_gnu(name, origin_url=None, tarballs=None):
- return GNULoader(origin_url, tarballs).load()
+@app.task(name=__name__ + '.LoadTar')
+def load_tar(url=None, artifacts=None, pk_keys=None):
+ return ArchiveLoader(url, artifacts, pk_keys=pk_keys).load()
diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_tar.py
rename from swh/loader/package/tests/test_gnu.py
rename to swh/loader/package/tests/test_tar.py
--- a/swh/loader/package/tests/test_gnu.py
+++ b/swh/loader/package/tests/test_tar.py
@@ -3,74 +3,24 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
-import re
-
from swh.model.hashutil import hash_to_bytes
-from swh.loader.package.gnu import GNULoader, get_version
+from swh.loader.package.tar import ArchiveLoader, pk
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
-def test_get_version():
- """From url to branch name should yield something relevant
-
- """
- for url, expected_branchname in [
- ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
- ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
- ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
- ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
- ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
- ('https://ftp.org/gnu/aris-w32.zip', 'w32'),
- ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
- ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
- ('https://ftp.org/gnu/crypto-build-demo.tar.gz',
- 'crypto-build-demo'),
- ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
- 'clue+clio+xit.clisp'),
- ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
- 'clue+clio.for-pcl'),
- ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
- 'hppa2.0-hp-hpux10.20'),
- ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
- ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
- ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
- ('clisp-powerpc-unknown-linuxlibc6.tar.gz',
- 'powerpc-unknown-linuxlibc6'),
-
- ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
- ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
- ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
- ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
- 'sparc-sun-sunos4.1.3_U1'),
- ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
- '2.25.1-powerpc-apple-MacOSX'),
- ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
- '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
- ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
- '2.27-i686-unknown-Linux-2.2.19'),
- ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
- '2.28-i386-i386-freebsd-4.3-RELEASE'),
- ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
- '2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
- ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
- '2.29-i386-i386-freebsd-4.6-STABLE'),
- ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
- '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
- ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
- '2.5.3-ansi-japi-xdr.20030701_mingw32'),
- ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
- ('sather-logo_images.tar.gz', 'sather-logo_images'),
- ('sather-specification-000328.html.tar.gz', '000328.html')
-
- ]:
- actual_branchname = get_version(url)
-
- assert actual_branchname == expected_branchname
-
+URL = 'https://ftp.gnu.org/gnu/8sync/'
+GNU_ARTIFACTS = [
+ {
+ 'time': 944729610,
+ 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ 'length': 221837,
+ 'filename': '8sync-0.1.0.tar.gz',
+ 'version': '0.1.0',
+ }
+]
_expected_new_contents_first_visit = [
'e9258d81faf5881a2f96a77ba609396f82cb97ad',
@@ -134,16 +84,18 @@
_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa
-def test_visit_with_no_artifact_found(swh_config, requests_mock):
- package_url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': '944729610',
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
-
- loader = GNULoader(package_url, tarballs)
- requests_mock.get(re.compile('https://'), status_code=404)
+def visit_with_no_artifact_found(swh_config, requests_mock_datadir):
+ url = URL
+ unknown_artifact_url = 'https://ftp.g.o/unknown/8sync-0.1.0.tar.gz'
+ loader = ArchiveLoader(url, artifacts=[
+ {
+ 'time': 944729610,
+ 'url': unknown_artifact_url, # unknown artifact
+ 'length': 221837,
+ 'filename': '8sync-0.1.0.tar.gz',
+ 'version': '0.1.0',
+ }
+ ])
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
@@ -161,19 +113,12 @@
'snapshot': 1,
} == stats
- origin_visit = next(loader.storage.origin_visit_get(package_url))
+ origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
def test_check_revision_metadata_structure(swh_config, requests_mock_datadir):
- package_url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': '944729610',
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
-
- loader = GNULoader(package_url, tarballs)
+ loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
@@ -205,15 +150,7 @@
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
- assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
- package_url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': 944729610,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
-
- loader = GNULoader(package_url, tarballs)
+ loader = ArchiveLoader(url=URL, artifacts=GNU_ARTIFACTS)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
@@ -253,15 +190,9 @@
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
- assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
- url = 'https://ftp.gnu.org/gnu/8sync/'
- tarballs = [{
- 'time': 944729610,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }]
+ url = URL
+ loader = ArchiveLoader(url, artifacts=GNU_ARTIFACTS)
- loader = GNULoader(url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
@@ -283,15 +214,10 @@
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
- assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
- url = 'https://ftp.gnu.org/gnu/8sync/'
- tarball1 = {
- 'time': 944729610,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
- 'length': 221837,
- }
+ url = URL
+ artifact1 = GNU_ARTIFACTS[0]
+ loader = ArchiveLoader(url, [artifact1])
- loader = GNULoader(url, [tarball1])
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
@@ -316,12 +242,15 @@
]
assert len(urls) == 1
- tarball2 = {
+ artifact2 = {
'time': 1480991830,
- 'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
'length': 238466,
+ 'filename': '8sync-0.2.0.tar.gz',
+ 'version': '0.2.0',
}
- loader2 = GNULoader(url, [tarball1, tarball2])
+
+ loader2 = ArchiveLoader(url, [artifact1, artifact2])
# implementation detail: share the storage in between visits
loader2.storage = loader.storage
stats2 = loader2.storage.stat_counters()
@@ -352,3 +281,63 @@
]
# 1 artifact (2nd time no modification) + 1 new artifact
assert len(urls) == 2
+
+
+def test_pk():
+ """Compute primary key should return the right pk
+
+ """
+ data = {
+ 'a': 1,
+ 'b': 2,
+ 'length': 221837,
+ 'filename': '8sync-0.1.0.tar.gz',
+ 'version': '0.1.0',
+ }
+
+ for pk_keys, expected_pk in [
+ (['a', 'b'], [1, 2]),
+ ([], []),
+ (['a', 'key-that-does-not-exist'], [1, None])
+ ]:
+ actual_pk = pk(data, pk_keys=pk_keys)
+ assert actual_pk == expected_pk
+
+
+def test_2_visits_without_change_not_gnu(swh_config, requests_mock_datadir):
+ """Load a project archive (not gnu) ends up with 1 snapshot
+
+ """
+ url = 'https://something.else.org/8sync/'
+ artifacts = [ # this is not a gnu artifact
+ {
+ 'time': '1999-12-09T09:53:30+00:00', # it's also not a timestamp
+ 'sha256': 'd5d1051e59b2be6f065a9fc6aedd3a391e44d0274b78b9bb4e2b57a09134dbe4', # noqa
+ # keep a gnu artifact reference to avoid adding other test files
+ 'url': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ 'length': 238466,
+ 'filename': '8sync-0.2.0.tar.gz',
+ 'version': '0.2.0',
+ }
+ ]
+
+ # Here the loader defines the pk_keys to use for existence in the snapshot
+ # It's not the default archive loader which
+ loader = ArchiveLoader(
+ url, artifacts=artifacts, pk_artifact_keys=['sha256', 'length', 'url'])
+
+ actual_load_status = loader.load()
+ assert actual_load_status['status'] == 'eventful'
+ origin_visit = list(loader.storage.origin_visit_get(url))[-1]
+ assert origin_visit['status'] == 'full'
+
+ actual_load_status2 = loader.load()
+ assert actual_load_status2['status'] == 'uneventful'
+ origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
+ assert origin_visit2['status'] == 'full'
+
+ urls = [
+ m.url for m in requests_mock_datadir.request_history
+ if m.url.startswith('https://ftp.gnu.org')
+ ]
+ assert len(urls) == 1

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 3:53 PM (6 h, 39 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226558

Event Timeline