Page MenuHomeSoftware Heritage

D2147.diff
No OneTemporary

D2147.diff

diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -33,22 +33,23 @@
.. code-block:: python
- args: ['https://ftp.gnu.org/gnu/3dldf/']
+ args:
kwargs: {
- 'tarballs': [{
- 'archive': 'https://...',
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/',
+ 'artifacts': [{
+ 'url': 'https://...',
'time': 1071002600,
'length': 128},
...
]}
"""
- tarballs = self.gnu_tree.artifacts[origin_url]
+ artifacts = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
- origin_url,
- tarballs=tarballs)
+ url=origin_url,
+ artifacts=artifacts)
def safely_issue_request(self, identifier):
"""Bypass the implementation. It's now the GNUTree which deals with
diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -9,7 +9,7 @@
logger = logging.getLogger(__name__)
-def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
+def test_gnu_lister(swh_listers, requests_mock_datadir):
lister = swh_listers['gnu']
lister.run()
@@ -21,21 +21,23 @@
assert row['type'] == 'load-tar'
# arguments check
args = row['arguments']['args']
- assert len(args) == 1
+ assert len(args) == 0
- url = args[0]
+ # kwargs
+ kwargs = row['arguments']['kwargs']
+ assert set(kwargs.keys()) == {'url', 'artifacts'}
+
+ url = kwargs['url']
assert url.startswith('https://ftp.gnu.org')
url_suffix = url.split('https://ftp.gnu.org')[1]
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
- # kwargs
- kwargs = row['arguments']['kwargs']
- assert list(kwargs.keys()) == ['tarballs']
-
- tarballs = kwargs['tarballs']
- # check the tarball's structure
- tarball = tarballs[0]
- assert set(tarball.keys()) == set(['archive', 'length', 'time'])
+ artifacts = kwargs['artifacts']
+ # check the artifact's structure
+ artifact = artifacts[0]
+ assert set(artifact.keys()) == {
+ 'url', 'length', 'time', 'filename', 'version'
+ }
assert row['policy'] == 'oneshot'
diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py
--- a/swh/lister/gnu/tests/test_tree.py
+++ b/swh/lister/gnu/tests/test_tree.py
@@ -9,7 +9,8 @@
from os import path
from swh.lister.gnu.tree import (
- GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
+ GNUTree, find_artifacts, check_filename_is_archive, load_raw_data,
+ get_version
)
@@ -69,14 +70,18 @@
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
{
- 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
+ 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'length': 90106,
- 'time': 857980800
+ 'time': 857980800,
+ 'filename': 'zlibc-0.9b.tar.gz',
+ 'version': '0.9b',
},
{
- 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
+ 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'length': 89625,
- 'time': 860396400
+ 'time': 860396400,
+ 'filename': 'zlibc-0.9e.tar.gz',
+ 'version': '0.9e',
}
]
@@ -93,38 +98,46 @@
def test_find_artifacts_small_sample(datadir):
- expected_tarballs = [
+ expected_artifacts = [
{
- 'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
+ 'url': '/root/artanis/artanis-0.2.1.tar.bz2',
'time': 1495205979,
'length': 424081,
+ 'version': '0.2.1',
+ 'filename': 'artanis-0.2.1.tar.bz2',
},
{
- 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
+ 'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'time': 898422900,
- 'length': 1514448
+ 'length': 1514448,
+ 'version': '4_0_0-src',
+ 'filename': 'winboard-4_0_0-src.zip',
},
{
- 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
+ 'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'time': 869814000,
'length': 450164,
+ 'version': '3.6.2',
+ 'filename': 'xboard-3.6.2.tar.gz',
},
{
- 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
+ 'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'time': 898422900,
'length': 514951,
+ 'version': '4.0.0',
+ 'filename': 'xboard-4.0.0.tar.gz',
},
]
file_structure = json.load(open(path.join(datadir, 'tree.min.json')))
- actual_tarballs = find_artifacts(file_structure, '/root/')
- assert actual_tarballs == expected_tarballs
+ actual_artifacts = find_artifacts(file_structure, '/root/')
+ assert actual_artifacts == expected_artifacts
def test_find_artifacts(datadir):
file_structure = json.load(open(path.join(datadir, 'tree.json')))
- actual_tarballs = find_artifacts(file_structure, '/root/')
- assert len(actual_tarballs) == 42 + 3 # tar + zip
+ actual_artifacts = find_artifacts(file_structure, '/root/')
+ assert len(actual_artifacts) == 42 + 3 # tar + zip
def test_check_filename_is_archive():
@@ -133,3 +146,61 @@
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
assert check_filename_is_archive(ext) is False
+
+
+def test_get_version():
+ """From url to branch name should yield something relevant
+
+ """
+ for url, expected_branchname in [
+ ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
+ ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
+ ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
+ ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
+ ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
+ ('https://ftp.org/gnu/aris-w32.zip', 'w32'),
+ ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
+ ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
+ ('https://ftp.org/gnu/crypto-build-demo.tar.gz',
+ 'crypto-build-demo'),
+ ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
+ 'clue+clio+xit.clisp'),
+ ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
+ 'clue+clio.for-pcl'),
+ ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
+ 'hppa2.0-hp-hpux10.20'),
+ ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
+ ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
+ ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
+ ('clisp-powerpc-unknown-linuxlibc6.tar.gz',
+ 'powerpc-unknown-linuxlibc6'),
+
+ ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
+ ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
+ ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
+ ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
+ 'sparc-sun-sunos4.1.3_U1'),
+ ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
+ '2.25.1-powerpc-apple-MacOSX'),
+ ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
+ '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
+ ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
+ '2.27-i686-unknown-Linux-2.2.19'),
+ ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
+ '2.28-i386-i386-freebsd-4.3-RELEASE'),
+ ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
+ '2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
+ ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
+ '2.29-i386-i386-freebsd-4.6-STABLE'),
+ ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
+ '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
+ ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
+ '2.5.3-ansi-japi-xdr.20030701_mingw32'),
+ ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
+ ('sather-logo_images.tar.gz', 'sather-logo_images'),
+ ('sather-specification-000328.html.tar.gz', '000328.html')
+
+ ]:
+ actual_branchname = get_version(url)
+
+ assert actual_branchname == expected_branchname
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
--- a/swh/lister/gnu/tree.py
+++ b/swh/lister/gnu/tree.py
@@ -7,15 +7,112 @@
import json
import logging
import requests
+import re
+from os import path
from pathlib import Path
-from typing import Dict, Tuple, List
+from typing import Any, Dict, List, Mapping, Tuple
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
+# to recognize existing naming pattern
+extensions = [
+ 'zip',
+ 'tar',
+ 'gz', 'tgz',
+ 'bz2', 'bzip2',
+ 'lzma', 'lz',
+ 'xz',
+ 'Z',
+]
+
+version_keywords = [
+ 'cygwin_me',
+ 'w32', 'win32', 'nt', 'cygwin', 'mingw',
+ 'latest', 'alpha', 'beta',
+ 'release', 'stable',
+ 'hppa',
+ 'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
+ 'aix', 'ibm', 'rs6000',
+ 'i386', 'i686',
+ 'linux', 'redhat', 'linuxlibc',
+ 'mips',
+ 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
+ 'unknown',
+ 'netbsd', 'freebsd',
+ 'sgi', 'irix',
+]
+
+# Match a filename into components.
+#
+# We use Debian's release number heuristic: A release number starts
+# with a digit, and is followed by alphanumeric characters or any of
+# ., +, :, ~ and -
+#
+# We hardcode a list of possible extensions, as this release number
+# scheme would match them too... We match on any combination of those.
+#
+# Greedy matching is done right to left (we only match the extension
+# greedily with +, software_name and release_number are matched lazily
+# with +? and *?).
+
+pattern = r'''
+^
+(?:
+ # We have a software name and a release number, separated with a
+ # -, _ or dot.
+ (?P<software_name1>.+?[-_.])
+ (?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
+|
+ # We couldn't match a release number, put everything in the
+ # software name.
+ (?P<software_name2>.+?)
+)
+(?P<extension>(?:\.(?:%(extensions)s))+)
+$
+''' % {
+ 'extensions': '|'.join(extensions),
+ 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
+}
+
+
+def get_version(uri: str) -> str:
+ """Extract branch name from tarball uri
+
+ Args:
+ uri (str): Tarball URI
+
+ Returns:
+ Version detected
+
+ Example:
+ For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
+
+ >>> get_version(uri)
+ '0.2.0'
+
+ For uri = 8sync-0.3.0.tar.gz
+
+ >>> get_version(uri)
+ '0.3.0'
+
+ """
+ filename = path.split(uri)[-1]
+ m = re.match(pattern, filename,
+ flags=re.VERBOSE | re.IGNORECASE)
+ if m:
+ d = m.groupdict()
+ if d['software_name1'] and d['release_number']:
+ return d['release_number']
+ if d['software_name2']:
+ return d['software_name2']
+
+ return ''
+
+
def load_raw_data(url: str) -> List[Dict]:
"""Load the raw json from the tree.json.gz
@@ -99,7 +196,8 @@
return projects, artifacts
-def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
+def find_artifacts(
+ filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]:
"""Recursively list artifacts present in the folder and subfolders for a
particular package url.
@@ -111,21 +209,33 @@
url: URL of the corresponding package
Returns
- List of tarball urls and their associated metadata (time, length).
- For example:
+ List of tarball urls and their associated metadata (time, length,
+ etc...). For example:
.. code-block:: python
[
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
- 'time': 1071002600,
- 'length': 543},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
- 'time': 1071078759,
- 'length': 456},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
- 'time': 1074278633,
- 'length': 251},
+ {
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'time': 1071002600,
+ 'filename': '3DLDF-1.1.3.tar.gz',
+ 'version': '1.1.3',
+ 'length': 543
+ },
+ {
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'time': 1071078759,
+ 'filename: '3DLDF-1.1.4.tar.gz',
+ 'version': '1.1.4',
+ 'length': 456
+ },
+ {
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
+ 'time': 1074278633,
+ 'filename': '3DLDF-1.1.5.tar.gz',
+ 'version': '1.1.5'
+ 'length': 251
+ },
...
]
@@ -136,10 +246,13 @@
filename = info_file['name']
if filetype == 'file':
if check_filename_is_archive(filename):
+ uri = url + filename
artifacts.append({
- 'archive': url + filename,
+ 'url': uri,
+ 'filename': filename,
'time': int(info_file['time']),
'length': int(info_file['size']),
+ 'version': get_version(filename),
})
# It will recursively check for artifacts in all sub-folders
elif filetype == 'directory':

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 1:15 AM (2 d, 10 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225952

Event Timeline