Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123167
D2147.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D2147.diff
View Options
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -33,22 +33,23 @@
.. code-block:: python
- args: ['https://ftp.gnu.org/gnu/3dldf/']
+ args:
kwargs: {
- 'tarballs': [{
- 'archive': 'https://...',
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/',
+ 'artifacts': [{
+ 'url': 'https://...',
'time': 1071002600,
'length': 128},
...
]}
"""
- tarballs = self.gnu_tree.artifacts[origin_url]
+ artifacts = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
- origin_url,
- tarballs=tarballs)
+ url=origin_url,
+ artifacts=artifacts)
def safely_issue_request(self, identifier):
"""Bypass the implementation. It's now the GNUTree which deals with
diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -9,7 +9,7 @@
logger = logging.getLogger(__name__)
-def test_lister_no_page_check_results(swh_listers, requests_mock_datadir):
+def test_gnu_lister(swh_listers, requests_mock_datadir):
lister = swh_listers['gnu']
lister.run()
@@ -21,21 +21,23 @@
assert row['type'] == 'load-tar'
# arguments check
args = row['arguments']['args']
- assert len(args) == 1
+ assert len(args) == 0
- url = args[0]
+ # kwargs
+ kwargs = row['arguments']['kwargs']
+ assert set(kwargs.keys()) == {'url', 'artifacts'}
+
+ url = kwargs['url']
assert url.startswith('https://ftp.gnu.org')
url_suffix = url.split('https://ftp.gnu.org')[1]
assert 'gnu' in url_suffix or 'old-gnu' in url_suffix
- # kwargs
- kwargs = row['arguments']['kwargs']
- assert list(kwargs.keys()) == ['tarballs']
-
- tarballs = kwargs['tarballs']
- # check the tarball's structure
- tarball = tarballs[0]
- assert set(tarball.keys()) == set(['archive', 'length', 'time'])
+ artifacts = kwargs['artifacts']
+ # check the artifact's structure
+ artifact = artifacts[0]
+ assert set(artifact.keys()) == {
+ 'url', 'length', 'time', 'filename', 'version'
+ }
assert row['policy'] == 'oneshot'
diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py
--- a/swh/lister/gnu/tests/test_tree.py
+++ b/swh/lister/gnu/tests/test_tree.py
@@ -9,7 +9,8 @@
from os import path
from swh.lister.gnu.tree import (
- GNUTree, find_artifacts, check_filename_is_archive, load_raw_data
+ GNUTree, find_artifacts, check_filename_is_archive, load_raw_data,
+ get_version
)
@@ -69,14 +70,18 @@
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
{
- 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
+ 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'length': 90106,
- 'time': 857980800
+ 'time': 857980800,
+ 'filename': 'zlibc-0.9b.tar.gz',
+ 'version': '0.9b',
},
{
- 'archive': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
+ 'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'length': 89625,
- 'time': 860396400
+ 'time': 860396400,
+ 'filename': 'zlibc-0.9e.tar.gz',
+ 'version': '0.9e',
}
]
@@ -93,38 +98,46 @@
def test_find_artifacts_small_sample(datadir):
- expected_tarballs = [
+ expected_artifacts = [
{
- 'archive': '/root/artanis/artanis-0.2.1.tar.bz2',
+ 'url': '/root/artanis/artanis-0.2.1.tar.bz2',
'time': 1495205979,
'length': 424081,
+ 'version': '0.2.1',
+ 'filename': 'artanis-0.2.1.tar.bz2',
},
{
- 'archive': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
+ 'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
'time': 898422900,
- 'length': 1514448
+ 'length': 1514448,
+ 'version': '4_0_0-src',
+ 'filename': 'winboard-4_0_0-src.zip',
},
{
- 'archive': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
+ 'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
'time': 869814000,
'length': 450164,
+ 'version': '3.6.2',
+ 'filename': 'xboard-3.6.2.tar.gz',
},
{
- 'archive': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
+ 'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
'time': 898422900,
'length': 514951,
+ 'version': '4.0.0',
+ 'filename': 'xboard-4.0.0.tar.gz',
},
]
file_structure = json.load(open(path.join(datadir, 'tree.min.json')))
- actual_tarballs = find_artifacts(file_structure, '/root/')
- assert actual_tarballs == expected_tarballs
+ actual_artifacts = find_artifacts(file_structure, '/root/')
+ assert actual_artifacts == expected_artifacts
def test_find_artifacts(datadir):
file_structure = json.load(open(path.join(datadir, 'tree.json')))
- actual_tarballs = find_artifacts(file_structure, '/root/')
- assert len(actual_tarballs) == 42 + 3 # tar + zip
+ actual_artifacts = find_artifacts(file_structure, '/root/')
+ assert len(actual_artifacts) == 42 + 3 # tar + zip
def test_check_filename_is_archive():
@@ -133,3 +146,61 @@
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
assert check_filename_is_archive(ext) is False
+
+
+def test_get_version():
+ """From url to branch name should yield something relevant
+
+ """
+ for url, expected_branchname in [
+ ('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
+ ('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
+ ('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
+ ('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
+ ('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
+ ('https://ftp.org/gnu/aris-w32.zip', 'w32'),
+ ('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
+ ('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
+ ('https://ftp.org/gnu/crypto-build-demo.tar.gz',
+ 'crypto-build-demo'),
+ ('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
+ 'clue+clio+xit.clisp'),
+ ('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
+ 'clue+clio.for-pcl'),
+ ('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
+ 'hppa2.0-hp-hpux10.20'),
+ ('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
+ ('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
+ ('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
+ ('clisp-powerpc-unknown-linuxlibc6.tar.gz',
+ 'powerpc-unknown-linuxlibc6'),
+
+ ('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
+ ('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
+ ('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
+ ('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
+ 'sparc-sun-sunos4.1.3_U1'),
+ ('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
+ '2.25.1-powerpc-apple-MacOSX'),
+ ('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
+ '2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
+ ('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
+ '2.27-i686-unknown-Linux-2.2.19'),
+ ('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
+ '2.28-i386-i386-freebsd-4.3-RELEASE'),
+ ('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
+ '2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
+ ('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
+ '2.29-i386-i386-freebsd-4.6-STABLE'),
+ ('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
+ '2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
+ ('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
+ '2.5.3-ansi-japi-xdr.20030701_mingw32'),
+ ('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
+ ('sather-logo_images.tar.gz', 'sather-logo_images'),
+ ('sather-specification-000328.html.tar.gz', '000328.html')
+
+ ]:
+ actual_branchname = get_version(url)
+
+ assert actual_branchname == expected_branchname
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
--- a/swh/lister/gnu/tree.py
+++ b/swh/lister/gnu/tree.py
@@ -7,15 +7,112 @@
import json
import logging
import requests
+import re
+from os import path
from pathlib import Path
-from typing import Dict, Tuple, List
+from typing import Any, Dict, List, Mapping, Tuple
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
+# to recognize existing naming pattern
+extensions = [
+ 'zip',
+ 'tar',
+ 'gz', 'tgz',
+ 'bz2', 'bzip2',
+ 'lzma', 'lz',
+ 'xz',
+ 'Z',
+]
+
+version_keywords = [
+ 'cygwin_me',
+ 'w32', 'win32', 'nt', 'cygwin', 'mingw',
+ 'latest', 'alpha', 'beta',
+ 'release', 'stable',
+ 'hppa',
+ 'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
+ 'aix', 'ibm', 'rs6000',
+ 'i386', 'i686',
+ 'linux', 'redhat', 'linuxlibc',
+ 'mips',
+ 'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
+ 'unknown',
+ 'netbsd', 'freebsd',
+ 'sgi', 'irix',
+]
+
+# Match a filename into components.
+#
+# We use Debian's release number heuristic: A release number starts
+# with a digit, and is followed by alphanumeric characters or any of
+# ., +, :, ~ and -
+#
+# We hardcode a list of possible extensions, as this release number
+# scheme would match them too... We match on any combination of those.
+#
+# Greedy matching is done right to left (we only match the extension
+# greedily with +, software_name and release_number are matched lazily
+# with +? and *?).
+
+pattern = r'''
+^
+(?:
+ # We have a software name and a release number, separated with a
+ # -, _ or dot.
+ (?P<software_name1>.+?[-_.])
+ (?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
+|
+ # We couldn't match a release number, put everything in the
+ # software name.
+ (?P<software_name2>.+?)
+)
+(?P<extension>(?:\.(?:%(extensions)s))+)
+$
+''' % {
+ 'extensions': '|'.join(extensions),
+ 'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
+}
+
+
+def get_version(uri: str) -> str:
+ """Extract branch name from tarball uri
+
+ Args:
+ uri (str): Tarball URI
+
+ Returns:
+ Version detected
+
+ Example:
+ For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
+
+ >>> get_version(uri)
+ '0.2.0'
+
+ For uri = 8sync-0.3.0.tar.gz
+
+ >>> get_version(uri)
+ '0.3.0'
+
+ """
+ filename = path.split(uri)[-1]
+ m = re.match(pattern, filename,
+ flags=re.VERBOSE | re.IGNORECASE)
+ if m:
+ d = m.groupdict()
+ if d['software_name1'] and d['release_number']:
+ return d['release_number']
+ if d['software_name2']:
+ return d['software_name2']
+
+ return ''
+
+
def load_raw_data(url: str) -> List[Dict]:
"""Load the raw json from the tree.json.gz
@@ -99,7 +196,8 @@
return projects, artifacts
-def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
+def find_artifacts(
+ filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]:
"""Recursively list artifacts present in the folder and subfolders for a
particular package url.
@@ -111,21 +209,33 @@
url: URL of the corresponding package
Returns
- List of tarball urls and their associated metadata (time, length).
- For example:
+ List of tarball urls and their associated metadata (time, length,
+ etc...). For example:
.. code-block:: python
[
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
- 'time': 1071002600,
- 'length': 543},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
- 'time': 1071078759,
- 'length': 456},
- {'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
- 'time': 1074278633,
- 'length': 251},
+ {
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
+ 'time': 1071002600,
+ 'filename': '3DLDF-1.1.3.tar.gz',
+ 'version': '1.1.3',
+ 'length': 543
+ },
+ {
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
+ 'time': 1071078759,
+ 'filename: '3DLDF-1.1.4.tar.gz',
+ 'version': '1.1.4',
+ 'length': 456
+ },
+ {
+ 'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
+ 'time': 1074278633,
+ 'filename': '3DLDF-1.1.5.tar.gz',
+ 'version': '1.1.5'
+ 'length': 251
+ },
...
]
@@ -136,10 +246,13 @@
filename = info_file['name']
if filetype == 'file':
if check_filename_is_archive(filename):
+ uri = url + filename
artifacts.append({
- 'archive': url + filename,
+ 'url': uri,
+ 'filename': filename,
'time': int(info_file['time']),
'length': int(info_file['size']),
+ 'version': get_version(filename),
})
# It will recursively check for artifacts in all sub-folders
elif filetype == 'directory':
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 1:15 AM (2 d, 10 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225952
Attached To
D2147: lister.gnu: Standardize arguments to pass to the loader tar
Event Timeline
Log In to Comment