Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11023465
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
23 KB
Subscribers
None
View Options
diff --git a/requirements.txt b/requirements.txt
index 51e86f4..bf722f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
SQLAlchemy
arrow
python_debian
requests
setuptools
xmltodict
iso8601
beautifulsoup4
+pytz
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
index 381c02c..5179462 100644
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -1,102 +1,102 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.gnu.models import GNUModel
from swh.lister.gnu.tree import GNUTree
logger = logging.getLogger(__name__)
class GNULister(SimpleLister):
MODEL = GNUModel
LISTER_NAME = 'gnu'
instance = 'gnu'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.gnu_tree = GNUTree('https://ftp.gnu.org/tree.json.gz')
def task_dict(self, origin_type, origin_url, **kwargs):
"""Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
This creates tasks with args and kwargs set, for example:
.. code-block:: python
args:
kwargs: {
'url': 'https://ftp.gnu.org/gnu/3dldf/',
'artifacts': [{
'url': 'https://...',
'time': 1071002600,
'length': 128},
...
]}
"""
artifacts = self.gnu_tree.artifacts[origin_url]
return utils.create_task_dict(
'load-%s' % origin_type,
kwargs.get('policy', 'oneshot'),
url=origin_url,
artifacts=artifacts)
def safely_issue_request(self, identifier):
"""Bypass the implementation. It's now the GNUTree which deals with
querying the gnu mirror.
As an implementation detail, we cannot change simply the base
SimpleLister as other implementation still uses it. This shall be part
of another refactoring pass.
"""
return None
def list_packages(self, response):
"""List the actual gnu origins (package name) with their name, url and
associated tarballs.
Args:
response: Unused
Returns:
List of packages name, url, last modification time
.. code-block:: python
[
{'name': '3dldf',
'url': 'https://ftp.gnu.org/gnu/3dldf/',
'time_modified': 1071002600},
{'name': '8sync',
'url': 'https://ftp.gnu.org/gnu/8sync/',
'time_modified': 1480991830},
...
]
"""
return list(self.gnu_tree.projects.values())
def get_model_from_repo(self, repo):
"""Transform from repository representation to model
"""
return {
'uid': repo['name'],
'name': repo['name'],
'full_name': repo['name'],
'html_url': repo['url'],
'origin_url': repo['url'],
- 'time_last_updated': int(repo['time_modified']),
+ 'time_last_updated': repo['time_modified'],
'origin_type': 'tar',
}
diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py
index 04b4a5e..bb78f85 100644
--- a/swh/lister/gnu/models.py
+++ b/swh/lister/gnu/models.py
@@ -1,17 +1,17 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from sqlalchemy import Column, String, Integer
+from sqlalchemy import Column, DateTime, Integer, String
from ..core.models import ModelBase
class GNUModel(ModelBase):
"""a GNU repository representation
"""
__tablename__ = 'gnu_repo'
uid = Column(String, primary_key=True)
- time_last_updated = Column(Integer)
+ time_last_updated = Column(DateTime)
diff --git a/swh/lister/gnu/tests/test_tree.py b/swh/lister/gnu/tests/test_tree.py
index 4137a74..0e6193b 100644
--- a/swh/lister/gnu/tests/test_tree.py
+++ b/swh/lister/gnu/tests/test_tree.py
@@ -1,223 +1,223 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import pytest
from os import path
from swh.lister.gnu.tree import (
GNUTree, find_artifacts, check_filename_is_archive, load_raw_data,
get_version, format_date
)
def test_load_raw_data_from_query(requests_mock_datadir):
actual_json = load_raw_data('https://ftp.gnu.org/tree.json.gz')
assert actual_json is not None
assert isinstance(actual_json, list)
assert len(actual_json) == 2
def test_load_raw_data_from_query_failure(requests_mock_datadir):
inexistant_url = 'https://ftp2.gnu.org/tree.unknown.gz'
with pytest.raises(ValueError, match='Error during query'):
load_raw_data(inexistant_url)
def test_load_raw_data_from_file(datadir):
filepath = path.join(datadir, 'https_ftp.gnu.org', 'tree.json.gz')
actual_json = load_raw_data(filepath)
assert actual_json is not None
assert isinstance(actual_json, list)
assert len(actual_json) == 2
def test_load_raw_data_from_file_failure(datadir):
unknown_path = path.join(datadir, 'ftp.gnu.org2', 'tree.json.gz')
with pytest.raises(FileNotFoundError):
load_raw_data(unknown_path)
def test_tree_json(requests_mock_datadir):
tree_json = GNUTree('https://ftp.gnu.org/tree.json.gz')
assert tree_json.projects['https://ftp.gnu.org/gnu/8sync/'] == {
'name': '8sync',
- 'time_modified': '1489817408',
+ 'time_modified': '2017-03-18T06:10:08+00:00',
'url': 'https://ftp.gnu.org/gnu/8sync/'
}
assert tree_json.projects['https://ftp.gnu.org/gnu/3dldf/'] == {
'name': '3dldf',
- 'time_modified': '1386961236',
+ 'time_modified': '2013-12-13T19:00:36+00:00',
'url': 'https://ftp.gnu.org/gnu/3dldf/'
}
assert tree_json.projects['https://ftp.gnu.org/gnu/a2ps/'] == {
'name': 'a2ps',
- 'time_modified': '1198900505',
+ 'time_modified': '2007-12-29T03:55:05+00:00',
'url': 'https://ftp.gnu.org/gnu/a2ps/'
}
assert tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/'] == {
'name': 'xshogi',
- 'time_modified': '1059822922',
+ 'time_modified': '2003-08-02T11:15:22+00:00',
'url': 'https://ftp.gnu.org/old-gnu/xshogi/'
}
assert tree_json.artifacts['https://ftp.gnu.org/old-gnu/zlibc/'] == [
{
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9b.tar.gz', # noqa
'length': 90106,
- 'time': '1997-03-10T09:00:00',
+ 'time': '1997-03-10T08:00:00+00:00',
'filename': 'zlibc-0.9b.tar.gz',
'version': '0.9b',
},
{
'url': 'https://ftp.gnu.org/old-gnu/zlibc/zlibc-0.9e.tar.gz', # noqa
'length': 89625,
- 'time': '1997-04-07T09:00:00',
+ 'time': '1997-04-07T07:00:00+00:00',
'filename': 'zlibc-0.9e.tar.gz',
'version': '0.9e',
}
]
def test_tree_json_failures(requests_mock_datadir):
url = 'https://unknown/tree.json.gz'
tree_json = GNUTree(url)
with pytest.raises(ValueError, match='Error during query to %s' % url):
tree_json.artifacts['https://ftp.gnu.org/gnu/3dldf/']
with pytest.raises(ValueError, match='Error during query to %s' % url):
tree_json.projects['https://ftp.gnu.org/old-gnu/xshogi/']
def test_find_artifacts_small_sample(datadir):
expected_artifacts = [
{
'url': '/root/artanis/artanis-0.2.1.tar.bz2',
- 'time': '2017-05-19T16:59:39',
+ 'time': '2017-05-19T14:59:39+00:00',
'length': 424081,
'version': '0.2.1',
'filename': 'artanis-0.2.1.tar.bz2',
},
{
'url': '/root/xboard/winboard/winboard-4_0_0-src.zip', # noqa
- 'time': '1998-06-21T11:55:00',
+ 'time': '1998-06-21T09:55:00+00:00',
'length': 1514448,
'version': '4_0_0-src',
'filename': 'winboard-4_0_0-src.zip',
},
{
'url': '/root/xboard/xboard-3.6.2.tar.gz', # noqa
- 'time': '1997-07-25T09:00:00',
+ 'time': '1997-07-25T07:00:00+00:00',
'length': 450164,
'version': '3.6.2',
'filename': 'xboard-3.6.2.tar.gz',
},
{
'url': '/root/xboard/xboard-4.0.0.tar.gz', # noqa
- 'time': '1998-06-21T11:55:00',
+ 'time': '1998-06-21T09:55:00+00:00',
'length': 514951,
'version': '4.0.0',
'filename': 'xboard-4.0.0.tar.gz',
},
]
file_structure = json.load(open(path.join(datadir, 'tree.min.json')))
actual_artifacts = find_artifacts(file_structure, '/root/')
assert actual_artifacts == expected_artifacts
def test_find_artifacts(datadir):
file_structure = json.load(open(path.join(datadir, 'tree.json')))
actual_artifacts = find_artifacts(file_structure, '/root/')
assert len(actual_artifacts) == 42 + 3 # tar + zip
def test_check_filename_is_archive():
for ext in ['abc.xy.zip', 'cvb.zip', 'abc.tar.bz2', 'something.tar']:
assert check_filename_is_archive(ext) is True
for ext in ['abc.tar.gz.sig', 'abc', 'something.zip2', 'foo.tar.']:
assert check_filename_is_archive(ext) is False
def test_get_version():
"""From url to branch name should yield something relevant
"""
for url, expected_branchname in [
('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
('https://ftp.org/gnu/aris-w32.zip', 'w32'),
('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
('https://ftp.org/gnu/crypto-build-demo.tar.gz',
'crypto-build-demo'),
('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
'clue+clio+xit.clisp'),
('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
'clue+clio.for-pcl'),
('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
'hppa2.0-hp-hpux10.20'),
('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
('clisp-powerpc-unknown-linuxlibc6.tar.gz',
'powerpc-unknown-linuxlibc6'),
('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
'sparc-sun-sunos4.1.3_U1'),
('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
'2.25.1-powerpc-apple-MacOSX'),
('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
'2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
'2.27-i686-unknown-Linux-2.2.19'),
('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
'2.28-i386-i386-freebsd-4.3-RELEASE'),
('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
'2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
'2.29-i386-i386-freebsd-4.6-STABLE'),
('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
'2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
'2.5.3-ansi-japi-xdr.20030701_mingw32'),
('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
('sather-logo_images.tar.gz', 'sather-logo_images'),
('sather-specification-000328.html.tar.gz', '000328.html')
]:
actual_branchname = get_version(url)
assert actual_branchname == expected_branchname
def test_format_date():
- for t_str, expected_isoformat_date in [
- (1489817408, '2017-03-18T07:10:08'),
- (1386961236, '2013-12-13T20:00:36'),
- ('1198900505', '2007-12-29T04:55:05'),
- (1059822922, '2003-08-02T13:15:22'),
- ('1489817408', '2017-03-18T07:10:08'),
+ for timestamp, expected_isoformat_date in [
+ (1489817408, '2017-03-18T06:10:08+00:00'),
+ (1386961236, '2013-12-13T19:00:36+00:00'),
+ ('1198900505', '2007-12-29T03:55:05+00:00'),
+ (1059822922, '2003-08-02T11:15:22+00:00'),
+ ('1489817408', '2017-03-18T06:10:08+00:00'),
]:
- actual_date = format_date(t_str)
+ actual_date = format_date(timestamp)
assert actual_date == expected_isoformat_date
with pytest.raises(ValueError):
format_date('')
with pytest.raises(TypeError):
format_date(None)
diff --git a/swh/lister/gnu/tree.py b/swh/lister/gnu/tree.py
index b47e1c5..5a4991f 100644
--- a/swh/lister/gnu/tree.py
+++ b/swh/lister/gnu/tree.py
@@ -1,308 +1,309 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import datetime
import gzip
import json
import logging
import requests
import re
+from datetime import datetime
from os import path
from pathlib import Path
+from pytz import utc
from typing import Any, Dict, List, Mapping, Tuple
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
class GNUTree:
"""Gnu Tree's representation
"""
def __init__(self, url: str):
self.url = url # filepath or uri
u = urlparse(url)
self.base_url = '%s://%s' % (u.scheme, u.netloc)
# Interesting top level directories
self.top_level_directories = ['gnu', 'old-gnu']
# internal state
self._artifacts = {} # type: Dict
self._projects = {} # type: Dict
@property
def projects(self) -> Dict:
if not self._projects:
self._projects, self._artifacts = self._load()
return self._projects
@property
def artifacts(self) -> Dict:
if not self._artifacts:
self._projects, self._artifacts = self._load()
return self._artifacts
def _load(self) -> Tuple[Dict, Dict]:
"""Compute projects and artifacts per project
Returns:
Tuple of dict projects (key project url, value the associated
information) and a dict artifacts (key project url, value the
info_file list)
"""
projects = {}
artifacts = {}
raw_data = load_raw_data(self.url)[0]
for directory in raw_data['contents']:
if directory['name'] not in self.top_level_directories:
continue
infos = directory['contents']
for info in infos:
if info['type'] == 'directory':
package_url = '%s/%s/%s/' % (
self.base_url, directory['name'], info['name'])
package_artifacts = find_artifacts(
info['contents'], package_url)
if package_artifacts != []:
repo_details = {
'name': info['name'],
'url': package_url,
- 'time_modified': info['time'],
+ 'time_modified': format_date(info['time'])
}
artifacts[package_url] = package_artifacts
projects[package_url] = repo_details
return projects, artifacts
def find_artifacts(
filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]:
"""Recursively list artifacts present in the folder and subfolders for a
particular package url.
Args:
filesystem: File structure of the package root directory. This is a
list of Dict representing either file or directory information as
dict (keys: name, size, time, type).
url: URL of the corresponding package
Returns
List of tarball urls and their associated metadata (time, length,
etc...). For example:
.. code-block:: python
[
{
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
'time': 1071002600,
'filename': '3DLDF-1.1.3.tar.gz',
'version': '1.1.3',
'length': 543
},
{
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
'time': 1071078759,
'filename: '3DLDF-1.1.4.tar.gz',
'version': '1.1.4',
'length': 456
},
{
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
'time': 1074278633,
'filename': '3DLDF-1.1.5.tar.gz',
'version': '1.1.5'
'length': 251
},
...
]
"""
artifacts = []
for info_file in filesystem:
filetype = info_file['type']
filename = info_file['name']
if filetype == 'file':
if check_filename_is_archive(filename):
uri = url + filename
artifacts.append({
'url': uri,
'filename': filename,
'time': format_date(info_file['time']),
'length': int(info_file['size']),
'version': get_version(filename),
})
# It will recursively check for artifacts in all sub-folders
elif filetype == 'directory':
tarballs_in_dir = find_artifacts(
info_file['contents'],
url + filename + '/')
artifacts.extend(tarballs_in_dir)
return artifacts
def check_filename_is_archive(filename: str) -> bool:
"""
Check for the extension of the file, if the file is of zip format of
.tar.x format, where x could be anything, then returns true.
Args:
filename: name of the file for which the extensions is needs to
be checked.
Returns:
Whether filename is an archive or not
Example:
>>> check_filename_is_archive('abc.zip')
True
>>> check_filename_is_archive('abc.tar.gz')
True
>>> check_filename_is_archive('bac.tar')
True
>>> check_filename_is_archive('abc.tar.gz.sig')
False
>>> check_filename_is_archive('foobar.tar.')
False
"""
file_suffixes = Path(filename).suffixes
logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
return True
elif len(file_suffixes) > 1:
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
return True
return False
# to recognize existing naming pattern
extensions = [
'zip',
'tar',
'gz', 'tgz',
'bz2', 'bzip2',
'lzma', 'lz',
'xz',
'Z',
]
version_keywords = [
'cygwin_me',
'w32', 'win32', 'nt', 'cygwin', 'mingw',
'latest', 'alpha', 'beta',
'release', 'stable',
'hppa',
'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
'aix', 'ibm', 'rs6000',
'i386', 'i686',
'linux', 'redhat', 'linuxlibc',
'mips',
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
'unknown',
'netbsd', 'freebsd',
'sgi', 'irix',
]
# Match a filename into components.
#
# We use Debian's release number heuristic: A release number starts
# with a digit, and is followed by alphanumeric characters or any of
# ., +, :, ~ and -
#
# We hardcode a list of possible extensions, as this release number
# scheme would match them too... We match on any combination of those.
#
# Greedy matching is done right to left (we only match the extension
# greedily with +, software_name and release_number are matched lazily
# with +? and *?).
pattern = r'''
^
(?:
# We have a software name and a release number, separated with a
# -, _ or dot.
(?P<software_name1>.+?[-_.])
(?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
|
# We couldn't match a release number, put everything in the
# software name.
(?P<software_name2>.+?)
)
(?P<extension>(?:\.(?:%(extensions)s))+)
$
''' % {
'extensions': '|'.join(extensions),
'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
}
def get_version(uri: str) -> str:
"""Extract branch name from tarball uri
Args:
uri (str): Tarball URI
Returns:
Version detected
Example:
For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
>>> get_version(uri)
'0.2.0'
For uri = 8sync-0.3.0.tar.gz
>>> get_version(uri)
'0.3.0'
"""
filename = path.split(uri)[-1]
m = re.match(pattern, filename,
flags=re.VERBOSE | re.IGNORECASE)
if m:
d = m.groupdict()
if d['software_name1'] and d['release_number']:
return d['release_number']
if d['software_name2']:
return d['software_name2']
return ''
def load_raw_data(url: str) -> List[Dict]:
"""Load the raw json from the tree.json.gz
Args:
url: Tree.json.gz url or path
Returns:
The raw json list
"""
if url.startswith('http://') or url.startswith('https://'):
response = requests.get(url, allow_redirects=True)
if not response.ok:
raise ValueError('Error during query to %s' % url)
raw = gzip.decompress(response.content)
else:
with gzip.open(url, 'r') as f:
raw = f.read()
raw_data = json.loads(raw.decode('utf-8'))
return raw_data
def format_date(timestamp: str) -> str:
"""Format a string timestamp to an isoformat string
"""
- return datetime.datetime.fromtimestamp(int(timestamp)).isoformat()
+ return datetime.fromtimestamp(int(timestamp), tz=utc).isoformat()
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Sep 18, 4:37 PM (1 d, 2 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3248104
Attached To
rDLS Listers
Event Timeline
Log In to Comment