Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/gnu/tree.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import gzip | import gzip | ||||
import json | import json | ||||
import logging | import logging | ||||
import requests | import requests | ||||
import re | |||||
from os import path | |||||
from pathlib import Path | from pathlib import Path | ||||
from typing import Dict, Tuple, List | from typing import Any, Dict, List, Mapping, Tuple | ||||
from urllib.parse import urlparse | from urllib.parse import urlparse | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
# to recognize existing naming pattern | |||||
extensions = [ | |||||
'zip', | |||||
'tar', | |||||
'gz', 'tgz', | |||||
'bz2', 'bzip2', | |||||
'lzma', 'lz', | |||||
'xz', | |||||
'Z', | |||||
] | |||||
version_keywords = [ | |||||
'cygwin_me', | |||||
'w32', 'win32', 'nt', 'cygwin', 'mingw', | |||||
'latest', 'alpha', 'beta', | |||||
'release', 'stable', | |||||
'hppa', | |||||
'solaris', 'sunos', 'sun4u', 'sparc', 'sun', | |||||
'aix', 'ibm', 'rs6000', | |||||
'i386', 'i686', | |||||
'linux', 'redhat', 'linuxlibc', | |||||
'mips', | |||||
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh', | |||||
'unknown', | |||||
'netbsd', 'freebsd', | |||||
'sgi', 'irix', | |||||
] | |||||
# Match a filename into components. | |||||
# | |||||
# We use Debian's release number heuristic: A release number starts | |||||
# with a digit, and is followed by alphanumeric characters or any of | |||||
# ., +, :, ~ and - | |||||
# | |||||
# We hardcode a list of possible extensions, as this release number | |||||
# scheme would match them too... We match on any combination of those. | |||||
# | |||||
# Greedy matching is done right to left (we only match the extension | |||||
# greedily with +, software_name and release_number are matched lazily | |||||
# with +? and *?). | |||||
pattern = r''' | |||||
^ | |||||
(?: | |||||
# We have a software name and a release number, separated with a | |||||
# -, _ or dot. | |||||
(?P<software_name1>.+?[-_.]) | |||||
(?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+) | |||||
| | |||||
# We couldn't match a release number, put everything in the | |||||
# software name. | |||||
(?P<software_name2>.+?) | |||||
) | |||||
(?P<extension>(?:\.(?:%(extensions)s))+) | |||||
$ | |||||
''' % { | |||||
'extensions': '|'.join(extensions), | |||||
'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords), | |||||
} | |||||
def get_version(uri: str) -> str: | |||||
"""Extract branch name from tarball uri | |||||
Args: | |||||
uri (str): Tarball URI | |||||
Returns: | |||||
Version detected | |||||
Example: | |||||
For uri = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz | |||||
>>> get_version(uri) | |||||
'0.2.0' | |||||
For uri = 8sync-0.3.0.tar.gz | |||||
>>> get_version(uri) | |||||
'0.3.0' | |||||
""" | |||||
filename = path.split(uri)[-1] | |||||
m = re.match(pattern, filename, | |||||
flags=re.VERBOSE | re.IGNORECASE) | |||||
if m: | |||||
d = m.groupdict() | |||||
if d['software_name1'] and d['release_number']: | |||||
return d['release_number'] | |||||
if d['software_name2']: | |||||
return d['software_name2'] | |||||
return '' | |||||
def load_raw_data(url: str) -> List[Dict]: | def load_raw_data(url: str) -> List[Dict]: | ||||
"""Load the raw json from the tree.json.gz | """Load the raw json from the tree.json.gz | ||||
Args: | Args: | ||||
url: Tree.json.gz url or path | url: Tree.json.gz url or path | ||||
Returns: | Returns: | ||||
The raw json list | The raw json list | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | def _load(self) -> Tuple[Dict, Dict]: | ||||
'time_modified': info['time'], | 'time_modified': info['time'], | ||||
} | } | ||||
artifacts[package_url] = package_artifacts | artifacts[package_url] = package_artifacts | ||||
projects[package_url] = repo_details | projects[package_url] = repo_details | ||||
return projects, artifacts | return projects, artifacts | ||||
def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]: | def find_artifacts( | ||||
filesystem: List[Dict], url: str) -> List[Mapping[str, Any]]: | |||||
"""Recursively list artifacts present in the folder and subfolders for a | """Recursively list artifacts present in the folder and subfolders for a | ||||
particular package url. | particular package url. | ||||
Args: | Args: | ||||
filesystem: File structure of the package root directory. This is a | filesystem: File structure of the package root directory. This is a | ||||
list of Dict representing either file or directory information as | list of Dict representing either file or directory information as | ||||
dict (keys: name, size, time, type). | dict (keys: name, size, time, type). | ||||
url: URL of the corresponding package | url: URL of the corresponding package | ||||
Returns | Returns | ||||
List of tarball urls and their associated metadata (time, length). | List of tarball urls and their associated metadata (time, length, | ||||
For example: | etc...). For example: | ||||
.. code-block:: python | .. code-block:: python | ||||
[ | [ | ||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', | { | ||||
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', | |||||
'time': 1071002600, | 'time': 1071002600, | ||||
'length': 543}, | 'filename': '3DLDF-1.1.3.tar.gz', | ||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', | 'version': '1.1.3', | ||||
'length': 543 | |||||
}, | |||||
{ | |||||
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', | |||||
'time': 1071078759, | 'time': 1071078759, | ||||
'length': 456}, | 'filename: '3DLDF-1.1.4.tar.gz', | ||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', | 'version': '1.1.4', | ||||
'length': 456 | |||||
}, | |||||
{ | |||||
'url': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', | |||||
'time': 1074278633, | 'time': 1074278633, | ||||
'length': 251}, | 'filename': '3DLDF-1.1.5.tar.gz', | ||||
'version': '1.1.5' | |||||
'length': 251 | |||||
}, | |||||
... | ... | ||||
] | ] | ||||
""" | """ | ||||
artifacts = [] | artifacts = [] | ||||
for info_file in filesystem: | for info_file in filesystem: | ||||
filetype = info_file['type'] | filetype = info_file['type'] | ||||
filename = info_file['name'] | filename = info_file['name'] | ||||
if filetype == 'file': | if filetype == 'file': | ||||
if check_filename_is_archive(filename): | if check_filename_is_archive(filename): | ||||
uri = url + filename | |||||
artifacts.append({ | artifacts.append({ | ||||
'archive': url + filename, | 'url': uri, | ||||
'filename': filename, | |||||
'time': int(info_file['time']), | 'time': int(info_file['time']), | ||||
'length': int(info_file['size']), | 'length': int(info_file['size']), | ||||
'version': get_version(filename), | |||||
}) | }) | ||||
# It will recursively check for artifacts in all sub-folders | # It will recursively check for artifacts in all sub-folders | ||||
elif filetype == 'directory': | elif filetype == 'directory': | ||||
tarballs_in_dir = find_artifacts( | tarballs_in_dir = find_artifacts( | ||||
info_file['contents'], | info_file['contents'], | ||||
url + filename + '/') | url + filename + '/') | ||||
artifacts.extend(tarballs_in_dir) | artifacts.extend(tarballs_in_dir) | ||||
Show All 28 Lines | def check_filename_is_archive(filename: str) -> bool: | ||||
""" | """ | ||||
file_suffixes = Path(filename).suffixes | file_suffixes = Path(filename).suffixes | ||||
logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) | logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) | ||||
if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): | if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): | ||||
return True | return True | ||||
elif len(file_suffixes) > 1: | elif len(file_suffixes) > 1: | ||||
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': | if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': | ||||
return True | return True | ||||
return False | return False | ||||
vlorentzUnsubmitted Not Done Inline Actionsvlorentz: ```
.format(
extensions=...
vkeywords=...
``` | |||||
Done Inline ActionsWell, i tried and it does not work. ardumont: Well, i tried and it does not work.
That fails the tests somehow... | |||||
Done Inline Actionspattern1 is the current implem' (i renamed the global variable to use upper case in their name): collecting ... ('pattern1: \n' '^\n' '(?:\n' ' # We have a software name and a release number, separated with a\n' ' # -, _ or dot.\n' ' (?P<software_name1>.+?[-_.])\n' ' ' '(?P<release_number>(cygwin_me[-]?|w32[-]?|win32[-]?|nt[-]?|cygwin[-]?|mingw[-]?|latest[-]?|alpha[-]?|beta[-]?|release[-]?|stable[-]?|hppa[-]?|solaris[-]?|sunos[-]?|sun4u[-]?|sparc[-]?|sun[-]?|aix[-]?|ibm[-]?|rs6000[-]?|i386[-]?|i686[-]?|linux[-]?|redhat[-]?|linuxlibc[-]?|mips[-]?|powerpc[-]?|macos[-]?|apple[-]?|darwin[-]?|macosx[-]?|powermacintosh[-]?|unknown[-]?|netbsd[-]?|freebsd[-]?|sgi[-]?|irix[-]?|[0-9][0-9a-zA-Z_.+:~-]*?)+)\n' '|\n' " # We couldn't match a release number, put everything in the\n" ' # software name.\n' ' (?P<software_name2>.+?)\n' ')\n' pattern2 is the format call: PATTERN2 = r''' ^ (?: # We have a software name and a release number, separated with a # -, _ or dot. (?P<software_name1>.+?[-_.]) (?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+) | # We couldn't match a release number, put everything in the # software name. (?P<software_name2>.+?) ) (?P<extension>(?:\.(?:%(extensions)s))+) $ '''.format( extensions='|'.join(EXTENSIONS), vkeywords='|'.join('%s[-]?' % k for k in VERSION_KEYWORDS) ) pprint(PATTERN2) gives: '(?P<extension>(?:\\.(?:zip|tar|gz|tgz|bz2|bzip2|lzma|lz|xz|Z|7z))+)\n' '$\n') ('pattern2: \n' '^\n' '(?:\n' ' # We have a software name and a release number, separated with a\n' ' # -, _ or dot.\n' ' (?P<software_name1>.+?[-_.])\n' ' (?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)\n' '|\n' " # We couldn't match a release number, put everything in the\n" ' # software name.\n' ' (?P<software_name2>.+?)\n' ')\n' '(?P<extension>(?:\\.(?:%(extensions)s))+)\n' '$\n') it seems it does not work as expected... ardumont: pattern1 is the current implem' (i renamed the global variable to use upper case in their name)… | |||||
Not Done Inline Actionsbecause you need to use "{extensions}" in the string to be formatted douardda: because you need to use "{extensions}" in the string to be formatted | |||||
Done Inline Actionsright, thx! ardumont: right, thx! | |||||
Not Done Inline Actionsshouldn't the final + be non-greedy as well? (Don't forget to add a test for this, if relevant) I'm guessing you'll need it for .7z. vlorentz: shouldn't the final `+` be non-greedy as well? (Don't forget to add a test for this, if… | |||||
Done Inline ActionsI added a test with a .7z and in current form, that's enough. ardumont: I added a test with a .7z and in current form, that's enough. | |||||
Not Done Inline ActionsYou forgot .7z, eg. https://ftp.gnu.org/gnu/octave/windows/ vlorentz: You forgot `.7z`, eg. https://ftp.gnu.org/gnu/octave/windows/ | |||||
Done Inline Actionsthx! ardumont: thx! |