Changeset View
Standalone View
swh/lister/gnu/tree.py
- This file was added.
# Copyright (C) 2019 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import gzip | |||||
import json | |||||
import logging | |||||
import requests | |||||
from pathlib import Path | |||||
from typing import Dict, Tuple, List | |||||
from urllib.parse import urlparse | |||||
logger = logging.getLogger(__name__) | |||||
douardda: I'd rather call it GnuTree (since the json is only a data serialization detail here). What this… | |||||
def load_raw_data(url: str) -> List[Dict]: | |||||
"""Load the raw json from the tree.json.gz | |||||
Args: | |||||
Not Done Inline Actionsif we're doing type annotations, url should have one. vlorentz: if we're doing type annotations, `url` should have one. | |||||
url: Tree.json.gz url or path | |||||
Returns: | |||||
The raw json list | |||||
""" | |||||
if url.startswith('http://') or url.startswith('https://'): | |||||
response = requests.get(url, allow_redirects=True) | |||||
Not Done Inline Actions# type: Dict vlorentz: `# type: Dict` | |||||
if not response.ok: | |||||
raise ValueError('Error during query to %s' % url) | |||||
raw = gzip.decompress(response.content) | |||||
else: | |||||
with gzip.open(url, 'r') as f: | |||||
raw = f.read() | |||||
raw_data = json.loads(raw.decode('utf-8')) | |||||
return raw_data | |||||
class GNUTree: | |||||
"""Gnu Tree's representation | |||||
""" | |||||
def __init__(self, url: str): | |||||
self.url = url # filepath or uri | |||||
u = urlparse(url) | |||||
self.base_url = '%s://%s' % (u.scheme, u.netloc) | |||||
Not Done Inline ActionsWe should probably do: if self.url.startswith('http://') or self.url.startswith('https://'). It feels like we're going to get bitten when a dir name starts with http_ otherwise vlorentz: We should probably do: `if self.url.startswith('http://') or self.url.startswith('https://')`. | |||||
Not Done Inline Actions:// vlorentz: `://` | |||||
# Interesting top level directories | |||||
self.top_level_directories = ['gnu', 'old-gnu'] | |||||
# internal state | |||||
self._artifacts = {} # type: Dict | |||||
Not Done Inline ActionsSince you already have properties (projects and artifacts) that are the entry points to access the data, and since these already implement the 'singleton/caching' nature of retrieving the original tree structure, this property is probably useless. Having both _raw_data and the couple (_projects, _artifacts) as cached data is redundant. douardda: Since you already have properties (projects and artifacts) that are the entry points to access… | |||||
Done Inline Actionssold ardumont: sold | |||||
self._projects = {} # type: Dict | |||||
@property | |||||
Not Done Inline ActionsWhen do we need loading from the filesystem? vlorentz: When do we need loading from the filesystem? | |||||
Done Inline ActionsBecause i did this code in a snippet first to actually analyze from a tree.json.gz snapshot. ardumont: Because i did this code in a snippet first to actually analyze from a tree.json.gz snapshot. | |||||
def projects(self) -> Dict: | |||||
if not self._projects: | |||||
self._projects, self._artifacts = self._load() | |||||
return self._projects | |||||
@property | |||||
def artifacts(self) -> Dict: | |||||
if not self._artifacts: | |||||
self._projects, self._artifacts = self._load() | |||||
return self._artifacts | |||||
def _load(self) -> Tuple[Dict, Dict]: | |||||
"""Compute projects and artifacts per project | |||||
Returns: | |||||
Tuple of dict projects (key project url, value the associated | |||||
information) and a dict artifacts (key project url, value the | |||||
info_file list) | |||||
""" | |||||
projects = {} | |||||
artifacts = {} | |||||
Not Done Inline Actionsnitpick: either contents = directory['contents'] for content in contents: or infos = directory['contents'] for info in infos: vlorentz: nitpick: either
```
contents = directory['contents']
for content in contents:
```
or
```… | |||||
raw_data = load_raw_data(self.url)[0] | |||||
for directory in raw_data['contents']: | |||||
if directory['name'] not in self.top_level_directories: | |||||
continue | |||||
infos = directory['contents'] | |||||
for info in infos: | |||||
if info['type'] == 'directory': | |||||
package_url = '%s/%s/%s/' % ( | |||||
self.base_url, directory['name'], info['name']) | |||||
package_artifacts = find_artifacts( | |||||
info['contents'], package_url) | |||||
if package_artifacts != []: | |||||
repo_details = { | |||||
'name': info['name'], | |||||
'url': package_url, | |||||
'time_modified': info['time'], | |||||
} | |||||
artifacts[package_url] = package_artifacts | |||||
projects[package_url] = repo_details | |||||
Not Done Inline ActionsTheses looks to be the 'public API entry points' of this class, if so, move them under the init method plz douardda: Theses looks to be the 'public API entry points' of this class, if so, move them under the… | |||||
Done Inline ActionsRight. ardumont: Right. | |||||
return projects, artifacts | |||||
Not Done Inline Actionsthe filesystem argument should be better document (at least show an example) vlorentz: the `filesystem` argument should be better document (at least show an example) | |||||
def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]: | |||||
"""Recursively list artifacts present in the folder and subfolders for a | |||||
particular package url. | |||||
Args: | |||||
filesystem: File structure of the package root directory. This is a | |||||
list of Dict representing either file or directory information as | |||||
dict (keys: name, size, time, type). | |||||
Not Done Inline Actionsindent vlorentz: indent | |||||
url: URL of the corresponding package | |||||
Returns | |||||
List of tarball urls and their associated metadata (time, length). | |||||
For example: | |||||
.. code-block:: python | |||||
[ | |||||
Not Done Inline Actionscode block vlorentz: code block | |||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz', | |||||
'time': 1071002600, | |||||
'length': 543}, | |||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz', | |||||
Not Done Inline Actionsshould be indented to be in the Returns block. vlorentz: should be indented to be in the `Returns` block. | |||||
Not Done Inline Actionsstill not indented vlorentz: still not indented | |||||
Not Done Inline Actionsstill not :/ vlorentz: still not :/ | |||||
'time': 1071078759, | |||||
'length': 456}, | |||||
{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz', | |||||
'time': 1074278633, | |||||
'length': 251}, | |||||
... | |||||
] | |||||
""" | |||||
artifacts = [] | |||||
for info_file in filesystem: | |||||
filetype = info_file['type'] | |||||
filename = info_file['name'] | |||||
if filetype == 'file': | |||||
if check_filename_is_archive(filename): | |||||
artifacts.append({ | |||||
'archive': url + filename, | |||||
'time': int(info_file['time']), | |||||
'length': int(info_file['size']), | |||||
Not Done Inline Actionscheck_filename_is_archive is more explicit, and is grammatically consistent with find_artifacts. vlorentz: `check_filename_is_archive` is more explicit, and is grammatically consistent with… | |||||
}) | |||||
# It will recursively check for artifacts in all sub-folders | |||||
elif filetype == 'directory': | |||||
tarballs_in_dir = find_artifacts( | |||||
info_file['contents'], | |||||
url + filename + '/') | |||||
artifacts.extend(tarballs_in_dir) | |||||
return artifacts | |||||
Not Done Inline Actionsaka bool vlorentz: aka `bool` | |||||
def check_filename_is_archive(filename: str) -> bool: | |||||
""" | |||||
Check for the extension of the file, if the file is of zip format of | |||||
.tar.x format, where x could be anything, then returns true. | |||||
Not Done Inline ActionsCould be a doctest: >>> file_extension_check('abc.zip') True >>> file_extension_check('abc.tar.gz') True >>> file_extension_check('abc.tar.gz.sig') False vlorentz: Could be a doctest:
```
>>> file_extension_check('abc.zip')
True
>>> file_extension_check('abc. | |||||
Not Done Inline Actionsbool: Whether filename is an archive or not vlorentz: `bool: Whether filename is an archive or not` | |||||
Done Inline ActionsI mentioned the returned type annotation so that's not needed. ardumont: I mentioned the returned type annotation so that's not needed. | |||||
Args: | |||||
filename: name of the file for which the extensions is needs to | |||||
be checked. | |||||
Returns: | |||||
Whether filename is an archive or not | |||||
Example: | |||||
Not Done Inline ActionsFalse negative on foo.tar. (Should also add a test for this) vlorentz: False negative on `foo.tar`. (Should also add a test for this) | |||||
>>> check_filename_is_archive('abc.zip') | |||||
True | |||||
>>> check_filename_is_archive('abc.tar.gz') | |||||
True | |||||
>>> check_filename_is_archive('bac.tar') | |||||
True | |||||
>>> check_filename_is_archive('abc.tar.gz.sig') | |||||
False | |||||
>>> check_filename_is_archive('foobar.tar.') | |||||
False | |||||
""" | |||||
file_suffixes = Path(filename).suffixes | |||||
logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes)) | |||||
if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'): | |||||
return True | |||||
elif len(file_suffixes) > 1: | |||||
if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar': | |||||
return True | |||||
return False |
I'd rather call it GnuTree (since the json is only a data serialization detail here). What this class models is the Gnu tree structure.