Differential D2076 Diff 7068 swh/lister/gnu/tree.py

Changeset View

Standalone View

swh/lister/gnu/tree.py

This file was added.

				# Copyright (C) 2019 The Software Heritage developers
				# See the AUTHORS file at the top-level directory of this distribution
				# License: GNU General Public License version 3, or any later version
				# See top-level LICENSE file for more information

				import gzip
				import json
				import logging
				import requests

				from pathlib import Path
				from typing import Dict, Tuple, List
				from urllib.parse import urlparse


				logger = logging.getLogger(__name__)


				douarddaUnsubmitted Not Done Inline Actions I'd rather call it GnuTree (since the json is only a data serialization detail here). What this class models is the Gnu tree structure. douardda: I'd rather call it GnuTree (since the json is only a data serialization detail here). What this…
				def load_raw_data(url: str) -> List[Dict]:
				"""Load the raw json from the tree.json.gz

				Args:
				vlorentzUnsubmitted Not Done Inline Actions if we're doing type annotations, `url` should have one. vlorentz: if we're doing type annotations, `url` should have one.
				url: Tree.json.gz url or path

				Returns:
				The raw json list

				"""
				if url.startswith('http://') or url.startswith('https://'):
				response = requests.get(url, allow_redirects=True)
				vlorentzUnsubmitted Not Done Inline Actions `# type: Dict` vlorentz: `# type: Dict`
				if not response.ok:
				raise ValueError('Error during query to %s' % url)
				raw = gzip.decompress(response.content)
				else:
				with gzip.open(url, 'r') as f:
				raw = f.read()
				raw_data = json.loads(raw.decode('utf-8'))
				return raw_data


				class GNUTree:
				"""Gnu Tree's representation

				"""
				def __init__(self, url: str):
				self.url = url # filepath or uri
				u = urlparse(url)
				self.base_url = '%s://%s' % (u.scheme, u.netloc)
				vlorentzUnsubmitted Not Done Inline Actions We should probably do: `if self.url.startswith('http://') or self.url.startswith('https://')`. It feels like we're going to get bitten when a dir name starts with `http_` otherwise vlorentz: We should probably do: `if self.url.startswith('http://') or self.url.startswith('https://')`.
				vlorentzUnsubmitted Not Done Inline Actions `://` vlorentz: `://`
				# Interesting top level directories
				self.top_level_directories = ['gnu', 'old-gnu']
				# internal state
				self._artifacts = {} # type: Dict
				douarddaUnsubmitted Not Done Inline Actions Since you already have properties (projects and artifacts) that are the entry points to access the data, and since these already implement the 'singleton/caching' nature of retrieving the original tree structure, this property is probably useless. Having both _raw_data and the couple (_projects, _artifacts) as cached data is redundant. My guess is that _raw_data is useless here. douardda: Since you already have properties (projects and artifacts) that are the entry points to access…
				ardumontAuthorUnsubmitted Done Inline Actions sold ardumont: sold
				self._projects = {} # type: Dict

				@property
				vlorentzUnsubmitted Not Done Inline Actions When do we need loading from the filesystem? vlorentz: When do we need loading from the filesystem?
				ardumontAuthorUnsubmitted Done Inline Actions Because i did this code in a snippet first to actually analyze from a tree.json.gz snapshot. ardumont: Because i did this code in a snippet first to actually analyze from a tree.json.gz snapshot.
				def projects(self) -> Dict:
				if not self._projects:
				self._projects, self._artifacts = self._load()
				return self._projects

				@property
				def artifacts(self) -> Dict:
				if not self._artifacts:
				self._projects, self._artifacts = self._load()
				return self._artifacts

				def _load(self) -> Tuple[Dict, Dict]:
				"""Compute projects and artifacts per project

				Returns:
				Tuple of dict projects (key project url, value the associated
				information) and a dict artifacts (key project url, value the
				info_file list)

				"""
				projects = {}
				artifacts = {}
				vlorentzUnsubmitted Not Done Inline Actions nitpick: either contents = directory['contents'] for content in contents: or infos = directory['contents'] for info in infos: vlorentz: nitpick: either ``` contents = directory['contents'] for content in contents: ``` or ```…

				raw_data = load_raw_data(self.url)[0]
				for directory in raw_data['contents']:
				if directory['name'] not in self.top_level_directories:
				continue
				infos = directory['contents']
				for info in infos:
				if info['type'] == 'directory':
				package_url = '%s/%s/%s/' % (
				self.base_url, directory['name'], info['name'])
				package_artifacts = find_artifacts(
				info['contents'], package_url)
				if package_artifacts != []:
				repo_details = {
				'name': info['name'],
				'url': package_url,
				'time_modified': info['time'],
				}
				artifacts[package_url] = package_artifacts
				projects[package_url] = repo_details
				douarddaUnsubmitted Not Done Inline Actions Theses looks to be the 'public API entry points' of this class, if so, move them under the init method plz douardda: Theses looks to be the 'public API entry points' of this class, if so, move them under the…
				ardumontAuthorUnsubmitted Done Inline Actions Right. ardumont: Right.

				return projects, artifacts

				vlorentzUnsubmitted Not Done Inline Actions the `filesystem` argument should be better document (at least show an example) vlorentz: the `filesystem` argument should be better document (at least show an example)

				def find_artifacts(filesystem: List[Dict], url: str) -> List[Dict]:
				"""Recursively list artifacts present in the folder and subfolders for a
				particular package url.

				Args:

				filesystem: File structure of the package root directory. This is a
				list of Dict representing either file or directory information as
				dict (keys: name, size, time, type).
				vlorentzUnsubmitted Not Done Inline Actions indent vlorentz: indent
				url: URL of the corresponding package

				Returns
				List of tarball urls and their associated metadata (time, length).
				For example:

				.. code-block:: python

				[
				vlorentzUnsubmitted Not Done Inline Actions code block vlorentz: code block
				{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.3.tar.gz',
				'time': 1071002600,
				'length': 543},
				{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.4.tar.gz',
				vlorentzUnsubmitted Not Done Inline Actions should be indented to be in the `Returns` block. vlorentz: should be indented to be in the `Returns` block.
				vlorentzUnsubmitted Not Done Inline Actions still not indented vlorentz: still not indented
				vlorentzUnsubmitted Not Done Inline Actions still not :/ vlorentz: still not :/
				'time': 1071078759,
				'length': 456},
				{'archive': 'https://ftp.gnu.org/gnu/3dldf/3DLDF-1.1.5.tar.gz',
				'time': 1074278633,
				'length': 251},
				...
				]

				"""
				artifacts = []
				for info_file in filesystem:
				filetype = info_file['type']
				filename = info_file['name']
				if filetype == 'file':
				if check_filename_is_archive(filename):
				artifacts.append({
				'archive': url + filename,
				'time': int(info_file['time']),
				'length': int(info_file['size']),
				vlorentzUnsubmitted Not Done Inline Actions `check_filename_is_archive` is more explicit, and is grammatically consistent with `find_artifacts`. vlorentz: `check_filename_is_archive` is more explicit, and is grammatically consistent with…
				})
				# It will recursively check for artifacts in all sub-folders
				elif filetype == 'directory':
				tarballs_in_dir = find_artifacts(
				info_file['contents'],
				url + filename + '/')
				artifacts.extend(tarballs_in_dir)

				return artifacts

				vlorentzUnsubmitted Not Done Inline Actions aka `bool` vlorentz: aka `bool`

				def check_filename_is_archive(filename: str) -> bool:
				"""
				Check for the extension of the file, if the file is of zip format of
				.tar.x format, where x could be anything, then returns true.
				vlorentzUnsubmitted Not Done Inline Actions Could be a doctest: >>> file_extension_check('abc.zip') True >>> file_extension_check('abc.tar.gz') True >>> file_extension_check('abc.tar.gz.sig') False vlorentz: Could be a doctest: ``` >>> file_extension_check('abc.zip') True >>> file_extension_check('abc.
				vlorentzUnsubmitted Not Done Inline Actions `bool: Whether filename is an archive or not` vlorentz: `bool: Whether filename is an archive or not`
				ardumontAuthorUnsubmitted Done Inline Actions I mentioned the returned type annotation so that's not needed. ardumont: I mentioned the returned type annotation so that's not needed.

				Args:
				filename: name of the file for which the extensions is needs to
				be checked.

				Returns:
				Whether filename is an archive or not

				Example:
				vlorentzUnsubmitted Not Done Inline Actions False negative on `foo.tar`. (Should also add a test for this) vlorentz: False negative on `foo.tar`. (Should also add a test for this)

				>>> check_filename_is_archive('abc.zip')
				True
				>>> check_filename_is_archive('abc.tar.gz')
				True
				>>> check_filename_is_archive('bac.tar')
				True
				>>> check_filename_is_archive('abc.tar.gz.sig')
				False
				>>> check_filename_is_archive('foobar.tar.')
				False

				"""
				file_suffixes = Path(filename).suffixes
				logger.debug('Path(%s).suffixed: %s' % (filename, file_suffixes))
				if len(file_suffixes) == 1 and file_suffixes[-1] in ('.zip', '.tar'):
				return True
				elif len(file_suffixes) > 1:
				if file_suffixes[-1] == '.zip' or file_suffixes[-2] == '.tar':
				return True
				return False