Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/utils.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import copy | import copy | ||||
import logging | import logging | ||||
import os | import os | ||||
import requests | import requests | ||||
import re | |||||
from typing import Dict, Optional, Tuple | from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple | ||||
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE | from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE | ||||
from swh.loader.package import DEFAULT_PARAMS | from swh.loader.package import DEFAULT_PARAMS | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
# https://github.com/jonschlinkert/author-regex | |||||
_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' | |||||
_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} | |||||
def api_info(url: str) -> Dict: | def api_info(url: str) -> Dict: | ||||
"""Basic api client to retrieve information on project. This deals with | """Basic api client to retrieve information on project. This deals with | ||||
fetching json metadata about pypi projects. | fetching json metadata about pypi projects. | ||||
Args: | Args: | ||||
url (str): The api url (e.g PyPI, npm, etc...) | url (str): The api url (e.g PyPI, npm, etc...) | ||||
Raises: | Raises: | ||||
▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines | def download(url: str, dest: str, hashes: Dict = {}, | ||||
return filepath, extrinsic_metadata | return filepath, extrinsic_metadata | ||||
def release_name(version: str, filename: Optional[str] = None) -> str: | def release_name(version: str, filename: Optional[str] = None) -> str: | ||||
if filename: | if filename: | ||||
return 'releases/%s/%s' % (version, filename) | return 'releases/%s/%s' % (version, filename) | ||||
return 'releases/%s' % version | return 'releases/%s' % version | ||||
def parse_author(author_str: str) -> Dict[str, str]: | |||||
""" | |||||
Parse npm package author string. | |||||
It works with a flexible range of formats, as detailed below:: | |||||
name | |||||
name <email> (url) | |||||
name <email>(url) | |||||
name<email> (url) | |||||
name<email>(url) | |||||
name (url) <email> | |||||
name (url)<email> | |||||
name(url) <email> | |||||
name(url)<email> | |||||
name (url) | |||||
name(url) | |||||
name <email> | |||||
name<email> | |||||
<email> (url) | |||||
<email>(url) | |||||
(url) <email> | |||||
(url)<email> | |||||
<email> | |||||
(url) | |||||
Args: | |||||
author_str (str): input author string | |||||
Returns: | |||||
dict: A dict that may contain the following keys: | |||||
* name | |||||
* url | |||||
""" | |||||
author = {} | |||||
matches = re.findall(_author_regexp, | |||||
author_str.replace('<>', '').replace('()', ''), | |||||
re.M) | |||||
for match in matches: | |||||
if match[0].strip(): | |||||
author['name'] = match[0].strip() | |||||
if match[1].strip(): | |||||
author['email'] = match[1].strip() | |||||
if match[2].strip(): | |||||
author['url'] = match[2].strip() | |||||
return author | |||||
def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]: | |||||
"""Transform an author like dict to an expected swh like dict (values are | |||||
bytes) | |||||
""" | |||||
name = author.get('name') | |||||
email = author.get('email') | |||||
fullname = None | |||||
if name and email: | |||||
fullname = '%s <%s>' % (name, email) | |||||
elif name: | |||||
fullname = name | |||||
if not fullname: | |||||
r = _EMPTY_AUTHOR | |||||
else: | |||||
r = { | |||||
'fullname': fullname.encode('utf-8') if fullname else None, | |||||
'name': name.encode('utf-8') if name else None, | |||||
'email': email.encode('utf-8') if email else None | |||||
} | |||||
return r | |||||
def artifact_identity(d: Mapping[str, Any], | |||||
id_keys: Sequence[str]) -> List[Any]: | |||||
"""Compute the primary key for a dict using the id_keys as primary key | |||||
composite. | |||||
Args: | |||||
d: A dict entry to compute the primary key on | |||||
id_keys: Sequence of keys to use as primary key | |||||
Returns: | |||||
The identity for that dict entry | |||||
""" | |||||
return [d.get(k) for k in id_keys] |