Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/npm/loader.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
import logging | import logging | ||||
import os | import os | ||||
import re | |||||
from codecs import BOM_UTF8 | from codecs import BOM_UTF8 | ||||
from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional | from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional | ||||
import chardet | import chardet | ||||
import iso8601 | import iso8601 | ||||
from urllib.parse import quote | from urllib.parse import quote | ||||
from swh.model.identifiers import normalize_timestamp | from swh.model.identifiers import normalize_timestamp | ||||
from swh.loader.package.loader import PackageLoader | from swh.loader.package.loader import PackageLoader | ||||
from swh.loader.package.utils import api_info, release_name | from swh.loader.package.utils import ( | ||||
api_info, release_name, parse_author, swh_author | |||||
) | |||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} | |||||
# https://github.com/jonschlinkert/author-regex | |||||
_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' | |||||
class NpmLoader(PackageLoader): | class NpmLoader(PackageLoader): | ||||
"""Load npm origin's artifact releases into swh archive. | """Load npm origin's artifact releases into swh archive. | ||||
""" | """ | ||||
visit_type = 'npm' | visit_type = 'npm' | ||||
def __init__(self, url: str): | def __init__(self, url: str): | ||||
"""Constructor | """Constructor | ||||
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines | for rev_id, known_artifact in known_artifacts.items(): | ||||
else: | else: | ||||
assert isinstance(known_original_artifact, list) | assert isinstance(known_original_artifact, list) | ||||
original_hash = known_original_artifact[0]['checksums']['sha1'] | original_hash = known_original_artifact[0]['checksums']['sha1'] | ||||
if shasum == original_hash: | if shasum == original_hash: | ||||
return rev_id | return rev_id | ||||
return None | return None | ||||
def parse_npm_package_author(author_str): | |||||
""" | |||||
Parse npm package author string. | |||||
It works with a flexible range of formats, as detailed below:: | |||||
name | |||||
name <email> (url) | |||||
name <email>(url) | |||||
name<email> (url) | |||||
name<email>(url) | |||||
name (url) <email> | |||||
name (url)<email> | |||||
name(url) <email> | |||||
name(url)<email> | |||||
name (url) | |||||
name(url) | |||||
name <email> | |||||
name<email> | |||||
<email> (url) | |||||
<email>(url) | |||||
(url) <email> | |||||
(url)<email> | |||||
<email> | |||||
(url) | |||||
Args: | |||||
author_str (str): input author string | |||||
Returns: | |||||
dict: A dict that may contain the following keys: | |||||
* name | |||||
* url | |||||
""" | |||||
author = {} | |||||
matches = re.findall(_author_regexp, | |||||
author_str.replace('<>', '').replace('()', ''), | |||||
re.M) | |||||
for match in matches: | |||||
if match[0].strip(): | |||||
author['name'] = match[0].strip() | |||||
if match[1].strip(): | |||||
author['email'] = match[1].strip() | |||||
if match[2].strip(): | |||||
author['url'] = match[2].strip() | |||||
return author | |||||
def extract_npm_package_author(package_json): | def extract_npm_package_author(package_json): | ||||
""" | """ | ||||
Extract package author from a ``package.json`` file content and | Extract package author from a ``package.json`` file content and | ||||
return it in swh format. | return it in swh format. | ||||
Args: | Args: | ||||
package_json (dict): Dict holding the content of parsed | package_json (dict): Dict holding the content of parsed | ||||
``package.json`` file | ``package.json`` file | ||||
Show All 18 Lines | def _author_str(author_data): | ||||
return _author_str(author_data[0]) if len(author_data) > 0 else '' | return _author_str(author_data[0]) if len(author_data) > 0 else '' | ||||
else: | else: | ||||
return author_data | return author_data | ||||
author_data = {} | author_data = {} | ||||
for author_key in ('author', 'authors'): | for author_key in ('author', 'authors'): | ||||
if author_key in package_json: | if author_key in package_json: | ||||
author_str = _author_str(package_json[author_key]) | author_str = _author_str(package_json[author_key]) | ||||
author_data = parse_npm_package_author(author_str) | author_data = parse_author(author_str) | ||||
name = author_data.get('name') | |||||
email = author_data.get('email') | |||||
fullname = None | |||||
if name and email: | |||||
fullname = '%s <%s>' % (name, email) | |||||
elif name: | |||||
fullname = name | |||||
if not fullname: | |||||
return _EMPTY_AUTHOR | |||||
if fullname: | |||||
fullname = fullname.encode('utf-8') | |||||
if name: | |||||
name = name.encode('utf-8') | |||||
if email: | |||||
email = email.encode('utf-8') | |||||
return {'fullname': fullname, 'name': name, 'email': email} | return swh_author(author_data) | ||||
def _lstrip_bom(s, bom=BOM_UTF8): | def _lstrip_bom(s, bom=BOM_UTF8): | ||||
if s.startswith(bom): | if s.startswith(bom): | ||||
return s[len(bom):] | return s[len(bom):] | ||||
else: | else: | ||||
return s | return s | ||||
▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines |