diff --git a/swh/loader/package/npm.py b/swh/loader/package/npm.py new file mode 100644 index 0000000..252f9a3 --- /dev/null +++ b/swh/loader/package/npm.py @@ -0,0 +1,362 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import json +import logging +import os +import re + +from codecs import BOM_UTF8 +from typing import Generator, Dict, Tuple, Sequence, List + +import chardet +import iso8601 +import requests +import tempfile + +from swh.model.identifiers import normalize_timestamp +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import download + + +logger = logging.getLogger(__name__) + + +class NpmClient: + """ + Helper class internally used by the npm loader to fetch + metadata for a specific package hosted on the npm registry. + + Args: + temp_dir (str): Path to the temporary disk location used + to uncompress the package tarballs + + """ + def __init__(self, log=None): + self.root_temp_dir = tempfile.mkdtemp() + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage npm loader' + } + } + self.log = log or logging + + def fetch_package_metadata(self, package_metadata_url) -> None: + """ + Fetch metadata for a given package and make it the focused one. + This must be called prior any other operations performed + by the other methods below. + + Args: + package_metadata_url: the package metadata url provided + by the npm loader + """ + self.package_metadata_url = package_metadata_url + self.package_metadata = self.session.get( + self.package_metadata_url).json() + self.package = self.package_metadata['name'] + self.temp_dir = os.path.join(self.root_temp_dir, self.package) + return self.package_metadata + + def package_versions(self, known_versions=None) -> List[Dict]: + """ + Return the available versions for the focused package. + + Args: + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + dict: A dict whose keys are Tuple[version, tarball_sha1] and + values dicts with the following entries: + + * **name**: the package name + * **version**: the package version + * **filename**: the package source tarball filename + * **sha1**: the package source tarball sha1 checksum + * **date**: the package release date + * **url**: the package source tarball download url + """ + versions = {} + if 'versions' in self.package_metadata: + for version, data in self.package_metadata['versions'].items(): + sha1 = data['dist']['shasum'] + key = (version, sha1) + if known_versions and key in known_versions: + continue + tarball_url = data['dist']['tarball'] + filename = os.path.basename(tarball_url) + date = self.package_metadata['time'][version] + versions[key] = { + 'name': self.package, + 'version': version, + 'filename': filename, + 'sha1': sha1, + 'date': date, + 'url': tarball_url + } + return versions + + +_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} + +# https://github.com/jonschlinkert/author-regex +_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' + + +def parse_npm_package_author(author_str): + """ + Parse npm package author string. + + It works with a flexible range of formats, as detailed below:: + + name + name (url) + name (url) + name (url) + name(url) + name (url) + name (url) + name(url) + name(url) + name (url) + name(url) + name + name + (url) + (url) + (url) + (url) + + (url) + + Args: + author_str (str): input author string + + Returns: + dict: A dict that may contain the following keys: + * name + * email + * url + + """ + author = {} + matches = re.findall(_author_regexp, + author_str.replace('<>', '').replace('()', ''), + re.M) + for match in matches: + if match[0].strip(): + author['name'] = match[0].strip() + if match[1].strip(): + author['email'] = match[1].strip() + if match[2].strip(): + author['url'] = match[2].strip() + return author + + +def extract_npm_package_author(package_json): + """ + Extract package author from a ``package.json`` file content and + return it in swh format. + + Args: + package_json (dict): Dict holding the content of parsed + ``package.json`` file + + Returns: + dict: A dict with the following keys: + * fullname + * name + * email + + """ + + def _author_str(author_data): + if type(author_data) is dict: + author_str = '' + if 'name' in author_data: + author_str += author_data['name'] + if 'email' in author_data: + author_str += ' <%s>' % author_data['email'] + return author_str + elif type(author_data) is list: + return _author_str(author_data[0]) if len(author_data) > 0 else '' + else: + return author_data + + author_data = {} + for author_key in ('author', 'authors'): + if author_key in package_json: + author_str = _author_str(package_json[author_key]) + author_data = parse_npm_package_author(author_str) + + name = author_data.get('name') + email = author_data.get('email') + + fullname = None + + if name and email: + fullname = '%s <%s>' % (name, email) + elif name: + fullname = name + + if not fullname: + return _EMPTY_AUTHOR + + if fullname: + fullname = fullname.encode('utf-8') + + if name: + name = name.encode('utf-8') + + if email: + email = email.encode('utf-8') + + return {'fullname': fullname, 'name': name, 'email': email} + + +def _lstrip_bom(s, bom=BOM_UTF8): + if s.startswith(bom): + return s[len(bom):] + else: + return s + + +def load_json(json_bytes): + """ + Try to load JSON from bytes and return a dictionary. + + First try to decode from utf-8. If the decoding failed, + try to detect the encoding and decode again with replace + error handling. + + If JSON is malformed, an empty dictionary will be returned. + + Args: + json_bytes (bytes): binary content of a JSON file + + Returns: + dict: JSON data loaded in a dictionary + """ + json_data = {} + try: + json_str = _lstrip_bom(json_bytes).decode('utf-8') + except UnicodeDecodeError: + encoding = chardet.detect(json_bytes)['encoding'] + if encoding: + json_str = json_bytes.decode(encoding, 'replace') + try: + json_data = json.loads(json_str) + except json.decoder.JSONDecodeError: + pass + return json_data + + +def extract_intrinsic_metadata(dir_path: str) -> Dict: + """Given an uncompressed path holding the pkginfo file, returns a + pkginfo parsed structure as a dict. + + The release artifact contains at their root one folder. For example: + $ tar tvf zprint-0.0.6.tar.gz + drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ + ... + + Args: + + dir_path (str): Path to the uncompressed directory + representing a release artifact from pypi. + + Returns: + the pkginfo parsed structure as a dict if any or None if + none was present. + + """ + # Retrieve the root folder of the archive + if not os.path.exists(dir_path): + return {} + lst = os.listdir(dir_path) + if len(lst) == 0: + return {} + project_dirname = lst[0] + package_json_path = os.path.join(dir_path, project_dirname, 'package.json') + if not os.path.exists(package_json_path): + return {} + with open(package_json_path, 'rb') as package_json_file: + package_json_bytes = package_json_file.read() + return load_json(package_json_bytes) + + +class NpmLoader(PackageLoader): + visit_type = 'npm' + + def __init__(self, package_name, package_url, package_metadata_url): + super().__init__(url=package_url) + self.package_metadata_url = package_metadata_url + + self._info = None + self._versions = None + self.client = NpmClient() + + # if package_url is None: + # package_url = 'https://www.npmjs.com/package/%s' % package_name + # if package_metadata_url is None: + # package_metadata_url = 'https://replicate.npmjs.com/%s/' %\ + # quote(package_name, safe='') + + @property + def info(self) -> Dict: + """Return the project metadata information (fetched from pypi registry) + + """ + if not self._info: + # This initializes the metadata retrieval on npm api + self._info = self.client.fetch_package_metadata( + self.package_metadata_url) + return self._info + + # def get_versions(self) -> Sequence[Tuple(str, str)]: + def get_versions(self) -> Sequence[str]: + return sorted(self.info['versions'].keys()) + + def get_default_release(self) -> str: + return self.info['dist-tags'].get('latest', '') + + def get_artifacts(self, version: str) -> Generator[ + Tuple[str, str, Dict], None, None]: + meta = self.info['versions'][version] + url = meta['dist']['tarball'] + filename = os.path.basename(url) + yield filename, url, meta + + def fetch_artifact_archive( + self, artifact_uri: str, dest: str) -> Tuple[str, Dict]: + return download(artifact_uri, dest=dest) + + def build_revision( + self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: + # Parse metadata (project, artifact metadata) + i_metadata = extract_intrinsic_metadata(a_uncompressed_path) + + # from intrinsic metadata + author = extract_npm_package_author(i_metadata) + # extrinsic metadata + version = i_metadata['version'] + date = self.info['time'][version] + date = iso8601.parse_date(date) + date = normalize_timestamp(int(date.timestamp())) + message = version.encode('ascii') + + return { + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'message': message, + 'metadata': { + 'intrinsic_metadata': i_metadata, + }, + 'parents': [], + } diff --git a/swh/loader/package/tests/common.py b/swh/loader/package/tests/common.py index ee334dd..e8e2292 100644 --- a/swh/loader/package/tests/common.py +++ b/swh/loader/package/tests/common.py @@ -1,80 +1,83 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from os import path from urllib.parse import urlparse from swh.model.hashutil import hash_to_bytes, hash_to_hex DATADIR = path.join(path.abspath(path.dirname(__file__)), 'resources') def get_response_cb(request, context): """Mount point callback to fetch on disk the content of a request Args: request (requests.Request): Object requests context (requests.Context): Object holding requests metadata information (headers, etc...) Returns: File descriptor on the on disk file to read from the test context """ url = urlparse(request.url) dirname = url.hostname # pypi.org | files.pythonhosted.org # url.path: pypi//json -> local file: pypi__json - filename = url.path[1:].replace('/', '_') + filename = url.path[1:] + if filename.endswith('/'): + filename = filename[:-1] + filename = filename.replace('/', '_') filepath = path.join(DATADIR, dirname, filename) fd = open(filepath, 'rb') context.headers['content-length'] = str(path.getsize(filepath)) return fd def decode_target(target): if not target: return target target_type = target['target_type'] if target_type == 'alias': decoded_target = target['target'].decode('utf-8') else: decoded_target = hash_to_hex(target['target']) return { 'target': decoded_target, 'target_type': target_type } def check_snapshot(expected_snapshot, expected_branches, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (Union[str, dict]): Either the snapshot identifier or the full snapshot expected_branches ([dict]): expected branches or nothing is the full snapshot is provided """ if isinstance(expected_snapshot, dict) and not expected_branches: expected_snapshot_id = expected_snapshot['id'] expected_branches = expected_snapshot['branches'] else: expected_snapshot_id = expected_snapshot snap = storage.snapshot_get(hash_to_bytes(expected_snapshot_id)) assert snap is not None branches = { branch.decode('utf-8'): decode_target(target) for branch, target in snap['branches'].items() } assert expected_branches == branches diff --git a/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.2.tgz b/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.2.tgz new file mode 100644 index 0000000..b726261 Binary files /dev/null and b/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.2.tgz differ diff --git a/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.3.tgz b/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.3.tgz new file mode 100644 index 0000000..bc20daa Binary files /dev/null and b/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.3.tgz differ diff --git a/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.4.tgz b/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.4.tgz new file mode 100644 index 0000000..a431eeb Binary files /dev/null and b/swh/loader/package/tests/resources/registry.npmjs.org/org_-_org-0.0.4.tgz differ diff --git a/swh/loader/package/tests/resources/replicate.npmjs.com/org b/swh/loader/package/tests/resources/replicate.npmjs.com/org new file mode 100644 index 0000000..3aba6b1 --- /dev/null +++ b/swh/loader/package/tests/resources/replicate.npmjs.com/org @@ -0,0 +1,191 @@ +{ + "_id": "org", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.0.4" + }, + "versions": { + "0.0.2": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.2", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.2", + "dist": { + "shasum": "12c58092e7de94456a43ef7823eef45e4d1d12fe", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.2.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.4": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.4", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.4", + "dist": { + "shasum": "788b3be1a50f7c94c1500ae4d922ec76c04e06ea", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.4.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44.220Z", + "created": "2014-01-01T15:40:31.231Z", + "0.0.2": "2014-01-01T15:40:33.020Z", + "0.0.3": "2014-01-01T15:55:45.497Z", + "0.0.4": "2014-01-02T06:10:26.485Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/tests/resources/replicate.npmjs.com/org_metadata_visit1.json b/swh/loader/package/tests/resources/replicate.npmjs.com/org_metadata_visit1.json new file mode 100644 index 0000000..3aba6b1 --- /dev/null +++ b/swh/loader/package/tests/resources/replicate.npmjs.com/org_metadata_visit1.json @@ -0,0 +1,191 @@ +{ + "_id": "org", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.0.4" + }, + "versions": { + "0.0.2": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.2", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.2", + "dist": { + "shasum": "12c58092e7de94456a43ef7823eef45e4d1d12fe", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.2.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.4": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.4", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.4", + "dist": { + "shasum": "788b3be1a50f7c94c1500ae4d922ec76c04e06ea", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.4.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44.220Z", + "created": "2014-01-01T15:40:31.231Z", + "0.0.2": "2014-01-01T15:40:33.020Z", + "0.0.3": "2014-01-01T15:55:45.497Z", + "0.0.4": "2014-01-02T06:10:26.485Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/tests/resources/replicate.npmjs.com/org_metadata_visit2.json b/swh/loader/package/tests/resources/replicate.npmjs.com/org_metadata_visit2.json new file mode 100644 index 0000000..d0e56b7 --- /dev/null +++ b/swh/loader/package/tests/resources/replicate.npmjs.com/org_metadata_visit2.json @@ -0,0 +1,347 @@ +{ + "_id": "org", + "_rev": "4-22484cc537f12d3023241211ee34e39d", + "name": "org", + "description": "A parser and converter for org-mode notation", + "dist-tags": { + "latest": "0.2.0" + }, + "versions": { + "0.0.2": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "main": "./lib/org.js", + "version": "0.0.2", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "bugs": { + "url": "https://github.com/mooz/org-js/issues" + }, + "_id": "org@0.0.2", + "dist": { + "shasum": "12c58092e7de94456a43ef7823eef45e4d1d12fe", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.2.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.3": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.3", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.3", + "dist": { + "shasum": "6a44220f88903a6dfc3b47d010238058f9faf3a0", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.3.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.4": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.4", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.4", + "dist": { + "shasum": "788b3be1a50f7c94c1500ae4d922ec76c04e06ea", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.4.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.0.5": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.0.5", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "_id": "org@0.0.5", + "dist": { + "shasum": "66e8c316cb37e1c176f604aa53fcb07b6f51b908", + "tarball": "https://registry.npmjs.org/org/-/org-0.0.5.tgz" + }, + "_from": ".", + "_npmVersion": "1.2.25", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ] + }, + "0.1.0": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.1.0", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "gitHead": "82ed089de208e82a3bf3463f52fa77006382674c", + "_id": "org@0.1.0", + "scripts": {}, + "_shasum": "bfaab735973c1a88fd62a21faf527ce360a412e9", + "_from": ".", + "_npmVersion": "1.4.28", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "dist": { + "shasum": "bfaab735973c1a88fd62a21faf527ce360a412e9", + "tarball": "https://registry.npmjs.org/org/-/org-0.1.0.tgz" + } + }, + "0.2.0": { + "name": "org", + "description": "A parser and converter for org-mode notation", + "homepage": "http://mooz.github.com/org-js", + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "licenses": [ + { + "type": "MIT" + } + ], + "main": "./lib/org.js", + "version": "0.2.0", + "directories": { + "test": "./tests" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "gitHead": "57b4480747e6e0c26baa43f267218a34b59224a5", + "_id": "org@0.2.0", + "scripts": {}, + "_shasum": "d76378387dc506fb8e3ccff73a0ad04e3afc6391", + "_from": ".", + "_npmVersion": "1.4.28", + "_npmUser": { + "name": "mooz", + "email": "stillpedant@gmail.com" + }, + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "dist": { + "shasum": "d76378387dc506fb8e3ccff73a0ad04e3afc6391", + "tarball": "https://registry.npmjs.org/org/-/org-0.2.0.tgz" + } + } + }, + "readme": "org-js\n======\n\nParser and converter for org-mode () notation written in JavaScript.\n\nInteractive Editor\n------------------\n\nFor working example, see http://mooz.github.com/org-js/editor/.\n\nInstallation\n------------\n\n npm install org\n\nSimple example of org -> HTML conversion\n----------------------------------------\n\n```javascript\nvar org = require(\"org\");\n\nvar parser = new org.Parser();\nvar orgDocument = parser.parse(orgCode);\nvar orgHTMLDocument = orgDocument.convert(org.ConverterHTML, {\n headerOffset: 1,\n exportFromLineNumber: false,\n suppressSubScriptHandling: false,\n suppressAutoLink: false\n});\n\nconsole.dir(orgHTMLDocument); // => { title, contentHTML, tocHTML, toc }\nconsole.log(orgHTMLDocument.toString()) // => Rendered HTML\n```\n\nWriting yet another converter\n-----------------------------\n\nSee `lib/org/converter/html.js`.\n", + "maintainers": [ + { + "name": "mooz", + "email": "stillpedant@gmail.com" + } + ], + "time": { + "modified": "2019-01-05T01:37:44.220Z", + "created": "2014-01-01T15:40:31.231Z", + "0.0.2": "2014-01-01T15:40:33.020Z", + "0.0.3": "2014-01-01T15:55:45.497Z", + "0.0.4": "2014-01-02T06:10:26.485Z", + "0.0.5": "2014-01-03T13:58:20.540Z", + "0.1.0": "2014-11-23T03:47:12.464Z", + "0.2.0": "2015-02-21T07:14:47.785Z" + }, + "author": { + "name": "Masafumi Oyamada", + "email": "stillpedant@gmail.com", + "url": "http://mooz.github.io/" + }, + "repository": { + "type": "git", + "url": "git://github.com/mooz/org-js.git" + }, + "users": { + "nak2k": true, + "bgschaid": true, + "422665vijay": true, + "nontau": true + }, + "homepage": "http://mooz.github.com/org-js", + "keywords": [ + "org-mode", + "emacs", + "parser" + ], + "bugs": { + "url": "http://github.com/mooz/org-s/issues" + }, + "readmeFilename": "README.md" +} diff --git a/swh/loader/package/tests/test_npm.py b/swh/loader/package/tests/test_npm.py new file mode 100644 index 0000000..81cdfa1 --- /dev/null +++ b/swh/loader/package/tests/test_npm.py @@ -0,0 +1,474 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import json +import os +import re +import unittest + +from swh.model.hashutil import hash_to_bytes + +from swh.loader.package.npm import ( + parse_npm_package_author, extract_npm_package_author +) +from swh.loader.package.tests.common import ( + DATADIR, get_response_cb, check_snapshot +) +from swh.loader.package.npm import NpmLoader + + +def package_metadata_file(package, visit=''): + json_filename = '%s_metadata' % package + if visit: + json_filename += '_visit%s' % visit + json_filename += '.json' + return json_filename + + +class TestNpmClient(unittest.TestCase): + + def _parse_author_string_test(self, author_str, expected_result): + self.assertEqual( + parse_npm_package_author(author_str), + expected_result + ) + self.assertEqual( + parse_npm_package_author(' %s' % author_str), + expected_result + ) + self.assertEqual( + parse_npm_package_author('%s ' % author_str), + expected_result + ) + + def test_parse_npm_package_author(self): + + self._parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe' + } + ) + + self._parse_author_string_test( + '', + { + 'email': 'john.doe@foo.bar' + } + ) + + self._parse_author_string_test( + '(https://john.doe)', + { + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + 'John Doe ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + self._parse_author_string_test( + 'John Doe', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar' + } + ) + + self._parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + ' (https://john.doe)', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + '(https://john.doe) ', + { + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + 'John Doe (https://john.doe) ', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + 'John Doe (https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test( + 'John Doe(https://john.doe)', + { + 'name': 'John Doe', + 'email': 'john.doe@foo.bar', + 'url': 'https://john.doe' + } + ) + + self._parse_author_string_test('', {}) + self._parse_author_string_test('<>', {}) + self._parse_author_string_test(' <>', {}) + self._parse_author_string_test('<>()', {}) + self._parse_author_string_test('<> ()', {}) + self._parse_author_string_test('()', {}) + self._parse_author_string_test(' ()', {}) + + self._parse_author_string_test( + 'John Doe <> ()', + { + 'name': 'John Doe' + } + ) + + self._parse_author_string_test( + 'John Doe <>', + { + 'name': 'John Doe' + } + ) + + self._parse_author_string_test( + 'John Doe ()', + { + 'name': 'John Doe' + } + ) + + def test_extract_npm_package_author(self): + package_metadata_filepath = os.path.join( + DATADIR, 'replicate.npmjs.com', 'org_metadata_visit2.json') + + with open(package_metadata_filepath) as json_file: + package_metadata = json.load(json_file) + + self.assertEqual( + extract_npm_package_author(package_metadata['versions']['0.0.2']), + { + 'fullname': b'mooz ', + 'name': b'mooz', + 'email': b'stillpedant@gmail.com' + } + ) + + self.assertEqual( + extract_npm_package_author(package_metadata['versions']['0.0.3']), + { + 'fullname': b'Masafumi Oyamada ', + 'name': b'Masafumi Oyamada', + 'email': b'stillpedant@gmail.com' + } + ) + + package_json = json.loads(''' + { + "name": "highlightjs-line-numbers.js", + "version": "2.7.0", + "description": "Highlight.js line numbers plugin.", + "main": "src/highlightjs-line-numbers.js", + "dependencies": {}, + "devDependencies": { + "gulp": "^4.0.0", + "gulp-rename": "^1.4.0", + "gulp-replace": "^0.6.1", + "gulp-uglify": "^1.2.0" + }, + "repository": { + "type": "git", + "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" + }, + "author": "Yauheni Pakala ", + "license": "MIT", + "bugs": { + "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" + }, + "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" + }''') # noqa + + self.assertEqual( + extract_npm_package_author(package_json), + { + 'fullname': b'Yauheni Pakala ', + 'name': b'Yauheni Pakala', + 'email': b'evgeniy.pakalo@gmail.com' + } + ) + + package_json = json.loads(''' + { + "name": "3-way-diff", + "version": "0.0.1", + "description": "3-way diffing of JavaScript objects", + "main": "index.js", + "authors": [ + { + "name": "Shawn Walsh", + "url": "https://github.com/shawnpwalsh" + }, + { + "name": "Markham F Rollins IV", + "url": "https://github.com/mrollinsiv" + } + ], + "keywords": [ + "3-way diff", + "3 way diff", + "three-way diff", + "three way diff" + ], + "devDependencies": { + "babel-core": "^6.20.0", + "babel-preset-es2015": "^6.18.0", + "mocha": "^3.0.2" + }, + "dependencies": { + "lodash": "^4.15.0" + } + }''') + + self.assertEqual( + extract_npm_package_author(package_json), + { + 'fullname': b'Shawn Walsh', + 'name': b'Shawn Walsh', + 'email': None + } + ) + + package_json = json.loads(''' + { + "name": "yfe-ynpm", + "version": "1.0.0", + "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", + "repository": { + "type": "git", + "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" + }, + "author": [ + "fengmk2 (https://fengmk2.com)", + "xufuzi (https://7993.org)" + ], + "license": "MIT" + }''') + + self.assertEqual( + extract_npm_package_author(package_json), + { + 'fullname': b'fengmk2 ', + 'name': b'fengmk2', + 'email': b'fengmk2@gmail.com' + } + ) + + package_json = json.loads(''' + { + "name": "umi-plugin-whale", + "version": "0.0.8", + "description": "Internal contract component", + "authors": { + "name": "xiaohuoni", + "email": "448627663@qq.com" + }, + "repository": "alitajs/whale", + "devDependencies": { + "np": "^3.0.4", + "umi-tools": "*" + }, + "license": "MIT" + }''') + + self.assertEqual( + extract_npm_package_author(package_json), + { + 'fullname': b'xiaohuoni <448627663@qq.com>', + 'name': b'xiaohuoni', + 'email': b'448627663@qq.com' + } + ) + + +def normalize_hashes(hashes): + if isinstance(hashes, str): + return hash_to_bytes(hashes) + if isinstance(hashes, list): + return [hash_to_bytes(x) for x in hashes] + return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} + + +_expected_new_contents_first_visit = normalize_hashes([ + '4ce3058e16ab3d7e077f65aabf855c34895bf17c', + '858c3ceee84c8311adc808f8cdb30d233ddc9d18', + '0fa33b4f5a4e0496da6843a38ff1af8b61541996', + '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', + '9163ac8025923d5a45aaac482262893955c9b37b', + '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', + '18c03aac6d3e910efb20039c15d70ab5e0297101', + '41265c42446aac17ca769e67d1704f99e5a1394d', + '783ff33f5882813dca9239452c4a7cadd4dba778', + 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', + '112d1900b4c2e3e9351050d1b542c9744f9793f3', + '5439bbc4bd9a996f1a38244e6892b71850bc98fd', + 'd83097a2f994b503185adf4e719d154123150159', + 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', + 'b3523a26f7147e4af40d9d462adaae6d49eda13e', + 'cd065fb435d6fb204a8871bcd623d0d0e673088c', + '2854a40855ad839a54f4b08f5cff0cf52fca4399', + 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', + '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', + '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', + '585fc5caab9ead178a327d3660d35851db713df1', + 'e8cd41a48d79101977e3036a87aeb1aac730686f', + '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', + '9c3cc2763bf9e9e37067d3607302c4776502df98', + '3649a68410e354c83cd4a38b66bd314de4c8f5c9', + 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', + '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', + '38de737da99514de6559ff163c988198bc91367a', +]) + +_expected_new_directories_first_visit = normalize_hashes([ + '3370d20d6f96dc1c9e50f083e2134881db110f4f', + '42753c0c2ab00c4501b552ac4671c68f3cf5aece', + 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce', + '80579be563e2ef3e385226fe7a3f079b377f142c', + '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', + 'bcad03ce58ac136f26f000990fc9064e559fe1c0', + '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', + 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', + '584b5b4b6cf7f038095e820b99386a9c232de931', + '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', + 'bb5f4ee143c970367eb409f2e4c1104898048b9d', + '1b95491047add1103db0dfdfa84a9735dcb11e88', + 'a00c6de13471a2d66e64aca140ddb21ef5521e62', + '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', + 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', + '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', + '775cc516543be86c15c1dc172f49c0d4e6e78235', + 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', +]) + +_expected_new_revisions_first_visit = normalize_hashes({ + 'd8a1c7474d2956ac598a19f0f27d52f7015f117e': + '42753c0c2ab00c4501b552ac4671c68f3cf5aece', + '5f9eb78af37ffd12949f235e86fac04898f9f72a': + '3370d20d6f96dc1c9e50f083e2134881db110f4f', + 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a': + 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'} +) + +_expected_new_snapshot_first_visit_id = normalize_hashes( + 'd0587e1195aed5a8800411a008f2f2d627f18e2d') + +_expected_branches_first_visit = { + 'HEAD': { + 'target': 'releases/0.0.4', + 'target_type': 'alias' + }, + 'releases/0.0.2': { + 'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e', + 'target_type': 'revision' + }, + 'releases/0.0.3': { + 'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a', + 'target_type': 'revision' + }, + 'releases/0.0.4': { + 'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a', + 'target_type': 'revision' + } +} + + +def package_url(package): + return 'https://www.npmjs.com/package/%s' % package + + +def package_metadata_url(package): + return 'https://replicate.npmjs.com/%s/' % package + + +def test_npm_loader_2_first_visit(requests_mock): + + package = 'org' + requests_mock.get(re.compile('https://'), + body=get_response_cb) + loader = NpmLoader(package, + package_url(package), + package_metadata_url(package)) + + actual_load_status = loader.load() + assert actual_load_status == {'status': 'eventful'} + + stats = loader.storage.stat_counters() + + assert { + 'content': len(_expected_new_contents_first_visit), + 'directory': len(_expected_new_directories_first_visit), + 'origin': 1, + 'origin_visit': 1, + 'person': 2, + 'release': 0, + 'revision': len(_expected_new_revisions_first_visit), + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + assert len(list(loader.storage.content_get( + _expected_new_contents_first_visit))) == len( + _expected_new_contents_first_visit) + + assert list(loader.storage.directory_missing( + _expected_new_directories_first_visit)) == [] + + assert list(loader.storage.revision_missing( + _expected_new_revisions_first_visit)) == [] + + check_snapshot( + _expected_new_snapshot_first_visit_id, + _expected_branches_first_visit, + storage=loader.storage)