diff --git a/swh/loader/package/npm.py b/swh/loader/package/npm.py index cef1293..96de129 100644 --- a/swh/loader/package/npm.py +++ b/swh/loader/package/npm.py @@ -1,296 +1,301 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import os import re from codecs import BOM_UTF8 from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional import chardet import iso8601 from swh.model.identifiers import normalize_timestamp from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import api_info, release_name logger = logging.getLogger(__name__) _EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} # https://github.com/jonschlinkert/author-regex _author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' class NpmLoader(PackageLoader): visit_type = 'npm' def __init__(self, package_name, package_url, package_metadata_url): super().__init__(url=package_url) self.provider_url = package_metadata_url self._info = None self._versions = None # if package_url is None: # package_url = 'https://www.npmjs.com/package/%s' % package_name # if package_metadata_url is None: # package_metadata_url = 'https://replicate.npmjs.com/%s/' %\ # quote(package_name, safe='') @property def info(self) -> Dict: """Return the project metadata information (fetched from npm registry) """ if not self._info: self._info = api_info(self.provider_url) return self._info def get_versions(self) -> Sequence[str]: return sorted(list(self.info['versions'].keys())) def get_default_version(self) -> str: return self.info['dist-tags'].get('latest', '') def get_package_info(self, version: str) -> Generator[ Tuple[str, Mapping[str, Any]], None, None]: meta = self.info['versions'][version] url = meta['dist']['tarball'] p_info = { 'url': url, 'filename': os.path.basename(url), 'raw': meta, } yield release_name(version), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: shasum = artifact_metadata['dist']['shasum'] for rev_id, known_artifact in known_artifacts.items(): original_artifact = known_artifact['original_artifact'][0] if shasum == original_artifact['checksums']['sha1']: return rev_id return None def build_revision( self, a_metadata: Dict, uncompressed_path: str) -> Dict: i_metadata = extract_intrinsic_metadata(uncompressed_path) # from intrinsic metadata author = extract_npm_package_author(i_metadata) - # extrinsic metadata - version = i_metadata['version'] - date = self.info['time'][version] + message = i_metadata['version'].encode('ascii') + + # from extrinsic metadata + + # No date available in intrinsic metadata: retrieve it from the API + # metadata, using the version number that the API claims this package + # has. + extrinsic_version = a_metadata['version'] + date = self.info['time'][extrinsic_version] date = iso8601.parse_date(date) date = normalize_timestamp(int(date.timestamp())) - message = version.encode('ascii') return { 'type': 'tar', 'message': message, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'parents': [], 'metadata': { 'intrinsic': { 'tool': 'package.json', 'raw': i_metadata, }, 'extrinsic': { 'provider': self.provider_url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, }, } def parse_npm_package_author(author_str): """ Parse npm package author string. It works with a flexible range of formats, as detailed below:: name name (url) name (url) name (url) name(url) name (url) name (url) name(url) name(url) name (url) name(url) name name (url) (url) (url) (url) (url) Args: author_str (str): input author string Returns: dict: A dict that may contain the following keys: * name * email * url """ author = {} matches = re.findall(_author_regexp, author_str.replace('<>', '').replace('()', ''), re.M) for match in matches: if match[0].strip(): author['name'] = match[0].strip() if match[1].strip(): author['email'] = match[1].strip() if match[2].strip(): author['url'] = match[2].strip() return author def extract_npm_package_author(package_json): """ Extract package author from a ``package.json`` file content and return it in swh format. Args: package_json (dict): Dict holding the content of parsed ``package.json`` file Returns: dict: A dict with the following keys: * fullname * name * email """ def _author_str(author_data): if type(author_data) is dict: author_str = '' if 'name' in author_data: author_str += author_data['name'] if 'email' in author_data: author_str += ' <%s>' % author_data['email'] return author_str elif type(author_data) is list: return _author_str(author_data[0]) if len(author_data) > 0 else '' else: return author_data author_data = {} for author_key in ('author', 'authors'): if author_key in package_json: author_str = _author_str(package_json[author_key]) author_data = parse_npm_package_author(author_str) name = author_data.get('name') email = author_data.get('email') fullname = None if name and email: fullname = '%s <%s>' % (name, email) elif name: fullname = name if not fullname: return _EMPTY_AUTHOR if fullname: fullname = fullname.encode('utf-8') if name: name = name.encode('utf-8') if email: email = email.encode('utf-8') return {'fullname': fullname, 'name': name, 'email': email} def _lstrip_bom(s, bom=BOM_UTF8): if s.startswith(bom): return s[len(bom):] else: return s def load_json(json_bytes): """ Try to load JSON from bytes and return a dictionary. First try to decode from utf-8. If the decoding failed, try to detect the encoding and decode again with replace error handling. If JSON is malformed, an empty dictionary will be returned. Args: json_bytes (bytes): binary content of a JSON file Returns: dict: JSON data loaded in a dictionary """ json_data = {} try: json_str = _lstrip_bom(json_bytes).decode('utf-8') except UnicodeDecodeError: encoding = chardet.detect(json_bytes)['encoding'] if encoding: json_str = json_bytes.decode(encoding, 'replace') try: json_data = json.loads(json_str) except json.decoder.JSONDecodeError: pass return json_data def extract_intrinsic_metadata(dir_path: str) -> Dict: """Given an uncompressed path holding the pkginfo file, returns a pkginfo parsed structure as a dict. The release artifact contains at their root one folder. For example: $ tar tvf zprint-0.0.6.tar.gz drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/ ... Args: dir_path (str): Path to the uncompressed directory representing a release artifact from npm. Returns: the pkginfo parsed structure as a dict if any or None if none was present. """ # Retrieve the root folder of the archive if not os.path.exists(dir_path): return {} lst = os.listdir(dir_path) if len(lst) == 0: return {} project_dirname = lst[0] package_json_path = os.path.join(dir_path, project_dirname, 'package.json') if not os.path.exists(package_json_path): return {} with open(package_json_path, 'rb') as package_json_file: package_json_bytes = package_json_file.read() return load_json(package_json_bytes) diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz new file mode 100644 index 0000000..9785c78 Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.0.tgz differ diff --git a/swh/loader/package/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz b/swh/loader/package/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz new file mode 100644 index 0000000..fa4554b Binary files /dev/null and b/swh/loader/package/tests/data/https_registry.npmjs.org/@aller_shared_-_shared-0.1.1-alpha.14.tgz differ diff --git a/swh/loader/package/tests/data/https_replicate.npmjs.com/@aller_shared b/swh/loader/package/tests/data/https_replicate.npmjs.com/@aller_shared new file mode 100644 index 0000000..219ab3c --- /dev/null +++ b/swh/loader/package/tests/data/https_replicate.npmjs.com/@aller_shared @@ -0,0 +1,312 @@ +{ + "_id": "@aller/shared", + "_rev": "12-a9ec246669488358f536b290e03fe04a", + "name": "@aller/shared", + "dist-tags": { + "canary": "0.1.2-alpha.3644", + "latest": "0.1.0" + }, + "versions": { + "0.1.0": { + "name": "@aller/shared", + "version": "0.1.0", + "description": "Library of shared code", + "license": "UNLICENSED", + "scripts": { + "build": "yarn build-storybook", + "build-storybook": "build-storybook -c .storybook -o storybook-static", + "lint": "yarn lint:eslint", + "lint:fix": "yarn lint:eslint:fix", + "lint:eslint": "eslint --max-warnings=0 .", + "lint:eslint:fix": "yarn lint:eslint --fix", + "lint:eslint:junit": "yarn lint:eslint --format junit -o reports/junit/js-lint-results.xml", + "storybook": "start-storybook -p 9001 -c .storybook", + "test": "yarn lint:eslint && yarn test:jest", + "test:ci": "yarn lint:eslint:junit && yarn test:jest:junit", + "test:jest": "NODE_ENV=test jest", + "test:jest:junit": "yarn test:jest --ci --runInBand --reporters=default --reporters=jest-junit", + "gql:generate": "graphql-codegen", + "gql:watch": "graphql-codegen --watch", + "gql:init": "graphql-codegen init" + }, + "publishConfig": { + "access": "public" + }, + "devDependencies": { + "@graphql-codegen/cli": "1.7.0", + "@graphql-codegen/fragment-matcher": "^1.7.0", + "@graphql-codegen/typescript": "1.7.0", + "@graphql-codegen/typescript-operations": "1.7.0", + "@graphql-codegen/typescript-react-apollo": "1.7.0", + "@storybook/addon-a11y": "5.2.1", + "@storybook/addon-actions": "5.2.1", + "@storybook/addon-contexts": "^5.1.11", + "@storybook/addon-links": "5.2.1", + "@storybook/addon-viewport": "5.2.1", + "@storybook/addons": "5.2.1", + "@storybook/react": "5.2.1", + "@storybook/theming": "5.2.1", + "fs-extra": "^8.1.0", + "glob": "^7.1.3", + "ts-jest": "^24.1.0" + }, + "main": "./src/index", + "module": "./src/index", + "sideEffects": false, + "engines": { + "node": ">=10.0.0" + }, + "dependencies": { + "@aller/blink-labrador": "8.2.2", + "@aller/shiny": "^0.47.0", + "@apollo/react-hooks": "3.1.1", + "@babel/core": "7.6.2", + "@firebase/app-types": "0.x", + "@firebase/util": "0.x", + "@godaddy/terminus": "4.2.0", + "@material-ui/core": "4.4.3", + "@material-ui/lab": "3.0.0-alpha.30", + "@zeit/next-bundle-analyzer": "0.1.2", + "acorn": "7.1.0", + "adblock-detect": "1.0.5", + "apollo-cache-inmemory": "^1.6.3", + "apollo-client": "^2.6.4", + "apollo-link": "^1.2.13", + "apollo-link-error": "^1.1.12", + "apollo-link-http": "^1.5.16", + "arcads": "1.4.5", + "aurora-deep-slice-merge": "github:soldotno/aurora-deep-slice-merge", + "axios": "0.19.0", + "babel-plugin-styled-components": "1.10.6", + "circular-json": "0.5.9", + "classnames": "2.2.6", + "compression": "1.7.4", + "copy-webpack-plugin": "5.0.4", + "date-fns": "2.3.0", + "debug": "3.2.6", + "dotenv": "8.1.0", + "express": "4.17.1", + "express-prom-bundle": "5.1.5", + "firebase": "7.0.0", + "graphql": "14.5.8", + "graphql-tag": "^2.10.1", + "hoist-non-react-statics": "3.3.0", + "intersection-observer": "0.7.0", + "is-client": "0.0.2", + "isomorphic-unfetch": "3.0.0", + "jaeger-client": "3.17.0", + "js-cookie": "2.2.1", + "lazysizes": "5.1.1", + "lodash": "4.17.15", + "lodash.debounce": "4.0.8", + "lodash.isinteger": "4.0.4", + "lodash.throttle": "4.1.1", + "next": "9.0.5", + "next-transpile-modules": "2.3.1", + "nodesi": "1.11.0", + "on-idle": "3.1.4", + "opentracing": "0.14.4", + "ot-got": "0.3.1", + "polished": "3.4.1", + "prebid.js": "^2.27.0", + "prom-client": "11.5.3", + "prop-types": "15.7.2", + "query-string": "5.1.1", + "react": "16.9.0", + "react-checkbox-group": "4.0.1", + "react-content-loader": "^4.2.2", + "react-dom": "16.9.0", + "react-gpt": "2.0.1", + "react-instagram-embed": "^1.5.0", + "react-intersection-observer": "8.24.2", + "react-prebid": "2.1.1", + "react-redux": "7.1.1", + "redux": "4.0.4", + "redux-thunk": "2.3.0", + "reselect": "4.0.0", + "serve-static": "1.14.1", + "sol-menu-content": "1.0.10", + "storybook-chromatic": "2.2.2", + "striptags": "3.1.1", + "styled-components": "5.0.0-beta.8", + "ucfirst": "1.0.0", + "ulog": "beta", + "victory-pie": "33.1.0", + "victory-tooltip": "33.1.0" + }, + "_id": "@aller/shared@0.1.0", + "dist": { + "shasum": "8231e9ad6d175843e1022cd7d0eb0ad6b6087603", + "integrity": "sha512-I2QHU6gHft3QN50wTZhNjzAesXbTEmdX/G3F5ID0+s7VXC7Z3EsUP9Abx2/G78JGINfe4mCqHRXPgHn9z9JKRA==", + "tarball": "https://registry.npmjs.org/@aller/shared/-/shared-0.1.0.tgz", + "fileCount": 686, + "unpackedSize": 6182869 + }, + "maintainers": [], + "directories": null, + "_npmOperationalInternal": { + "host": "s3://npm-registry-packages", + "tmp": "tmp/shared_0.1.0_1570023680880_0.29907609962091875" + }, + "_hasShrinkwrap": false + }, + "0.1.1-alpha.14": { + "name": "@aller/shared", + "version": "0.1.1-alpha.14", + "description": "Library of shared code", + "license": "UNLICENSED", + "scripts": { + "build": "yarn build-storybook", + "build-storybook": "build-storybook -c .storybook -o storybook-static", + "lint": "yarn lint:eslint", + "lint:fix": "yarn lint:eslint:fix", + "lint:eslint": "eslint --max-warnings=0 .", + "lint:eslint:fix": "yarn lint:eslint --fix", + "lint:eslint:junit": "yarn lint:eslint --format junit -o reports/junit/js-lint-results.xml", + "storybook": "start-storybook -p 9001 -c .storybook", + "test": "yarn lint:eslint && yarn test:jest", + "test:ci": "yarn lint:eslint:junit && yarn test:jest:junit", + "test:jest": "NODE_ENV=test jest", + "test:jest:junit": "yarn test:jest --ci --runInBand --reporters=default --reporters=jest-junit", + "gql:generate": "graphql-codegen", + "gql:watch": "graphql-codegen --watch", + "gql:init": "graphql-codegen init" + }, + "publishConfig": { + "access": "public" + }, + "devDependencies": { + "@graphql-codegen/cli": "1.7.0", + "@graphql-codegen/fragment-matcher": "^1.7.0", + "@graphql-codegen/typescript": "1.7.0", + "@graphql-codegen/typescript-operations": "1.7.0", + "@graphql-codegen/typescript-react-apollo": "1.7.0", + "@storybook/addon-a11y": "5.2.1", + "@storybook/addon-actions": "5.2.1", + "@storybook/addon-contexts": "^5.1.11", + "@storybook/addon-links": "5.2.1", + "@storybook/addon-viewport": "5.2.1", + "@storybook/addons": "5.2.1", + "@storybook/react": "5.2.1", + "@storybook/theming": "5.2.1", + "fs-extra": "^8.1.0", + "glob": "^7.1.3", + "ts-jest": "^24.1.0" + }, + "main": "./src/index", + "module": "./src/index", + "sideEffects": false, + "engines": { + "node": ">=10.0.0" + }, + "dependencies": { + "@aller/blink-labrador": "8.2.2", + "@aller/shiny": "^0.47.0", + "@apollo/react-hooks": "3.1.1", + "@babel/core": "7.6.2", + "@firebase/app-types": "0.x", + "@firebase/util": "0.x", + "@godaddy/terminus": "4.2.0", + "@material-ui/core": "4.4.3", + "@material-ui/lab": "3.0.0-alpha.30", + "@zeit/next-bundle-analyzer": "0.1.2", + "acorn": "7.1.0", + "adblock-detect": "1.0.5", + "apollo-cache-inmemory": "^1.6.3", + "apollo-client": "^2.6.4", + "apollo-link": "^1.2.13", + "apollo-link-error": "^1.1.12", + "apollo-link-http": "^1.5.16", + "arcads": "1.4.5", + "aurora-deep-slice-merge": "github:soldotno/aurora-deep-slice-merge", + "axios": "0.19.0", + "babel-plugin-styled-components": "1.10.6", + "circular-json": "0.5.9", + "classnames": "2.2.6", + "compression": "1.7.4", + "copy-webpack-plugin": "5.0.4", + "date-fns": "2.3.0", + "debug": "3.2.6", + "dotenv": "8.1.0", + "express": "4.17.1", + "express-prom-bundle": "5.1.5", + "firebase": "7.0.0", + "graphql": "14.5.8", + "graphql-tag": "^2.10.1", + "hoist-non-react-statics": "3.3.0", + "intersection-observer": "0.7.0", + "is-client": "0.0.2", + "isomorphic-unfetch": "3.0.0", + "jaeger-client": "3.17.0", + "js-cookie": "2.2.1", + "lazysizes": "5.1.1", + "lodash": "4.17.15", + "lodash.debounce": "4.0.8", + "lodash.isinteger": "4.0.4", + "lodash.throttle": "4.1.1", + "next": "9.0.5", + "next-transpile-modules": "2.3.1", + "nodesi": "1.11.0", + "on-idle": "3.1.4", + "opentracing": "0.14.4", + "ot-got": "0.3.1", + "polished": "3.4.1", + "prebid.js": "^2.27.0", + "prom-client": "11.5.3", + "prop-types": "15.7.2", + "query-string": "5.1.1", + "react": "16.9.0", + "react-checkbox-group": "4.0.1", + "react-content-loader": "^4.2.2", + "react-dom": "16.9.0", + "react-gpt": "2.0.1", + "react-instagram-embed": "^1.5.0", + "react-intersection-observer": "8.24.2", + "react-prebid": "2.1.1", + "react-redux": "7.1.1", + "redux": "4.0.4", + "redux-thunk": "2.3.0", + "reselect": "4.0.0", + "serve-static": "1.14.1", + "sol-menu-content": "1.0.10", + "storybook-chromatic": "2.2.2", + "striptags": "3.1.1", + "styled-components": "5.0.0-beta.8", + "ucfirst": "1.0.0", + "ulog": "beta", + "victory-pie": "33.1.0", + "victory-tooltip": "33.1.0" + }, + "gitHead": "ac08282d37c72abf3f7c7b1238edb6a653195646", + "readme": "ERROR: No README data found!", + "_id": "@aller/shared@0.1.1-alpha.14", + "_nodeVersion": "10.16.3", + "_npmVersion": "lerna/3.16.4/node@v10.16.3+x64 (linux)", + "dist": { + "integrity": "sha512-WDgwdL76W6aV44dw+asf+QA1uWHVCt0074WJSVA9VxVWIjL5S4Dg8+b5GnU9047KalLW8fy53PlOQqClJ3qJ1A==", + "shasum": "b36f8ea2d17305ce1110fb096d23d8abf948ff6d", + "tarball": "https://registry.npmjs.org/@aller/shared/-/shared-0.1.1-alpha.14.tgz", + "fileCount": 504, + "unpackedSize": 850607 + }, + "maintainers": [], + "directories": null, + "_npmOperationalInternal": { + "host": "s3://npm-registry-packages", + "tmp": "tmp/shared_0.1.1-alpha.14_1570024938845_0.37086021016657233" + }, + "_hasShrinkwrap": false + } + }, + "time": { + "created": "2019-10-01T18:57:21.566Z", + "modified": "2019-11-07T14:11:06.245Z", + "0.1.0": "2019-10-02T13:41:21.150Z", + "0.1.1-alpha.14": "2019-10-02T14:02:19.056Z" + }, + "maintainers": [], + "description": "Library of shared code", + "license": "UNLICENSED", + "readme": "", + "readmeFilename": "" +} diff --git a/swh/loader/package/tests/test_npm.py b/swh/loader/package/tests/test_npm.py index 2175d87..0b38cca 100644 --- a/swh/loader/package/tests/test_npm.py +++ b/swh/loader/package/tests/test_npm.py @@ -1,536 +1,584 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import os +import pytest from swh.model.hashutil import hash_to_bytes from swh.loader.package.npm import ( parse_npm_package_author, extract_npm_package_author ) from swh.loader.package.tests.common import ( check_snapshot, check_metadata_paths, get_stats ) from swh.loader.package.npm import NpmLoader def _parse_author_string_test(author_str, expected_result): assert parse_npm_package_author(author_str) == expected_result assert parse_npm_package_author(' %s' % author_str) == expected_result assert parse_npm_package_author('%s ' % author_str) == expected_result def test_parse_npm_package_author(): _parse_author_string_test( 'John Doe', { 'name': 'John Doe' } ) _parse_author_string_test( '', { 'email': 'john.doe@foo.bar' } ) _parse_author_string_test( '(https://john.doe)', { 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe ', { 'name': 'John Doe', 'email': 'john.doe@foo.bar' } ) _parse_author_string_test( 'John Doe', { 'name': 'John Doe', 'email': 'john.doe@foo.bar' } ) _parse_author_string_test( 'John Doe (https://john.doe)', { 'name': 'John Doe', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe(https://john.doe)', { 'name': 'John Doe', 'url': 'https://john.doe' } ) _parse_author_string_test( ' (https://john.doe)', { 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( '(https://john.doe) ', { 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe (https://john.doe)', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe (https://john.doe) ', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe (https://john.doe)', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test( 'John Doe(https://john.doe)', { 'name': 'John Doe', 'email': 'john.doe@foo.bar', 'url': 'https://john.doe' } ) _parse_author_string_test('', {}) _parse_author_string_test('<>', {}) _parse_author_string_test(' <>', {}) _parse_author_string_test('<>()', {}) _parse_author_string_test('<> ()', {}) _parse_author_string_test('()', {}) _parse_author_string_test(' ()', {}) _parse_author_string_test( 'John Doe <> ()', { 'name': 'John Doe' } ) _parse_author_string_test( 'John Doe <>', { 'name': 'John Doe' } ) _parse_author_string_test( 'John Doe ()', { 'name': 'John Doe' } ) def test_extract_npm_package_author(datadir): package_metadata_filepath = os.path.join( datadir, 'https_replicate.npmjs.com', 'org_visit1') with open(package_metadata_filepath) as json_file: package_metadata = json.load(json_file) extract_npm_package_author(package_metadata['versions']['0.0.2']) == \ { 'fullname': b'mooz ', 'name': b'mooz', 'email': b'stillpedant@gmail.com' } assert ( extract_npm_package_author(package_metadata['versions']['0.0.3']) == { 'fullname': b'Masafumi Oyamada ', 'name': b'Masafumi Oyamada', 'email': b'stillpedant@gmail.com' } ) package_json = json.loads(''' { "name": "highlightjs-line-numbers.js", "version": "2.7.0", "description": "Highlight.js line numbers plugin.", "main": "src/highlightjs-line-numbers.js", "dependencies": {}, "devDependencies": { "gulp": "^4.0.0", "gulp-rename": "^1.4.0", "gulp-replace": "^0.6.1", "gulp-uglify": "^1.2.0" }, "repository": { "type": "git", "url": "https://github.com/wcoder/highlightjs-line-numbers.js.git" }, "author": "Yauheni Pakala ", "license": "MIT", "bugs": { "url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues" }, "homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/" }''') # noqa assert extract_npm_package_author(package_json) == \ { 'fullname': b'Yauheni Pakala ', 'name': b'Yauheni Pakala', 'email': b'evgeniy.pakalo@gmail.com' } package_json = json.loads(''' { "name": "3-way-diff", "version": "0.0.1", "description": "3-way diffing of JavaScript objects", "main": "index.js", "authors": [ { "name": "Shawn Walsh", "url": "https://github.com/shawnpwalsh" }, { "name": "Markham F Rollins IV", "url": "https://github.com/mrollinsiv" } ], "keywords": [ "3-way diff", "3 way diff", "three-way diff", "three way diff" ], "devDependencies": { "babel-core": "^6.20.0", "babel-preset-es2015": "^6.18.0", "mocha": "^3.0.2" }, "dependencies": { "lodash": "^4.15.0" } }''') assert extract_npm_package_author(package_json) == \ { 'fullname': b'Shawn Walsh', 'name': b'Shawn Walsh', 'email': None } package_json = json.loads(''' { "name": "yfe-ynpm", "version": "1.0.0", "homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm", "repository": { "type": "git", "url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git" }, "author": [ "fengmk2 (https://fengmk2.com)", "xufuzi (https://7993.org)" ], "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ { 'fullname': b'fengmk2 ', 'name': b'fengmk2', 'email': b'fengmk2@gmail.com' } package_json = json.loads(''' { "name": "umi-plugin-whale", "version": "0.0.8", "description": "Internal contract component", "authors": { "name": "xiaohuoni", "email": "448627663@qq.com" }, "repository": "alitajs/whale", "devDependencies": { "np": "^3.0.4", "umi-tools": "*" }, "license": "MIT" }''') assert extract_npm_package_author(package_json) == \ { 'fullname': b'xiaohuoni <448627663@qq.com>', 'name': b'xiaohuoni', 'email': b'448627663@qq.com' } def normalize_hashes(hashes): if isinstance(hashes, str): return hash_to_bytes(hashes) if isinstance(hashes, list): return [hash_to_bytes(x) for x in hashes] return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()} _expected_new_contents_first_visit = normalize_hashes([ '4ce3058e16ab3d7e077f65aabf855c34895bf17c', '858c3ceee84c8311adc808f8cdb30d233ddc9d18', '0fa33b4f5a4e0496da6843a38ff1af8b61541996', '85a410f8ef8eb8920f2c384a9555566ad4a2e21b', '9163ac8025923d5a45aaac482262893955c9b37b', '692cf623b8dd2c5df2c2998fd95ae4ec99882fb4', '18c03aac6d3e910efb20039c15d70ab5e0297101', '41265c42446aac17ca769e67d1704f99e5a1394d', '783ff33f5882813dca9239452c4a7cadd4dba778', 'b029cfb85107aee4590c2434a3329bfcf36f8fa1', '112d1900b4c2e3e9351050d1b542c9744f9793f3', '5439bbc4bd9a996f1a38244e6892b71850bc98fd', 'd83097a2f994b503185adf4e719d154123150159', 'd0939b4898e83090ee55fd9d8a60e312cfadfbaf', 'b3523a26f7147e4af40d9d462adaae6d49eda13e', 'cd065fb435d6fb204a8871bcd623d0d0e673088c', '2854a40855ad839a54f4b08f5cff0cf52fca4399', 'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe', '0f73d56e1cf480bded8a1ecf20ec6fc53c574713', '0d9882b2dfafdce31f4e77fe307d41a44a74cefe', '585fc5caab9ead178a327d3660d35851db713df1', 'e8cd41a48d79101977e3036a87aeb1aac730686f', '5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7', '9c3cc2763bf9e9e37067d3607302c4776502df98', '3649a68410e354c83cd4a38b66bd314de4c8f5c9', 'e96ed0c091de1ebdf587104eaf63400d1974a1fe', '078ca03d2f99e4e6eab16f7b75fbb7afb699c86c', '38de737da99514de6559ff163c988198bc91367a', ]) _expected_new_directories_first_visit = normalize_hashes([ '3370d20d6f96dc1c9e50f083e2134881db110f4f', '42753c0c2ab00c4501b552ac4671c68f3cf5aece', 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce', '80579be563e2ef3e385226fe7a3f079b377f142c', '3b0ddc6a9e58b4b53c222da4e27b280b6cda591c', 'bcad03ce58ac136f26f000990fc9064e559fe1c0', '5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca', 'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd', '584b5b4b6cf7f038095e820b99386a9c232de931', '184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a', 'bb5f4ee143c970367eb409f2e4c1104898048b9d', '1b95491047add1103db0dfdfa84a9735dcb11e88', 'a00c6de13471a2d66e64aca140ddb21ef5521e62', '5ce6c1cd5cda2d546db513aaad8c72a44c7771e2', 'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2', '202fafcd7c0f8230e89d5496ad7f44ab12b807bf', '775cc516543be86c15c1dc172f49c0d4e6e78235', 'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e', ]) _expected_new_revisions_first_visit = normalize_hashes({ 'd8a1c7474d2956ac598a19f0f27d52f7015f117e': '42753c0c2ab00c4501b552ac4671c68f3cf5aece', '5f9eb78af37ffd12949f235e86fac04898f9f72a': '3370d20d6f96dc1c9e50f083e2134881db110f4f', 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a': 'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'} ) def package_url(package): return 'https://www.npmjs.com/package/%s' % package def package_metadata_url(package): return 'https://replicate.npmjs.com/%s/' % package def test_revision_metadata_structure(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package, package_url(package), package_metadata_url(package)) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['snapshot_id'] is not None expected_revision_id = hash_to_bytes( 'd8a1c7474d2956ac598a19f0f27d52f7015f117e') revision = list(loader.storage.revision_get([expected_revision_id]))[0] assert revision is not None check_metadata_paths(revision['metadata'], paths=[ ('intrinsic.tool', str), ('intrinsic.raw', dict), ('extrinsic.provider', str), ('extrinsic.when', str), ('extrinsic.raw', dict), ('original_artifact', list), ]) for original_artifact in revision['metadata']['original_artifact']: check_metadata_paths(original_artifact, paths=[ ('filename', str), ('length', int), ('checksums', dict), ]) def test_npm_loader_first_visit(swh_config, requests_mock_datadir): package = 'org' loader = NpmLoader(package, package_url(package), package_metadata_url(package)) actual_load_status = loader.load() expected_snapshot_id = 'd0587e1195aed5a8800411a008f2f2d627f18e2d' assert actual_load_status == { 'status': 'eventful', 'snapshot_id': expected_snapshot_id } stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats assert len(list(loader.storage.content_get( _expected_new_contents_first_visit))) == len( _expected_new_contents_first_visit) assert list(loader.storage.directory_missing( _expected_new_directories_first_visit)) == [] assert list(loader.storage.revision_missing( _expected_new_revisions_first_visit)) == [] expected_snapshot = { 'id': expected_snapshot_id, 'branches': { 'HEAD': { 'target': 'releases/0.0.4', 'target_type': 'alias' }, 'releases/0.0.2': { 'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e', 'target_type': 'revision' }, 'releases/0.0.3': { 'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a', 'target_type': 'revision' }, 'releases/0.0.4': { 'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a', 'target_type': 'revision' } } } check_snapshot(expected_snapshot, loader.storage) def test_npm_loader_incremental_visit( swh_config, requests_mock_datadir_visits): package = 'org' url = package_url(package) metadata_url = package_metadata_url(package) loader = NpmLoader(package, url, metadata_url) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' assert actual_load_status['status'] is not None origin_visit = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit['status'] == 'full' assert origin_visit['type'] == 'npm' stats = get_stats(loader.storage) assert { 'content': len(_expected_new_contents_first_visit), 'directory': len(_expected_new_directories_first_visit), 'origin': 1, 'origin_visit': 1, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit), 'skipped_content': 0, 'snapshot': 1, } == stats loader._info = None # reset loader internal state actual_load_status2 = loader.load() assert actual_load_status2['status'] == 'eventful' snap_id2 = actual_load_status2['snapshot_id'] assert snap_id2 is not None assert snap_id2 != actual_load_status['snapshot_id'] origin_visit2 = list(loader.storage.origin_visit_get(url))[-1] assert origin_visit2['status'] == 'full' assert origin_visit2['type'] == 'npm' stats = get_stats(loader.storage) assert { # 3 new releases artifacts 'content': len(_expected_new_contents_first_visit) + 14, 'directory': len(_expected_new_directories_first_visit) + 15, 'origin': 1, 'origin_visit': 2, 'person': 2, 'release': 0, 'revision': len(_expected_new_revisions_first_visit) + 3, 'skipped_content': 0, 'snapshot': 2, } == stats urls = [ m.url for m in requests_mock_datadir_visits.request_history if m.url.startswith('https://registry.npmjs.org') ] assert len(urls) == len(set(urls)) # we visited each artifact once across + + +@pytest.mark.usefixtures('requests_mock_datadir') +def test_npm_loader_version_divergence(swh_config): + package = '@aller_shared' + url = package_url(package) + loader = NpmLoader(package, url, package_metadata_url(package)) + + actual_load_status = loader.load() + assert actual_load_status['status'] == 'eventful' + assert actual_load_status['status'] is not None + origin_visit = list(loader.storage.origin_visit_get(url))[-1] + assert origin_visit['status'] == 'full' + assert origin_visit['type'] == 'npm' + + stats = get_stats(loader.storage) + + assert { # 1 new releases artifacts + 'content': 534, + 'directory': 153, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 2, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + expected_snapshot = { + 'id': 'b11ebac8c9d0c9e5063a2df693a18e3aba4b2f92', + 'branches': { + 'HEAD': { + 'target_type': 'alias', + 'target': 'releases/0.1.0' + }, + 'releases/0.1.0': { + 'target_type': 'revision', + 'target': '845673bfe8cbd31b1eaf757745a964137e6f9116', + }, + 'releases/0.1.1-alpha.14': { + 'target_type': 'revision', + 'target': '05181c12cd8c22035dd31155656826b85745da37', + }, + }, + } + check_snapshot(expected_snapshot, loader.storage)