diff --git a/requirements-swh.txt b/requirements-swh.txt index 9fc5250..185434a 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,3 @@ swh.model >= 0.0.18 swh.storage >= 0.0.133 +swh.deposit diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py new file mode 100644 index 0000000..3e93d11 --- /dev/null +++ b/swh/loader/package/deposit.py @@ -0,0 +1,80 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Generator, Dict, Tuple, Sequence + +from swh.loader.package.loader import PackageLoader +from swh.deposit.client import PrivateApiDepositClient as ApiClient + + +class DepositLoader(PackageLoader): + """Load pypi origin's artifact releases into swh archive. + + """ + visit_type = 'deposit' + + def __init__(self, url: str, deposit_id: str): + """Constructor + + Args: + url: Origin url to associate the artifacts/metadata to + deposit_id: Deposit identity + + """ + super().__init__(url=url) + + # For now build back existing api urls + # archive_url: Private api url to retrieve archive artifact + self.archive_url = '/%s/raw/' % deposit_id + # metadata_url: Private api url to retrieve the deposit metadata + self.metadata_url = '/%s/meta/' % deposit_id + # deposit_update_url: Private api to push pids and status update on the + # deposit id + self.deposit_update_url = '/%s/update/' % deposit_id + self.client = ApiClient() + + def get_versions(self) -> Sequence[str]: + # only 1 branch 'HEAD' with no alias since we only have 1 snapshot + # branch + return ['HEAD'] + + def get_artifacts(self, version: str) -> Generator[ + Tuple[str, str, Dict], None, None]: + meta = self.client.metadata_get(self.metadata_url) + filename = 'archive.zip' # do not care about it here + url = self.client.base_url + self.archive_url + yield filename, url, meta + + def build_revision( + self, a_metadata: Dict, a_uncompressed_path: str, + visit_date: str) -> Dict: + revision = a_metadata.pop('revision') + metadata = { + 'extrinsic': { + 'provider': '%s/%s' % ( + self.client.base_url, self.metadata_url), + 'when': visit_date, + 'raw': a_metadata, + }, + } + + # FIXME: the deposit no longer needs to build the revision + revision['metadata'].update(metadata) + revision['author'] = parse_author(revision['author']) + revision['committer'] = parse_author(revision['committer']) + revision['message'] = revision['message'].encode('utf-8') + + return revision + + +def parse_author(author): + """See prior fixme + + """ + return { + 'fullname': author['fullname'].encode('utf-8'), + 'name': author['name'].encode('utf-8'), + 'email': author['email'].encode('utf-8'), + } diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index 7686ab6..7cc8c87 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,353 +1,356 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging import tempfile import os from typing import Generator, Dict, Tuple, Sequence, List, Optional from swh.core.tarball import uncompress from swh.core.config import SWHConfig from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes ) from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.loader.core.converters import content_for_storage from swh.loader.package.utils import download logger = logging.getLogger(__name__) # Not implemented yet: # - clean up disk routines from previous killed workers (when OOMkilled) # -> separation of concern would like this to be abstracted from the code # -> experience tells us it's complicated to do as such (T903, T964, T982, # etc...) # # - model: swh.model.merkle.from_disk should output swh.model.model.* objects # to avoid this layer's conversion routine call # -> Take this up within swh.model's current implementation class PackageLoader: # Origin visit type (str) set by the loader visit_type = '' def __init__(self, url): """Loader's constructor. This raises exception if the minimal required configuration is missing (cf. fn:`check` method). Args: url (str): Origin url to load data from """ # This expects to use the environment variable SWH_CONFIG_FILENAME self.config = SWHConfig.parse_config_file() self._check_configuration() self.storage = get_storage(**self.config['storage']) self.url = url def _check_configuration(self): """Checks the minimal configuration required is set for the loader. If some required configuration is missing, exception detailing the issue is raised. """ if 'storage' not in self.config: raise ValueError( 'Misconfiguration, at least the storage key should be set') def get_versions(self) -> Sequence[str]: """Return the list of all published package versions. Returns: Sequence of published versions """ return [] def get_artifacts(self, version: str) -> Generator[ Tuple[str, str, Dict], None, None]: """Given a release version of a package, retrieve the associated artifact information for such version. Args: version: Package version Returns: (artifact filename, artifact uri, raw artifact metadata) """ yield from {} def build_revision( self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: """Build the revision dict Returns: SWH data dict """ return {} def get_default_release(self) -> str: """Retrieve the latest release version Returns: Latest version """ return '' def last_snapshot(self) -> Optional[Dict]: """Retrieve the last snapshot """ visit = self.storage.origin_visit_get_latest( self.url, require_snapshot=True) if visit: return snapshot_get_all_branches( self.storage, visit['snapshot']['id']) def known_artifacts(self, snapshot: Dict) -> [Dict]: """Retrieve the known releases/artifact for the origin. Args snapshot: snapshot for the visit Returns: Dict of keys revision id (bytes), values a metadata Dict. """ if not snapshot or 'branches' not in snapshot: return {} # retrieve only revisions (e.g the alias we do not want here) revs = [rev['target'] for rev in snapshot['branches'].values() if rev and rev['target_type'] == 'revision'] known_revisions = self.storage.revision_get(revs) ret = {} for revision in known_revisions: if not revision: # revision_get can return None continue original_artifact = revision['metadata'].get('original_artifact') if original_artifact: ret[revision['id']] = original_artifact return ret def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: """Resolve the revision from a snapshot and an artifact metadata dict. If the artifact has already been downloaded, this will return the existing revision targeting that uncompressed artifact directory. Otherwise, this returns None. Args: snapshot: Snapshot artifact_metadata: Information dict Returns: None or revision identifier """ return None def load(self) -> Dict: """Load for a specific origin the associated contents. for each package version of the origin 1. Fetch the files for one package version By default, this can be implemented as a simple HTTP request. Loaders with more specific requirements can override this, e.g.: the PyPI loader checks the integrity of the downloaded files; the Debian loader has to download and check several files for one package version. 2. Extract the downloaded files By default, this would be a universal archive/tarball extraction. Loaders for specific formats can override this method (for instance, the Debian loader uses dpkg-source -x). 3. Convert the extracted directory to a set of Software Heritage objects Using swh.model.from_disk. 4. Extract the metadata from the unpacked directories This would only be applicable for "smart" loaders like npm (parsing the package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing debian/changelog and debian/control). On "minimal-metadata" sources such as the GNU archive, the lister should provide the minimal set of metadata needed to populate the revision/release objects (authors, dates) as an argument to the task. 5. Generate the revision/release objects for the given version. From the data generated at steps 3 and 4. end for each 6. Generate and load the snapshot for the visit Using the revisions/releases collected at step 5., and the branch information from step 0., generate a snapshot and load it into the Software Heritage archive """ status_load = 'uneventful' # either: eventful, uneventful, failed status_visit = 'full' # either: partial, full tmp_revisions: Dict[str, List] = {} snapshot = None try: # Prepare origin and origin_visit origin = {'url': self.url} self.storage.origin_add([origin]) visit_date = datetime.datetime.now(tz=datetime.timezone.utc) visit_id = self.storage.origin_visit_add( origin=self.url, date=visit_date, type=self.visit_type)['visit'] last_snapshot = self.last_snapshot() logger.debug('last snapshot: %s', last_snapshot) known_artifacts = self.known_artifacts(last_snapshot) logger.debug('known artifacts: %s', known_artifacts) # Retrieve the default release (the "latest" one) default_release = self.get_default_release() logger.debug('default release: %s', default_release) for version in self.get_versions(): # for each logger.debug('version: %s', version) tmp_revisions[version] = [] # `a_` stands for `artifact_` for a_filename, a_uri, a_metadata in self.get_artifacts( version): logger.debug('a_metadata: %s', a_metadata) revision_id = self.resolve_revision_from( known_artifacts, a_metadata) if revision_id is None: with tempfile.TemporaryDirectory() as tmpdir: try: # a_c_: archive_computed_ a_path, a_c_metadata = download( - a_uri, dest=tmpdir) - except Exception as e: - logger.warning( - 'Unable to retrieve %s. Reason: %s', - a_uri, e) + a_uri, dest=tmpdir, filename=a_filename) + except Exception: + logger.exception('Unable to retrieve %s', + a_uri) status_visit = 'partial' continue logger.debug('archive_path: %s', a_path) logger.debug('archive_computed_metadata: %s', a_c_metadata) uncompressed_path = os.path.join(tmpdir, 'src') uncompress(a_path, dest=uncompressed_path) logger.debug('uncompressed_path: %s', uncompressed_path) directory = Directory.from_disk( path=uncompressed_path.encode('utf-8'), data=True) # noqa # FIXME: Try not to load the full raw content in # memory objects = directory.collect() contents = objects['content'].values() logger.debug('Number of contents: %s', len(contents)) self.storage.content_add( map(content_for_storage, contents)) status_load = 'eventful' directories = objects['directory'].values() logger.debug('Number of directories: %s', len(directories)) self.storage.directory_add(directories) # FIXME: This should be release. cf. D409 revision = self.build_revision( a_metadata, uncompressed_path, visit_date.isoformat()) revision.update({ 'type': 'tar', 'synthetic': True, 'directory': directory.hash, }) revision['metadata'].update({ 'original_artifact': a_c_metadata, }) revision['id'] = revision_id = identifier_to_bytes( revision_identifier(revision)) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) tmp_revisions[version].append({ 'filename': a_filename, 'target': revision_id, }) # Build and load the snapshot branches = {} for version, v_branches in tmp_revisions.items(): if len(v_branches) == 1: - branch_name = ('releases/%s' % version).encode('utf-8') + branch_name = ( + version if version == 'HEAD' + else 'releases/%s' % version).encode('utf-8') if version == default_release: branches[b'HEAD'] = { 'target_type': 'alias', 'target': branch_name, } branches[branch_name] = { 'target_type': 'revision', 'target': v_branches[0]['target'], } else: for x in v_branches: branch_name = ('releases/%s/%s' % ( version, v_branches['filename'])).encode('utf-8') branches[branch_name] = { 'target_type': 'revision', 'target': x['target'], } snapshot = { 'branches': branches } + logger.debug('snapshot: %s', snapshot) + snapshot['id'] = identifier_to_bytes( snapshot_identifier(snapshot)) logger.debug('snapshot: %s', snapshot) self.storage.snapshot_add([snapshot]) if hasattr(self.storage, 'flush'): self.storage.flush() - except Exception as e: - logger.warning('Fail to load %s. Reason: %s' % (self.url, e)) + except Exception: + logger.exception('Fail to load %s' % self.url) status_visit = 'partial' finally: self.storage.origin_visit_update( origin=self.url, visit_id=visit_id, status=status_visit, snapshot=snapshot) return {'status': status_load} diff --git a/swh/loader/package/tests/resources/deposit.softwareheritage.org/1_private_666_meta b/swh/loader/package/tests/resources/deposit.softwareheritage.org/1_private_666_meta new file mode 120000 index 0000000..1c75198 --- /dev/null +++ b/swh/loader/package/tests/resources/deposit.softwareheritage.org/1_private_666_meta @@ -0,0 +1 @@ +hello_2.10.json \ No newline at end of file diff --git a/swh/loader/package/tests/resources/deposit.softwareheritage.org/1_private_666_raw b/swh/loader/package/tests/resources/deposit.softwareheritage.org/1_private_666_raw new file mode 120000 index 0000000..51b4a7a --- /dev/null +++ b/swh/loader/package/tests/resources/deposit.softwareheritage.org/1_private_666_raw @@ -0,0 +1 @@ +hello_2.10.orig.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/resources/deposit.softwareheritage.org/hello_2.10.json b/swh/loader/package/tests/resources/deposit.softwareheritage.org/hello_2.10.json new file mode 100644 index 0000000..cab9c3b --- /dev/null +++ b/swh/loader/package/tests/resources/deposit.softwareheritage.org/hello_2.10.json @@ -0,0 +1,80 @@ +{ + "origin": { + "url": "https://hal-test.archives-ouvertes.fr/some-external-id", + "type": "deposit" + }, + "origin_metadata": { + "metadata": { + "@xmlns": [ + "http://www.w3.org/2005/Atom" + ], + "author": [ + "some awesome author", + "another one", + "no one" + ], + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "external_identifier": "some-external-id", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id" + }, + "provider": { + "provider_name": "hal", + "provider_type": "deposit_client", + "provider_url": "https://hal-test.archives-ouvertes.fr/", + "metadata": null + }, + "tool": { + "name": "swh-deposit", + "version": "0.0.1", + "configuration": { + "sword_version": "2" + } + } + }, + "revision": { + "synthetic": true, + "committer_date": { + "timestamp": { + "seconds": 1507389428, + "microseconds": 0 + }, + "offset": 0, + "negative_utc": false + }, + "message": "hal: Deposit 666 in collection hal", + "author": { + "name": "Software Heritage", + "fullname": "Software Heritage", + "email": "robot@softwareheritage.org" + }, + "committer": { + "name": "Software Heritage", + "fullname": "Software Heritage", + "email": "robot@softwareheritage.org" + }, + "date": { + "timestamp": { + "seconds": 1507389428, + "microseconds": 0 + }, + "offset": 0, + "negative_utc": false + }, + "metadata": { + "@xmlns": [ + "http://www.w3.org/2005/Atom" + ], + "author": [ + "some awesome author", + "another one", + "no one" + ], + "external_identifier": "some-external-id", + "codemeta:dateCreated": "2017-10-07T15:17:08Z", + "url": "https://hal-test.archives-ouvertes.fr/some-external-id" + }, + "type": "tar", + "parents": [] + }, + "branch_name": "master" +} diff --git a/swh/loader/package/tests/resources/deposit.softwareheritage.org/hello_2.10.orig.tar.gz b/swh/loader/package/tests/resources/deposit.softwareheritage.org/hello_2.10.orig.tar.gz new file mode 100644 index 0000000..cae6b33 Binary files /dev/null and b/swh/loader/package/tests/resources/deposit.softwareheritage.org/hello_2.10.orig.tar.gz differ diff --git a/swh/loader/package/tests/resources/loader.yml b/swh/loader/package/tests/resources/loader.yml index a2fafc3..177c338 100644 --- a/swh/loader/package/tests/resources/loader.yml +++ b/swh/loader/package/tests/resources/loader.yml @@ -1,14 +1,16 @@ storage: cls: filter args: storage: cls: buffer args: storage: cls: memory args: {} thresholds: content: 5 content_bytes: 100 directory: 5 revision: 5 + +url: https://deposit.softwareheritage.org/1/private diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py new file mode 100644 index 0000000..826677e --- /dev/null +++ b/swh/loader/package/tests/test_deposit.py @@ -0,0 +1,77 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.loader.package.deposit import DepositLoader + + +def test_deposit_init_ok(swh_config): + url = 'some-url' + deposit_id = 999 + loader = DepositLoader(url, deposit_id) # Something that does not exist + + assert loader.url == url + assert loader.archive_url == '/%s/raw/' % deposit_id + assert loader.metadata_url == '/%s/meta/' % deposit_id + assert loader.deposit_update_url == '/%s/update/' % deposit_id + assert loader.client is not None + + +def test_deposit_loading_failure_to_retrieve_artifact(swh_config): + """Error during fetching artifact ends us with partial visit + + """ + # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/' + url = 'some-url' + unknown_deposit_id = 666 + loader = DepositLoader(url, unknown_deposit_id) # does not exist + + assert loader.archive_url + actual_load_status = loader.load() + + assert actual_load_status == {'status': 'uneventful'} + + stats = loader.storage.stat_counters() + + assert { + 'content': 0, + 'directory': 0, + 'origin': 1, + 'origin_visit': 1, + 'person': 0, + 'release': 0, + 'revision': 0, + 'skipped_content': 0, + 'snapshot': 0, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'partial' + + +def test_deposit_loading_ok(swh_config, local_get): + url = 'https://hal-test.archives-ouvertes.fr/some-external-id' + deposit_id = 666 + loader = DepositLoader(url, deposit_id) + + assert loader.archive_url + actual_load_status = loader.load() + + assert actual_load_status == {'status': 'eventful'} + + stats = loader.storage.stat_counters() + assert { + 'content': 303, + 'directory': 12, + 'origin': 1, + 'origin_visit': 1, + 'person': 1, + 'release': 0, + 'revision': 1, + 'skipped_content': 0, + 'snapshot': 1, + } == stats + + origin_visit = next(loader.storage.origin_visit_get(url)) + assert origin_visit['status'] == 'full' diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py index fa10566..920964d 100644 --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -1,101 +1,105 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import requests -from typing import Dict, Tuple +from typing import Dict, Optional, Tuple from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.loader.package import DEFAULT_PARAMS logger = logging.getLogger(__name__) def api_info(url: str) -> Dict: """Basic api client to retrieve information on project. This deals with fetching json metadata about pypi projects. Args: url (str): The api url (e.g PyPI, npm, etc...) Raises: ValueError in case of query failures (for some reasons: 404, ...) Returns: The associated response's information dict """ response = requests.get(url, **DEFAULT_PARAMS) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) return response.json() -def download(url: str, dest: str, hashes: Dict = {}) -> Tuple[str, Dict]: +def download(url: str, dest: str, hashes: Dict = {}, + filename: Optional[str] = None) -> Tuple[str, Dict]: """Download a remote tarball from url, uncompresses and computes swh hashes on it. Args: url: Artifact uri to fetch, uncompress and hash dest: Directory to write the archive to hashes: Dict of expected hashes (key is the hash algo) for the artifact to download (those hashes are expected to be hex string) Raises: ValueError in case of any error when fetching/computing (length, checksums mismatched...) Returns: Tuple of local (filepath, hashes of filepath) """ response = requests.get(url, **DEFAULT_PARAMS, stream=True) + logger.debug('headers: %s', response.headers) if response.status_code != 200: raise ValueError("Fail to query '%s'. Reason: %s" % ( url, response.status_code)) length = int(response.headers['content-length']) - filename = os.path.basename(url) + filename = filename if filename else os.path.basename(url) + logger.debug('filename: %s', filename) filepath = os.path.join(dest, filename) + logger.debug('filepath: %s', filepath) h = MultiHash(length=length) with open(filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): h.update(chunk) f.write(chunk) actual_length = os.path.getsize(filepath) if length != actual_length: raise ValueError('Error when checking size: %s != %s' % ( length, actual_length)) # Also check the expected hashes if provided if hashes: actual_hashes = h.hexdigest() for algo_hash in hashes.keys(): actual_digest = actual_hashes[algo_hash] expected_digest = hashes[algo_hash] if actual_digest != expected_digest: raise ValueError( 'Failure when fetching %s. ' 'Checksum mismatched: %s != %s' % ( url, expected_digest, actual_digest)) extrinsic_metadata = { 'length': length, 'filename': filename, 'checksums': { **h.hexdigest() }, } logger.debug('extrinsic_metadata', extrinsic_metadata) return filepath, extrinsic_metadata