diff --git a/README.md b/README.md index 737c01d..38457ab 100644 --- a/README.md +++ b/README.md @@ -1,102 +1,109 @@ swh-loader-pypi ==================== SWH PyPI loader's source code repository # What does the loader do? The PyPI loader visits and loads a PyPI project [1]. Each visit will result in: - 1 snapshot (which targets n revisions ; 1 per release artifact) - 1 revision (which targets 1 directory ; the release artifact uncompressed) [1] https://pypi.org/help/#packages ## First visit Given a PyPI project (origin), the loader, for the first visit: - retrieves information for the given project (including releases) - then for each associated release - for each associated source distribution (type 'sdist') release artifact (possibly many per release) - retrieves the associated artifact archive (with checks) - uncompresses locally the archive - computes the hashes of the uncompressed directory - then creates a revision (using PKG-INFO metadata file) targetting such directory - finally, creates a snapshot targetting all seen revisions (uncompressed PyPI artifact and metadata). ## Next visit The loader starts by checking if something changed since the last visit. If nothing changed, the visit's snapshot is left unchanged. The new visit targets the same snapshot. If something changed, the already seen release artifacts are skipped. Only the new ones are loaded. In the end, the loader creates a new snapshot based on the previous one. Thus, the new snapshot targets both the old and new PyPI release artifacts. ## Terminology - 1 project: a PyPI project (used as swh origin). This is a collection of releases. - 1 release: a specific version of the (PyPi) project. It's a collection of information and associated source release artifacts (type 'sdist') - 1 release artifact: a source release artifact (distributed by a PyPI maintainer). In swh, we are specifically interested by the 'sdist' type (source code). ## Edge cases - If no release provides release artifacts, those are skipped - If a release artifact holds no PKG-INFO file (root at the archive), the release artifact is skipped. - If a problem occurs during a fetch action (e.g. release artifact download), the load fails and the visit is marked as 'partial'. # Development ## Configuration file ### Location Either: - /etc/softwareheritage/loader/pypi.yml - ~/.config/swh/loader/pypi.yml - ~/.swh/loader/svn.pypi ### Configuration sample ``` storage: cls: remote args: url: http://localhost:5002/ ``` ## Local run -PyPI loader expects as input: -- project: a pypi project name (ex: arrow) -- project_url: uri to the pypi project (html page) -- project_metadata_url: uri to the pypi metadata information (json page) - +The built-in command-line will run the loader for a project in the main PyPI archive. +For instance, to load arrow: ``` sh -$ python3 -Python 3.6.6 (default, Jun 27 2018, 14:44:17) -[GCC 8.1.0] on linux -Type "help", "copyright", "credits" or "license" for more information. ->>> import logging; logging.basicConfig(level=logging.DEBUG ->>> project='arrow; from swh.loader.pypi.tasks import LoadPyPI; ->>> LoadPyPI().run(project, 'https://pypi.org/pypi/%s/' % project, 'https://pypi.org/pypi/%s/json' % project) +python3 -m swh.loader.pypi arrow +``` + +If you need more control, you can use the loader directly. It expects three arguments: +- project: a PyPI project name (f.e.: arrow) +- project_url: URL of the PyPI project (human-readable html page) +- project_metadata_url: URL of the PyPI metadata information (machine-parsable json document) + +``` python +import logging +logging.basicConfig(level=logging.DEBUG) + +from swh.loader.pypi.tasks import LoadPyPI + +project='arrow' + +LoadPyPI().run(project, 'https://pypi.org/pypi/%s/' % project, 'https://pypi.org/pypi/%s/json' % project) ``` diff --git a/swh/loader/pypi/loader.py b/swh/loader/pypi/loader.py index 59664a1..b1c0e9f 100644 --- a/swh/loader/pypi/loader.py +++ b/swh/loader/pypi/loader.py @@ -1,244 +1,260 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import arrow import os import shutil from tempfile import mkdtemp from swh.loader.core.utils import clean_dangling_folders from swh.loader.core.loader import SWHLoader from swh.model.from_disk import Directory from swh.model.identifiers import ( revision_identifier, snapshot_identifier, identifier_to_bytes, normalize_timestamp ) from .client import PyPIClient, PyPIProject TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.pypi.' DEBUG_MODE = '** DEBUG MODE **' class PyPILoader(SWHLoader): CONFIG_BASE_FILENAME = 'loader/pypi' ADDITIONAL_CONFIG = { 'temp_directory': ('str', '/tmp/swh.loader.pypi/'), 'cache': ('bool', False), 'cache_dir': ('str', ''), 'debug': ('bool', False), # NOT FOR PRODUCTION } def __init__(self, client=None): super().__init__(logging_class='swh.loader.pypi.PyPILoader') self.origin_id = None if not client: temp_directory = self.config['temp_directory'] os.makedirs(temp_directory, exist_ok=True) self.temp_directory = mkdtemp( suffix='-%s' % os.getpid(), prefix=TEMPORARY_DIR_PREFIX_PATTERN, dir=temp_directory) self.pypi_client = PyPIClient( temp_directory=self.temp_directory, cache=self.config['cache'], cache_dir=self.config['cache_dir']) else: self.temp_directory = client.temp_directory self.pypi_client = client self.debug = self.config['debug'] self.done = False def pre_cleanup(self): """To prevent disk explosion if some other workers exploded in mid-air (OOM killed), we try and clean up dangling files. """ if self.debug: self.log.warn('%s Will not pre-clean up temp dir %s' % ( DEBUG_MODE, self.temp_directory )) return clean_dangling_folders(self.config['temp_directory'], pattern_check=TEMPORARY_DIR_PREFIX_PATTERN, log=self.log) def cleanup(self): """Clean up temporary disk use """ if self.debug: self.log.warn('%s Will not clean up temp dir %s' % ( DEBUG_MODE, self.temp_directory )) return if os.path.exists(self.temp_directory): self.log.debug('Clean up %s' % self.temp_directory) shutil.rmtree(self.temp_directory) def prepare_origin_visit(self, project_name, origin_url, origin_metadata_url=None): """Prepare the origin visit information Args: project_name (str): Project's simple name origin_url (str): Project's main url origin_metadata_url (str): Project's metadata url """ self.origin = { 'url': origin_url, 'type': 'pypi', } self.visit_date = None # loader core will populate it def _known_artifacts(self, last_snapshot): """Retrieve the known releases/artifact for the origin_id. Args snapshot (dict): Last snapshot for the visit Returns: tuple artifact's filename, artifact's sha256 """ revs = [rev['target'] for rev in last_snapshot['branches'].values()] known_revisions = self.storage.revision_get(revs) for revision in known_revisions: artifact = revision['metadata']['original_artifact'] yield artifact['filename'], artifact['sha256'] def _last_snapshot(self): """Retrieve the last snapshot """ return self.storage.snapshot_get_latest(self.origin_id) def prepare(self, project_name, origin_url, origin_metadata_url=None): """Keep reference to the origin url (project) and the project metadata url Args: project_name (str): Project's simple name origin_url (str): Project's main url origin_metadata_url (str): Project's metadata url """ self.project_name = project_name self.origin_url = origin_url self.origin_metadata_url = origin_metadata_url self.project = PyPIProject(self.pypi_client, self.project_name, self.origin_metadata_url) self._prepare_state() def _prepare_state(self): """Initialize internal state (snapshot, contents, directories, etc...) This is called from `prepare` method. """ last_snapshot = self._last_snapshot() if last_snapshot: self._snapshot = last_snapshot.copy() known_artifacts = self._known_artifacts(self._snapshot) else: self._snapshot = { 'branches': {} } known_artifacts = [] # and the artifacts # that will be the source of data to retrieve self.release_artifacts = self.project.releases(known_artifacts) # temporary state self._contents = [] self._directories = [] self._revisions = [] def fetch_data(self): """Called once per release artifact version (can be many for one release). This will for each call: - retrieve a release artifact (associated to a release version) - Uncompress it and compute the necessary information - Computes the swh objects Returns: True as long as data to fetch exist """ data = None if self.done: return False try: data = next(self.release_artifacts) except StopIteration: self.done = True return False project_info, author, release, artifact, dir_path = data dir_path = dir_path.encode('utf-8') directory = Directory.from_disk(path=dir_path, data=True) _objects = directory.collect() self._contents = _objects['content'].values() self._directories = _objects['directory'].values() date = normalize_timestamp( int(arrow.get(artifact['date']).timestamp)) name = release['name'].encode('utf-8') message = release['message'].encode('utf-8') if message: message = b'%s: %s' % (name, message) else: message = name _revision = { 'synthetic': True, 'metadata': { 'original_artifact': artifact, 'project': project_info, }, 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'message': message, 'directory': directory.hash, 'parents': [], 'type': 'tar', } _revision['id'] = identifier_to_bytes( revision_identifier(_revision)) self._revisions.append(_revision) branch_name = artifact['filename'].encode('utf-8') self._snapshot['branches'][branch_name] = { 'target': _revision['id'], 'target_type': 'revision', } return not self.done def generate_and_load_snapshot(self): self._snapshot['id'] = identifier_to_bytes( snapshot_identifier(self._snapshot)) self.maybe_load_snapshot(self._snapshot) def store_data(self): """(override) This sends collected objects to storage. """ self.maybe_load_contents(self._contents) self.maybe_load_directories(self._directories) self.maybe_load_revisions(self._revisions) if self.done: self.generate_and_load_snapshot() self.flush() + + +if __name__ == '__main__': + import logging + import sys + logging.basicConfig(level=logging.DEBUG) + if len(sys.argv) != 2: + logging.error('Usage: %s ' % sys.argv[0]) + sys.exit(1) + module_name = sys.argv[1] + loader = PyPILoader() + loader.load( + module_name, + 'https://pypi.org/projects/%s/' % module_name, + 'https://pypi.org/pypi/%s/json' % module_name, + )