diff --git a/PKG-INFO b/PKG-INFO index 0c65a43..1d57505 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.20 +Version: 0.0.21 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index b907ae0..de40404 100644 --- a/debian/control +++ b/debian/control @@ -1,23 +1,23 @@ Source: swh-loader-dir Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), - python3-swh.model (>= 0.0.10~), + python3-swh.model (>= 0.0.11~), python3-swh.scheduler, python3-swh.storage (>= 0.0.31~), - python3-swh.loader.core (>= 0.0.9~), + python3-swh.loader.core (>= 0.0.10~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDDIR/ Package: python3-swh.loader.dir Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage Directory Loader diff --git a/requirements.txt b/requirements.txt index cd4e9c4..401f717 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner swh.core >= 0.0.14 -swh.model >= 0.0.10 +swh.model >= 0.0.11 swh.scheduler swh.storage >= 0.0.31 -swh.loader.core >= 0.0.9 +swh.loader.core >= 0.0.10 retrying diff --git a/swh.loader.dir.egg-info/PKG-INFO b/swh.loader.dir.egg-info/PKG-INFO index 0c65a43..1d57505 100644 --- a/swh.loader.dir.egg-info/PKG-INFO +++ b/swh.loader.dir.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.20 +Version: 0.0.21 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.dir.egg-info/requires.txt b/swh.loader.dir.egg-info/requires.txt index 73f4465..efcf120 100644 --- a/swh.loader.dir.egg-info/requires.txt +++ b/swh.loader.dir.egg-info/requires.txt @@ -1,7 +1,7 @@ retrying swh.core>=0.0.14 -swh.loader.core>=0.0.9 -swh.model>=0.0.10 +swh.loader.core>=0.0.10 +swh.model>=0.0.11 swh.scheduler swh.storage>=0.0.31 vcversioner diff --git a/swh/loader/dir/loader.py b/swh/loader/dir/loader.py index 8c7cc3e..2a667a8 100644 --- a/swh/loader/dir/loader.py +++ b/swh/loader/dir/loader.py @@ -1,188 +1,188 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import sys import uuid from swh.loader.core import loader from swh.model import git from swh.model.git import GitType from . import converters class DirLoader(loader.SWHLoader): """A bulk loader for a directory. This will load the content of the directory. """ CONFIG_BASE_FILENAME = 'loader/dir.ini' def __init__(self, origin_id, logging_class='swh.loader.dir.DirLoader', config=None): super().__init__(origin_id, logging_class, config=config) def list_repo_objs(self, dir_path, revision, release): """List all objects from dir_path. Args: - dir_path (path): the directory to list - revision: revision dictionary representation - release: release dictionary representation Returns: a dict containing lists of `Oid`s with keys for each object type: - CONTENT - DIRECTORY """ def _revision_from(tree_hash, revision): full_rev = dict(revision) full_rev['directory'] = tree_hash full_rev = converters.commit_to_revision(full_rev) full_rev['id'] = git.compute_revision_sha1_git(full_rev) return full_rev def _release_from(revision_hash, release): full_rel = dict(release) full_rel['target'] = revision_hash full_rel['target_type'] = 'revision' full_rel = converters.annotated_tag_to_release(full_rel) full_rel['id'] = git.compute_release_sha1_git(full_rel) return full_rel log_id = str(uuid.uuid4()) sdir_path = dir_path.decode('utf-8') self.log.info("Started listing %s" % dir_path, extra={ 'swh_type': 'dir_list_objs_start', 'swh_repo': sdir_path, 'swh_id': log_id, }) - objects_per_path = git.walk_and_compute_sha1_from_directory_2(dir_path) + objects_per_path = git.compute_hashes_from_directory(dir_path) tree_hash = objects_per_path[dir_path]['checksums']['sha1_git'] full_rev = _revision_from(tree_hash, revision) objects = { - GitType.BLOB: list( # FIXME: bad, only to satisfy log below! + GitType.BLOB: list( git.objects_per_type(GitType.BLOB, objects_per_path)), - GitType.TREE: list( # FIXME: bad, only to satisfy log! + GitType.TREE: list( git.objects_per_type(GitType.TREE, objects_per_path)), GitType.COMM: [full_rev], GitType.RELE: [] } if release and 'name' in release: full_rel = _release_from(full_rev['id'], release) objects[GitType.RELE] = [full_rel] self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( sdir_path, len(objects[GitType.BLOB]), len(objects[GitType.TREE]), len(objects[GitType.COMM]), len(objects[GitType.RELE]) ), extra={ 'swh_type': 'dir_list_objs_end', 'swh_repo': sdir_path, 'swh_num_blobs': len(objects[GitType.BLOB]), 'swh_num_trees': len(objects[GitType.TREE]), 'swh_num_commits': len(objects[GitType.COMM]), 'swh_num_releases': len(objects[GitType.RELE]), 'swh_id': log_id, }) return objects def process(self, dir_path, origin, revision, release, occurrences): """Load a directory in backend. Args: - dir_path: source of the directory to import - origin: Dictionary origin - id: origin's id - url: url origin we fetched - type: type of the origin - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrences as dictionary. Information needed, keys are: - branch: occurrence's branch name - date: validity date (e.g. 2015-01-01 00:00:00+00) Returns: Dictionary with the following keys: - status: mandatory, the status result as a boolean - stderr: optional when status is True, mandatory otherwise - objects: the actual objects sent to swh storage """ def _occurrence_from(origin_id, revision_hash, occurrence): occ = dict(occurrence) occ.update({ 'target': revision_hash, 'target_type': 'revision', 'origin': origin_id, }) return occ def _occurrences_from(origin_id, revision_hash, occurrences): occs = [] for occurrence in occurrences: occs.append(_occurrence_from(origin_id, revision_hash, occurrence)) return occs if not os.path.exists(dir_path): warn_msg = 'Skipping inexistant directory %s' % dir_path self.log.warn(warn_msg, extra={ 'swh_type': 'dir_repo_list_refs', 'swh_repo': dir_path, 'swh_num_refs': 0, }) return {'status': False, 'stderr': warn_msg} if isinstance(dir_path, str): dir_path = dir_path.encode(sys.getfilesystemencoding()) # to load the repository, walk all objects, compute their hash objects = self.list_repo_objs(dir_path, revision, release) full_rev = objects[GitType.COMM][0] # only 1 revision # Update objects with release and occurrences objects[GitType.REFS] = _occurrences_from(origin['id'], full_rev['id'], occurrences) self.load(objects) self.flush() return {'status': True, 'objects': objects} diff --git a/version.txt b/version.txt index 79652d6..3ca6d35 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.20-0-g9177236 \ No newline at end of file +v0.0.21-0-g13f94a8 \ No newline at end of file