diff --git a/PKG-INFO b/PKG-INFO index f3afd9e..313d7e1 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.18 +Version: 0.0.19 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 5d68b13..c107adb 100644 --- a/debian/control +++ b/debian/control @@ -1,23 +1,23 @@ Source: swh-loader-dir Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), - python3-swh.model (>= 0.0.4~), + python3-swh.model (>= 0.0.10~), python3-swh.scheduler, python3-swh.storage (>= 0.0.31~), - python3-swh.loader.core (>= 0.0.5~), + python3-swh.loader.core (>= 0.0.8~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDDIR/ Package: python3-swh.loader.dir Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage Directory Loader diff --git a/debian/rules b/debian/rules index 1404aae..7921a1c 100755 --- a/debian/rules +++ b/debian/rules @@ -1,9 +1,12 @@ #!/usr/bin/make -f export PYBUILD_NAME=swh-loader-dir %: dh $@ --with python3 --buildsystem=pybuild override_dh_auto_test: + PYBUILD_SYSTEM=custom \ + PYBUILD_TEST_ARGS="python{version} -m nose swh -sva '!db,!fs'" \ + dh_auto_test diff --git a/requirements.txt b/requirements.txt index 1feb592..c3ef19f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner swh.core >= 0.0.14 -swh.model >= 0.0.4 +swh.model >= 0.0.10 swh.scheduler swh.storage >= 0.0.31 -swh.loader.core >= 0.0.5 +swh.loader.core >= 0.0.8 retrying diff --git a/swh.loader.dir.egg-info/PKG-INFO b/swh.loader.dir.egg-info/PKG-INFO index f3afd9e..313d7e1 100644 --- a/swh.loader.dir.egg-info/PKG-INFO +++ b/swh.loader.dir.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.18 +Version: 0.0.19 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.dir.egg-info/SOURCES.txt b/swh.loader.dir.egg-info/SOURCES.txt index 2b9fb74..be5ed18 100644 --- a/swh.loader.dir.egg-info/SOURCES.txt +++ b/swh.loader.dir.egg-info/SOURCES.txt @@ -1,30 +1,32 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile Makefile.local README requirements.txt setup.py version.txt bin/swh-check-missing-objects.py bin/swh-loader-dir debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format resources/dir.ini resources/loader/dir.ini scratch/walking.py swh.loader.dir.egg-info/PKG-INFO swh.loader.dir.egg-info/SOURCES.txt swh.loader.dir.egg-info/dependency_links.txt swh.loader.dir.egg-info/requires.txt swh.loader.dir.egg-info/top_level.txt swh/loader/dir/__init__.py +swh/loader/dir/converters.py swh/loader/dir/loader.py swh/loader/dir/tasks.py +swh/loader/dir/tests/test_converters.py swh/loader/dir/tests/test_loader.py \ No newline at end of file diff --git a/swh.loader.dir.egg-info/requires.txt b/swh.loader.dir.egg-info/requires.txt index 4e31f97..011b7b2 100644 --- a/swh.loader.dir.egg-info/requires.txt +++ b/swh.loader.dir.egg-info/requires.txt @@ -1,7 +1,7 @@ retrying swh.core>=0.0.14 -swh.loader.core>=0.0.5 -swh.model>=0.0.4 +swh.loader.core>=0.0.8 +swh.model>=0.0.10 swh.scheduler swh.storage>=0.0.31 vcversioner diff --git a/swh/loader/dir/converters.py b/swh/loader/dir/converters.py new file mode 100644 index 0000000..654c818 --- /dev/null +++ b/swh/loader/dir/converters.py @@ -0,0 +1,71 @@ +# Copyright (C) 2015-2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime + + +def to_datetime(ts): + """Convert a timestamp to utc datetime. + + """ + return datetime.datetime.utcfromtimestamp(ts).replace( + tzinfo=datetime.timezone.utc) + + +def format_to_minutes(offset_str): + """Convert a git string timezone format string (e.g +0200, -0310) to minutes. + + Args: + offset_str: a string representing an offset. + + Returns: + A positive or negative number of minutes of such input + + """ + sign = offset_str[0] + hours = int(offset_str[1:3]) + minutes = int(offset_str[3:]) + (hours * 60) + return minutes if sign == '+' else -1 * minutes + + +def commit_to_revision(commit, log=None): + """Format a commit as a revision. + + """ + new_commit = commit.copy() + new_commit.update({ + 'author': { + 'name': commit['author']['name'].encode('utf-8'), + 'fullname': commit['author']['fullname'].encode('utf-8'), + 'email': commit['author']['email'].encode('utf-8'), + }, + 'committer': { + 'name': commit['committer']['name'].encode('utf-8'), + 'fullname': commit['committer']['fullname'].encode('utf-8'), + 'email': commit['committer']['email'].encode('utf-8'), + }, + 'message': commit['message'].encode('utf-8'), + 'synthetic': True, + 'parents': [] + }) + return new_commit + + +def annotated_tag_to_release(release, log=None): + """Format a swh release. + + """ + new_release = release.copy() + new_release.update({ + 'name': release['name'].encode('utf-8'), + 'author': { + 'name': release['author']['name'].encode('utf-8'), + 'fullname': release['author']['fullname'].encode('utf-8'), + 'email': release['author']['email'].encode('utf-8'), + }, + 'message': release['message'].encode('utf-8'), + 'synthetic': True + }) + return new_release diff --git a/swh/loader/dir/loader.py b/swh/loader/dir/loader.py index 57e7933..8c7cc3e 100644 --- a/swh/loader/dir/loader.py +++ b/swh/loader/dir/loader.py @@ -1,198 +1,188 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import sys import uuid -from swh.loader.core import loader, converters +from swh.loader.core import loader from swh.model import git from swh.model.git import GitType +from . import converters + class DirLoader(loader.SWHLoader): """A bulk loader for a directory. This will load the content of the directory. """ CONFIG_BASE_FILENAME = 'loader/dir.ini' def __init__(self, origin_id, logging_class='swh.loader.dir.DirLoader', config=None): super().__init__(origin_id, logging_class, config=config) def list_repo_objs(self, dir_path, revision, release): """List all objects from dir_path. Args: - dir_path (path): the directory to list - revision: revision dictionary representation - release: release dictionary representation Returns: a dict containing lists of `Oid`s with keys for each object type: - CONTENT - DIRECTORY """ - def get_objects_per_object_type(objects_per_path): - m = { - GitType.BLOB: [], - GitType.TREE: [], - GitType.COMM: [], - GitType.RELE: [] - } - for tree_path in objects_per_path: - objs = objects_per_path[tree_path] - for obj in objs: - m[obj['type']].append(obj) - - return m - - def _revision_from(tree_hash, revision, objects): + def _revision_from(tree_hash, revision): full_rev = dict(revision) full_rev['directory'] = tree_hash - full_rev = converters.commit_to_revision(full_rev, objects) + full_rev = converters.commit_to_revision(full_rev) full_rev['id'] = git.compute_revision_sha1_git(full_rev) return full_rev def _release_from(revision_hash, release): full_rel = dict(release) full_rel['target'] = revision_hash full_rel['target_type'] = 'revision' full_rel = converters.annotated_tag_to_release(full_rel) full_rel['id'] = git.compute_release_sha1_git(full_rel) return full_rel log_id = str(uuid.uuid4()) sdir_path = dir_path.decode('utf-8') self.log.info("Started listing %s" % dir_path, extra={ 'swh_type': 'dir_list_objs_start', 'swh_repo': sdir_path, 'swh_id': log_id, }) - objects_per_path = git.walk_and_compute_sha1_from_directory(dir_path) - - objects = get_objects_per_object_type(objects_per_path) - - tree_hash = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] + objects_per_path = git.walk_and_compute_sha1_from_directory_2(dir_path) - full_rev = _revision_from(tree_hash, revision, objects_per_path) + tree_hash = objects_per_path[dir_path]['checksums']['sha1_git'] + full_rev = _revision_from(tree_hash, revision) - objects[GitType.COMM] = [full_rev] + objects = { + GitType.BLOB: list( # FIXME: bad, only to satisfy log below! + git.objects_per_type(GitType.BLOB, objects_per_path)), + GitType.TREE: list( # FIXME: bad, only to satisfy log! + git.objects_per_type(GitType.TREE, objects_per_path)), + GitType.COMM: [full_rev], + GitType.RELE: [] + } if release and 'name' in release: full_rel = _release_from(full_rev['id'], release) objects[GitType.RELE] = [full_rel] self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( sdir_path, len(objects[GitType.BLOB]), len(objects[GitType.TREE]), len(objects[GitType.COMM]), len(objects[GitType.RELE]) ), extra={ 'swh_type': 'dir_list_objs_end', 'swh_repo': sdir_path, 'swh_num_blobs': len(objects[GitType.BLOB]), 'swh_num_trees': len(objects[GitType.TREE]), 'swh_num_commits': len(objects[GitType.COMM]), 'swh_num_releases': len(objects[GitType.RELE]), 'swh_id': log_id, }) - return objects, objects_per_path + return objects def process(self, dir_path, origin, revision, release, occurrences): """Load a directory in backend. Args: - dir_path: source of the directory to import - origin: Dictionary origin - id: origin's id - url: url origin we fetched - type: type of the origin - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrences as dictionary. Information needed, keys are: - branch: occurrence's branch name - date: validity date (e.g. 2015-01-01 00:00:00+00) Returns: Dictionary with the following keys: - status: mandatory, the status result as a boolean - stderr: optional when status is True, mandatory otherwise - objects: the actual objects sent to swh storage """ def _occurrence_from(origin_id, revision_hash, occurrence): occ = dict(occurrence) occ.update({ 'target': revision_hash, 'target_type': 'revision', 'origin': origin_id, }) return occ def _occurrences_from(origin_id, revision_hash, occurrences): occs = [] for occurrence in occurrences: occs.append(_occurrence_from(origin_id, revision_hash, occurrence)) return occs if not os.path.exists(dir_path): warn_msg = 'Skipping inexistant directory %s' % dir_path self.log.warn(warn_msg, extra={ 'swh_type': 'dir_repo_list_refs', 'swh_repo': dir_path, 'swh_num_refs': 0, }) return {'status': False, 'stderr': warn_msg} if isinstance(dir_path, str): dir_path = dir_path.encode(sys.getfilesystemencoding()) # to load the repository, walk all objects, compute their hash - objects, objects_per_path = self.list_repo_objs(dir_path, revision, - release) + objects = self.list_repo_objs(dir_path, revision, release) full_rev = objects[GitType.COMM][0] # only 1 revision # Update objects with release and occurrences - objects[GitType.RELE] = [full_rev] objects[GitType.REFS] = _occurrences_from(origin['id'], full_rev['id'], occurrences) - self.load(objects, objects_per_path) + self.load(objects) self.flush() return {'status': True, 'objects': objects} diff --git a/swh/loader/dir/tasks.py b/swh/loader/dir/tasks.py index a4dd591..c4b7c25 100644 --- a/swh/loader/dir/tasks.py +++ b/swh/loader/dir/tasks.py @@ -1,35 +1,35 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.dir.loader import DirLoader from swh.loader.core import tasks class LoadDirRepository(tasks.LoaderCoreTask): """Import a directory to Software Heritage """ task_queue = 'swh_loader_dir' def run(self, dir_path, origin, revision, release, occurrences): """Import a directory. Args: cf. swh.loader.dir.loader.run docstring """ - storage = DirLoader().storage + storage = DirLoader(origin_id=None).storage origin['id'] = storage.origin_add_one(origin) fetch_history_id = self.open_fetch_history(storage, origin['id']) result = DirLoader(origin['id']).process(dir_path, origin, revision, release, occurrences) self.close_fetch_history(storage, fetch_history_id, result) diff --git a/swh/loader/dir/tests/test_converters.py b/swh/loader/dir/tests/test_converters.py new file mode 100644 index 0000000..0a9f658 --- /dev/null +++ b/swh/loader/dir/tests/test_converters.py @@ -0,0 +1,117 @@ +# Copyright (C) 2015-2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import shutil +import tempfile +import unittest + +from nose.tools import istest + +from swh.loader.dir import converters + + +def tmpfile_with_content(fromdir, contentfile): + """Create a temporary file with content contentfile in directory fromdir. + + """ + tmpfilepath = tempfile.mktemp( + suffix='.swh', + prefix='tmp-file-for-test', + dir=fromdir) + + with open(tmpfilepath, 'wb') as f: + f.write(contentfile) + + return tmpfilepath + + +class TestConverters(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.tmpdir = tempfile.mkdtemp(prefix='test-swh-loader-dir.') + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir) + super().tearDownClass() + + @istest + def format_to_minutes(self): + self.assertEquals(converters.format_to_minutes('+0100'), 60) + self.assertEquals(converters.format_to_minutes('-0200'), -120) + self.assertEquals(converters.format_to_minutes('+1250'), 12*60+50) + self.assertEquals(converters.format_to_minutes('+0000'), 0) + self.assertEquals(converters.format_to_minutes('-0000'), 0) + + @istest + def annotated_tag_to_release(self): + # given + release = { + 'name': 'v0.0.1', + 'message': 'synthetic-message-input', + 'author': {'name': 'author-name', + 'email': 'author-email', + 'fullname': 'fullname'}, + } + + expected_release = { + 'name': b'v0.0.1', + 'message': b'synthetic-message-input', + 'author': {'name': b'author-name', + 'email': b'author-email', + 'fullname': b'fullname'}, + 'synthetic': True, + } + + # when + actual_release = converters.annotated_tag_to_release(release) + + # then + self.assertDictEqual(actual_release, expected_release) + + @istest + def commit_to_revision(self): + # given + commit = { + 'sha1_git': 'commit-git-sha1', + 'directory': 'targeted-tree-sha1', + 'date': {'timestamp': 1444054085, 'offset': '+0000'}, + 'committer_date': {'timestamp': 1444054085, 'offset': '+0000'}, + 'type': 'tar', + 'message': 'synthetic-message-input', + 'author': {'name': 'author-name', + 'email': 'author-email', + 'fullname': 'fullname'}, + 'committer': {'name': 'author-name', + 'email': 'author-email', + 'fullname': 'fullname'}, + 'directory': 'targeted-tree-sha1', + } + + expected_revision = { + 'sha1_git': 'commit-git-sha1', + 'directory': 'targeted-tree-sha1', + 'date': {'timestamp': 1444054085, 'offset': '+0000'}, + 'committer_date': {'timestamp': 1444054085, 'offset': '+0000'}, + 'type': 'tar', + 'message': b'synthetic-message-input', + 'author': {'name': b'author-name', + 'email': b'author-email', + 'fullname': b'fullname'}, + 'committer': {'name': b'author-name', + 'email': b'author-email', + 'fullname': b'fullname'}, + 'directory': 'targeted-tree-sha1', + 'synthetic': True, + 'parents': [] + } + + # when + actual_revision = converters.commit_to_revision(commit) + + # then + self.assertEquals(actual_revision, expected_revision) diff --git a/swh/loader/dir/tests/test_loader.py b/swh/loader/dir/tests/test_loader.py index 7a0b59a..e0068ea 100644 --- a/swh/loader/dir/tests/test_loader.py +++ b/swh/loader/dir/tests/test_loader.py @@ -1,125 +1,138 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.loader.dir.loader import DirLoader from swh.model.git import GitType class TestLoader(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmp_root_path = tempfile.mkdtemp().encode('utf-8') start_path = os.path.dirname(__file__).encode('utf-8') sample_folder_archive = os.path.join(start_path, b'../../../../..', b'swh-storage-testdata', b'dir-folders', b'sample-folder.tgz') cls.root_path = os.path.join(cls.tmp_root_path, b'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path], ) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.tmp_root_path) def setUp(self): super().setUp() self.info = { 'storage_class': 'remote_storage', 'storage_args': ['http://localhost:5000/'], 'content_size_limit': 104857600, 'log_db': 'dbname=softwareheritage-log', 'directory_packet_size': 25000, 'content_packet_size': 10000, 'send_contents': True, 'send_directories': True, 'content_packet_size_bytes': 1073741824, 'occurrence_packet_size': 100000, 'send_revisions': True, 'revision_packet_size': 100000, 'content_packet_block_size_bytes': 104857600, 'send_occurrences': True, 'release_packet_size': 100000, 'send_releases': True } self.origin = { 'url': 'file:///dev/null', 'type': 'dir', } self.occurrence = { 'branch': 'master', 'authority_id': 1, 'validity': '2015-01-01 00:00:00+00', } self.revision = { - 'author_name': 'swh author', - 'author_email': 'swh@inria.fr', - 'author_date': '1444054085', - 'author_offset': '+0200', - 'committer_name': 'swh committer', - 'committer_email': 'swh@inria.fr', - 'committer_date': '1444054085', - 'committer_offset': '+0200', + 'author': { + 'name': 'swh author', + 'email': 'swh@inria.fr', + 'fullname': 'swh' + }, + 'date': { + 'timestamp': 1444054085, + 'offset': 0 + }, + 'committer': { + 'name': 'swh committer', + 'email': 'swh@inria.fr', + 'fullname': 'swh' + }, + 'committer_date': { + 'timestamp': '1444054085', + 'offset': 0, + }, 'type': 'tar', 'message': 'synthetic revision', 'metadata': {'foo': 'bar'}, } self.release = { 'name': 'v0.0.1', - 'date': '1444054085', - 'offset': '+0200', - 'author_name': 'swh author', - 'author_email': 'swh@inria.fr', - 'comment': 'synthetic release', + 'date': { + 'timestamp': 1444054085, + 'offset': 0, + }, + 'author': { + 'name': 'swh author', + 'fullname': 'swh', + 'email': 'swh@inria.fr', + }, + 'message': 'synthetic release', } self.dirloader = DirLoader(origin_id=1, config=self.info) @istest def load_without_storage(self): # when - objects, objects_per_path = self.dirloader.list_repo_objs( + objects = self.dirloader.list_repo_objs( self.root_path, self.revision, self.release) # then self.assertEquals(len(objects), 4, "4 objects types, blob, tree, revision, release") self.assertEquals(len(objects[GitType.BLOB]), 8, "8 contents: 3 files + 5 links") self.assertEquals(len(objects[GitType.TREE]), 5, "5 directories: 4 subdirs + 1 empty + 1 main dir") self.assertEquals(len(objects[GitType.COMM]), 1, "synthetic revision") self.assertEquals(len(objects[GitType.RELE]), 1, "synthetic release") - self.assertEquals(len(objects_per_path), 6, "5 folders + ") - # print('objects: %s\n objects-per-path: %s\n' % # (objects.keys(), # objects_per_path.keys())) diff --git a/version.txt b/version.txt index 102b507..fc331dc 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.18-0-g704efe1 \ No newline at end of file +v0.0.19-0-g2e00512 \ No newline at end of file