diff --git a/MANIFEST.in b/MANIFEST.in index 08ebc95..e7c46fc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include Makefile include requirements.txt +include requirements-swh.txt include version.txt diff --git a/PKG-INFO b/PKG-INFO index 962ac38..886af48 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/README b/README index 2287724..90b283b 100644 --- a/README +++ b/README @@ -1,95 +1,96 @@ SWH-loader-dir ============== The Software Heritage Directory Loader is a tool and a library to walk a local directory and inject into the SWH dataset all unknown contained files. Directory loader ================ ### Configuration This is the loader's (or task's) configuration file. -loader/dir.ini: - - [main] - - # access to swh's storage - storage_class = remote_storage - storage_args = http://localhost:5000/ - - # parameters to condition loading into swh storage - send_contents = True - send_directories = True - send_revisions = True - send_releases = True - send_occurrences = True - content_packet_size = 10000 - content_packet_size_bytes = 1073741824 - directory_packet_size = 25000 - revision_packet_size = 100000 - release_packet_size = 100000 - occurrence_packet_size = 100000 +loader/dir.yml: + + storage: + cls: remote + args: + url: http://localhost:5002/ + + send_contents: True + send_directories: True + send_revisions: True + send_releases: True + send_occurrences: True + # nb of max contents to send for storage + content_packet_size: 100 + # 100 Mib of content data + content_packet_block_size_bytes: 104857600 + # limit for swh content storage for one blob (beyond that limit, the + # content's data is not sent for storage) + content_packet_size_bytes: 1073741824 + directory_packet_size: 250 + revision_packet_size: 100 + release_packet_size: 100 + occurrence_packet_size: 100 Present in possible locations: - ~/.config/swh/loader/dir.ini - ~/.swh/loader/dir.ini - /etc/softwareheritage/loader/dir.ini #### Toplevel Load directory directly from code or toplevel: - from swh.loader.dir.tasks import LoadDirRepository + from swh.loader.dir.loader import DirLoader dir_path = '/path/to/directory # Fill in those - origin = {} + origin = {'url': 'some-origin', 'type': 'dir'} + visit_date = 'Tue, 3 May 2017 17:16:32 +0200' release = None revision = {} occurrence = {} - LoadDirRepository().run(dir_path, origin, revision, release, [occurrence]) + DirLoader().load(dir_path, origin, visit_date, revision, release, [occurrence]) #### Celery Load directory using celery. Providing you have a properly configured celery up and running worker.ini needs to be updated with the following keys: task_modules = swh.loader.dir.tasks task_queues = swh_loader_dir cf. https://forge.softwareheritage.org/diffusion/DCORE/browse/master/README.md for more details You can send the following message to the task queue: from swh.loader.dir.tasks import LoadDirRepository # Fill in those - origin = {} + origin = {'url': 'some-origin', 'type': 'dir'} + visit_date = 'Tue, 3 May 2017 17:16:32 +0200' release = None revision = {} occurrence = {} # Send message to the task queue - LoadDirRepository().apply_async(('/path/to/dir, - origin, - revision, - release, - [occurrence])) + LoaderDirRepository().run(('/path/to/dir, origin, visit_date, revision, release, [occurrence])) Directory producer ================== None diff --git a/bin/swh-check-missing-objects.py b/bin/swh-check-missing-objects.py index 49b5e4a..1958e37 100755 --- a/bin/swh-check-missing-objects.py +++ b/bin/swh-check-missing-objects.py @@ -1,189 +1,189 @@ #!/usr/bin/env python3 import os import subprocess from swh.model.hashutil import hash_path, hash_to_bytes BATCH_SIZE = 10000 config = { # with git data for listing trees 'dir_path_git': '/home/tony/work/inria/repo/linux-tryouts-git', # without anything git related 'dir_path': '/home/tony/work/inria/repo/linux-tryouts', 'storage_class': 'remote_storage', - 'storage_args': ['http://localhost:5000/'], + 'storage_args': ['http://localhost:5002/'], } if config['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage storage = Storage(*config['storage_args']) def list_files_from(rootpath): """Git ls tree from rootpath's latest revision's tree. Yields: Tuple of (perms, type, hex sha1, name) """ with subprocess.Popen( ['find', '.', '-type', 'f'], stdout=subprocess.PIPE, cwd=rootpath) as proc: for filepath in proc.stdout: yield os.path.join(rootpath, filepath.strip().decode('utf-8')) def hashfile(filepath): """Hash a file according to what expects storage's api. """ hashes = hash_path(filepath) hashes.update({'length': os.path.getsize(filepath)}) return hashes def check_missing_contents(rootpath): print('Folder to check: %s' % rootpath) # List of contents to check in storage contents_batch = [] # map of content index by sha1, value is their actual path contents_map = {} # full contents missing is a list of files not in storage content_missings = [] # batch of contents to check count_batch_contents = 0 # total number of checked contents count_checked_contents = 0 # nb files read nb_files = 0 for filepath in list_files_from(rootpath): nb_files += 1 content_hashes = hashfile(filepath) contents_map.update({content_hashes['sha1']: filepath}) contents_batch.append(content_hashes) count_batch_contents += 1 if count_batch_contents < BATCH_SIZE: # accumulate content to check continue print('Checks %s contents' % len(contents_batch)) for content_missing in storage.content_missing(contents_batch): content_missings.append(contents_map[content_missing['sha1']]) count_checked_contents += count_batch_contents # reinitialize list contents_batch = [] count_batch_contents = 0 if contents_batch is not []: contents_batch_len = len(contents_batch) print('Checks %s contents' % contents_batch_len) for content_missing in storage.content_missing(contents_batch): content_missings.append(contents_map[content_missing['sha1']]) count_checked_contents += contents_batch_len print('Number of contents checked: %s' % count_checked_contents) print('Number of files: %s' % nb_files) print('Stats on missing contents -') if len(content_missings) > 0: print('Missing files: ') for file_missing in content_missings: print('- %s', file_missing) else: print('Nothing missing!') print() def git_ls_tree(rootpath): """Git ls tree from rootpath's latest revision's tree. Yields: Tuple of (perms, type, hex sha1, name) """ with subprocess.Popen( ['git', 'ls-tree', '-r', '-t', 'master^{tree}'], stdout=subprocess.PIPE, cwd=rootpath) as proc: for line in proc.stdout: yield line.strip().decode('utf-8').replace('\t', ' ').split(' ') def trees(rootpath): """Filter tree from rootpath in swh's api compliant with search. Yields: SWH compliant directory structure. """ for _, type, hex_sha1, name in git_ls_tree(rootpath): if type == 'tree': yield{'id': hash_to_bytes(hex_sha1), 'name': name} def check_missing_trees(rootpath): print('Folder to check: %s' % rootpath) # List of dirs to check in storage dirs_batch = [] # map of dir index by sha1, value is their actual path dirs_map = {} # full dirs missing is a list of files not in storage dir_missings = [] # batch of dirs to check count_batch_dirs = 0 # total number of checked dirs count_checked_dirs = 0 # nb trees read nb_dirs = 0 for tree in trees(rootpath): nb_dirs += 1 tree_id = tree['id'] dirs_map.update({tree_id: tree['name']}) dirs_batch.append(tree_id) count_batch_dirs += 1 if count_batch_dirs < BATCH_SIZE: # accumulate dir to check on storage continue print('Checks %s dirs' % len(dirs_batch)) for dir_missing in storage.directory_missing(dirs_batch): dir_missings.append(dirs_map[dir_missing['id']]) count_checked_dirs += count_batch_dirs # reinitialize list dirs_batch = [] count_batch_dirs = 0 if dirs_batch is not []: dirs_batch_len = len(dirs_batch) print('Checks %s dirs' % dirs_batch_len) for dir_missing in storage.directory_missing(dirs_batch): dir_missings.append(dirs_map[dir_missing['sha1']]) count_checked_dirs += dirs_batch_len print('Number of dirs checked: %s' % count_checked_dirs) print('Number of dirs: %s' % nb_dirs) print('Stats on missing dirs -') if len(dir_missings) > 0: print('Missing files: ') for file_missing in dir_missings: print('- %s', file_missing) else: print('Nothing missing!') print() check_missing_contents(config['dir_path']) check_missing_trees(config['dir_path_git']) diff --git a/debian/control b/debian/control index 37b9bf2..9603f11 100644 --- a/debian/control +++ b/debian/control @@ -1,23 +1,23 @@ Source: swh-loader-dir Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.14~), python3-swh.model (>= 0.0.11~), python3-swh.scheduler, python3-swh.storage (>= 0.0.76~), - python3-swh.loader.core (>= 0.0.12~), + python3-swh.loader.core (>= 0.0.13~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDDIR/ Package: python3-swh.loader.dir Architecture: all Depends: python3-swh.core (>= 0.0.14~), python3-swh.model (>= 0.0.11~), python3-swh.scheduler, - python3-swh.storage (>= 0.0.76~), python3-swh.loader.core (>= 0.0.12~), ${misc:Depends}, ${python3:Depends} + python3-swh.storage (>= 0.0.76~), python3-swh.loader.core (>= 0.0.13~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Directory Loader diff --git a/requirements-swh.txt b/requirements-swh.txt new file mode 100644 index 0000000..aefbfbd --- /dev/null +++ b/requirements-swh.txt @@ -0,0 +1,5 @@ +swh.core >= 0.0.14 +swh.model >= 0.0.11 +swh.scheduler +swh.storage >= 0.0.76 +swh.loader.core >= 0.0.13 diff --git a/requirements.txt b/requirements.txt index 809dcd7..1e62953 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,6 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner -swh.core >= 0.0.14 -swh.model >= 0.0.11 -swh.scheduler -swh.storage >= 0.0.76 -swh.loader.core >= 0.0.12 retrying +click diff --git a/resources/dir.ini b/resources/dir.ini index 3d41520..4e7b491 100644 --- a/resources/dir.ini +++ b/resources/dir.ini @@ -1,15 +1,15 @@ [main] storage_class = remote_storage -storage_args = http://localhost:5000/ +storage_args = http://localhost:5002/ send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True content_packet_size = 10000 content_packet_size_bytes = 1073741824 content_packet_block_size_bytes = 104857600 directory_packet_size = 25000 revision_packet_size = 100000 release_packet_size = 100000 occurrence_packet_size = 100000 diff --git a/setup.py b/setup.py index 67fb3d0..8919cf3 100644 --- a/setup.py +++ b/setup.py @@ -1,29 +1,29 @@ from setuptools import setup def parse_requirements(): requirements = [] - with open('requirements.txt') as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) - + for reqf in ('requirements.txt', 'requirements-swh.txt'): + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements setup( name='swh.loader.dir', description='Software Heritage Directory Loader', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDDIR', packages=['swh.loader.dir', 'swh.loader.dir.tests'], scripts=[], install_requires=parse_requirements(), setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, ) diff --git a/swh.loader.dir.egg-info/PKG-INFO b/swh.loader.dir.egg-info/PKG-INFO index 962ac38..886af48 100644 --- a/swh.loader.dir.egg-info/PKG-INFO +++ b/swh.loader.dir.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.loader.dir -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage Directory Loader Home-page: https://forge.softwareheritage.org/diffusion/DLDDIR Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.loader.dir.egg-info/SOURCES.txt b/swh.loader.dir.egg-info/SOURCES.txt index be5ed18..9d9d6eb 100644 --- a/swh.loader.dir.egg-info/SOURCES.txt +++ b/swh.loader.dir.egg-info/SOURCES.txt @@ -1,32 +1,33 @@ .gitignore AUTHORS LICENSE MANIFEST.in Makefile Makefile.local README +requirements-swh.txt requirements.txt setup.py version.txt bin/swh-check-missing-objects.py bin/swh-loader-dir debian/changelog debian/compat debian/control debian/copyright debian/rules debian/source/format resources/dir.ini resources/loader/dir.ini scratch/walking.py swh.loader.dir.egg-info/PKG-INFO swh.loader.dir.egg-info/SOURCES.txt swh.loader.dir.egg-info/dependency_links.txt swh.loader.dir.egg-info/requires.txt swh.loader.dir.egg-info/top_level.txt swh/loader/dir/__init__.py swh/loader/dir/converters.py swh/loader/dir/loader.py swh/loader/dir/tasks.py swh/loader/dir/tests/test_converters.py swh/loader/dir/tests/test_loader.py \ No newline at end of file diff --git a/swh.loader.dir.egg-info/requires.txt b/swh.loader.dir.egg-info/requires.txt index 9c935b8..8d8eb2a 100644 --- a/swh.loader.dir.egg-info/requires.txt +++ b/swh.loader.dir.egg-info/requires.txt @@ -1,7 +1,8 @@ +click retrying swh.core>=0.0.14 -swh.loader.core>=0.0.12 +swh.loader.core>=0.0.13 swh.model>=0.0.11 swh.scheduler swh.storage>=0.0.76 vcversioner diff --git a/swh/loader/dir/loader.py b/swh/loader/dir/loader.py index 19d759b..2952978 100644 --- a/swh/loader/dir/loader.py +++ b/swh/loader/dir/loader.py @@ -1,220 +1,242 @@ -# Copyright (C) 2015-2016 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import datetime +import click import os import sys import uuid from swh.loader.core import loader from swh.model import git from swh.model.git import GitType from . import converters class DirLoader(loader.SWHLoader): """A bulk loader for a directory. This will load the content of the directory. + Args: + - dir_path: source of the directory to import + - origin: Dictionary origin + - id: origin's id + - url: url origin we fetched + - type: type of the origin + - revision: Dictionary of information needed, keys are: + - author_name: revision's author name + - author_email: revision's author email + - author_date: timestamp (e.g. 1444054085) + - author_offset: date offset e.g. -0220, +0100 + - committer_name: revision's committer name + - committer_email: revision's committer email + - committer_date: timestamp + - committer_offset: date offset e.g. -0220, +0100 + - type: type of revision dir, tar + - message: synthetic message for the revision + - release: Dictionary of information needed, keys are: + - name: release name + - date: release timestamp (e.g. 1444054085) + - offset: release date offset e.g. -0220, +0100 + - author_name: release author's name + - author_email: release author's email + - comment: release's comment message + - occurrences: List of occurrences as dictionary. + Information needed, keys are: + - branch: occurrence's branch name + - date: validity date (e.g. 2015-01-01 00:00:00+00) + """ - CONFIG_BASE_FILENAME = 'loader/dir.ini' + CONFIG_BASE_FILENAME = 'loader/dir' def __init__(self, logging_class='swh.loader.dir.DirLoader', config=None): super().__init__(logging_class=logging_class, config=config) def list_repo_objs(self, dir_path, revision, release): """List all objects from dir_path. Args: - dir_path (path): the directory to list - revision: revision dictionary representation - release: release dictionary representation Returns: a dict containing lists of `Oid`s with keys for each object type: - CONTENT - DIRECTORY """ def _revision_from(tree_hash, revision): full_rev = dict(revision) full_rev['directory'] = tree_hash full_rev = converters.commit_to_revision(full_rev) full_rev['id'] = git.compute_revision_sha1_git(full_rev) return full_rev def _release_from(revision_hash, release): full_rel = dict(release) full_rel['target'] = revision_hash full_rel['target_type'] = 'revision' full_rel = converters.annotated_tag_to_release(full_rel) full_rel['id'] = git.compute_release_sha1_git(full_rel) return full_rel log_id = str(uuid.uuid4()) sdir_path = dir_path.decode('utf-8') self.log.info("Started listing %s" % dir_path, extra={ 'swh_type': 'dir_list_objs_start', 'swh_repo': sdir_path, 'swh_id': log_id, }) objects_per_path = git.compute_hashes_from_directory(dir_path) tree_hash = objects_per_path[dir_path]['checksums']['sha1_git'] full_rev = _revision_from(tree_hash, revision) objects = { GitType.BLOB: list( git.objects_per_type(GitType.BLOB, objects_per_path)), GitType.TREE: list( git.objects_per_type(GitType.TREE, objects_per_path)), GitType.COMM: [full_rev], GitType.RELE: [] } if release and 'name' in release: full_rel = _release_from(full_rev['id'], release) objects[GitType.RELE] = [full_rel] self.log.info("Done listing the objects in %s: %d contents, " "%d directories, %d revisions, %d releases" % ( sdir_path, len(objects[GitType.BLOB]), len(objects[GitType.TREE]), len(objects[GitType.COMM]), len(objects[GitType.RELE]) ), extra={ 'swh_type': 'dir_list_objs_end', 'swh_repo': sdir_path, 'swh_num_blobs': len(objects[GitType.BLOB]), 'swh_num_trees': len(objects[GitType.TREE]), 'swh_num_commits': len(objects[GitType.COMM]), 'swh_num_releases': len(objects[GitType.RELE]), 'swh_id': log_id, }) return objects - def load(self, dir_path, origin, visit, revision, release, occurrences): - """Load a directory in backend. + def prepare(self, *args, **kwargs): + self.dir_path, self.origin, self.visit_date, self.revision, self.release, self.occs = args # noqa - Args: - - dir_path: source of the directory to import - - origin: Dictionary origin - - id: origin's id - - url: url origin we fetched - - type: type of the origin - - revision: Dictionary of information needed, keys are: - - author_name: revision's author name - - author_email: revision's author email - - author_date: timestamp (e.g. 1444054085) - - author_offset: date offset e.g. -0220, +0100 - - committer_name: revision's committer name - - committer_email: revision's committer email - - committer_date: timestamp - - committer_offset: date offset e.g. -0220, +0100 - - type: type of revision dir, tar - - message: synthetic message for the revision - - release: Dictionary of information needed, keys are: - - name: release name - - date: release timestamp (e.g. 1444054085) - - offset: release date offset e.g. -0220, +0100 - - author_name: release author's name - - author_email: release author's email - - comment: release's comment message - - occurrences: List of occurrences as dictionary. - Information needed, keys are: - - branch: occurrence's branch name - - date: validity date (e.g. 2015-01-01 00:00:00+00) + if not os.path.exists(self.dir_path): + warn_msg = 'Skipping inexistant directory %s' % self.dir_path + self.log.error(warn_msg, + extra={ + 'swh_type': 'dir_repo_list_refs', + 'swh_repo': self.dir_path, + 'swh_num_refs': 0, + }) + raise ValueError(warn_msg) - Returns: - Dictionary with the following keys: - - status: mandatory, the status result as a boolean - - stderr: optional when status is True, mandatory otherwise - - objects: the actual objects sent to swh storage + if isinstance(self.dir_path, str): + self.dir_path = self.dir_path.encode(sys.getfilesystemencoding()) + + def get_origin(self): + return self.origin # set in prepare method + + def cleanup(self): + """Nothing to clean up. """ + pass + + def fetch_data(self): def _occurrence_from(origin_id, visit, revision_hash, occurrence): occ = dict(occurrence) occ.update({ 'target': revision_hash, 'target_type': 'revision', 'origin': origin_id, 'visit': visit }) return occ def _occurrences_from(origin_id, visit, revision_hash, occurrences): occs = [] for occurrence in occurrences: occs.append(_occurrence_from(origin_id, visit, revision_hash, occurrence)) return occs - if not os.path.exists(dir_path): - warn_msg = 'Skipping inexistant directory %s' % dir_path - self.log.warn(warn_msg, - extra={ - 'swh_type': 'dir_repo_list_refs', - 'swh_repo': dir_path, - 'swh_num_refs': 0, - }) - return {'status': False, 'stderr': warn_msg} + # to load the repository, walk all objects, compute their hashes + self.objects = self.list_repo_objs( + self.dir_path, self.revision, self.release) - if isinstance(dir_path, str): - dir_path = dir_path.encode(sys.getfilesystemencoding()) - - # to load the repository, walk all objects, compute their hash - objects = self.list_repo_objs(dir_path, revision, release) - - full_rev = objects[GitType.COMM][0] # only 1 revision + full_rev = self.objects[GitType.COMM][0] # only 1 revision # Update objects with release and occurrences - objects[GitType.REFS] = _occurrences_from( - origin['id'], visit, full_rev['id'], occurrences) + self.objects[GitType.REFS] = _occurrences_from( + self.origin_id, self.visit, full_rev['id'], self.occs) - # load contents + def store_data(self): + objects = self.objects self.maybe_load_contents(objects[GitType.BLOB]) self.maybe_load_directories(objects[GitType.TREE]) self.maybe_load_revisions(objects[GitType.COMM]) self.maybe_load_releases(objects[GitType.RELE]) self.maybe_load_occurrences(objects[GitType.REFS]) - self.flush() - - return {'status': True, 'objects': objects} - def prepare_and_load(self, dir_path, origin, revision, release, - occurrences): - """First prepare the origin, origin_visit. - Then load the data in storage. - At last, close the origin_visit. - - """ - self.origin_id = self.storage.origin_add_one(origin) - origin['id'] = self.origin_id - - fetch_history_id = self.open_fetch_history() - date_visit = datetime.datetime.now(tz=datetime.timezone.utc) - origin_visit = self.storage.origin_visit_add(origin['id'], date_visit) - visit = origin_visit['visit'] - - try: - self.load(dir_path, origin, visit, revision, release, occurrences) - self.close_fetch_history_success(fetch_history_id) - self.storage.origin_visit_update( - self.origin_id, visit, status='full') - except: - self.close_fetch_history_failure(fetch_history_id) - self.storage.origin_visit_update( - self.origin_id, visit, status='partial') - raise +@click.command() +@click.option('--dir-path', required=1, help='Directory path to load') +@click.option('--origin-url', required=1, help='Origin url for that directory') +@click.option('--visit-date', default=None, help='Visit date time override') +def main(dir_path, origin_url, visit_date): + """Debugging purpose.""" + d = DirLoader() + + origin = { + 'url': origin_url, + 'type': 'dir' + } + + import datetime + commit_time = int(datetime.datetime.now(tz=datetime.timezone.utc).timestamp()) # noqa + + SWH_PERSON = { + 'name': 'Software Heritage', + 'fullname': 'Software Heritage', + 'email': 'robot@softwareheritage.org' + } + REVISION_MESSAGE = 'synthetic revision message' + REVISION_TYPE = 'tar' + revision = { + 'date': { + 'timestamp': commit_time, + 'offset': 0, + }, + 'committer_date': { + 'timestamp': commit_time, + 'offset': 0, + }, + 'author': SWH_PERSON, + 'committer': SWH_PERSON, + 'type': REVISION_TYPE, + 'message': REVISION_MESSAGE, + 'metadata': {}, + } + release = None + occurrences = {} + d.load(dir_path, origin, visit_date, revision, release, occurrences) + + +if __name__ == '__main__': + main() diff --git a/swh/loader/dir/tasks.py b/swh/loader/dir/tasks.py index 05a4059..6333b92 100644 --- a/swh/loader/dir/tasks.py +++ b/swh/loader/dir/tasks.py @@ -1,24 +1,25 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.loader.dir.loader import DirLoader from swh.scheduler.task import Task class LoadDirRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_dir' - def run(self, dir_path, origin, revision, release, occurrences): - """Import a directory. - - Args: - cf. swh.loader.dir.loader.run docstring + def run(self, dir_path, origin, visit_date, revision, release, + occurrences): + """Import a directory dir_path with origin at visit_date time. + Providing the revision, release, and occurrences. """ - DirLoader().prepare_and_load( - dir_path, origin, revision, release, occurrences) + loader = DirLoader() + loader.log = self.log + loader.load(dir_path, origin, visit_date, revision, release, + occurrences) diff --git a/swh/loader/dir/tests/test_loader.py b/swh/loader/dir/tests/test_loader.py index 52544de..2c8829a 100644 --- a/swh/loader/dir/tests/test_loader.py +++ b/swh/loader/dir/tests/test_loader.py @@ -1,142 +1,142 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.loader.dir.loader import DirLoader from swh.model.git import GitType class TestLoader(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmp_root_path = tempfile.mkdtemp().encode('utf-8') start_path = os.path.dirname(__file__).encode('utf-8') sample_folder_archive = os.path.join(start_path, b'../../../../..', b'swh-storage-testdata', b'dir-folders', b'sample-folder.tgz') cls.root_path = os.path.join(cls.tmp_root_path, b'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path], ) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.tmp_root_path) def setUp(self): super().setUp() self.info = { 'storage': { 'cls': 'remote', 'args': { - 'url': 'http://localhost:5000/', + 'url': 'http://localhost:5002/', } }, 'content_size_limit': 104857600, 'log_db': 'dbname=softwareheritage-log', 'directory_packet_size': 25000, 'content_packet_size': 10000, 'send_contents': True, 'send_directories': True, 'content_packet_size_bytes': 1073741824, 'occurrence_packet_size': 100000, 'send_revisions': True, 'revision_packet_size': 100000, 'content_packet_block_size_bytes': 104857600, 'send_occurrences': True, 'release_packet_size': 100000, 'send_releases': True } self.origin = { 'url': 'file:///dev/null', 'type': 'dir', } self.occurrence = { 'branch': 'master', 'authority_id': 1, 'validity': '2015-01-01 00:00:00+00', } self.revision = { 'author': { 'name': 'swh author', 'email': 'swh@inria.fr', 'fullname': 'swh' }, 'date': { 'timestamp': 1444054085, 'offset': 0 }, 'committer': { 'name': 'swh committer', 'email': 'swh@inria.fr', 'fullname': 'swh' }, 'committer_date': { - 'timestamp': '1444054085', + 'timestamp': 1444054085, 'offset': 0, }, 'type': 'tar', 'message': 'synthetic revision', 'metadata': {'foo': 'bar'}, } self.release = { 'name': 'v0.0.1', 'date': { 'timestamp': 1444054085, 'offset': 0, }, 'author': { 'name': 'swh author', 'fullname': 'swh', 'email': 'swh@inria.fr', }, 'message': 'synthetic release', } self.dirloader = DirLoader(config=self.info) @istest def load_without_storage(self): # when objects = self.dirloader.list_repo_objs( self.root_path, self.revision, self.release) # then self.assertEquals(len(objects), 4, "4 objects types, blob, tree, revision, release") self.assertEquals(len(objects[GitType.BLOB]), 8, "8 contents: 3 files + 5 links") self.assertEquals(len(objects[GitType.TREE]), 5, "5 directories: 4 subdirs + 1 empty + 1 main dir") self.assertEquals(len(objects[GitType.COMM]), 1, "synthetic revision") self.assertEquals(len(objects[GitType.RELE]), 1, "synthetic release") # print('objects: %s\n objects-per-path: %s\n' % # (objects.keys(), # objects_per_path.keys())) diff --git a/version.txt b/version.txt index e185572..a9e941a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.23-0-gd45330a \ No newline at end of file +v0.0.24-0-g06d0cff \ No newline at end of file