diff --git a/README b/README index 2287724..7f740f9 100644 --- a/README +++ b/README @@ -1,95 +1,95 @@ SWH-loader-dir ============== The Software Heritage Directory Loader is a tool and a library to walk a local directory and inject into the SWH dataset all unknown contained files. Directory loader ================ ### Configuration This is the loader's (or task's) configuration file. loader/dir.ini: [main] # access to swh's storage storage_class = remote_storage - storage_args = http://localhost:5000/ + storage_args = http://localhost:5002/ # parameters to condition loading into swh storage send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True content_packet_size = 10000 content_packet_size_bytes = 1073741824 directory_packet_size = 25000 revision_packet_size = 100000 release_packet_size = 100000 occurrence_packet_size = 100000 Present in possible locations: - ~/.config/swh/loader/dir.ini - ~/.swh/loader/dir.ini - /etc/softwareheritage/loader/dir.ini #### Toplevel Load directory directly from code or toplevel: from swh.loader.dir.tasks import LoadDirRepository dir_path = '/path/to/directory # Fill in those origin = {} release = None revision = {} occurrence = {} LoadDirRepository().run(dir_path, origin, revision, release, [occurrence]) #### Celery Load directory using celery. Providing you have a properly configured celery up and running worker.ini needs to be updated with the following keys: task_modules = swh.loader.dir.tasks task_queues = swh_loader_dir cf. https://forge.softwareheritage.org/diffusion/DCORE/browse/master/README.md for more details You can send the following message to the task queue: from swh.loader.dir.tasks import LoadDirRepository # Fill in those origin = {} release = None revision = {} occurrence = {} # Send message to the task queue LoadDirRepository().apply_async(('/path/to/dir, origin, revision, release, [occurrence])) Directory producer ================== None diff --git a/bin/swh-check-missing-objects.py b/bin/swh-check-missing-objects.py index 49b5e4a..1958e37 100755 --- a/bin/swh-check-missing-objects.py +++ b/bin/swh-check-missing-objects.py @@ -1,189 +1,189 @@ #!/usr/bin/env python3 import os import subprocess from swh.model.hashutil import hash_path, hash_to_bytes BATCH_SIZE = 10000 config = { # with git data for listing trees 'dir_path_git': '/home/tony/work/inria/repo/linux-tryouts-git', # without anything git related 'dir_path': '/home/tony/work/inria/repo/linux-tryouts', 'storage_class': 'remote_storage', - 'storage_args': ['http://localhost:5000/'], + 'storage_args': ['http://localhost:5002/'], } if config['storage_class'] == 'remote_storage': from swh.storage.api.client import RemoteStorage as Storage else: from swh.storage import Storage storage = Storage(*config['storage_args']) def list_files_from(rootpath): """Git ls tree from rootpath's latest revision's tree. Yields: Tuple of (perms, type, hex sha1, name) """ with subprocess.Popen( ['find', '.', '-type', 'f'], stdout=subprocess.PIPE, cwd=rootpath) as proc: for filepath in proc.stdout: yield os.path.join(rootpath, filepath.strip().decode('utf-8')) def hashfile(filepath): """Hash a file according to what expects storage's api. """ hashes = hash_path(filepath) hashes.update({'length': os.path.getsize(filepath)}) return hashes def check_missing_contents(rootpath): print('Folder to check: %s' % rootpath) # List of contents to check in storage contents_batch = [] # map of content index by sha1, value is their actual path contents_map = {} # full contents missing is a list of files not in storage content_missings = [] # batch of contents to check count_batch_contents = 0 # total number of checked contents count_checked_contents = 0 # nb files read nb_files = 0 for filepath in list_files_from(rootpath): nb_files += 1 content_hashes = hashfile(filepath) contents_map.update({content_hashes['sha1']: filepath}) contents_batch.append(content_hashes) count_batch_contents += 1 if count_batch_contents < BATCH_SIZE: # accumulate content to check continue print('Checks %s contents' % len(contents_batch)) for content_missing in storage.content_missing(contents_batch): content_missings.append(contents_map[content_missing['sha1']]) count_checked_contents += count_batch_contents # reinitialize list contents_batch = [] count_batch_contents = 0 if contents_batch is not []: contents_batch_len = len(contents_batch) print('Checks %s contents' % contents_batch_len) for content_missing in storage.content_missing(contents_batch): content_missings.append(contents_map[content_missing['sha1']]) count_checked_contents += contents_batch_len print('Number of contents checked: %s' % count_checked_contents) print('Number of files: %s' % nb_files) print('Stats on missing contents -') if len(content_missings) > 0: print('Missing files: ') for file_missing in content_missings: print('- %s', file_missing) else: print('Nothing missing!') print() def git_ls_tree(rootpath): """Git ls tree from rootpath's latest revision's tree. Yields: Tuple of (perms, type, hex sha1, name) """ with subprocess.Popen( ['git', 'ls-tree', '-r', '-t', 'master^{tree}'], stdout=subprocess.PIPE, cwd=rootpath) as proc: for line in proc.stdout: yield line.strip().decode('utf-8').replace('\t', ' ').split(' ') def trees(rootpath): """Filter tree from rootpath in swh's api compliant with search. Yields: SWH compliant directory structure. """ for _, type, hex_sha1, name in git_ls_tree(rootpath): if type == 'tree': yield{'id': hash_to_bytes(hex_sha1), 'name': name} def check_missing_trees(rootpath): print('Folder to check: %s' % rootpath) # List of dirs to check in storage dirs_batch = [] # map of dir index by sha1, value is their actual path dirs_map = {} # full dirs missing is a list of files not in storage dir_missings = [] # batch of dirs to check count_batch_dirs = 0 # total number of checked dirs count_checked_dirs = 0 # nb trees read nb_dirs = 0 for tree in trees(rootpath): nb_dirs += 1 tree_id = tree['id'] dirs_map.update({tree_id: tree['name']}) dirs_batch.append(tree_id) count_batch_dirs += 1 if count_batch_dirs < BATCH_SIZE: # accumulate dir to check on storage continue print('Checks %s dirs' % len(dirs_batch)) for dir_missing in storage.directory_missing(dirs_batch): dir_missings.append(dirs_map[dir_missing['id']]) count_checked_dirs += count_batch_dirs # reinitialize list dirs_batch = [] count_batch_dirs = 0 if dirs_batch is not []: dirs_batch_len = len(dirs_batch) print('Checks %s dirs' % dirs_batch_len) for dir_missing in storage.directory_missing(dirs_batch): dir_missings.append(dirs_map[dir_missing['sha1']]) count_checked_dirs += dirs_batch_len print('Number of dirs checked: %s' % count_checked_dirs) print('Number of dirs: %s' % nb_dirs) print('Stats on missing dirs -') if len(dir_missings) > 0: print('Missing files: ') for file_missing in dir_missings: print('- %s', file_missing) else: print('Nothing missing!') print() check_missing_contents(config['dir_path']) check_missing_trees(config['dir_path_git']) diff --git a/resources/dir.ini b/resources/dir.ini index 3d41520..4e7b491 100644 --- a/resources/dir.ini +++ b/resources/dir.ini @@ -1,15 +1,15 @@ [main] storage_class = remote_storage -storage_args = http://localhost:5000/ +storage_args = http://localhost:5002/ send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True content_packet_size = 10000 content_packet_size_bytes = 1073741824 content_packet_block_size_bytes = 104857600 directory_packet_size = 25000 revision_packet_size = 100000 release_packet_size = 100000 occurrence_packet_size = 100000 diff --git a/swh/loader/dir/tests/test_loader.py b/swh/loader/dir/tests/test_loader.py index b204eaf..2c8829a 100644 --- a/swh/loader/dir/tests/test_loader.py +++ b/swh/loader/dir/tests/test_loader.py @@ -1,142 +1,142 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import subprocess import tempfile import unittest from nose.tools import istest from swh.loader.dir.loader import DirLoader from swh.model.git import GitType class TestLoader(unittest.TestCase): @classmethod def setUpClass(cls): super().setUpClass() cls.tmp_root_path = tempfile.mkdtemp().encode('utf-8') start_path = os.path.dirname(__file__).encode('utf-8') sample_folder_archive = os.path.join(start_path, b'../../../../..', b'swh-storage-testdata', b'dir-folders', b'sample-folder.tgz') cls.root_path = os.path.join(cls.tmp_root_path, b'sample-folder') # uncompress the sample folder subprocess.check_output( ['tar', 'xvf', sample_folder_archive, '-C', cls.tmp_root_path], ) @classmethod def tearDownClass(cls): super().tearDownClass() shutil.rmtree(cls.tmp_root_path) def setUp(self): super().setUp() self.info = { 'storage': { 'cls': 'remote', 'args': { - 'url': 'http://localhost:5000/', + 'url': 'http://localhost:5002/', } }, 'content_size_limit': 104857600, 'log_db': 'dbname=softwareheritage-log', 'directory_packet_size': 25000, 'content_packet_size': 10000, 'send_contents': True, 'send_directories': True, 'content_packet_size_bytes': 1073741824, 'occurrence_packet_size': 100000, 'send_revisions': True, 'revision_packet_size': 100000, 'content_packet_block_size_bytes': 104857600, 'send_occurrences': True, 'release_packet_size': 100000, 'send_releases': True } self.origin = { 'url': 'file:///dev/null', 'type': 'dir', } self.occurrence = { 'branch': 'master', 'authority_id': 1, 'validity': '2015-01-01 00:00:00+00', } self.revision = { 'author': { 'name': 'swh author', 'email': 'swh@inria.fr', 'fullname': 'swh' }, 'date': { 'timestamp': 1444054085, 'offset': 0 }, 'committer': { 'name': 'swh committer', 'email': 'swh@inria.fr', 'fullname': 'swh' }, 'committer_date': { 'timestamp': 1444054085, 'offset': 0, }, 'type': 'tar', 'message': 'synthetic revision', 'metadata': {'foo': 'bar'}, } self.release = { 'name': 'v0.0.1', 'date': { 'timestamp': 1444054085, 'offset': 0, }, 'author': { 'name': 'swh author', 'fullname': 'swh', 'email': 'swh@inria.fr', }, 'message': 'synthetic release', } self.dirloader = DirLoader(config=self.info) @istest def load_without_storage(self): # when objects = self.dirloader.list_repo_objs( self.root_path, self.revision, self.release) # then self.assertEquals(len(objects), 4, "4 objects types, blob, tree, revision, release") self.assertEquals(len(objects[GitType.BLOB]), 8, "8 contents: 3 files + 5 links") self.assertEquals(len(objects[GitType.TREE]), 5, "5 directories: 4 subdirs + 1 empty + 1 main dir") self.assertEquals(len(objects[GitType.COMM]), 1, "synthetic revision") self.assertEquals(len(objects[GitType.RELE]), 1, "synthetic release") # print('objects: %s\n objects-per-path: %s\n' % # (objects.keys(), # objects_per_path.keys()))