diff --git a/org/checks.org b/org/checks.org new file mode 100644 index 0000000..dba287b --- /dev/null +++ b/org/checks.org @@ -0,0 +1,81 @@ +#+title: 'integrity' Checks on vcs +#+author: ardumont + +* hg check + +#+BEGIN_SRC sh +ardumont@uffizi:/tmp/longkeyy-luceneutil$ hg verify +checking changesets +checking manifests +crosschecking files in changesets and manifests +checking files +115 files, 552 changesets, 1054 total revisions +ardumont@uffizi:/tmp/longkeyy-luceneutil$ echo $? +0 +#+END_SRC + +* git check + +#+BEGIN_SRC sh +ardumont@uffizi:/tmp/cassandra-ruby$ git fsck +Checking object directories: 100% (256/256), done. +Checking objects: 100% (957/957), done. +ardumont@uffizi:/tmp/cassandra-ruby$ echo $? +0 +#+END_SRC + +* svn check + +#+BEGIN_SRC sh +$ svnadmin create jdochelper-repo   1 +$ svnadmin load jdochelper-repo < jdochelper-repo.svndump +<<< Started new transaction, based on original revision 1 +* editing path : branches ... done. +* editing path : tags ... done. +* editing path : trunk ... done. +* editing path : wiki ... done. + +------- Committed revision 1 >>> + +<<< Started new transaction, based on original revision 2 +* editing path : wiki/Index.wiki ... done. + +------- Committed revision 2 >>> + +<<< Started new transaction, based on original revision 3 +* editing path : trunk/JDocHelper ... done. + +------- Committed revision 3 >>> + +<<< Started new transaction, based on original revision 4 +* editing path : trunk/JDocHelper ... done. +* editing path : trunk/JDocHelper/.classpath ... done. +* editing path : trunk/JDocHelper/.project ... done. +* editing path : trunk/JDocHelper/.settings ... done. +* editing path : trunk/JDocHelper/.settings/org.eclipse.core.resources.prefs ... done. +* editing path : trunk/JDocHelper/.settings/org.eclipse.jdt.core.prefs ... done. +* editing path : trunk/JDocHelper/build.xml ... done. +* editing path : trunk/JDocHelper/doc ... done. +* editing path : trunk/JDocHelper/doc/logo.jpg ... done. +* editing path : trunk/JDocHelper/doc/todos.txt ... done. +* editing path : trunk/JDocHelper/lib ... done. +* editing path : trunk/JDocHelper/readme.txt ... done. +* editing path : trunk/JDocHelper/run.bat ... done. +* editing path : trunk/JDocHelper/src ... done. +* editing path : trunk/JDocHelper/src/org ... done. +* editing path : trunk/JDocHelper/src/org/tacedu ... done. +* editing path : trunk/JDocHelper/src/org/tacedu/javadochelper ... done. +* editing path : trunk/JDocHelper/src/org/tacedu/javadochelper/Main.java ... done. +* editing path : trunk/JDocHelper/src/org/tacedu/javadochelper/MainFrame.java ... done. + +------- Committed revision 4 >>> + +<<< Started new transaction, based on original revision 5 +* editing path : trunk/JDocHelper/info.txt ... done. + +------- Committed revision 5 >>> + +$ echo $? +0 +#+END_SRC +Note: `cat jdochelper-repo.svndump | svnadmin load jdochelper-repo` diff --git a/swh/fetcher/googlecode/checker.py b/swh/fetcher/googlecode/checker.py new file mode 100644 index 0000000..4bb0859 --- /dev/null +++ b/swh/fetcher/googlecode/checker.py @@ -0,0 +1,149 @@ +# Copyright (C) 2015-2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Namespace to deal with checks on git, svn and hg repository from +googlecode archives. + +System requisites: svn, git, hg, unzip, pigz + +""" + +import logging +import os +import shutil +import tempfile + +from subprocess import PIPE, Popen, check_call + +from swh.core import config + +from . import utils + + +REPO_TYPE_FILENAME = 'project.json' +REPO_TYPE_KEY = 'repoType' + + +def basic_check(archive_path, temp_dir, cmd): + """Execute basic integrity check. + + Args: + archive_path: the full pathname to the archive to check + temp_dir: the temporary directory to load and check the repository + cmd: the actual command to check the repository is ok. + + Returns: + True in case check is ok, False otherwise. + + """ + # all git and hg archives contain one folder with the project name + cmd = ['unzip', '-q', '-o', archive_path, '-d', temp_dir] + check_call(cmd) + # zip contains a folder named after the project_name (and this is + # the repository path) + project_name = os.path.basename(os.path.dirname(archive_path)) + repo_path = os.path.join(temp_dir, project_name) + + with utils.cwd(repo_path): + try: + r = check_call(cmd) + return r == 0 + except: + return False + + +def check_svn_integrity(archive_path, temp_dir): + """Check the repository's svn integrity. + + Args: + archive_path: the full pathname to the archive to check + temp_dir: the temporary directory to load and check the repository + + Returns: + True in case check is ok, False otherwise. + + """ + project_name = os.path.basename(os.path.dirname(archive_path)) + repo_path = os.path.join(temp_dir, project_name) + + # create the repository that will be loaded with the dump + cmd = ['svnadmin', 'create', repo_path] + check_call(cmd) + + try: + with Popen(['pigz', '-dc', archive_path], stdout=PIPE) as dump: + cmd = ['svnadmin', 'load', '-q', repo_path] + r = check_call(cmd, stdin=dump.stdout) + return r == 0 + except: + return False + + +def check_integrity(repo_type, archive_path, temp_dir): + """Given a repository to uncompress in temp_dir with type repo_type, + check its integrity. + + """ + if repo_type == 'git': + return basic_check(archive_path, temp_dir, cmd=['git', 'fsck']) + + if repo_type == 'hg': + return basic_check(archive_path, temp_dir, cmd=['hg', 'verify']) + + if repo_type == 'svn': + return check_svn_integrity(archive_path, temp_dir) + + raise NotImplemented("Repository type %s not implemented." % repo_type) + + +class SWHGoogleArchiveChecker(config.SWHConfig): + """A google archive 'integrity' checker. + + This checker will: + + - determine the archive's nature (hg, git, svn) by checking the + project.json associated file + - uncompress the archive on a temporary folder + - depending on its nature, check that the archive's integrity is ok + - git: `git fsck` + - svn: `pigz -dc foo-repo.svndump.gz | svnadmin load repos/foo-repo` + - hg: `hg verify` + + """ + + def __init__(self): + self.log = logging.getLogger( + 'swh.fetcher.google.SWHGoogleArchiveChecker') + + def process(self, archive_path, temp_root_dir): + """Check the archive path is actually ok. + + """ + self.log.info('Check %s\'s metadata' % archive_path) + + parent_dir = os.path.dirname(archive_path) + # contains the repoType field + project_json = os.path.join(parent_dir, REPO_TYPE_FILENAME) + + meta = utils.load_meta(project_json) + repo_type = meta[REPO_TYPE_KEY] + + # compute the repo path repository + temp_dir = tempfile.mkdtemp(suffix='.swh.fetcher.googlecode', + prefix='tmp.', + dir=temp_root_dir) + + self.log.debug('type: %s, archive: %s' % (repo_type, archive_path)) + + try: + if check_integrity(repo_type, archive_path, temp_dir): + self.log.info('%s SUCCESS' % archive_path) + else: + self.log.error('%s FAILURE' % archive_path) + + finally: + # cleanup the temporary representation + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) diff --git a/swh/fetcher/googlecode/checker_producer.py b/swh/fetcher/googlecode/checker_producer.py new file mode 100644 index 0000000..c8491f4 --- /dev/null +++ b/swh/fetcher/googlecode/checker_producer.py @@ -0,0 +1,33 @@ +# Copyright (C) 2015-2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import sys + + +task_name = 'swh.fetcher.googlecode.tasks.SWHGoogleArchiveCheckerTask' + + +@click.command() +@click.option('--archive-path', + help="Archive path to check") +@click.option('--temp-dir', + help="Temporary folder to make check computations on archive.") +def produce(archive_path, temp_dir): + from swh.scheduler.celery_backend.config import app + from swh.fetcher.googlecode import tasks # noqa + + task = app.tasks[task_name] + if archive_path: # for debug purpose, one archive + task.delay(archive_path, temp_dir) + else: # otherwise, we deal in archive_path batch + for archive_path in sys.stdin: + archive_path = archive_path.rstrip() + print(archive_path) + task.delay(archive_path, temp_dir) + + +if __name__ == '__main__': + produce() diff --git a/swh/fetcher/googlecode/tasks.py b/swh/fetcher/googlecode/tasks.py index 1dbe06a..826a0b5 100644 --- a/swh/fetcher/googlecode/tasks.py +++ b/swh/fetcher/googlecode/tasks.py @@ -1,17 +1,39 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from .fetcher import SWHGoogleArchiveFetcher +from .checker import SWHGoogleArchiveChecker class SWHGoogleArchiveFetcherTask(Task): - """Main task to fetch files from google code archive server. + """Main task to fetch and check archive source from google code + archive server. + + The checks are made on: + - size + - md5 + from the associated '.json' file associated to the archive fetched. """ task_queue = 'swh_fetcher_googlecode_fetch_archive' def run(self, archive_gs, destination_rootpath): SWHGoogleArchiveFetcher().process(archive_gs, destination_rootpath) + + +class SWHGoogleArchiveCheckerTask(Task): + """Main task to check fetched archive files from google code archive + server. + + The checks are more thorough, that is: + - uncompress the archive on a temporary folder + - integrity check according to repo's nature (git, hg, svn) + + """ + task_queue = 'swh_fetcher_googlecode_check_archive' + + def run(self, path, root_temp_dir): + SWHGoogleArchiveChecker().process(path, root_temp_dir) diff --git a/swh/fetcher/googlecode/utils.py b/swh/fetcher/googlecode/utils.py index 9eacf23..6ae3906 100644 --- a/swh/fetcher/googlecode/utils.py +++ b/swh/fetcher/googlecode/utils.py @@ -1,64 +1,81 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import json +from contextlib import contextmanager def compute_destination_folder(path): """Given a path, compute a destination folder to which downloads the remote files. """ parent_dir = os.path.dirname(path) project_name = os.path.basename(parent_dir) parent_ddir = os.path.dirname(parent_dir) return os.path.join(parent_ddir, project_name[0], project_name) prefix_source_url_api = 'https://www.googleapis.com/storage/v1/b/google-code-archive-source/o' # noqa prefix_project_meta = 'https://storage.googleapis.com/google-code-archive' def transform(url_gs): """Transform input gs:// url into a dictionary with the following information. Returns: Dict of the following form: - destination folder - filename - metadata archive url to fetch - project metadata url to fetch """ url_gs = url_gs.replace('gs://google-code-archive-source/', '') filename = os.path.basename(url_gs) project_name = os.path.dirname(url_gs) url_meta = '%s/%s' % (prefix_source_url_api, url_gs.replace('/', '%2F')) url_project_meta = '%s/%s/project.json' % (prefix_project_meta, project_name) return { 'parent_dir': compute_destination_folder(url_gs), 'filename': filename, 'url_project_archive_meta': url_meta, 'url_project_meta': url_project_meta } + + +@contextmanager +def cwd(path): + """Contextually change the working directory to do thy bidding. + Then gets back to the original location. + + """ + prev_cwd = os.getcwd() + os.chdir(path) + try: + yield + finally: + os.chdir(prev_cwd) + + def load_meta(filepath): """Load the metadata from the given filepath (json file). It is assumed that the code is called after checking the file exists. Returns: Dictionary of data or None if any problem is encountered. """ try: with open(filepath, 'r') as f: return json.loads(f.read()) except: return None