diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -257,6 +257,44 @@ class GitLoaderFromArchive(GitLoader): """Load a git repository from an archive. + This loader ingests a git repository compressed into an archive. + The supported archive formats are ``.zip`` and ``.tar.gz``. + + From an input tarball named ``my-git-repo.zip``, the following layout is + expected in it:: + + my-git-repo/ + ├── .git + │ ├── branches + │ ├── COMMIT_EDITMSG + │ ├── config + │ ├── description + │ ├── HEAD + ... + + Nevertheless, the loader is able to ingest tarballs with the following + layouts too:: + + . + ├── .git + │ ├── branches + │ ├── COMMIT_EDITMSG + │ ├── config + │ ├── description + │ ├── HEAD + ... + + or:: + + other-repo-name/ + ├── .git + │ ├── branches + │ ├── COMMIT_EDITMSG + │ ├── config + │ ├── description + │ ├── HEAD + ... + """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -266,7 +304,12 @@ """Compute the project name from the archive's path. """ - return os.path.basename(os.path.dirname(archive_path)) + archive_name = os.path.basename(archive_path) + for ext in ('.zip', '.tar.gz', '.tgz'): + if archive_name.lower().endswith(ext): + archive_name = archive_name[:-len(ext)] + break + return archive_name def prepare_origin_visit(self, origin_url, archive_path, visit_date): self._prepare_origin_visit(origin_url, visit_date) diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -4,8 +4,6 @@ # See top-level LICENSE file for more information import os.path -import zipfile -import tempfile import subprocess from swh.loader.git.loader import GitLoader, GitLoaderFromArchive @@ -19,13 +17,16 @@ # We don't want the project name to be 'resources'. return 'testrepo' + def parse_config_file(self, *args, **kwargs): + return TEST_LOADER_CONFIG + CONTENT1 = { '33ab5639bfd8e7b95eb1d8d0b87781d4ffea4d5d', # README v1 '349c4ff7d21f1ec0eda26f3d9284c293e3425417', # README v2 '799c11e348d39f1704022b8354502e2f81f3c037', # file1.txt '4bdb40dfd6ec75cb730e678b5d7786e30170c5fb', # file2.txt - } +} SNAPSHOT_ID = 'bdf3b06d6017e0d9ad6447a73da6ff1ae9efb8f0' @@ -89,7 +90,7 @@ '9ca0c7d6ffa3f9f0de59fd7912e08f11308a1338', 'bd746cd1913721b269b395a56a97baf6755151c2': 'e1d0d894835f91a0f887a4bc8b16f81feefdfbd5', - } +} class BaseGitLoaderTest(BaseLoaderTest): @@ -113,7 +114,7 @@ """ def setUp(self): - super().setUp('testrepo.tgz', True) + super().setUp('testrepo.tgz', uncompress_archive=True) self.loader = GitLoaderTest() self.storage = self.loader.storage @@ -124,12 +125,7 @@ directory=self.destination_path) -class GitLoaderFromArchiveTest(GitLoaderFromArchive): - def parse_config_file(self, *args, **kwargs): - return TEST_LOADER_CONFIG - - -class BaseZipGitLoaderTest(BaseGitLoaderTest): +class BaseGitLoaderFromArchiveTest(BaseGitLoaderTest): """Mixin base loader test to prepare the git repository to uncompress, load and test the results. @@ -137,32 +133,10 @@ """ def setUp(self): - super().setUp('testrepo.tgz', True) - self._setup_zip() - self.loader = GitLoaderFromArchiveTest() + super().setUp('testrepo.tgz', uncompress_archive=False) + self.loader = GitLoaderFromArchive() self.storage = self.loader.storage - def _setup_zip(self): - self._zip_file = tempfile.NamedTemporaryFile('ab', suffix='.zip') - dest_dir = os.path.normpath(self.destination_path) + '/' - with zipfile.ZipFile(self._zip_file, 'a') as zip_writer: - for root, dirs, files in os.walk(dest_dir): - assert root.startswith(dest_dir) - relative_root = os.path.join( - 'testrepo', - root[len(dest_dir):]) - for file_ in files: - zip_writer.write( - filename=os.path.join(root, file_), - arcname=os.path.join(relative_root, file_)) - self.destination_path = self._zip_file.name - self.tmp_root_path = None - self.repo_url = 'file://' + self.destination_path - - def tearDown(self): - self._zip_file.close() - super().tearDown() - def load(self): return self.loader.load( origin_url=self.repo_url, @@ -281,7 +255,7 @@ self.assertEqual(self.loader.visit_status(), 'full') -class ZipGitLoaderTest(BaseZipGitLoaderTest, GitLoaderTests): +class GitLoaderFromArchiveTest(BaseGitLoaderFromArchiveTest, GitLoaderTests): """Tests for GitLoaderFromArchive. Imports the common ones from GitLoaderTests.""" pass diff --git a/swh/loader/git/utils.py b/swh/loader/git/utils.py --- a/swh/loader/git/utils.py +++ b/swh/loader/git/utils.py @@ -10,7 +10,7 @@ import shutil import tempfile -from subprocess import call +from swh.core import tarball def init_git_repo_from_archive(project_name, archive_path, @@ -41,11 +41,18 @@ try: # create the repository that will be loaded with the dump - r = call(['unzip', '-q', '-o', archive_path, '-d', temp_dir]) - if r != 0: - raise ValueError('Failed to uncompress archive %s' % archive_path) - + tarball.uncompress(archive_path, temp_dir) repo_path = os.path.join(temp_dir, project_name) + # tarball content may not be as expected (e.g. no top level directory + # or a top level directory with a name different from project_name), + # so try to make it loadable anyway + if not os.path.exists(repo_path): + os.mkdir(repo_path) + for root, dirs, files in os.walk(temp_dir): + if '.git' in dirs: + shutil.copytree(os.path.join(root, '.git'), + os.path.join(repo_path, '.git')) + break return temp_dir, repo_path except Exception as e: shutil.rmtree(temp_dir)