diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,5 +1,5 @@ swh.core >= 0.0.7 -swh.loader.core >= 0.17.0 +swh.loader.core >= 0.18.0 swh.model >= 0.4.0 swh.scheduler >= 0.0.39 swh.storage >= 0.22.0 diff --git a/swh/loader/git/from_disk.py b/swh/loader/git/from_disk.py --- a/swh/loader/git/from_disk.py +++ b/swh/loader/git/from_disk.py @@ -1,13 +1,13 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - from collections import defaultdict +from datetime import datetime import os import shutil -from typing import Any, Dict, Optional +from typing import Dict, Optional from dulwich.errors import ObjectFormatException @@ -23,6 +23,7 @@ from swh.model import hashutil from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType from swh.storage.algos.origin import origin_get_latest_visit_status +from swh.storage.interface import StorageInterface from . import converters, utils @@ -36,20 +37,27 @@ def __init__( self, - url, - visit_date=None, - directory=None, - config: Optional[Dict[str, Any]] = None, + storage: StorageInterface, + url: str, + visit_date: Optional[datetime] = None, + directory: Optional[str] = None, + save_data_path: Optional[str] = None, + max_content_size: Optional[int] = None, ): - super().__init__(logging_class="swh.loader.git.Loader", config=config) + super().__init__( + storage=storage, + logging_class="swh.loader.git.Loader", + save_data_path=save_data_path, + max_content_size=max_content_size, + ) self.origin_url = url self.visit_date = visit_date self.directory = directory - def prepare_origin_visit(self, *args, **kwargs): + def prepare_origin_visit(self): self.origin = Origin(url=self.origin_url) - def prepare(self, *args, **kwargs): + def prepare(self): self.repo = dulwich.repo.Repo(self.directory) def iter_objects(self): @@ -358,7 +366,7 @@ break return archive_name - def prepare(self, *args, **kwargs): + def prepare(self): """1. Uncompress the archive in temporary location. 2. Prepare as the GitLoaderFromDisk does 3. Load as GitLoaderFromDisk does @@ -376,7 +384,7 @@ self.repo_path, ) self.directory = self.repo_path - super().prepare(*args, **kwargs) + super().prepare() def cleanup(self): """Cleanup the temporary location (if it exists). diff --git a/swh/loader/git/loader.py b/swh/loader/git/loader.py --- a/swh/loader/git/loader.py +++ b/swh/loader/git/loader.py @@ -18,7 +18,6 @@ from dulwich.object_store import ObjectStoreGraphWalker from dulwich.pack import PackData, PackInflater -from swh.core.config import merge_configs from swh.loader.core.loader import DVCSLoader from swh.loader.exception import NotFound from swh.model import hashutil @@ -34,6 +33,7 @@ TargetType, ) from swh.storage.algos.snapshot import snapshot_get_latest +from swh.storage.interface import StorageInterface from . import converters, utils @@ -98,11 +98,6 @@ pack_size: int -DEFAULT_CONFIG: Dict[str, Any] = { - "pack_size_bytes": 4 * 1024 * 1024 * 1024, -} - - class GitLoader(DVCSLoader): """A bulk loader for a git repository""" @@ -110,11 +105,14 @@ def __init__( self, + storage: StorageInterface, url: str, base_url: Optional[str] = None, ignore_history: bool = False, repo_representation: Type[RepoRepresentation] = RepoRepresentation, - config: Optional[Dict[str, Any]] = None, + pack_size_bytes: Optional[int] = 4 * 1024 * 1024 * 1024, + save_data_path: Optional[str] = None, + max_content_size: Optional[int] = None, ): """Initialize the bulk updater. @@ -124,13 +122,17 @@ data. """ - super().__init__(logging_class="swh.loader.git.BulkLoader", config=config) - self.config = merge_configs(DEFAULT_CONFIG, self.config) + super().__init__( + storage=storage, + logging_class="swh.loader.git.BulkLoader", + save_data_path=save_data_path, + max_content_size=max_content_size, + ) self.origin_url = url self.base_url = base_url self.ignore_history = ignore_history self.repo_representation = repo_representation - + self.pack_size_bytes = pack_size_bytes # state initialized in fetch_data self.remote_refs: Dict[bytes, bytes] = {} self.symbolic_refs: Dict[bytes, bytes] = {} @@ -154,16 +156,16 @@ origin_url, thin_packs=False ) - size_limit = self.config["pack_size_bytes"] + size_limit = self.pack_size_bytes def do_pack(data: bytes) -> None: cur_size = pack_buffer.tell() would_write = len(data) - if cur_size + would_write > size_limit: + if cur_size + would_write > size_limit: # type: ignore raise IOError( - "Pack file too big for repository %s, " - "limit is %d bytes, current size is %d, " - "would write %d" % (origin_url, size_limit, cur_size, would_write) + f"Pack file too big for repository {origin_url}, " + "limit is {size_limit} bytes, current size is {cur_size}, " + "would write {would_write}" ) pack_buffer.write(data) @@ -205,14 +207,14 @@ return id_to_type, type_to_ids - def prepare_origin_visit(self, *args, **kwargs) -> None: + def prepare_origin_visit(self) -> None: self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc) self.origin = Origin(url=self.origin_url) def get_full_snapshot(self, origin_url) -> Optional[Snapshot]: return snapshot_get_latest(self.storage, origin_url) - def prepare(self, *args, **kwargs) -> None: + def prepare(self) -> None: assert self.origin is not None prev_snapshot: Optional[Snapshot] = None @@ -528,8 +530,11 @@ default=False, ) def main(origin_url: str, base_url: str, ignore_history: bool) -> Dict[str, Any]: + from swh.storage import get_storage + + storage = get_storage(cls="memory") loader = GitLoader( - origin_url, base_url=base_url, ignore_history=ignore_history, + storage, origin_url, base_url=base_url, ignore_history=ignore_history, ) return loader.load() diff --git a/swh/loader/git/tasks.py b/swh/loader/git/tasks.py --- a/swh/loader/git/tasks.py +++ b/swh/loader/git/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -17,7 +17,7 @@ """Import a git repository from a remote location """ - loader = GitLoader(url, base_url=base_url) + loader = GitLoader.from_configfile(url=url, base_url=base_url) return loader.load() @@ -30,7 +30,9 @@ """ visit_date = dateutil.parser.parse(date) - loader = GitLoaderFromDisk(url, directory=directory, visit_date=visit_date) + loader = GitLoaderFromDisk.from_configfile( + url=url, directory=directory, visit_date=visit_date + ) return loader.load() @@ -44,5 +46,7 @@ """ visit_date = dateutil.parser.parse(date) - loader = GitLoaderFromArchive(url, archive_path=archive_path, visit_date=visit_date) + loader = GitLoaderFromArchive.from_configfile( + url=url, archive_path=archive_path, visit_date=visit_date + ) return loader.load() diff --git a/swh/loader/git/tests/conftest.py b/swh/loader/git/tests/conftest.py --- a/swh/loader/git/tests/conftest.py +++ b/swh/loader/git/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,27 +9,31 @@ @pytest.fixture -def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: - swh_storage_backend_config["journal_writer"] = {} +def swh_storage_backend_config(swh_storage_backend_config): + """Basic pg storage configuration with no journal collaborator + (to avoid pulling optional dependency on clients of this fixture) + + """ return { + "cls": "filter", "storage": { - "cls": "pipeline", - "steps": [ - {"cls": "filter"}, - { - "cls": "buffer", - "min_batch_size": { - "content": 10, - "content_bytes": 100 * 1024 * 1024, - "directory": 10, - "revision": 10, - "release": 10, - }, - }, - swh_storage_backend_config, - ], + "cls": "buffer", + "min_batch_size": { + "content": 10, + "content_bytes": 100 * 1024 * 1024, + "directory": 10, + "revision": 10, + "release": 10, + }, + "storage": swh_storage_backend_config, }, + } + + +@pytest.fixture +def swh_loader_config(swh_storage_backend_config) -> Dict[str, Any]: + return { + "storage": swh_storage_backend_config, "max_content_size": 100 * 1024 * 1024, - "pack_size_bytes": 4 * 1024 * 1024 * 1024, - "save_data": False, + "save_data_path": None, } diff --git a/swh/loader/git/tests/test_from_disk.py b/swh/loader/git/tests/test_from_disk.py --- a/swh/loader/git/tests/test_from_disk.py +++ b/swh/loader/git/tests/test_from_disk.py @@ -408,7 +408,7 @@ """ @pytest.fixture(autouse=True) - def init(self, swh_config, datadir, tmp_path): + def init(self, swh_storage, datadir, tmp_path): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") tmp_path = str(tmp_path) @@ -417,6 +417,7 @@ ) self.destination_path = os.path.join(tmp_path, archive_name) self.loader = GitLoaderFromDisk( + swh_storage, url=self.repo_url, visit_date=datetime.datetime( 2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc @@ -430,11 +431,12 @@ """Tests for GitLoaderFromArchive. Only tests common scenario.""" @pytest.fixture(autouse=True) - def init(self, swh_config, datadir, tmp_path): + def init(self, swh_storage, datadir, tmp_path): archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") self.repo_url = archive_path self.loader = GitLoaderFromArchive( + swh_storage, url=self.repo_url, archive_path=archive_path, visit_date=datetime.datetime( diff --git a/swh/loader/git/tests/test_loader.py b/swh/loader/git/tests/test_loader.py --- a/swh/loader/git/tests/test_loader.py +++ b/swh/loader/git/tests/test_loader.py @@ -90,7 +90,7 @@ """ @pytest.fixture(autouse=True) - def init(self, swh_config, datadir, tmp_path): + def init(self, swh_storage, datadir, tmp_path): super().setUp() archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") @@ -99,7 +99,7 @@ archive_path, archive_name, tmp_path=tmp_path ) self.destination_path = os.path.join(tmp_path, archive_name) - self.loader = GitLoader(self.repo_url) + self.loader = GitLoader(swh_storage, self.repo_url) self.repo = dulwich.repo.Repo(self.destination_path) @@ -110,7 +110,7 @@ """ @pytest.fixture(autouse=True) - def init(self, swh_loader_config, datadir, tmp_path): + def init(self, swh_storage, datadir, tmp_path): super().setUp() archive_name = "testrepo" archive_path = os.path.join(datadir, f"{archive_name}.tgz") @@ -120,7 +120,5 @@ ) self.destination_path = os.path.join(tmp_path, archive_name) base_url = f"base://{self.repo_url}" - self.loader = GitLoader( - self.repo_url, base_url=base_url, config=swh_loader_config - ) + self.loader = GitLoader(swh_storage, self.repo_url, base_url=base_url) self.repo = dulwich.repo.Repo(self.destination_path)