diff --git a/AUTHORS b/AUTHORS --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,3 @@ -Copyright (C) 2019 The Software Heritage developers +Copyright (C) 2021 The Software Heritage developers See http://www.softwareheritage.org/ for more information. diff --git a/README.md b/README.md new file mode 100644 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +swh-loader-cvs +============== + +The Software Heritage CVS Loader is a tool and a library to walk a local CVS repository +and inject into the SWH dataset all contained files that weren't known before. + +The main entry points are + +- :class:`swh.loader.cvs.loader.CvsLoader` for the main cvs loader which ingests content out of + a local cvs repository + +# CLI run + +With the configuration: + +/tmp/loader_cvs.yml: +``` +storage: + cls: remote + args: + url: http://localhost:5002/ +``` + +Run: + +``` +swh loader --config-file /tmp/loader_cvs.yml \ + run cvs +``` diff --git a/conftest.py b/conftest.py new file mode 100644 --- /dev/null +++ b/conftest.py @@ -0,0 +1,19 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +pytest_plugins = [ + "swh.scheduler.pytest_plugin", + "swh.storage.pytest_plugin", + "swh.loader.pytest_plugin", +] + + +@pytest.fixture(scope="session") +def swh_scheduler_celery_includes(swh_scheduler_celery_includes): + return swh_scheduler_celery_includes + [ + "swh.loader.cvs.tasks", + ] diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -8,8 +8,14 @@ [mypy-pkg_resources.*] ignore_missing_imports = True +[mypy-celery.*] +ignore_missing_imports = True + +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-pytest.*] ignore_missing_imports = True -# [mypy-add_your_lib_here.*] -# ignore_missing_imports = True +[mypy-swh.loader.*] +ignore_missing_imports = True diff --git a/pytest.ini b/pytest.ini --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,5 @@ [pytest] + norecursedirs = docs .* +markers = + fs: execute tests that write to the filesystem diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,6 @@ # Add here internal Software Heritage dependencies, one per line. swh.core[http] >= 0.3 # [http] is required by swh.core.pytest_plugin +swh.storage >= 0.11.3 +swh.model >= 0.4.0 +swh.scheduler >= 0.0.39 +swh.loader.core >= 0.18 diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -52,11 +52,10 @@ use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, - # uncomment when ready - # entry_points=""" - # [swh.workers] - # loader.cvs=swh.loader.cvs - # """, + entry_points=""" + [swh.workers] + loader.cvs=swh.loader.cvs:register + """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/__init__.py @@ -0,0 +1,4 @@ +from pkgutil import extend_path +from typing import Iterable + +__path__ = extend_path(__path__, __name__) # type: Iterable[str] diff --git a/swh/loader/cvs/__init__.py b/swh/loader/cvs/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/cvs/__init__.py @@ -0,0 +1,15 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Any, Dict + + +def register() -> Dict[str, Any]: + from swh.loader.cvs.loader import CvsLoader + + return { + "task_modules": ["%s.tasks" % __name__], + "loader": CvsLoader, + } diff --git a/swh/loader/cvs/loader.py b/swh/loader/cvs/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/cvs/loader.py @@ -0,0 +1,105 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Loader in charge of injecting either new or existing cvs repositories to +swh-storage. + +""" +from datetime import datetime +from mmap import ACCESS_WRITE, mmap +import os +import pty +import re +import shutil +from subprocess import Popen +import tempfile +from typing import Dict, Iterator, List, Optional, Tuple + +from swh.loader.core.loader import BaseLoader +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.exception import NotFound +from swh.loader.cvs.cvs import CvsRepo +from swh.model import from_disk, hashutil +from swh.model.model import ( + Content, + Directory, + Origin, + Revision, + SkippedContent, + Snapshot, + SnapshotBranch, + TargetType, +) +from swh.storage.algos.snapshot import snapshot_get_latest +from swh.storage.interface import StorageInterface + +DEFAULT_BRANCH = b"HEAD" + +TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.cvs." + + +class CvsLoader(BaseLoader): + """Swh cvs loader. + + The repository is local. The loader deals with + update on an already previously loaded repository. + + """ + + visit_type = "cvs" + + def __init__( + self, + storage: StorageInterface, + url: str, + origin_url: Optional[str] = None, + visit_date: Optional[datetime] = None, + destination_path: Optional[str] = None, + swh_revision: Optional[str] = None, + start_from_scratch: bool = False, + temp_directory: str = "/tmp", + debug: bool = False, + check_revision: int = 0, + max_content_size: Optional[int] = None, + ): + super().__init__( + storage=storage, + logging_class="swh.loader.cvs.CvsLoader", + max_content_size=max_content_size, + ) + self.cvsroot_url = url + # origin url as unique identifier for origin in swh archive + self.origin_url = origin_url if origin_url else self.cvsroot_url + self.debug = debug + self.temp_directory = temp_directory + self.done = False + self.cvsrepo = None + # Revision check is configurable + self.check_revision = check_revision + # internal state used to store swh objects + self._contents: List[Content] = [] + self._skipped_contents: List[SkippedContent] = [] + self._directories: List[Directory] = [] + self._revisions: List[Revision] = [] + self._snapshot: Optional[Snapshot] = None + # internal state, current visit + self._last_revision = None + self._visit_status = "full" + self.visit_date = visit_date + self.destination_path = destination_path + self.start_from_scratch = start_from_scratch + self.snapshot = None + # state from previous visit + self.latest_snapshot = None + self.latest_revision = None + + def load_status(self): + return { + "status": self._load_status, + } + + def visit_status(self): + return self._visit_status + diff --git a/swh/loader/cvs/py.typed b/swh/loader/cvs/py.typed new file mode 100644 --- /dev/null +++ b/swh/loader/cvs/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561. diff --git a/swh/loader/cvs/tasks.py b/swh/loader/cvs/tasks.py new file mode 100644 --- /dev/null +++ b/swh/loader/cvs/tasks.py @@ -0,0 +1,58 @@ +# Copyright (C) 2015-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +from typing import Optional + +from celery import shared_task +import iso8601 + +from .loader import CvsLoader + + +def convert_to_datetime(date: Optional[str]) -> Optional[datetime]: + try: + return iso8601.parse_date(date) + except Exception: + return None + + +@shared_task(name=__name__ + ".LoadCvsRepository") +def load_cvs( + *, + url: Optional[str] = None, + origin_url: Optional[str] = None, + destination_path: Optional[str] = None, + swh_revision: Optional[str] = None, + visit_date: Optional[str] = None, + start_from_scratch: Optional[bool] = False, +): + """Import a CVS repository + + Args: + - url: (mandatory) CVS's repository url to ingest data from + - origin_url: Optional original url override to use as origin reference + in the archive. If not provided, "url" is used as origin. + - destination_path: (optional) root directory to + locally retrieve svn's data + - swh_revision: (optional) extra revision hex to + start from. See swh.loader.svn.CvsLoader.process + docstring + - visit_date: Optional date to override the visit date + - start_from_scratch: Flag to allow starting back the svn repository from the + start + + """ + loader = CvsLoader.from_configfile( + url=url, + origin_url=origin_url, + destination_path=destination_path, + swh_revision=swh_revision, + visit_date=convert_to_datetime(visit_date), + start_from_scratch=start_from_scratch, + ) + return loader.load() + +