Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/from_disk.py
# Copyright (C) 2015-2020 The Software Heritage developers | # Copyright (C) 2015-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from datetime import datetime | |||||
import os | import os | ||||
import shutil | import shutil | ||||
from typing import Any, Dict, Optional | from typing import Dict, Optional | ||||
from dulwich.errors import ObjectFormatException | from dulwich.errors import ObjectFormatException | ||||
try: | try: | ||||
from dulwich.errors import EmptyFileException # type: ignore | from dulwich.errors import EmptyFileException # type: ignore | ||||
except ImportError: | except ImportError: | ||||
# dulwich >= 0.20 | # dulwich >= 0.20 | ||||
from dulwich.objects import EmptyFileException | from dulwich.objects import EmptyFileException | ||||
import dulwich.repo | import dulwich.repo | ||||
from swh.loader.core.loader import DVCSLoader | from swh.loader.core.loader import DVCSLoader | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType | from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType | ||||
from swh.storage.algos.origin import origin_get_latest_visit_status | from swh.storage.algos.origin import origin_get_latest_visit_status | ||||
from swh.storage.interface import StorageInterface | |||||
from . import converters, utils | from . import converters, utils | ||||
class GitLoaderFromDisk(DVCSLoader): | class GitLoaderFromDisk(DVCSLoader): | ||||
"""Load a git repository from a directory. | """Load a git repository from a directory. | ||||
""" | """ | ||||
visit_type = "git" | visit_type = "git" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
url, | storage: StorageInterface, | ||||
visit_date=None, | url: str, | ||||
directory=None, | visit_date: Optional[datetime] = None, | ||||
config: Optional[Dict[str, Any]] = None, | directory: Optional[str] = None, | ||||
save_data_path: Optional[str] = None, | |||||
max_content_size: Optional[int] = None, | |||||
): | ): | ||||
super().__init__(logging_class="swh.loader.git.Loader", config=config) | super().__init__( | ||||
storage=storage, | |||||
anlambert: maybe `logging_class` could be set to the real loader class name ? | |||||
Done Inline Actionsyes, i could even drop it now as the default is to fallback to what you suggest (in loader-core). ardumont: yes, i could even drop it now as the default is to fallback to what you suggest (in loader… | |||||
save_data_path=save_data_path, | |||||
max_content_size=max_content_size, | |||||
) | |||||
self.origin_url = url | self.origin_url = url | ||||
self.visit_date = visit_date | self.visit_date = visit_date | ||||
self.directory = directory | self.directory = directory | ||||
def prepare_origin_visit(self, *args, **kwargs): | def prepare_origin_visit(self): | ||||
self.origin = Origin(url=self.origin_url) | self.origin = Origin(url=self.origin_url) | ||||
def prepare(self, *args, **kwargs): | def prepare(self): | ||||
self.repo = dulwich.repo.Repo(self.directory) | self.repo = dulwich.repo.Repo(self.directory) | ||||
def iter_objects(self): | def iter_objects(self): | ||||
object_store = self.repo.object_store | object_store = self.repo.object_store | ||||
for pack in object_store.packs: | for pack in object_store.packs: | ||||
objs = list(pack.index.iterentries()) | objs = list(pack.index.iterentries()) | ||||
objs.sort(key=lambda x: x[1]) | objs.sort(key=lambda x: x[1]) | ||||
▲ Show 20 Lines • Show All 292 Lines • ▼ Show 20 Lines | def project_name_from_archive(self, archive_path): | ||||
""" | """ | ||||
archive_name = os.path.basename(archive_path) | archive_name = os.path.basename(archive_path) | ||||
for ext in (".zip", ".tar.gz", ".tgz"): | for ext in (".zip", ".tar.gz", ".tgz"): | ||||
if archive_name.lower().endswith(ext): | if archive_name.lower().endswith(ext): | ||||
archive_name = archive_name[: -len(ext)] | archive_name = archive_name[: -len(ext)] | ||||
break | break | ||||
return archive_name | return archive_name | ||||
def prepare(self, *args, **kwargs): | def prepare(self): | ||||
"""1. Uncompress the archive in temporary location. | """1. Uncompress the archive in temporary location. | ||||
2. Prepare as the GitLoaderFromDisk does | 2. Prepare as the GitLoaderFromDisk does | ||||
3. Load as GitLoaderFromDisk does | 3. Load as GitLoaderFromDisk does | ||||
""" | """ | ||||
project_name = self.project_name_from_archive(self.archive_path) | project_name = self.project_name_from_archive(self.archive_path) | ||||
self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( | self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( | ||||
project_name, self.archive_path | project_name, self.archive_path | ||||
) | ) | ||||
self.log.info( | self.log.info( | ||||
"Project %s - Uncompressing archive %s at %s", | "Project %s - Uncompressing archive %s at %s", | ||||
self.origin_url, | self.origin_url, | ||||
os.path.basename(self.archive_path), | os.path.basename(self.archive_path), | ||||
self.repo_path, | self.repo_path, | ||||
) | ) | ||||
self.directory = self.repo_path | self.directory = self.repo_path | ||||
super().prepare(*args, **kwargs) | super().prepare() | ||||
def cleanup(self): | def cleanup(self): | ||||
"""Cleanup the temporary location (if it exists). | """Cleanup the temporary location (if it exists). | ||||
""" | """ | ||||
if self.temp_dir and os.path.exists(self.temp_dir): | if self.temp_dir and os.path.exists(self.temp_dir): | ||||
shutil.rmtree(self.temp_dir) | shutil.rmtree(self.temp_dir) | ||||
self.log.info( | self.log.info( | ||||
"Project %s - Done injecting %s" % (self.origin_url, self.repo_path) | "Project %s - Done injecting %s" % (self.origin_url, self.repo_path) | ||||
) | ) |
maybe logging_class could be set to the real loader class name ?