Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/git/from_disk.py
# Copyright (C) 2015-2021 The Software Heritage developers | # Copyright (C) 2015-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from datetime import datetime | from datetime import datetime | ||||
import logging | |||||
import os | import os | ||||
import shutil | import shutil | ||||
from typing import Dict, Optional | from typing import Dict, Optional | ||||
from dulwich.errors import ObjectFormatException | from dulwich.errors import ObjectFormatException | ||||
try: | try: | ||||
from dulwich.errors import EmptyFileException # type: ignore | from dulwich.errors import EmptyFileException # type: ignore | ||||
except ImportError: | except ImportError: | ||||
# dulwich >= 0.20 | # dulwich >= 0.20 | ||||
from dulwich.objects import EmptyFileException | from dulwich.objects import EmptyFileException | ||||
import dulwich.objects | import dulwich.objects | ||||
import dulwich.repo | import dulwich.repo | ||||
from swh.loader.core.loader import DVCSLoader | from swh.loader.core.loader import DVCSLoader | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType | from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType | ||||
from swh.storage.algos.origin import origin_get_latest_visit_status | from swh.storage.algos.origin import origin_get_latest_visit_status | ||||
from swh.storage.interface import StorageInterface | from swh.storage.interface import StorageInterface | ||||
from . import converters, utils | from . import converters, utils | ||||
logger = logging.getLogger(__name__) | |||||
def _check_tag(tag): | def _check_tag(tag): | ||||
"""Copy-paste of dulwich.objects.Tag, minus the tagger and time checks, | """Copy-paste of dulwich.objects.Tag, minus the tagger and time checks, | ||||
which are too strict and error on old tags.""" | which are too strict and error on old tags.""" | ||||
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> | # Copyright (C) 2007 James Westby <jw+debian@jameswestby.net> | ||||
# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> | # Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk> | ||||
# | # | ||||
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU | # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU | ||||
▲ Show 20 Lines • Show All 130 Lines • ▼ Show 20 Lines | def get_object(self, oid): | ||||
""" | """ | ||||
try: | try: | ||||
# some errors are raised when reading the object | # some errors are raised when reading the object | ||||
obj = self.repo[oid] | obj = self.repo[oid] | ||||
# some we need to check ourselves | # some we need to check ourselves | ||||
self._check(obj) | self._check(obj) | ||||
except KeyError: | except KeyError: | ||||
_id = oid.decode("utf-8") | _id = oid.decode("utf-8") | ||||
self.log.warn( | logger.warn( | ||||
"object %s not found, skipping" % _id, | "object %s not found, skipping" % _id, | ||||
extra={ | extra={ | ||||
"swh_type": "swh_loader_git_missing_object", | "swh_type": "swh_loader_git_missing_object", | ||||
"swh_object_id": _id, | "swh_object_id": _id, | ||||
"origin_url": self.origin.url, | "origin_url": self.origin.url, | ||||
}, | }, | ||||
) | ) | ||||
return None | return None | ||||
except ObjectFormatException as e: | except ObjectFormatException as e: | ||||
id_ = oid.decode("utf-8") | id_ = oid.decode("utf-8") | ||||
self.log.warn( | logger.warn( | ||||
"object %s malformed (%s), skipping", | "object %s malformed (%s), skipping", | ||||
id_, | id_, | ||||
e.args[0], | e.args[0], | ||||
extra={ | extra={ | ||||
"swh_type": "swh_loader_git_missing_object", | "swh_type": "swh_loader_git_missing_object", | ||||
"swh_object_id": id_, | "swh_object_id": id_, | ||||
"origin_url": self.origin.url, | "origin_url": self.origin.url, | ||||
}, | }, | ||||
) | ) | ||||
return None | return None | ||||
except EmptyFileException: | except EmptyFileException: | ||||
id_ = oid.decode("utf-8") | id_ = oid.decode("utf-8") | ||||
self.log.warn( | logger.warn( | ||||
"object %s corrupted (empty file), skipping", | "object %s corrupted (empty file), skipping", | ||||
id_, | id_, | ||||
extra={ | extra={ | ||||
"swh_type": "swh_loader_git_missing_object", | "swh_type": "swh_loader_git_missing_object", | ||||
"swh_object_id": id_, | "swh_object_id": id_, | ||||
"origin_url": self.origin.url, | "origin_url": self.origin.url, | ||||
}, | }, | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | class GitLoaderFromDisk(DVCSLoader): | ||||
def get_directories(self): | def get_directories(self): | ||||
"""Get the directories that need to be loaded""" | """Get the directories that need to be loaded""" | ||||
missing_dirs = set( | missing_dirs = set( | ||||
self.storage.directory_missing(sorted(self.get_directory_ids())) | self.storage.directory_missing(sorted(self.get_directory_ids())) | ||||
) | ) | ||||
for oid in missing_dirs: | for oid in missing_dirs: | ||||
yield converters.dulwich_tree_to_directory( | yield converters.dulwich_tree_to_directory( | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log | self.repo[hashutil.hash_to_bytehex(oid)], | ||||
) | ) | ||||
def has_revisions(self): | def has_revisions(self): | ||||
"""Checks whether we need to load revisions""" | """Checks whether we need to load revisions""" | ||||
return bool(self.type_to_ids[b"commit"]) | return bool(self.type_to_ids[b"commit"]) | ||||
def get_revision_ids(self): | def get_revision_ids(self): | ||||
"""Get the revision identifiers from the git repository""" | """Get the revision identifiers from the git repository""" | ||||
return ( | return ( | ||||
hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"] | hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"] | ||||
) | ) | ||||
def get_revisions(self): | def get_revisions(self): | ||||
"""Get the revisions that need to be loaded""" | """Get the revisions that need to be loaded""" | ||||
missing_revs = set( | missing_revs = set( | ||||
self.storage.revision_missing(sorted(self.get_revision_ids())) | self.storage.revision_missing(sorted(self.get_revision_ids())) | ||||
) | ) | ||||
for oid in missing_revs: | for oid in missing_revs: | ||||
yield converters.dulwich_commit_to_revision( | yield converters.dulwich_commit_to_revision( | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log | self.repo[hashutil.hash_to_bytehex(oid)], | ||||
) | ) | ||||
def has_releases(self): | def has_releases(self): | ||||
"""Checks whether we need to load releases""" | """Checks whether we need to load releases""" | ||||
return bool(self.type_to_ids[b"tag"]) | return bool(self.type_to_ids[b"tag"]) | ||||
def get_release_ids(self): | def get_release_ids(self): | ||||
"""Get the release identifiers from the git repository""" | """Get the release identifiers from the git repository""" | ||||
return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"]) | return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"]) | ||||
def get_releases(self): | def get_releases(self): | ||||
"""Get the releases that need to be loaded""" | """Get the releases that need to be loaded""" | ||||
missing_rels = set(self.storage.release_missing(sorted(self.get_release_ids()))) | missing_rels = set(self.storage.release_missing(sorted(self.get_release_ids()))) | ||||
for oid in missing_rels: | for oid in missing_rels: | ||||
yield converters.dulwich_tag_to_release( | yield converters.dulwich_tag_to_release( | ||||
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log | self.repo[hashutil.hash_to_bytehex(oid)], | ||||
) | ) | ||||
def get_snapshot(self): | def get_snapshot(self): | ||||
"""Turn the list of branches into a snapshot to load""" | """Turn the list of branches into a snapshot to load""" | ||||
branches: Dict[bytes, Optional[SnapshotBranch]] = {} | branches: Dict[bytes, Optional[SnapshotBranch]] = {} | ||||
for ref, target in self.repo.refs.as_dict().items(): | for ref, target in self.repo.refs.as_dict().items(): | ||||
if utils.ignore_branch_name(ref): | if utils.ignore_branch_name(ref): | ||||
Show All 15 Lines | def get_snapshot(self): | ||||
if target not in branches: | if target not in branches: | ||||
# This handles the case where the pointer is "dangling". | # This handles the case where the pointer is "dangling". | ||||
# There's a chance that a further symbolic reference will | # There's a chance that a further symbolic reference will | ||||
# override this default value, which is totally fine. | # override this default value, which is totally fine. | ||||
dangling_branches[target] = ref | dangling_branches[target] = ref | ||||
branches[target] = None | branches[target] = None | ||||
utils.warn_dangling_branches( | utils.warn_dangling_branches( | ||||
branches, dangling_branches, self.log, self.origin_url | branches, dangling_branches, logger, self.origin_url | ||||
) | ) | ||||
self.snapshot = Snapshot(branches=branches) | self.snapshot = Snapshot(branches=branches) | ||||
return self.snapshot | return self.snapshot | ||||
def save_data(self): | def save_data(self): | ||||
"""We already have the data locally, no need to save it""" | """We already have the data locally, no need to save it""" | ||||
pass | pass | ||||
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines | def prepare(self): | ||||
3. Load as GitLoaderFromDisk does | 3. Load as GitLoaderFromDisk does | ||||
""" | """ | ||||
project_name = self.project_name_from_archive(self.archive_path) | project_name = self.project_name_from_archive(self.archive_path) | ||||
self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( | self.temp_dir, self.repo_path = utils.init_git_repo_from_archive( | ||||
project_name, self.archive_path | project_name, self.archive_path | ||||
) | ) | ||||
self.log.info( | logger.info( | ||||
"Project %s - Uncompressing archive %s at %s", | "Project %s - Uncompressing archive %s at %s", | ||||
self.origin_url, | self.origin_url, | ||||
os.path.basename(self.archive_path), | os.path.basename(self.archive_path), | ||||
self.repo_path, | self.repo_path, | ||||
) | ) | ||||
self.directory = self.repo_path | self.directory = self.repo_path | ||||
super().prepare() | super().prepare() | ||||
def cleanup(self): | def cleanup(self): | ||||
"""Cleanup the temporary location (if it exists). | """Cleanup the temporary location (if it exists). | ||||
""" | """ | ||||
if self.temp_dir and os.path.exists(self.temp_dir): | if self.temp_dir and os.path.exists(self.temp_dir): | ||||
shutil.rmtree(self.temp_dir) | shutil.rmtree(self.temp_dir) | ||||
self.log.info( | logger.info( | ||||
"Project %s - Done injecting %s" % (self.origin_url, self.repo_path) | "Project %s - Done injecting %s" % (self.origin_url, self.repo_path) | ||||
) | ) |