Page MenuHomeSoftware Heritage

from_disk.py
No OneTemporary

from_disk.py

# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from datetime import datetime
import os
import shutil
from typing import Dict, Optional
from dulwich.errors import ObjectFormatException
try:
from dulwich.errors import EmptyFileException # type: ignore
except ImportError:
# dulwich >= 0.20
from dulwich.objects import EmptyFileException
import dulwich.objects
import dulwich.repo
from swh.loader.core.loader import DVCSLoader
from swh.model import hashutil
from swh.model.model import Origin, Snapshot, SnapshotBranch, TargetType
from swh.storage.algos.origin import origin_get_latest_visit_status
from swh.storage.interface import StorageInterface
from . import converters, utils
def _check_tag(tag):
"""Copy-paste of dulwich.objects.Tag, minus the tagger and time checks,
which are too strict and error on old tags."""
# Copyright (C) 2007 James Westby <jw+debian@jameswestby.net>
# Copyright (C) 2008-2013 Jelmer Vernooij <jelmer@jelmer.uk>
#
# Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU
# General Public License as public by the Free Software Foundation; version 2.0
# or (at your option) any later version. You can redistribute it and/or
# modify it under the terms of either of these two licenses.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# You should have received a copy of the licenses; if not, see
# <http://www.gnu.org/licenses/> for a copy of the GNU General Public License
# and <http://www.apache.org/licenses/LICENSE-2.0> for a copy of the Apache
# License, Version 2.0.
dulwich.objects.ShaFile.check(tag)
tag._check_has_member("_object_sha", "missing object sha")
tag._check_has_member("_object_class", "missing object type")
tag._check_has_member("_name", "missing tag name")
if not tag._name:
raise ObjectFormatException("empty tag name")
dulwich.objects.check_hexsha(tag._object_sha, "invalid object sha")
if tag._tag_time is not None:
dulwich.objects.check_time(tag._tag_time)
from dulwich.objects import (
_OBJECT_HEADER,
_TAG_HEADER,
_TAGGER_HEADER,
_TYPE_HEADER,
)
last = None
for field, _ in dulwich.objects._parse_message(tag._chunked_text):
if field == _OBJECT_HEADER and last is not None:
raise ObjectFormatException("unexpected object")
elif field == _TYPE_HEADER and last != _OBJECT_HEADER:
raise ObjectFormatException("unexpected type")
elif field == _TAG_HEADER and last != _TYPE_HEADER:
raise ObjectFormatException("unexpected tag name")
elif field == _TAGGER_HEADER and last != _TAG_HEADER:
raise ObjectFormatException("unexpected tagger")
last = field
class GitLoaderFromDisk(DVCSLoader):
"""Load a git repository from a directory.
"""
visit_type = "git"
def __init__(
self,
storage: StorageInterface,
url: str,
visit_date: Optional[datetime] = None,
directory: Optional[str] = None,
save_data_path: Optional[str] = None,
max_content_size: Optional[int] = None,
):
super().__init__(
storage=storage,
save_data_path=save_data_path,
max_content_size=max_content_size,
)
self.origin_url = url
self.visit_date = visit_date
self.directory = directory
def prepare_origin_visit(self):
self.origin = Origin(url=self.origin_url)
def prepare(self):
self.repo = dulwich.repo.Repo(self.directory)
def iter_objects(self):
object_store = self.repo.object_store
for pack in object_store.packs:
objs = list(pack.index.iterentries())
objs.sort(key=lambda x: x[1])
for sha, offset, crc32 in objs:
yield hashutil.hash_to_bytehex(sha)
yield from object_store._iter_loose_objects()
yield from object_store._iter_alternate_objects()
def _check(self, obj):
"""Check the object's repository representation.
If any errors in check exists, an ObjectFormatException is
raised.
Args:
obj (object): Dulwich object read from the repository.
"""
if isinstance(obj, dulwich.objects.Tag):
_check_tag(obj)
else:
obj.check()
try:
# For additional checks on dulwich objects with date
# for now, only checks on *time
if isinstance(obj, dulwich.objects.Commit):
commit_time = obj._commit_time
utils.check_date_time(commit_time)
author_time = obj._author_time
utils.check_date_time(author_time)
elif isinstance(obj, dulwich.objects.Tag):
tag_time = obj._tag_time
if tag_time:
utils.check_date_time(tag_time)
except Exception as e:
raise ObjectFormatException(e)
def get_object(self, oid):
"""Given an object id, return the object if it is found and not
malformed in some way.
Args:
oid (bytes): the object's identifier
Returns:
The object if found without malformation
"""
try:
# some errors are raised when reading the object
obj = self.repo[oid]
# some we need to check ourselves
self._check(obj)
except KeyError:
_id = oid.decode("utf-8")
self.log.warn(
"object %s not found, skipping" % _id,
extra={
"swh_type": "swh_loader_git_missing_object",
"swh_object_id": _id,
"origin_url": self.origin.url,
},
)
return None
except ObjectFormatException as e:
id_ = oid.decode("utf-8")
self.log.warn(
"object %s malformed (%s), skipping",
id_,
e.args[0],
extra={
"swh_type": "swh_loader_git_missing_object",
"swh_object_id": id_,
"origin_url": self.origin.url,
},
)
return None
except EmptyFileException:
id_ = oid.decode("utf-8")
self.log.warn(
"object %s corrupted (empty file), skipping",
id_,
extra={
"swh_type": "swh_loader_git_missing_object",
"swh_object_id": id_,
"origin_url": self.origin.url,
},
)
else:
return obj
def fetch_data(self):
"""Fetch the data from the data source"""
visit_status = origin_get_latest_visit_status(
self.storage, self.origin_url, require_snapshot=True
)
self.previous_snapshot_id = (
None if visit_status is None else visit_status.snapshot
)
type_to_ids = defaultdict(list)
for oid in self.iter_objects():
obj = self.get_object(oid)
if obj is None:
continue
type_name = obj.type_name
type_to_ids[type_name].append(oid)
self.type_to_ids = type_to_ids
def has_contents(self):
"""Checks whether we need to load contents"""
return bool(self.type_to_ids[b"blob"])
def get_content_ids(self):
"""Get the content identifiers from the git repository"""
for oid in self.type_to_ids[b"blob"]:
yield converters.dulwich_blob_to_content_id(self.repo[oid])
def get_contents(self):
"""Get the contents that need to be loaded"""
missing_contents = set(
self.storage.content_missing(self.get_content_ids(), "sha1_git")
)
for oid in missing_contents:
yield converters.dulwich_blob_to_content(
self.repo[hashutil.hash_to_bytehex(oid)]
)
def has_directories(self):
"""Checks whether we need to load directories"""
return bool(self.type_to_ids[b"tree"])
def get_directory_ids(self):
"""Get the directory identifiers from the git repository"""
return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tree"])
def get_directories(self):
"""Get the directories that need to be loaded"""
missing_dirs = set(
self.storage.directory_missing(sorted(self.get_directory_ids()))
)
for oid in missing_dirs:
yield converters.dulwich_tree_to_directory(
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log
)
def has_revisions(self):
"""Checks whether we need to load revisions"""
return bool(self.type_to_ids[b"commit"])
def get_revision_ids(self):
"""Get the revision identifiers from the git repository"""
return (
hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"commit"]
)
def get_revisions(self):
"""Get the revisions that need to be loaded"""
missing_revs = set(
self.storage.revision_missing(sorted(self.get_revision_ids()))
)
for oid in missing_revs:
yield converters.dulwich_commit_to_revision(
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log
)
def has_releases(self):
"""Checks whether we need to load releases"""
return bool(self.type_to_ids[b"tag"])
def get_release_ids(self):
"""Get the release identifiers from the git repository"""
return (hashutil.hash_to_bytes(id.decode()) for id in self.type_to_ids[b"tag"])
def get_releases(self):
"""Get the releases that need to be loaded"""
missing_rels = set(self.storage.release_missing(sorted(self.get_release_ids())))
for oid in missing_rels:
yield converters.dulwich_tag_to_release(
self.repo[hashutil.hash_to_bytehex(oid)], log=self.log
)
def get_snapshot(self):
"""Turn the list of branches into a snapshot to load"""
branches: Dict[bytes, Optional[SnapshotBranch]] = {}
for ref, target in self.repo.refs.as_dict().items():
if utils.ignore_branch_name(ref):
continue
obj = self.get_object(target)
if obj:
target_type = converters.DULWICH_TARGET_TYPES[obj.type_name]
branches[ref] = SnapshotBranch(
target=hashutil.bytehex_to_hash(target), target_type=target_type,
)
else:
branches[ref] = None
dangling_branches = {}
for ref, target in self.repo.refs.get_symrefs().items():
if utils.ignore_branch_name(ref):
continue
branches[ref] = SnapshotBranch(target=target, target_type=TargetType.ALIAS)
if target not in branches:
# This handles the case where the pointer is "dangling".
# There's a chance that a further symbolic reference will
# override this default value, which is totally fine.
dangling_branches[target] = ref
branches[target] = None
utils.warn_dangling_branches(
branches, dangling_branches, self.log, self.origin_url
)
self.snapshot = Snapshot(branches=branches)
return self.snapshot
def save_data(self):
"""We already have the data locally, no need to save it"""
pass
def load_status(self):
"""The load was eventful if the current occurrences are different to
the ones we retrieved at the beginning of the run"""
eventful = False
if self.previous_snapshot_id:
eventful = self.snapshot.id != self.previous_snapshot_id
else:
eventful = bool(self.snapshot.branches)
return {"status": ("eventful" if eventful else "uneventful")}
class GitLoaderFromArchive(GitLoaderFromDisk):
"""Load a git repository from an archive.
This loader ingests a git repository compressed into an archive.
The supported archive formats are ``.zip`` and ``.tar.gz``.
From an input tarball named ``my-git-repo.zip``, the following layout is
expected in it::
my-git-repo/
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
Nevertheless, the loader is able to ingest tarballs with the following
layouts too::
.
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
or::
other-repo-name/
├── .git
│ ├── branches
│ ├── COMMIT_EDITMSG
│ ├── config
│ ├── description
│ ├── HEAD
...
"""
def __init__(self, *args, archive_path, **kwargs):
super().__init__(*args, **kwargs)
self.temp_dir = self.repo_path = None
self.archive_path = archive_path
def project_name_from_archive(self, archive_path):
"""Compute the project name from the archive's path.
"""
archive_name = os.path.basename(archive_path)
for ext in (".zip", ".tar.gz", ".tgz"):
if archive_name.lower().endswith(ext):
archive_name = archive_name[: -len(ext)]
break
return archive_name
def prepare(self):
"""1. Uncompress the archive in temporary location.
2. Prepare as the GitLoaderFromDisk does
3. Load as GitLoaderFromDisk does
"""
project_name = self.project_name_from_archive(self.archive_path)
self.temp_dir, self.repo_path = utils.init_git_repo_from_archive(
project_name, self.archive_path
)
self.log.info(
"Project %s - Uncompressing archive %s at %s",
self.origin_url,
os.path.basename(self.archive_path),
self.repo_path,
)
self.directory = self.repo_path
super().prepare()
def cleanup(self):
"""Cleanup the temporary location (if it exists).
"""
if self.temp_dir and os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)
self.log.info(
"Project %s - Done injecting %s" % (self.origin_url, self.repo_path)
)

File Metadata

Mime Type
text/x-python
Expires
Sat, Jun 21, 5:44 PM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3304990

Event Timeline