Page MenuHomeSoftware Heritage

D8080.diff
No OneTemporary

D8080.diff

diff --git a/swh/model/model.py b/swh/model/model.py
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -16,10 +16,11 @@
"""
from abc import ABCMeta, abstractmethod
+import collections
import datetime
from enum import Enum
import hashlib
-from typing import Any, Dict, Iterable, Optional, Tuple, Type, TypeVar, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union
import attr
from attrs_strict import AttributeTypeError
@@ -29,7 +30,7 @@
from . import git_objects
from .collections import ImmutableDict
-from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_hex
+from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_hex
from .swhids import CoreSWHID
from .swhids import ExtendedObjectType as SwhidExtendedObjectType
from .swhids import ExtendedSWHID
@@ -266,7 +267,7 @@
attribute is set to an empty value.
"""
if self.raw_manifest is None:
- return super().compute_hash()
+ return super().compute_hash() # calls self._compute_hash_from_attributes()
else:
return _compute_hash_from_manifest(self.raw_manifest)
@@ -943,12 +944,15 @@
)
+_DIR_ENTRY_TYPES = ["file", "dir", "rev"]
+
+
@attr.s(frozen=True, slots=True)
class DirectoryEntry(BaseModel):
object_type: Final = "directory_entry"
name = attr.ib(type=bytes, validator=type_validator())
- type = attr.ib(type=str, validator=attr.validators.in_(["file", "dir", "rev"]))
+ type = attr.ib(type=str, validator=attr.validators.in_(_DIR_ENTRY_TYPES))
target = attr.ib(type=Sha1Git, validator=type_validator(), repr=hash_repr)
perms = attr.ib(type=int, validator=type_validator(), converter=int, repr=oct)
"""Usually one of the values of `swh.model.from_disk.DentryPerms`."""
@@ -996,6 +1000,87 @@
"""Returns a SWHID representing this object."""
return CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=self.id)
+ @classmethod
+ def from_possibly_duplicated_entries(
+ cls,
+ *,
+ entries: Tuple[DirectoryEntry, ...],
+ id: Sha1Git = b"",
+ raw_manifest: Optional[bytes] = None,
+ ) -> Tuple[bool, "Directory"]:
+ """Constructs a ``Directory`` object from a list of entries that may contain
+ duplicated names.
+
+ This is required to represent legacy objects, that were ingested in the
+ storage database before this check was added.
+
+ As it is impossible for a ``Directory`` instances to have more than one entry
+ with a given names, this function computes a ``raw_manifest`` and renames one of
+ the entries before constructing the ``Directory``.
+
+ Returns:
+ ``(is_corrupt, directory)`` where ``is_corrupt`` is True iff some
+ entry names were indeed duplicated
+ """
+ # First, try building a Directory object normally without any extra computation,
+ # which works the overwhelming majority of the time:
+ try:
+ return (False, Directory(entries=entries, id=id, raw_manifest=raw_manifest))
+ except ValueError:
+ pass
+
+ # If it fails:
+ # 1. compute a raw_manifest if there isn't already one:
+ if raw_manifest is None:
+ # invalid_directory behaves like a Directory object, but without the
+ # duplicated entry check; which allows computing its raw_manifest
+ invalid_directory = type("", (), {})()
+ invalid_directory.entries = entries
+ raw_manifest = git_objects.directory_git_object(invalid_directory)
+
+ # 2. look for duplicated entries:
+ entries_by_name: Dict[
+ bytes, Dict[str, List[DirectoryEntry]]
+ ] = collections.defaultdict(lambda: collections.defaultdict(list))
+ for entry in entries:
+ entries_by_name[entry.name][entry.type].append(entry)
+
+ # 3. strip duplicates
+ deduplicated_entries = []
+ for entry_lists in entries_by_name.values():
+ # We could pick one entry at random to keep the original name; but we try to
+ # "minimize" the impact, by preserving entries of type "rev" first
+ # (because renaming them would likely break git submodules entirely
+ # when this directory is written to disk),
+ # then entries of type "dir" (because renaming them affects the path
+ # of every file in the dir, instead of just one "cnt").
+ dir_entry_types = ("rev", "dir", "file")
+ assert set(dir_entry_types) == set(_DIR_ENTRY_TYPES)
+ picked_winner = False # when True, all future entries must be renamed
+ for type_ in dir_entry_types:
+ for entry in entry_lists[type_]:
+ if not picked_winner:
+ # this is the "most important" entry according to this
+ # heuristic; it gets to keep its name.
+ deduplicated_entries.append(entry)
+ picked_winner = True
+ else:
+ # the heuristic already found an entry more important than
+ # this one; so this one must be renamed to something.
+ # we pick the beginning of its hash, it should be good enough
+ # to avoid any conflict.
+ new_name = (
+ entry.name + b"_" + hash_to_bytehex(entry.target)[0:10]
+ )
+ renamed_entry = attr.evolve(entry, name=new_name)
+ deduplicated_entries.append(renamed_entry)
+
+ # Finally, return the "fixed" the directory
+ dir_ = Directory(
+ entries=tuple(deduplicated_entries), id=id, raw_manifest=raw_manifest
+ )
+ return (True, dir_)
+
@attr.s(frozen=True, slots=True)
class BaseContent(BaseModel):
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -943,6 +943,143 @@
Directory(entries=entries)
+@given(strategies.directories())
+def test_directory_from_possibly_duplicated_entries__no_duplicates(directory):
+ """
+ Directory.from_possibly_duplicated_entries should return the directory
+ unchanged if it has no duplicated entry name.
+ """
+ assert (False, directory) == Directory.from_possibly_duplicated_entries(
+ id=directory.id, entries=directory.entries, raw_manifest=directory.raw_manifest
+ )
+ assert (False, directory) == Directory.from_possibly_duplicated_entries(
+ entries=directory.entries, raw_manifest=directory.raw_manifest
+ )
+
+
+@pytest.mark.parametrize("rev_first", [True, False])
+def test_directory_from_possibly_duplicated_entries__rev_and_dir(rev_first):
+ entries = (
+ DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+ DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+ )
+ if rev_first:
+ entries = tuple(reversed(entries))
+ (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+ assert is_corrupt
+ assert dir_.entries == (
+ DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+ DirectoryEntry(
+ name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1
+ ),
+ )
+
+ # order is independent of 'rev_first' because it is always sorted in git order
+ assert dir_.raw_manifest == (
+ # fmt: off
+ b"tree 52\x00"
+ + b"0 foo\x00" + b"\x00" * 20
+ + b"1 foo\x00" + b"\x01" * 20
+ # fmt: on
+ )
+
+
+@pytest.mark.parametrize("file_first", [True, False])
+def test_directory_from_possibly_duplicated_entries__file_and_dir(file_first):
+ entries = (
+ DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+ DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+ )
+ if file_first:
+ entries = tuple(reversed(entries))
+ (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+ assert is_corrupt
+ assert dir_.entries == (
+ DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+ DirectoryEntry(
+ name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0
+ ),
+ )
+
+ # order is independent of 'file_first' because it is always sorted in git order
+ assert dir_.raw_manifest == (
+ # fmt: off
+ b"tree 52\x00"
+ + b"0 foo\x00" + b"\x00" * 20
+ + b"1 foo\x00" + b"\x01" * 20
+ # fmt: on
+ )
+
+
+def test_directory_from_possibly_duplicated_entries__two_files1():
+ entries = (
+ DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1),
+ DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+ )
+ (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+ assert is_corrupt
+
+ assert dir_.entries == (
+ DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1),
+ DirectoryEntry(
+ name=b"foo_0000000000", type="file", target=b"\x00" * 20, perms=0
+ ),
+ )
+ assert dir_.raw_manifest == (
+ # fmt: off
+ b"tree 52\x00"
+ + b"1 foo\x00" + b"\x01" * 20
+ + b"0 foo\x00" + b"\x00" * 20
+ # fmt: on
+ )
+
+
+def test_directory_from_possibly_duplicated_entries__two_files2():
+ """
+ Same as above, but entries are in a different order (and order matters
+ to break the tie)
+ """
+ entries = (
+ DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+ DirectoryEntry(name=b"foo", type="file", target=b"\x01" * 20, perms=1),
+ )
+ (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(entries=entries)
+ assert is_corrupt
+
+ assert dir_.entries == (
+ DirectoryEntry(name=b"foo", type="file", target=b"\x00" * 20, perms=0),
+ DirectoryEntry(
+ name=b"foo_0101010101", type="file", target=b"\x01" * 20, perms=1
+ ),
+ )
+ assert dir_.raw_manifest == (
+ # fmt: off
+ b"tree 52\x00"
+ + b"0 foo\x00" + b"\x00" * 20
+ + b"1 foo\x00" + b"\x01" * 20
+ # fmt: on
+ )
+
+
+def test_directory_from_possibly_duplicated_entries__preserve_manifest():
+ entries = (
+ DirectoryEntry(name=b"foo", type="dir", target=b"\x01" * 20, perms=1),
+ DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+ )
+ (is_corrupt, dir_) = Directory.from_possibly_duplicated_entries(
+ entries=entries, raw_manifest=b"blah"
+ )
+ assert is_corrupt
+ assert dir_.entries == (
+ DirectoryEntry(name=b"foo", type="rev", target=b"\x00" * 20, perms=0),
+ DirectoryEntry(
+ name=b"foo_0101010101", type="dir", target=b"\x01" * 20, perms=1
+ ),
+ )
+
+ assert dir_.raw_manifest == b"blah"
+
+
# Release

File Metadata

Mime Type
text/plain
Expires
Wed, Sep 17, 4:37 AM (10 h, 37 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225436

Event Timeline