Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9346292
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
15 KB
Subscribers
None
View Options
diff --git a/swh/storage/algos/diff.py b/swh/storage/algos/diff.py
index d4204ace..1d75ffe2 100644
--- a/swh/storage/algos/diff.py
+++ b/swh/storage/algos/diff.py
@@ -1,402 +1,402 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# Utility module to efficiently compute the list of changed files
# between two directory trees.
# The implementation is inspired from the work of Alberto Cortés
# for the go-git project. For more details, you can refer to:
# - this blog post: https://blog.sourced.tech/post/difftree/
# - the reference implementation in go:
# https://github.com/src-d/go-git/tree/master/utils/merkletrie
import collections
from swh.model.identifiers import directory_identifier
from .dir_iterators import (
DirectoryIterator, DoubleDirectoryIterator, Remaining
)
# get the hash identifier for an empty directory
_empty_dir_hash = directory_identifier({'entries': []})
def _get_rev(storage, rev_id):
"""
Return revision data from swh storage.
"""
return list(storage.revision_get([rev_id]))[0]
class _RevisionChangesList(object):
"""
Helper class to track the changes between two
revision directories.
"""
def __init__(self, storage, track_renaming):
"""
Args:
storage: instance of swh storage
track_renaming (bool): whether to track or not files renaming
"""
self.storage = storage
self.track_renaming = track_renaming
self.result = []
# dicts used to track file renaming based on hash value
# we use a list instead of a single entry to handle the corner
# case when a repository contains multiple instance of
# the same file in different directories and a commit
# renames all of them
self.inserted_hash_idx = collections.defaultdict(list)
self.deleted_hash_idx = collections.defaultdict(list)
def add_insert(self, it_to):
"""
Add a file insertion in the to directory.
Args:
it_to (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on the to directory
"""
to_hash = it_to.current_hash()
# if the current file hash has been previously marked as deleted,
# the file has been renamed
if self.track_renaming and self.deleted_hash_idx[to_hash]:
# pop the delete change index in the same order it was inserted
change = self.result[self.deleted_hash_idx[to_hash].pop(0)]
# change the delete change as a rename one
change['type'] = 'rename'
change['to'] = it_to.current()
change['to_path'] = it_to.current_path()
else:
# add the insert change in the list
self.result.append({'type': 'insert',
'from': None,
'from_path': None,
'to': it_to.current(),
'to_path': it_to.current_path()})
# if rename tracking is activated, add the change index in
# the inserted_hash_idx dict
if self.track_renaming:
self.inserted_hash_idx[to_hash].append(len(self.result) - 1)
def add_delete(self, it_from):
"""
Add a file deletion in the from directory.
Args:
it_from (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on the from directory
"""
from_hash = it_from.current_hash()
# if the current file has been previously marked as inserted,
# the file has been renamed
if self.track_renaming and self.inserted_hash_idx[from_hash]:
# pop the insert change index in the same order it was inserted
change = self.result[self.inserted_hash_idx[from_hash].pop(0)]
# change the insert change as a rename one
change['type'] = 'rename'
change['from'] = it_from.current()
change['from_path'] = it_from.current_path()
else:
# add the delete change in the list
self.result.append({'type': 'delete',
'from': it_from.current(),
'from_path': it_from.current_path(),
'to': None,
'to_path': None})
# if rename tracking is activated, add the change index in
# the deleted_hash_idx dict
if self.track_renaming:
self.deleted_hash_idx[from_hash].append(len(self.result) - 1)
def add_modify(self, it_from, it_to):
"""
Add a file modification in the to directory.
Args:
it_from (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on the from directory
it_to (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on the to directory
"""
self.result.append({'type': 'modify',
'from': it_from.current(),
'from_path': it_from.current_path(),
'to': it_to.current(),
'to_path': it_to.current_path()})
def add_recursive(self, it, insert):
"""
Recursively add changes from a directory.
Args:
it (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on a directory
insert (bool): the type of changes to add (insertion
or deletion)
"""
# current iterated element is a regular file,
# simply add adequate change in the list
if not it.current_is_dir():
if insert:
self.add_insert(it)
else:
self.add_delete(it)
return
# current iterated element is a directory,
dir_id = it.current_hash()
# handle empty dir insertion/deletion as the swh model allow
# to have such object compared to git
if dir_id == _empty_dir_hash:
if insert:
self.add_insert(it)
else:
self.add_delete(it)
# iterate on files reachable from it and add
# adequate changes in the list
else:
sub_it = DirectoryIterator(self.storage, dir_id,
it.current_path() + b'/')
sub_it_current = sub_it.step()
while sub_it_current:
if not sub_it.current_is_dir():
if insert:
self.add_insert(sub_it)
else:
self.add_delete(sub_it)
sub_it_current = sub_it.step()
def add_recursive_insert(self, it_to):
"""
Recursively add files insertion from a to directory.
Args:
it_to (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on a to directory
"""
self.add_recursive(it_to, True)
def add_recursive_delete(self, it_from):
"""
Recursively add files deletion from a from directory.
Args:
it_from (swh.storage.algos.dir_iterators.DirectoryIterator):
iterator on a from directory
"""
self.add_recursive(it_from, False)
def _diff_elts_same_name(changes, it):
""""
Compare two directory entries with the same name and add adequate
changes if any.
Args:
changes (_RevisionChangesList): the list of changes between
two revisions
it (swh.storage.algos.dir_iterators.DoubleDirectoryIterator):
the iterator traversing two revision directories at the same time
"""
# compare the two current directory elements of the iterator
status = it.compare()
# elements have same hash and same permissions:
# no changes to add and call next on the two iterators
if status['same_hash'] and status['same_perms']:
it.next_both()
# elements are regular files and have been modified:
# insert the modification change in the list and
# call next on the two iterators
elif status['both_are_files']:
changes.add_modify(it.it_from, it.it_to)
it.next_both()
# one element is a regular file, the other a directory:
# recursively add delete/insert changes and call next
# on the two iterators
elif status['file_and_dir']:
changes.add_recursive_delete(it.it_from)
changes.add_recursive_insert(it.it_to)
it.next_both()
# both elements are directories:
elif status['both_are_dirs']:
# from directory is empty:
# recursively add insert changes in the to directory
# and call next on the two iterators
if status['from_is_empty_dir']:
changes.add_recursive_insert(it.it_to)
it.next_both()
# to directory is empty:
# recursively add delete changes in the from directory
# and call next on the two iterators
elif status['to_is_empty_dir']:
changes.add_recursive_delete(it.it_from)
it.next_both()
# both directories are not empty:
# call step on the two iterators to descend further in
# the directory trees.
elif not status['from_is_empty_dir'] and not status['to_is_empty_dir']:
it.step_both()
def _compare_paths(path1, path2):
"""
Compare paths in lexicographic depth-first order.
For instance, it returns:
- "a" < "b"
- "b/c/d" < "b"
- "c/foo.txt" < "c.txt"
"""
path1_parts = path1.split(b'/')
path2_parts = path2.split(b'/')
i = 0
while True:
if len(path1_parts) == len(path2_parts) and i == len(path1_parts):
return 0
elif len(path2_parts) == i:
return 1
elif len(path1_parts) == i:
return -1
else:
if path2_parts[i] > path1_parts[i]:
return -1
elif path2_parts[i] < path1_parts[i]:
return 1
i = i + 1
def _diff_elts(changes, it):
"""
Compare two directory entries.
Args:
changes (_RevisionChangesList): the list of changes between
two revisions
it (swh.storage.algos.dir_iterators.DoubleDirectoryIterator):
the iterator traversing two revision directories at the same time
"""
# compare current to and from path in depth-first lexicographic order
c = _compare_paths(it.it_from.current_path(), it.it_to.current_path())
# current from path is lower than the current to path:
# the from path has been deleted
if c < 0:
changes.add_recursive_delete(it.it_from)
it.next_from()
- # current from path is greather than the current to path:
+ # current from path is greater than the current to path:
# the to path has been inserted
elif c > 0:
changes.add_recursive_insert(it.it_to)
it.next_to()
# paths are the same and need more processing
else:
_diff_elts_same_name(changes, it)
def diff_directories(storage, from_dir, to_dir, track_renaming=False):
"""
Compute the differential between two directories, i.e. the list of
file changes (insertion / deletion / modification / renaming)
between them.
Args:
storage (swh.storage.storage.Storage): instance of a swh
storage (either local or remote, for optimal performance
the use of a local storage is recommended)
from_dir (bytes): the swh identifier of the directory to compare from
to_dir (bytes): the swh identifier of the directory to compare to
track_renaming (bool): whether or not to track files renaming
Returns:
list: A list of dict representing the changes between the two
revisions. Each dict contains the following entries:
- *type*: a string describing the type of change
('insert' / 'delete' / 'modify' / 'rename')
- *from*: a dict containing the directory entry metadata in the
from revision (None in case of an insertion)
- *from_path*: bytes string corresponding to the absolute path
of the from revision entry (None in case of an insertion)
- *to*: a dict containing the directory entry metadata in the
to revision (None in case of a deletion)
- *to_path*: bytes string corresponding to the absolute path
of the to revision entry (None in case of a deletion)
The returned list is sorted in lexicographic depth-first order
according to the value of the *to_path* field.
"""
changes = _RevisionChangesList(storage, track_renaming)
it = DoubleDirectoryIterator(storage, from_dir, to_dir)
while True:
r = it.remaining()
if r == Remaining.NoMoreFiles:
break
elif r == Remaining.OnlyFromFilesRemain:
changes.add_recursive_delete(it.it_from)
it.next_from()
elif r == Remaining.OnlyToFilesRemain:
changes.add_recursive_insert(it.it_to)
it.next_to()
else:
_diff_elts(changes, it)
return changes.result
def diff_revisions(storage, from_rev, to_rev, track_renaming=False):
"""
Compute the differential between two revisions,
i.e. the list of file changes between the two associated directories.
Args:
storage (swh.storage.storage.Storage): instance of a swh
storage (either local or remote, for optimal performance
the use of a local storage is recommended)
from_rev (bytes): the identifier of the revision to compare from
to_rev (bytes): the identifier of the revision to compare to
track_renaming (bool): whether or not to track files renaming
Returns:
list: A list of dict describing the introduced file changes
(see :func:`swh.storage.algos.diff.diff_directories`).
"""
from_dir = None
if from_rev:
from_dir = _get_rev(storage, from_rev)['directory']
to_dir = _get_rev(storage, to_rev)['directory']
return diff_directories(storage, from_dir, to_dir, track_renaming)
def diff_revision(storage, revision, track_renaming=False):
"""
Computes the differential between a revision and its first parent.
If the revision has no parents, the directory to compare from
is considered as empty.
In other words, it computes the file changes introduced in a
specific revision.
Args:
storage (swh.storage.storage.Storage): instance of a swh
storage (either local or remote, for optimal performance
the use of a local storage is recommended)
revision (bytes): the identifier of the revision from which to
compute the introduced changes.
track_renaming (bool): whether or not to track files renaming
Returns:
list: A list of dict describing the introduced file changes
(see :func:`swh.storage.algos.diff.diff_directories`).
"""
rev_data = _get_rev(storage, revision)
parent = None
if rev_data['parents']:
parent = rev_data['parents'][0]
return diff_revisions(storage, parent, revision, track_renaming)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:52 PM (2 w, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3253798
Attached To
rDSTO Storage manager
Event Timeline
Log In to Comment