Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9341285
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
View Options
diff --git a/swh/loader/mercurial/bundle_loader.py b/swh/loader/mercurial/bundle_loader.py
index d627867..f5c9ba2 100644
--- a/swh/loader/mercurial/bundle_loader.py
+++ b/swh/loader/mercurial/bundle_loader.py
@@ -1,353 +1,353 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# https://www.mercurial-scm.org/wiki/BundleFormat says:
# "The new bundle format design is described on the BundleFormat2 page."
#
# https://www.mercurial-scm.org/wiki/BundleFormat2#Format_of_the_Bundle2_Container says: # noqa
# "The latest description of the binary format can be found as comment in the
# Mercurial source code."
#
# https://www.mercurial-scm.org/repo/hg/file/default/mercurial/help/internals/bundles.txt says: # noqa
# "The 'HG20' format is not yet documented here. See the inline comments in
# 'mercurial/exchange.py' for now."
#
# Avi says: --------------------
# All of the official statements shown above are wrong.
# The bundle20 format seems to be accurately documented nowhere.
# https://www.mercurial-scm.org/wiki/BundleFormatHG19 is predictably invalid
# and absent in a number of details besides.
# https://www.mercurial-scm.org/wiki/BundleFormat2#New_header is wrong and
# bizarre, and there isn't any other information on the page.
# We get something reasonably close in
# https://www.mercurial-scm.org/repo/hg/file/tip/mercurial/help/internals/changegroups.txt # noqa
# which describes much of the internal structure, but it doesn't describe any
# of the file-level structure, including the file header and that the entire
# bundle is broken into overlayed 4KB chunks starting from byte 9, nor does it
# describe what any of the component elements are used for.
# Also it says "The [delta data] format is described more fully in
# 'hg help internals.bdiff'", which is also wrong. As far as I can tell,
# that file has never existed.
# ------------------------------
#
import struct
import types
from binascii import hexlify
from functools import update_wrapper
from io import BytesIO
-from datetime import datetime, timedelta, timezone
+from datetime import datetime
"""
mkdir WORKSPACE
cd WORKSPACE
hg init
hg pull <SOURCE_URL>
hg bundle -f -a bundle_file -t none-v2
"""
def makeFIFO(io_cls):
"""Factory method for convenient io stream FIFOs.
Pass in either StringIO or BytesIO, and get back a modified instance of
that type suitable for automatic use as a FIFO buffer without needing to
manually position the cursor.
"""
# don't re-wrap a known entity
global_cls = globals().get(io_cls.__name__+'_FIFO')
if global_cls:
return global_cls()
def len(self):
return getattr(self, '_write_pos', 0) - getattr(self, '_read_pos', 0)
def peek(self, howmany):
num_left = self.len()
ret = self.read(howmany)
self.seek(-min(num_left, howmany), 1)
setattr(self, '_read_pos', self.tell())
return ret
def _wrap_func(func):
if func.__name__.startswith('read'):
pos_marker = '_read_pos'
elif func.__name__.startswith('write'):
pos_marker = '_write_pos'
else:
return None
def seek_first(self, *args, **kwargs):
self.seek(getattr(self, pos_marker, 0))
try:
return func(self, *args, **kwargs)
finally:
setattr(self, pos_marker, self.tell())
update_wrapper(seek_first, func)
return seek_first
# brand new subclass of io_cls
FIFO = types.new_class(io_cls.__name__+'_FIFO', (io_cls,))
for attribute in dir(FIFO):
# wrap some methods
if callable(getattr(FIFO, attribute)):
new_func = _wrap_func(getattr(FIFO, attribute))
if new_func:
setattr(FIFO, attribute, new_func)
# add a few new methods
setattr(FIFO, 'len', len)
setattr(FIFO, 'peek', peek)
# put in global namespace
globals()[FIFO.__name__] = FIFO
return FIFO()
class ChunkedFileReader(object):
"""A BytesIO FIFO (see the makeFIFO factory function) that gives seamless
read access to files such as the Mercurial bundle2 HG20 format which are
partitioned into chunks of [4Bytes:<length>, <length-4>Bytes:<data>].
init args:
file: either a filename string or pre-opened binary-read file handle.
"""
def __init__(self, file):
if isinstance(file, str):
self._file = open(file, "rb")
else:
self._file = file
self._fifo = makeFIFO(BytesIO)
def _buffer_chunk(self):
"""Load the next chunk of data into the buffer.
"""
chunk_size = struct.unpack('>I', self._file.read(4))[0]
return self._fifo.write(self._file.read(chunk_size))
def read(self, bytes_to_read):
"""Return N bytes from the file as a single block.
"""
return b''.join(self.read_iterator(bytes_to_read))
def read_iterator(self, bytes_to_read):
"""Return a generator that eventually yields N bytes from the file
one file chunk at a time.
"""
has_data = True
while bytes_to_read > 0 and has_data:
if bytes_to_read > self._fifo.len():
has_data = self._buffer_chunk()
bytes_read = min(bytes_to_read, self._fifo.len())
yield self._fifo.read(bytes_read)
bytes_to_read -= bytes_read
def __getattr__(self, name):
"""Forward all other method calls to the FIFO class.
"""
return getattr(self._fifo, name)
def DEBUG(*args, **kwargs):
print(*args, **kwargs)
def unpack(fmt_str, source):
ret = struct.unpack(fmt_str, source.read(struct.calcsize(fmt_str)))
if len(ret) == 1:
return ret[0]
return ret
class Bundle20Reader(object):
"""Purpose-built bundle20 file parser for SWH loading.
"""
def __init__(self, bundlefile):
"""
args:
bundlefile: (string) name of the binary repository bundle file
"""
bfile = open(bundlefile, 'rb')
btype = bfile.read(4) # 'HG20'
if btype != b'HG20':
raise Exception(bundlefile,
b'Not an HG20 bundle. First 4 bytes:'+btype)
bfile.read(4) # '\x00\x00\x00\x00'
self.chunkreader = ChunkedFileReader(bfile)
self.commits = []
self.manifests = []
self.files = {}
def read_deltaheader(self):
"""Yield the complete header section of a delta chunk.
"""
chunkreader = self.chunkreader
node = hexlify(chunkreader.read(20))
p1 = hexlify(chunkreader.read(20))
p2 = hexlify(chunkreader.read(20))
basenode = hexlify(chunkreader.read(20))
linknode = hexlify(chunkreader.read(20))
return {'node': node, 'p1': p1, 'p2': p2,
'basenode': basenode, 'linknode': linknode}
def read_deltadata(self, size):
"""Yield the complete data section of a delta chunk.
"""
read_bytes = 104
while read_bytes < size:
start_offset = unpack(">I", self.chunkreader)
end_offset = unpack(">I", self.chunkreader)
sub_size = unpack(">I", self.chunkreader)
print("DATA SIZE:", sub_size)
if sub_size > 0:
data_it = self.chunkreader.read_iterator(sub_size)
else:
data_it = None
read_bytes += sub_size+12
yield (start_offset, end_offset, data_it)
def commit_handler(self, header, data_block_it):
"""Handler method for changeset delta components.
"""
data_block = next(data_block_it)[2]
data = b''.join(data_block)
firstpart, message = data.split(b'\n\n', 1)
firstpart = firstpart.split(b'\n')
commit = header
commit['message'] = message
commit['manifest'] = firstpart[0]
commit['user'] = firstpart[1]
tstamp, tz, *extra = firstpart[2].split(b' ')
- tz = timezone(timedelta(hours=int(tz)/3600))
- commit['time'] = datetime.fromtimestamp(float(tstamp), tz=tz)
+ commit['time'] = datetime.fromtimestamp(float(tstamp))
+ commit['time_offset_seconds'] = int(tz)
if extra:
commit['extra'] = extra
commit['changed_files'] = firstpart[3:]
self.commits.append((header['node'], commit))
def manifest_handler(self, header, data_block_it):
"""Handler method for manifest delta components.
"""
commit = header
commit['manifest'] = []
for data_block in data_block_it:
data_it = data_block[2]
if data_it is not None:
data = b''.join(data_it)[:-1]
commit['manifest'] += [tuple(file.split(b'\x00'))
for file in data.split(b'\n')]
self.manifests.append((header['node'], commit))
def filedelta_handler(self, header, data_block_it):
"""Handler method for filelog delta components.
"""
commit = header
for data_block in data_block_it:
data_it = data_block[2]
if data_it is not None:
data = b''.join(data_it)
if data.startswith(b'\x01\n'): # has a meta-message
empty, metainfo, data = data.split(b'\x01\n', 2)
if metainfo.startswith(b'copy:'): # direct file copy
copyinfo = metainfo.split(b'\n')
commit['copied_file'] = copyinfo[0][6:]
commit['copied_filerev'] = copyinfo[1][9:]
elif metainfo.startswith(b'censored:'):
# censored revision deltas must be full-replacements
commit['censored'] = metainfo
else:
commit['meta'] = metainfo
commit.setdefault('new_data', []).append(
(data_block[0], data_block[1], data)
)
self.files.setdefault(
self.cur_file, []
).append((header['node'], commit))
def loop_deltagroups(self, section_handler):
"""Bundle sections are composed of one or more groups of deltas.
Iterate over them and hand each one to the current section-specific
handler method.
"""
size = unpack(">I", self.chunkreader)
while size > 0:
print("SIZE ", size)
section_handler(
self.read_deltaheader(),
(size > 104) and self.read_deltadata(size) or []
)
size = unpack(">I", self.chunkreader)
def process_changesets(self):
"""Parsing stage for the changeset section, containing metadata about
each commit.
"""
DEBUG("\nREADING COMMITS\n")
self.loop_deltagroups(self.commit_handler)
def process_manifest(self):
"""Parsing stage for the manifest section, containing manifest deltas
for each changeset.
"""
DEBUG("\nREADING MANIFEST\n")
self.loop_deltagroups(self.manifest_handler)
def process_filelog(self):
"""Parsing stage for the filelog section, containing data deltas for
each change to each file.
"""
DEBUG("\nREADING DELTAS\n")
name_size = unpack(">I", self.chunkreader)
while name_size > 0:
name = b''.join(self.chunkreader.read_iterator(name_size-4))
DEBUG("\n\nFILE", name, "\n")
self.cur_file = name
self.loop_deltagroups(self.filedelta_handler)
name_size = unpack(">I", self.chunkreader)
def process_bundle_header(self):
"""Parsing stage for the file header which describes format and
parameters.
"""
chunkreader = self.chunkreader
DEBUG("\nREADING BUNDLE HEADER\n")
chg_len = unpack('>B', chunkreader) # len('CHANGEGROUP') == 11
chunkreader.read(chg_len) # 'CHANGEGROUP'
unpack('>I', chunkreader) # probably \x00\x00\x00\x00
n_mandatory, n_advisory = unpack('>BB', chunkreader) # parameters
mandatory_params = [
(key_len, val_len)
for key_len, val_len
in [unpack('>BB', chunkreader) for i in range(n_mandatory)]
]
advisory_params = [
(key_len, val_len)
for key_len, val_len
in [unpack('>BB', chunkreader) for i in range(n_advisory)]
]
params = {}
for key_len, val_len in mandatory_params+advisory_params:
key = unpack('>%ds' % key_len, chunkreader)
val = int(unpack('>%ds' % val_len, chunkreader))
params[key] = val
DEBUG(params)
def read_bundle(self):
"""Initiate loading of the bundle.
"""
self.process_bundle_header()
self.process_changesets()
self.process_manifest()
self.process_filelog()
import code
code.interact(local=dict(globals(), **locals()))
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
br = Bundle20Reader(sys.argv[1])
br.read_bundle()
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 11:54 AM (3 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3254753
Attached To
rDLDHG Mercurial loader
Event Timeline
Log In to Comment