diff --git a/.gitignore b/.gitignore
index f5fc2ae..18373e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,10 @@
*.pyc
*.sw?
*~
.coverage
.eggs/
__pycache__
*.egg-info/
-version.txt
\ No newline at end of file
+version.txt
+build/
+dist/
diff --git a/MANIFEST.in b/MANIFEST.in
index e7c46fc..f0674d6 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
include Makefile
include requirements.txt
include requirements-swh.txt
include version.txt
+recursive-include swh/loader/mercurial/tests/resources *
diff --git a/PKG-INFO b/PKG-INFO
index d91d5fd..6b4300d 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,10 +1,130 @@
-Metadata-Version: 1.0
+Metadata-Version: 2.1
Name: swh.loader.mercurial
-Version: 0.0.12
+Version: 0.0.13
Summary: Software Heritage Mercurial Loader
Home-page: https://forge.softwareheritage.org/diffusion/DLDHG/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
-Description: UNKNOWN
+Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
+Project-URL: Funding, https://www.softwareheritage.org/donate
+Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-mercurial
+Description: swh-loader-mercurial
+ =========================
+
+ # Configuration file
+
+ In usual location for a loader, *{/etc/softwareheritage/ | ~/.swh/ |
+ ~/.config/swh/}loader/hg.yml*:
+
+ ``` YAML
+ storage:
+ cls: remote
+ args:
+ url: http://localhost:5002/
+ ```
+
+ # Basic use
+
+ The main entry point to import a Mercurial repository is the `main` function
+ defined in the `swh.loader.mercurial.cli` module:
+
+ ``` bash
+ python3 -m swh.loader.mercurial.cli
+ ```
+
+
+ If the Python package has been installed via `pip`, you should be able
+ to type:
+
+ ``` bash
+ user@host:~$ swh-loader-hg --help
+
+ Usage: swh-loader-hg [OPTIONS] ORIGIN_URL
+
+ Options:
+ -d, --hg-directory TEXT Path to the hg (local) directory to load
+ from. If unset, the hg repo will ben cloned
+ from the given (origin) url
+ -a, --hg-archive TEXT Path to the hg (local) archive file to load
+ from.
+ -D, --visit-date TEXT Visit date (defaults to now)
+ -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL]
+ Log level
+ --help Show this message and exit.
+
+ ```
+
+ For example:
+
+ ``` bash
+ user@host:~$ swh-loader-hg https://www.mercurial-scm.org/repo/hello
+ [...]
+ ```
+
+
+ # From Python
+ From python3's toplevel:
+
+ ## Remote
+
+ ``` Python
+ project = 'hello'
+ # remote repository
+ origin_url = 'https://www.mercurial-scm.org/repo/%s' % project
+ # local clone
+ directory = '/home/storage/hg/repo/%s' % project
+
+ import logging
+ logging.basicConfig(level=logging.DEBUG)
+
+ from swh.loader.mercurial.tasks import LoadMercurial
+
+ t = LoadMercurial()
+ t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00')
+ ```
+
+ ## local directory
+
+ Only origin, contents, and directories are filled so far.
+
+ Remaining objects are empty (revision, release, occurrence).
+
+ ``` Python
+ project = '756015-ipv6'
+ directory = '/home/storage/hg/repo/%s' % project
+ origin_url = 'https://%s.googlecode.com' % project
+
+ import logging
+ logging.basicConfig(level=logging.DEBUG)
+
+ from swh.loader.mercurial.tasks import LoadMercurial
+
+ t = LoadMercurial()
+ t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00')
+ ```
+
+ ## local archive
+
+ ``` Python
+ project = '756015-ipv6-source-archive.zip'
+ archive_path = '/home/storage/hg/repo/%s' % project
+ origin_url = 'https://%s-archive.googlecode.com' % project
+
+ import logging
+ logging.basicConfig(level=logging.DEBUG)
+
+ from swh.loader.mercurial.tasks import LoadArchiveMercurial
+
+ t = LoadArchiveMercurial()
+ t.run(origin_url=origin_url, archive_path=archive_path, visit_date='2016-05-03T15:16:32+00:00')
+ ```
+
Platform: UNKNOWN
+Classifier: Programming Language :: Python :: 3
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 4 - Beta
+Description-Content-Type: text/markdown
+Provides-Extra: testing
diff --git a/README.md b/README.md
index 887b797..d4fff57 100644
--- a/README.md
+++ b/README.md
@@ -1,83 +1,110 @@
swh-loader-mercurial
=========================
# Configuration file
In usual location for a loader, *{/etc/softwareheritage/ | ~/.swh/ |
~/.config/swh/}loader/hg.yml*:
``` YAML
storage:
cls: remote
args:
url: http://localhost:5002/
-
-send_contents: True
-send_directories: True
-send_revisions: True
-send_releases: True
-send_occurrences: True
-content_packet_size: 1000
-content_packet_size_bytes: 1073741824
-directory_packet_size: 2500
-revision_packet_size: 1000
-release_packet_size: 1000
-occurrence_packet_size: 1000
```
# Basic use
+The main entry point to import a Mercurial repository is the `main` function
+defined in the `swh.loader.mercurial.cli` module:
+
+``` bash
+python3 -m swh.loader.mercurial.cli
+```
+
+
+If the Python package has been installed via `pip`, you should be able
+to type:
+
+``` bash
+user@host:~$ swh-loader-hg --help
+
+Usage: swh-loader-hg [OPTIONS] ORIGIN_URL
+
+Options:
+ -d, --hg-directory TEXT Path to the hg (local) directory to load
+ from. If unset, the hg repo will ben cloned
+ from the given (origin) url
+ -a, --hg-archive TEXT Path to the hg (local) archive file to load
+ from.
+ -D, --visit-date TEXT Visit date (defaults to now)
+ -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL]
+ Log level
+ --help Show this message and exit.
+
+```
+
+For example:
+
+``` bash
+user@host:~$ swh-loader-hg https://www.mercurial-scm.org/repo/hello
+[...]
+```
+
+
+# From Python
From python3's toplevel:
-## Remote (failure)
+## Remote
``` Python
+project = 'hello'
# remote repository
-origin_url = 'https://www.mercurial-scm.org/repo/hello'
+origin_url = 'https://www.mercurial-scm.org/repo/%s' % project
# local clone
-directory = '/home/storage/hg/repo/hello'
+directory = '/home/storage/hg/repo/%s' % project
import logging
logging.basicConfig(level=logging.DEBUG)
-from swh.loader.mercurial.tasks import LoadMercurialTsk
+from swh.loader.mercurial.tasks import LoadMercurial
-t = LoadMercurialTsk()
+t = LoadMercurial()
t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00')
```
-## local directory (failure)
+## local directory
Only origin, contents, and directories are filled so far.
Remaining objects are empty (revision, release, occurrence).
``` Python
project = '756015-ipv6'
directory = '/home/storage/hg/repo/%s' % project
origin_url = 'https://%s.googlecode.com' % project
import logging
logging.basicConfig(level=logging.DEBUG)
-from swh.loader.mercurial.tasks import SlowLoadMercurialTsk
+from swh.loader.mercurial.tasks import LoadMercurial
-t = SlowLoadMercurialTsk()
+t = LoadMercurial()
t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00')
```
-## local archive (failure)
+## local archive
``` Python
project = '756015-ipv6-source-archive.zip'
archive_path = '/home/storage/hg/repo/%s' % project
origin_url = 'https://%s-archive.googlecode.com' % project
import logging
logging.basicConfig(level=logging.DEBUG)
-from swh.loader.mercurial.tasks import SlowLoadMercurialArchiveTsk
+from swh.loader.mercurial.tasks import LoadArchiveMercurial
-t = SlowLoadMercurialArchiveTsk()
+t = LoadArchiveMercurial()
t.run(origin_url=origin_url, archive_path=archive_path, visit_date='2016-05-03T15:16:32+00:00')
```
diff --git a/docs/index.rst b/docs/index.rst
index f23e50a..6901dfc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,17 +1,19 @@
.. _swh-loader-mercurial:
-Software Heritage - Development Documentation
-=============================================
+Software Heritage - Mercurial loader
+====================================
+
+Loader for `Mercurial `_ repositories.
+
.. toctree::
:maxdepth: 2
:caption: Contents:
-
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
diff --git a/requirements-swh.txt b/requirements-swh.txt
index bf7d5cf..0c28121 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,5 +1,5 @@
swh.core >= 0.0.36
-swh.model >= 0.0.20
+swh.model >= 0.0.27
swh.storage >= 0.0.95
swh.scheduler >= 0.0.19
-swh.loader.core >= 0.0.33
+swh.loader.core >= 0.0.34
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 0000000..f3c7e8e
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1 @@
+nose
diff --git a/requirements.txt b/requirements.txt
index d1a5bda..3e8c649 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,10 @@
# Add here external Python modules dependencies, one per line. Module names
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
-vcversioner
-hglib
-retrying
+click
+patool
python-dateutil
+python-hglib
+retrying
sqlitedict
+vcversioner
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index db89a38..f5e4caa
--- a/setup.py
+++ b/setup.py
@@ -1,28 +1,69 @@
+#!/usr/bin/env python3
+# Copyright (C) 2015-2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
from setuptools import setup, find_packages
+from os import path
+from io import open
+
+here = path.abspath(path.dirname(__file__))
+
+# Get the long description from the README file
+with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+ long_description = f.read()
+
+
+def parse_requirements(name=None):
+ if name:
+ reqf = 'requirements-%s.txt' % name
+ else:
+ reqf = 'requirements.txt'
-def parse_requirements():
requirements = []
- for reqf in ('requirements.txt', 'requirements-swh.txt'):
- with open(reqf) as f:
- for line in f.readlines():
- line = line.strip()
- if not line or line.startswith('#'):
- continue
- requirements.append(line)
+ if not path.exists(reqf):
+ return requirements
+
+ with open(reqf) as f:
+ for line in f.readlines():
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ requirements.append(line)
return requirements
setup(
name='swh.loader.mercurial',
description='Software Heritage Mercurial Loader',
+ long_description=long_description,
+ long_description_content_type='text/markdown',
author='Software Heritage developers',
author_email='swh-devel@inria.fr',
url='https://forge.softwareheritage.org/diffusion/DLDHG/',
- packages=find_packages(), # packages's modules
- scripts=[], # scripts to package
- install_requires=parse_requirements(),
+ packages=find_packages(),
+ scripts=[],
+ install_requires=parse_requirements() + parse_requirements('swh'),
setup_requires=['vcversioner'],
+ extras_require={'testing': parse_requirements('test')},
vcversioner={},
include_package_data=True,
+ entry_points={
+ 'console_scripts': ['swh-loader-hg=swh.loader.mercurial.cli:main'],
+ },
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+ "Operating System :: OS Independent",
+ "Development Status :: 4 - Beta",
+ ],
+ project_urls={
+ 'Bug Reports': 'https://forge.softwareheritage.org/maniphest',
+ 'Funding': 'https://www.softwareheritage.org/donate',
+ 'Source': (
+ 'https://forge.softwareheritage.org/source/swh-loader-mercurial'),
+ },
)
diff --git a/swh.loader.mercurial.egg-info/PKG-INFO b/swh.loader.mercurial.egg-info/PKG-INFO
index d91d5fd..6b4300d 100644
--- a/swh.loader.mercurial.egg-info/PKG-INFO
+++ b/swh.loader.mercurial.egg-info/PKG-INFO
@@ -1,10 +1,130 @@
-Metadata-Version: 1.0
+Metadata-Version: 2.1
Name: swh.loader.mercurial
-Version: 0.0.12
+Version: 0.0.13
Summary: Software Heritage Mercurial Loader
Home-page: https://forge.softwareheritage.org/diffusion/DLDHG/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
-Description: UNKNOWN
+Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
+Project-URL: Funding, https://www.softwareheritage.org/donate
+Project-URL: Source, https://forge.softwareheritage.org/source/swh-loader-mercurial
+Description: swh-loader-mercurial
+ =========================
+
+ # Configuration file
+
+ In usual location for a loader, *{/etc/softwareheritage/ | ~/.swh/ |
+ ~/.config/swh/}loader/hg.yml*:
+
+ ``` YAML
+ storage:
+ cls: remote
+ args:
+ url: http://localhost:5002/
+ ```
+
+ # Basic use
+
+ The main entry point to import a Mercurial repository is the `main` function
+ defined in the `swh.loader.mercurial.cli` module:
+
+ ``` bash
+ python3 -m swh.loader.mercurial.cli
+ ```
+
+
+ If the Python package has been installed via `pip`, you should be able
+ to type:
+
+ ``` bash
+ user@host:~$ swh-loader-hg --help
+
+ Usage: swh-loader-hg [OPTIONS] ORIGIN_URL
+
+ Options:
+ -d, --hg-directory TEXT Path to the hg (local) directory to load
+ from. If unset, the hg repo will ben cloned
+ from the given (origin) url
+ -a, --hg-archive TEXT Path to the hg (local) archive file to load
+ from.
+ -D, --visit-date TEXT Visit date (defaults to now)
+ -l, --log-level [NOTSET|DEBUG|INFO|WARNING|ERROR|CRITICAL]
+ Log level
+ --help Show this message and exit.
+
+ ```
+
+ For example:
+
+ ``` bash
+ user@host:~$ swh-loader-hg https://www.mercurial-scm.org/repo/hello
+ [...]
+ ```
+
+
+ # From Python
+ From python3's toplevel:
+
+ ## Remote
+
+ ``` Python
+ project = 'hello'
+ # remote repository
+ origin_url = 'https://www.mercurial-scm.org/repo/%s' % project
+ # local clone
+ directory = '/home/storage/hg/repo/%s' % project
+
+ import logging
+ logging.basicConfig(level=logging.DEBUG)
+
+ from swh.loader.mercurial.tasks import LoadMercurial
+
+ t = LoadMercurial()
+ t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00')
+ ```
+
+ ## local directory
+
+ Only origin, contents, and directories are filled so far.
+
+ Remaining objects are empty (revision, release, occurrence).
+
+ ``` Python
+ project = '756015-ipv6'
+ directory = '/home/storage/hg/repo/%s' % project
+ origin_url = 'https://%s.googlecode.com' % project
+
+ import logging
+ logging.basicConfig(level=logging.DEBUG)
+
+ from swh.loader.mercurial.tasks import LoadMercurial
+
+ t = LoadMercurial()
+ t.run(origin_url=origin_url, directory=directory, visit_date='2016-05-03T15:16:32+00:00')
+ ```
+
+ ## local archive
+
+ ``` Python
+ project = '756015-ipv6-source-archive.zip'
+ archive_path = '/home/storage/hg/repo/%s' % project
+ origin_url = 'https://%s-archive.googlecode.com' % project
+
+ import logging
+ logging.basicConfig(level=logging.DEBUG)
+
+ from swh.loader.mercurial.tasks import LoadArchiveMercurial
+
+ t = LoadArchiveMercurial()
+ t.run(origin_url=origin_url, archive_path=archive_path, visit_date='2016-05-03T15:16:32+00:00')
+ ```
+
Platform: UNKNOWN
+Classifier: Programming Language :: Python :: 3
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
+Classifier: Operating System :: OS Independent
+Classifier: Development Status :: 4 - Beta
+Description-Content-Type: text/markdown
+Provides-Extra: testing
diff --git a/swh.loader.mercurial.egg-info/SOURCES.txt b/swh.loader.mercurial.egg-info/SOURCES.txt
index a796f44..363e866 100644
--- a/swh.loader.mercurial.egg-info/SOURCES.txt
+++ b/swh.loader.mercurial.egg-info/SOURCES.txt
@@ -1,39 +1,47 @@
.gitignore
AUTHORS
LICENSE
MANIFEST.in
Makefile
README.md
requirements-swh.txt
+requirements-test.txt
requirements.txt
setup.py
version.txt
debian/changelog
debian/compat
debian/control
debian/copyright
debian/rules
debian/source/format
docs/.gitignore
docs/Makefile
docs/conf.py
docs/index.rst
docs/_static/.placeholder
docs/_templates/.placeholder
swh/__init__.py
swh.loader.mercurial.egg-info/PKG-INFO
swh.loader.mercurial.egg-info/SOURCES.txt
swh.loader.mercurial.egg-info/dependency_links.txt
+swh.loader.mercurial.egg-info/entry_points.txt
swh.loader.mercurial.egg-info/requires.txt
swh.loader.mercurial.egg-info/top_level.txt
swh/loader/__init__.py
swh/loader/mercurial/__init__.py
swh/loader/mercurial/archive_extract.py
-swh/loader/mercurial/bundle20_loader.py
swh/loader/mercurial/bundle20_loader_verifier.py
swh/loader/mercurial/bundle20_reader.py
swh/loader/mercurial/chunked_reader.py
+swh/loader/mercurial/cli.py
swh/loader/mercurial/converters.py
+swh/loader/mercurial/loader.py
swh/loader/mercurial/objects.py
swh/loader/mercurial/slow_loader.py
-swh/loader/mercurial/tasks.py
\ No newline at end of file
+swh/loader/mercurial/tasks.py
+swh/loader/mercurial/tests/__init__.py
+swh/loader/mercurial/tests/test_loader.org
+swh/loader/mercurial/tests/test_loader.py
+swh/loader/mercurial/tests/resources/hello.tgz
+swh/loader/mercurial/tests/resources/the-sandbox.tgz
\ No newline at end of file
diff --git a/swh.loader.mercurial.egg-info/entry_points.txt b/swh.loader.mercurial.egg-info/entry_points.txt
new file mode 100644
index 0000000..0242428
--- /dev/null
+++ b/swh.loader.mercurial.egg-info/entry_points.txt
@@ -0,0 +1,3 @@
+[console_scripts]
+swh-loader-hg = swh.loader.mercurial.cli:main
+
diff --git a/swh.loader.mercurial.egg-info/requires.txt b/swh.loader.mercurial.egg-info/requires.txt
index 80eda68..4ec0aae 100644
--- a/swh.loader.mercurial.egg-info/requires.txt
+++ b/swh.loader.mercurial.egg-info/requires.txt
@@ -1,10 +1,15 @@
-hglib
+click
+patool
python-dateutil
+python-hglib
retrying
sqlitedict
swh.core>=0.0.36
-swh.loader.core>=0.0.33
-swh.model>=0.0.20
+swh.loader.core>=0.0.34
+swh.model>=0.0.27
swh.scheduler>=0.0.19
swh.storage>=0.0.95
vcversioner
+
+[testing]
+nose
diff --git a/swh/loader/mercurial/bundle20_loader_verifier.py b/swh/loader/mercurial/bundle20_loader_verifier.py
index ca8f308..6321b15 100644
--- a/swh/loader/mercurial/bundle20_loader_verifier.py
+++ b/swh/loader/mercurial/bundle20_loader_verifier.py
@@ -1,254 +1,255 @@
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import code
import datetime
import hglib
import os
import random
import sys
import time
from binascii import hexlify, unhexlify
-from swh.model import hashutil
+from swh.model.hashutil import MultiHash
-from .bundle20_loader import HgBundle20Loader
+from .loader import HgBundle20Loader
from .converters import PRIMARY_ALGO as ALGO
from .objects import SimpleTree
class HgLoaderValidater(HgBundle20Loader):
def generate_all_blobs(self, validate=True, frequency=1):
print('GENERATING BLOBS')
i = 0
start = time.time()
u = set()
for blob, node_info in self.br.yield_all_blobs():
filename = node_info[0]
header = node_info[2]
i += 1
- bhash = hashutil.hash_data(blob, algorithms=set([ALGO]))[ALGO]
+ hashes = MultiHash.from_data(blob, hash_names=set([ALGO])).digest()
+ bhash = hashes[ALGO]
self.file_node_to_hash[header['node']] = bhash
u.update([bhash])
if validate:
if random.random() < frequency:
self.validate_blob(filename, header, blob)
if i % 10000 == 0:
print(i)
print('')
print('FOUND', i, 'BLOBS')
print('FOUND', len(u), 'UNIQUE BLOBS')
print('ELAPSED', time.time()-start)
def validate_blob(self, filename, header, blob):
if not self.hg:
self.hg = hglib.open(self.hgdir)
data = bytes(blob)
filepath = os.path.join(self.hg.root(), bytes(filename))
linknode = hexlify(header['linknode'])
cat_contents = self.hg.cat([filepath], rev=linknode)
if cat_contents != data:
print('INTERNAL ERROR ERROR ERROR ERROR')
print(filename)
print(header)
print('-----')
print(cat_contents)
print('---- vs ----')
print(data)
code.interact(local=dict(globals(), **locals()))
quit()
else:
print('v', end='')
def generate_all_trees(self, validate=True, frequency=1):
print('GENERATING MANIFEST TREES')
c = 0
n = 0
u = set()
start = time.time()
validated = 0
for header, tree, new_dirs in self.load_directories():
if validate and (c >= validated) and (random.random() < frequency):
self.validate_tree(tree, header, c)
for d in new_dirs:
u.add(d['id'])
c += 1
n += len(new_dirs)
print('.', end='')
if c % 20 == 0:
sys.stdout.flush()
if c % 10000 == 0:
print(c)
print('')
print('FOUND', c, 'COMMIT MANIFESTS')
print('FOUND', n, 'NEW DIRS')
print('FOUND', len(u), 'UNIQUE DIRS')
print('ELAPSED', time.time()-start)
def validate_tree(self, tree, header, i):
if not self.hg:
self.hg = hglib.open(self.hgdir)
commit_id = header['linknode']
if len(commit_id) == 20:
commit_id = hexlify(commit_id)
base_tree = SimpleTree()
base_files = list(self.hg.manifest(rev=commit_id))
bfiles = sorted([f[4] for f in base_files])
for p in base_files:
base_tree.add_blob(
p[4], self.file_node_to_hash[unhexlify(p[0])], p[3], p[1]
)
base_tree.hash_changed()
files = sorted(list(tree.flatten().keys()))
if tree != base_tree:
print('validating rev:', i, 'commit:', commit_id)
print('validating files:', len(files), len(base_files))
print(' INVALID TREE')
def so1(a):
keys = [k['name'] for k in a['entries']]
return b''.join(sorted(keys))
tree_dirs = [d for d in tree.yield_swh_directories()]
base_dirs = [d for d in base_tree.yield_swh_directories()]
tree_dirs.sort(key=so1)
base_dirs.sort(key=so1)
# for i in range(len(tree_dirs)):
# if tree_dirs[i] != base_dirs[i]:
# print(i)
# code.interact(local=dict(globals(), **locals()))
print('Program will quit after your next Ctrl-D')
code.interact(local=dict(globals(), **locals()))
quit()
else:
print('v', end='')
def generate_all_commits(self, validate=True, frequency=1):
i = 0
start = time.time()
for rev in self.get_revisions():
print('.', end='')
i += 1
if i % 20 == 0:
sys.stdout.flush()
print('')
print('FOUND', i, 'COMMITS')
print('ELAPSED', time.time()-start)
def runtest(self, hgdir, validate_blobs=False, validate_trees=False,
frequency=1.0, test_iterative=False):
"""
HgLoaderValidater().runtest('/home/avi/SWH/mozilla-unified')
"""
self.origin_id = 'test'
dt = datetime.datetime.now(tz=datetime.timezone.utc)
if test_iterative:
dt = dt - datetime.timedelta(10)
hgrepo = None
if (hgdir.lower().startswith('http:')
or hgdir.lower().startswith('https:')):
hgrepo, hgdir = hgdir, hgrepo
self.hgdir = hgdir
try:
print('preparing')
self.prepare(hgrepo, dt, hgdir)
self.file_node_to_hash = {}
# self.generate_all_blobs(validate=validate_blobs,
# frequency=frequency)
# self.generate_all_trees(validate=validate_trees, frequency=frequency)
# self.generate_all_commits()
print('getting contents')
cs = 0
for c in self.get_contents():
cs += 1
pass
print('getting directories')
ds = 0
for d in self.get_directories():
ds += 1
pass
revs = 0
print('getting revisions')
for rev in self.get_revisions():
revs += 1
pass
print('getting releases')
rels = 0
for rel in self.get_releases():
rels += 1
print(rel)
self.visit = 'foo'
print('getting snapshot')
o = self.get_snapshot()
print(o['branches'].keys())
finally:
self.cleanup()
print('final count: ',
'cs', cs, 'ds', ds, 'revs', revs, 'rels', rels)
def main():
if len(sys.argv) > 1:
test_repo = sys.argv[1]
else:
print('Please pass in the path to an HG repository.')
quit()
while test_repo[-1] == '/':
test_repo = test_repo[:-1]
if len(sys.argv) > 2:
validate_frequency = float(sys.argv[2])
else:
validate_frequency = 0.001
if len(sys.argv) > 3:
test_iterative = True
else:
test_iterative = False
HgLoaderValidater().runtest(test_repo, True, True, validate_frequency,
test_iterative)
if __name__ == '__main__':
main()
diff --git a/swh/loader/mercurial/bundle20_reader.py b/swh/loader/mercurial/bundle20_reader.py
index f8f8b0d..ec24383 100644
--- a/swh/loader/mercurial/bundle20_reader.py
+++ b/swh/loader/mercurial/bundle20_reader.py
@@ -1,619 +1,619 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""This document contains code for extracting all of the data from
Mercurial version 2 bundle file. It is referenced by
bundle20_loader.py
"""
# =============================================================================
# =============================================================================
# BACKGROUND
# =============================================================================
# =============================================================================
#
# https://www.mercurial-scm.org/wiki/BundleFormat says:
# "The new bundle format design is described on the BundleFormat2 page."
#
# https://www.mercurial-scm.org/wiki/BundleFormat2#Format_of_the_Bundle2_Container says: # noqa
# "The latest description of the binary format can be found as comment in the
# Mercurial source code."
#
# https://www.mercurial-scm.org/repo/hg/file/default/mercurial/help/internals/bundles.txt says: # noqa
# "The 'HG20' format is not yet documented here. See the inline comments in
# 'mercurial/exchange.py' for now."
#
# -----------------------------------------------------------------------------
# Avi says:
# -----------------------------------------------------------------------------
#
# All of the above official(?) statements seem to be quite wrong.
#
# The mercurial-scm wiki is a cluster#@*& of missing pages, bad links, wrong
# information, obsolete information, undecipherable names, and half-started
# leavings that only sort of look like content. I don't understand who or what
# it's there for. I think that means it's not there for me?
#
# https://www.mercurial-scm.org/wiki/BundleFormat2#New_header is wrong and
# bizarre, and there isn't any other information on the page.
#
#
# https://www.mercurial-scm.org/repo/hg/file/de86a6872d06/mercurial/help/internals/changegroups.txt # noqa
# (`hg help internals.changegroups`) is very close to what we need.
# It is accurate, current, and thorough.
# It describes much of the internal structure, which is super helpful if you
# know in advance which info can be trusted, but it doesn't describe any of the
# file-level details, including the file headers and that the entire bundle
-# is broken into overlayed 4KB chunks starting from just after the bundle
+# is broken into overlaid 4KB chunks starting from just after the bundle
# header, nor does it describe what any of the component elements are used for,
# nor does it explain the meta-message segment in the blob deltas, nor does it
# explain the file flags occasionally appended to manifest file hashes. Also it
# says: "The [delta data] format is described more fully in 'hg help
# internals.bdiff'", which is also wrong. As far as I can tell, that
# file has never existed.
#
# It does however have one potentially extremely useful note buried in the
# middle that, in hindsight, could have significant implications for complexity
# and performance in future Mercurial loading work.
#
# It says: "In version 1, the delta is always applied against the previous node
# from the changegroup or the first parent if this is the first entry in the
# changegroup."
#
# If the next version of HG support for SWH can reliably get version 1 data,
# then it could be implemented entirely without worrying about ballooning
# memory utilization, which would shrink the code significantly and probably be
# faster too. So maybe HG10 bundles instead of HG20 bundles are superior for
# this task? But then I read that servers can optionally disable serving
# version 1 content, and I like to think that this loader could eventually
# be applied directly to a network stream without an intermediate phase for
# cloning and local bundling, so...It seemed like a good idea at the time?
#
# -----------------------------------------------------------------------------
# Other notes and thoughts:
# -----------------------------------------------------------------------------
# 1)
# This is a relatively minor detail, but
# Mercurial nodes are not content-addressable like Git's are.
#
# https://www.mercurial-scm.org/wiki/Nodeid explains: "If you modify a file,
# commit the change, and then modify it to restore the original contents, the
# contents are the same but the history is different, so the file will get a
# new nodeid. This history-sensitivity is obtained by calculating the nodeid
# from the concatenation of the parent nodeids with the file's contents..."
#
# The result is that we always have to collect and hash everything at least
# once in order to know if we've seen something like it before, because nothing
# tells us that the node we're looking at is unique. We can use node ids for
# linking disparate elements together (e.g. commit to manifest) but not for
# determining whether two elements in the same group are identical in all but
# descendency. So there's no way to save time on duplicate hashing. Well...
# there is the copied_file blob metadata, but, lol.
#
# 2)
# Most of the code complexity is due to dealing with 'version 2' changegroups,
# for which we need to keep track of the entire history of all updates made
# to a given file or working directory tree structure, because a revision
# delta could be applied over any of the prior revisions all the way back to
# rev 0, according to whenever files were branched/merged/uncommitted/etc. For
# very large repositories with a lot of churn, this can quickly expand to
# require multiple gigabytes of space, possibly exceeding RAM availability if
# one desires to keep as much data resident in memory as possible to boost
# performance. mozilla-unified, for instance, produces some 2 million+ blobs
# (1.6 million+ unique). Nested umpteen subdirectory levels deep, those blobs
# balloon into a quantity of directory subtrees that rapidly exceeds an 8GB RAM
# laptop's ability to keep them all active without a good amount of care and
# pruning. The code here tries to strike a balance between memory utilization
# and performance.
#
# This problem is also referenced in the last paragraph of the previous
# section, where potentially this problem wouldn't exist for 'version 1' data
# if we can reliably get it. Can we? Either that or not use bundles at all,
# which has other costs.
#
# 3)
# If the list of changed files stored by the changesets had indicated which
# of those changed files were added or modified and which ones were removed,
# this code could be much faster. Right now we have to perform a huge number of
# substring replacements (see the apply_revdata method) to produce a complete
# file manifest for each commit (as a string!!!) in order to know how to get
# the set of removed files from the next delta. We can intuit from every
# manifest delta which files were modified or added, but I believe there's no
# way to intuit which files were removed without actually having the complete
# prior state and without the list of removals being explicitly given. If you
# have an explicit list of all the files that were removed for any given commit
# changegroup, and you combine that with the delta updates in the manifest
# changegroups which detail the set of files that have been added or modified,
# then you wouldn't even have to apply any of the string deltas to get a
# complete understanding of the set of differences between one manifest and the
# next. Not having this effective speed boost is rather unfortunate; it would
# require only one extra stored byte per commit to differentiate removals and
# would make extracting bundles lightning fast.
# ============================================================================
##
import itertools
import struct
from binascii import unhexlify
from collections import OrderedDict
from datetime import datetime
from .chunked_reader import ChunkedFileReader
from .objects import SelectiveCache
def unpack(fmt_str, source):
"""Utility function for fetching the right number of bytes from a stream to
satisfy a struct.unpack pattern.
args:
fmt_str: a struct.unpack string pattern
(e.g. '>I' for 4 bytes big-endian)
source: any IO object that has a read() method which
returns an appropriate sequence of bytes
"""
ret = struct.unpack(fmt_str, source.read(struct.calcsize(fmt_str)))
if len(ret) == 1:
return ret[0]
return ret
class Bundle20Reader(object):
"""Parser for extracting data from Mercurial Bundle20 files.
NOTE: Currently only works on uncompressed HG20 bundles, but checking for
COMPRESSION=<2chars> and loading the appropriate stream decompressor
at that point would be trivial to add if necessary.
args:
bundlefile (str): name of the binary repository bundle file
cache_filename (str): path to the disk cache used (transited
to the SelectiveCache instance)
cache_size (int): tuning parameter for the upper RAM limit used by
historical data caches. The default is defined in the
SelectiveCache class.
"""
NAUGHT_NODE = b'\x00' * 20
def __init__(self, bundlefile, cache_filename, cache_size=None):
self.bundlefile = bundlefile
self.cache_filename = cache_filename
bfile = open(bundlefile, 'rb', buffering=200*1024*1024)
btype = bfile.read(4) # 'HG20'
if btype != b'HG20':
raise Exception(bundlefile,
b'Not an HG20 bundle. First 4 bytes:' + btype)
bfile.read(4) # '\x00\x00\x00\x00'
self.params = self.read_bundle_header(bfile)
# print('PARAMETERS', self.params)
self.num_commits = self.params[b'nbchanges']
self.filereader = ChunkedFileReader(bfile)
self.cache_size = cache_size
self.blobs_offset = None
self.changes_offset = self.filereader.tell()
self.changes_next_offset = None
self.manifests_offset = None
self.manifests_next_offset = None
self.id_to_info = {}
def read_bundle_header(self, bfile):
"""Parse the file header which describes the format and parameters.
See the structure diagram at the top of the file for more insight.
args:
bfile: bundle file handle with the cursor at the start offset of
the content header (the 9th byte in the file)
returns:
dict of decoded bundle parameters
"""
unpack('>I', bfile) # header length
chg_len = unpack('>B', bfile) # len('CHANGEGROUP') == 11
bfile.read(chg_len) # should say 'CHANGEGROUP'
unpack('>I', bfile) # probably \x00\x00\x00\x00
n_mandatory, n_advisory = unpack('>BB', bfile) # parameter counts
mandatory_params = [
(key_len, val_len)
for key_len, val_len
in [unpack('>BB', bfile) for i in range(n_mandatory)]
]
advisory_params = [
(key_len, val_len)
for key_len, val_len
in [unpack('>BB', bfile) for i in range(n_advisory)]
]
params = {}
for key_len, val_len in mandatory_params+advisory_params:
key = unpack('>%ds' % key_len, bfile)
val = int(unpack('>%ds' % val_len, bfile))
params[key] = val
return params
def revdata_iterator(self, bytes_to_read):
"""A chunk's revdata section is a series of start/end/length/data_delta
content updates called RevDiffs that indicate components of a text diff
applied to the node's basenode. The sum length of all diffs is the
length indicated at the beginning of the chunk at the start of the
header.
See the structure diagram at the top of the file for more insight.
args:
bytes_to_read: int total number of bytes in the chunk's revdata
yields:
(int, int, read iterator) representing a single text diff component
"""
while bytes_to_read > 0:
start_offset = unpack('>I', self.filereader)
end_offset = unpack('>I', self.filereader)
blocklen = unpack('>I', self.filereader)
delta_it = self.filereader.read_iterator(blocklen)
bytes_to_read -= (12 + blocklen)
yield (start_offset, end_offset, delta_it) # RevDiff
def read_chunk_header(self):
"""The header of a RevChunk describes the id ('node') for the current
change, the commit id ('linknode') associated with this change,
the parental heritage ('p1' and 'p2'), and the node to which the
revdata updates will apply ('basenode'). 'linknode' is the same as
'node' when reading the commit log because any commit is already
itself. 'basenode' for a changeset will be NAUGHT_NODE, because
changeset chunks include complete information and not diffs.
See the structure diagram at the top of the file for more insight.
returns:
dict of the next delta header
"""
header = self.filereader.read(100)
header = {
'node': header[0:20],
'p1': header[20:40],
'p2': header[40:60],
'basenode': header[60:80],
'linknode': header[80:100]
}
return header
def read_revchunk(self):
"""Fetch a complete RevChunk.
A RevChunk contains the collection of line changes made in a particular
update. header['node'] identifies which update. Commits, manifests, and
files all have these. Each chunk contains an indicator of the whole
chunk size, an update header, and then the body of the update as a
series of text diff components.
See the structure diagram at the top of the file for more insight.
returns:
tuple(dict, iterator) of (header, chunk data) if there is another
chunk in the group, else None
"""
size = unpack('>I', self.filereader) - 104
if size >= 0:
header = self.read_chunk_header()
return (header, self.revdata_iterator(size))
else:
return None # NullChunk
def extract_commit_metadata(self, data):
"""Converts the binary commit metadata format into a dict.
args:
data: bytestring of encoded commit information
returns:
dict of decoded commit information
"""
parts, message = data.split(b'\n\n', 1)
parts = parts.split(b'\n')
commit = {}
commit['message'] = message
commit['manifest'] = unhexlify(parts[0])
commit['user'] = parts[1]
tstamp, tz, *extra = parts[2].split(b' ')
commit['time'] = datetime.fromtimestamp(float(tstamp))
commit['time_offset_seconds'] = int(tz)
if extra:
commit['extra'] = extra[0]
commit['changed_files'] = parts[3:]
return commit
def skip_sections(self, num_sections=1):
"""Skip past sections quickly.
args:
num_sections: int number of sections to skip
"""
for i in range(num_sections):
size = unpack('>I', self.filereader)
while size >= 104:
self.filereader.seek(size - 4, from_current=True)
size = unpack('>I', self.filereader)
def apply_revdata(self, revdata_it, prev_state):
"""Compose the complete text body for a change from component deltas.
args:
revdata_it: output from the revdata_iterator method
prev_state: bytestring the base complete text on which the new
deltas will be applied
returns:
(bytestring, list, list) the new complete string and lists of added
and removed components (used in manifest processing)
"""
state = []
added = []
removed = []
next_start = 0
for delta_start, delta_end, rev_diff_it in revdata_it:
removed.append(prev_state[delta_start:delta_end])
added.append(b''.join(rev_diff_it))
state.append(prev_state[next_start:delta_start])
state.append(added[-1])
next_start = delta_end
state.append(prev_state[next_start:])
state = b''.join(state)
return (state, added, removed)
def skim_headers(self):
"""Get all header data from a change group but bypass processing of the
contained delta components.
yields:
output of read_chunk_header method for all chunks in the group
"""
size = unpack('>I', self.filereader) - 104
while size >= 0:
header = self.read_chunk_header()
self.filereader.seek(size, from_current=True)
yield header
size = unpack('>I', self.filereader) - 104
def group_iterator(self):
"""Bundle sections are called groups. These are composed of one or more
revision chunks of delta components. Iterate over all the chunks in a
group and hand each one back.
yields:
see output of read_revchunk method
"""
revchunk = self.read_revchunk()
while revchunk: # A group is terminated by a NullChunk
yield revchunk # (header, revdata_iterator)
revchunk = self.read_revchunk()
def yield_group_objects(self, cache_hints=None, group_offset=None):
"""Bundles are sectioned into groups: the log of all commits, the log
of all manifest changes, and a series of logs of blob changes (one for
each file). All groups are structured the same way, as a series of
revisions each with a series of delta components. Iterate over the
current group and return the completed object data for the current
update by applying all of the internal delta components to each prior
revision.
args:
cache_hints: see build_cache_hints (this will be built
automatically if not pre-built and passed in)
group_offset: int file position of the start of the desired group
yields:
(dict, bytestring, list, list) the output from read_chunk_header
followed by the output from apply_revdata
"""
if group_offset is not None:
self.filereader.seek(group_offset)
if cache_hints is None:
cache_hints = self.build_cache_hints()
data_cache = SelectiveCache(max_size=self.cache_size,
cache_hints=cache_hints,
filename=self.cache_filename)
# Loop over all revisions in the group
data = b''
for header, revdata_it in self.group_iterator():
node = header['node']
basenode = header['basenode']
data = data_cache.fetch(basenode) or b''
data, added, removed = self.apply_revdata(revdata_it, data)
data_cache.store(node, data)
yield (header, data, added, removed) # each RevChunk
def extract_meta_from_blob(self, data):
"""File revision data sometimes begins with a metadata section of
dubious value. Strip it off and maybe decode it. It seems to be mostly
useless. Why indicate that a file node is a copy of another node? You
can already get that information from the delta header.
args:
data: bytestring of one revision of a file, possibly with metadata
embedded at the start
returns:
(bytestring, dict) of (the blob data, the meta information)
"""
meta = {}
if data.startswith(b'\x01\n'):
empty, metainfo, data = data.split(b'\x01\n', 2)
metainfo = b'\x01\n' + metainfo + b'\x01\n'
if metainfo.startswith(b'copy:'):
# direct file copy (?)
copyinfo = metainfo.split(b'\n')
meta['copied_file'] = copyinfo[0][6:]
meta['copied_rev'] = copyinfo[1][9:]
elif metainfo.startswith(b'censored:'):
# censored revision deltas must be full-replacements (?)
meta['censored'] = metainfo
else:
# no idea
meta['text'] = metainfo
return data, meta
def seek_changelog(self):
"""Seek to the beginning of the change logs section.
"""
self.filereader.seek(self.changes_offset)
def seek_manifests(self):
"""Seek to the beginning of the manifests section.
"""
if self.manifests_offset is None:
self.seek_changelog()
self.skip_sections(1) # skip past commits
self.manifests_offset = self.filereader.tell()
else:
self.filereader.seek(self.manifests_offset)
def seek_filelist(self):
"""Seek to the beginning of the file changes section.
"""
if self.blobs_offset is None:
self.seek_manifests()
self.skip_sections(1) # skip past manifests
self.blobs_offset = self.filereader.tell()
else:
self.filereader.seek(self.blobs_offset)
def yield_all_blobs(self):
"""Gets blob data from the bundle.
yields:
(bytestring, (bytestring, int, dict)) of
(blob data, (file name, start offset of the file within the
bundle, node header))
"""
self.seek_filelist()
# Loop through all files that have commits
size = unpack('>I', self.filereader)
while size > 0:
file_name = self.filereader.read(size-4)
file_start_offset = self.filereader.tell()
# get all of the blobs for each file
for header, data, *_ in self.yield_group_objects():
blob, meta = self.extract_meta_from_blob(data)
yield blob, (file_name, file_start_offset, header)
size = unpack('>I', self.filereader)
def yield_all_changesets(self):
"""Gets commit data from the bundle.
yields:
(dict, dict) of (read_chunk_header output,
extract_commit_metadata output)
"""
self.seek_changelog()
for header, data, *_ in self.yield_group_objects():
changeset = self.extract_commit_metadata(data)
yield (header, changeset)
def yield_all_manifest_deltas(self, cache_hints=None):
"""Gets manifest data from the bundle.
In order to process the manifests in a reasonable amount of time, we
want to use only the deltas and not the entire manifest at each change,
because if we're processing them in sequential order (we are) then we
already have the previous state so we only need the changes.
args:
cache_hints: see build_cache_hints method
yields:
(dict, dict, dict) of (read_chunk_header output,
extract_manifest_elements output on added/modified files,
extract_manifest_elements on removed files)
"""
self.seek_manifests()
for header, data, added, removed in self.yield_group_objects(
cache_hints=cache_hints
):
added = self.extract_manifest_elements(added)
removed = self.extract_manifest_elements(removed)
yield (header, added, removed)
def build_manifest_hints(self):
"""Just a minor abstraction shortcut for the build_cache_hints method.
returns:
see build_cache_hints method
"""
self.seek_manifests()
return self.build_cache_hints()
def build_cache_hints(self):
"""The SelectiveCache class that we use in building nodes can accept a
set of key counters that makes its memory usage much more efficient.
returns:
dict of key=a node id, value=the number of times we
will need data from that node when building subsequent nodes
"""
cur_pos = self.filereader.tell()
hints = OrderedDict()
prev_node = None
for header in self.skim_headers():
basenode = header['basenode']
if (basenode != self.NAUGHT_NODE) and (basenode != prev_node):
# If the base isn't immediately prior, then cache it once more.
hints[basenode] = hints.get(basenode, 0) + 1
prev_node = header['node']
self.filereader.seek(cur_pos)
return hints
def extract_manifest_elements(self, data):
"""Parses data that looks like a manifest. In practice we only pass in
the bits extracted from the application of a manifest delta describing
which files were added/modified or which ones were removed.
args:
data: either a string or a list of strings that, when joined,
embodies the composition of a manifest. This takes the form
of repetitions of (without the brackets):
b'\x00[flag]\n' ...repeat...
where [flag] may or may not be there depending on whether the
file is specially flagged as executable or something
returns:
dict of key=file_path, value=(file_node, permissions) where
permissions is given according to the flag that optionally exists
in the data
"""
elements = {}
if isinstance(data, str):
data = data.split(b'\n')
else:
data = itertools.chain.from_iterable(
[chunk.split(b'\n') for chunk in data]
)
for line in data:
if line != b'':
f = line.split(b'\x00')
node = f[1]
flag_bytes = node[40:]
elements[f[0]] = (
unhexlify(node[:40]),
b'l' in flag_bytes,
b'755' if (b'x' in flag_bytes) else b'644'
)
return elements
diff --git a/swh/loader/mercurial/cli.py b/swh/loader/mercurial/cli.py
new file mode 100644
index 0000000..0d32e19
--- /dev/null
+++ b/swh/loader/mercurial/cli.py
@@ -0,0 +1,51 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+import datetime
+import logging
+
+from itertools import chain
+
+LOGLEVELS = list(chain.from_iterable((logging._levelToName[lvl],
+ logging._levelToName[lvl].lower())
+ for lvl in sorted(logging._levelToName.keys())))
+
+
+@click.command()
+@click.argument('origin-url')
+@click.option('--hg-directory', '-d',
+ help=('Path to the hg (local) directory to load from. '
+ 'If unset, the hg repo will be cloned from the '
+ 'given (origin) url.'))
+@click.option('--hg-archive', '-a',
+ help=('Path to the hg archive file to load from.'))
+@click.option('--visit-date', '-D', help='Visit date (defaults to now).')
+@click.option('--log-level', '-l',
+ type=click.Choice(LOGLEVELS),
+ help='Log level.')
+def main(origin_url, hg_directory=None,
+ hg_archive=None, visit_date=None, log_level=None):
+
+ logging.basicConfig(
+ level=(log_level or 'DEBUG').upper(),
+ format='%(asctime)s %(process)d %(message)s')
+
+ if not visit_date:
+ visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
+ kwargs = {'visit_date': visit_date,
+ 'origin_url': origin_url}
+ if hg_archive:
+ from .loader import HgArchiveBundle20Loader as HgLoader
+ kwargs['archive_path'] = hg_archive
+ else:
+ from .loader import HgBundle20Loader as HgLoader
+ kwargs['directory'] = hg_directory
+
+ return HgLoader().load(**kwargs)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/swh/loader/mercurial/bundle20_loader.py b/swh/loader/mercurial/loader.py
similarity index 95%
rename from swh/loader/mercurial/bundle20_loader.py
rename to swh/loader/mercurial/loader.py
index fde2ff5..7a85340 100644
--- a/swh/loader/mercurial/bundle20_loader.py
+++ b/swh/loader/mercurial/loader.py
@@ -1,517 +1,526 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""This document contains a SWH loader for ingesting repository data
from Mercurial version 2 bundle files.
"""
# NOTE: The code here does expensive work twice in places because of the
# intermediate need to check for what is missing before sending to the database
# and the desire to not juggle very large amounts of data.
# TODO: Decide whether to also serialize to disk and read back more quickly
# from there. Maybe only for very large repos and fast drives.
# - Avi
import datetime
import hglib
import os
import random
import re
from dateutil import parser
from shutil import rmtree
from tempfile import mkdtemp
-from swh.model import hashutil, identifiers
+from swh.model import identifiers
+from swh.model.hashutil import (
+ MultiHash, hash_to_hex, hash_to_bytes,
+ DEFAULT_ALGORITHMS
+)
from swh.loader.core.loader import SWHStatelessLoader
from swh.loader.core.converters import content_for_storage
from swh.loader.core.utils import clean_dangling_folders
from . import converters
from .archive_extract import tmp_extract
from .bundle20_reader import Bundle20Reader
from .converters import PRIMARY_ALGO as ALGO
from .objects import SelectiveCache, SimpleTree
TAG_PATTERN = re.compile('[0-9A-Fa-f]{40}')
TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.mercurial.'
class HgBundle20Loader(SWHStatelessLoader):
"""Mercurial loader able to deal with remote or local repository.
"""
CONFIG_BASE_FILENAME = 'loader/hg'
ADDITIONAL_CONFIG = {
'bundle_filename': ('str', 'HG20_none_bundle'),
'reduce_effort': ('bool', True), # default: Try to be smart about time
'temp_directory': ('str', '/tmp'),
'cache1_size': ('int', 800*1024*1024),
'cache2_size': ('int', 800*1024*1024),
}
def __init__(self, logging_class='swh.loader.mercurial.Bundle20Loader'):
super().__init__(logging_class=logging_class)
self.content_max_size_limit = self.config['content_size_limit']
self.bundle_filename = self.config['bundle_filename']
self.reduce_effort_flag = self.config['reduce_effort']
self.empty_repository = None
self.temp_directory = self.config['temp_directory']
self.cache1_size = self.config['cache1_size']
self.cache2_size = self.config['cache2_size']
self.working_directory = None
self.bundle_path = None
def pre_cleanup(self):
"""Cleanup potential dangling files from prior runs (e.g. OOM killed
tasks)
"""
clean_dangling_folders(self.temp_directory,
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log)
def cleanup(self):
"""Clean temporary working directory
"""
if self.bundle_path and os.path.exists(self.bundle_path):
self.log.debug('Cleanup up working bundle %s' % self.bundle_path)
os.unlink(self.bundle_path)
if self.working_directory and os.path.exists(self.working_directory):
self.log.debug('Cleanup up working directory %s' % (
self.working_directory, ))
rmtree(self.working_directory)
def get_heads(self, repo):
"""Read the closed branches heads (branch, bookmarks) and returns a
dict with branch_name (bytes) and mercurial's node id
(bytes). Those needs conversion to swh-ids. This is taken
care of in get_revisions.
"""
b = {}
for _, node_hash_id, _, branch_name, *_ in repo.heads():
- b[branch_name] = hashutil.hash_to_bytes(
+ b[branch_name] = hash_to_bytes(
node_hash_id.decode())
bookmarks = repo.bookmarks()
if bookmarks and bookmarks[0]:
for bookmark_name, _, target_short in bookmarks[0]:
target = repo[target_short].node()
- b[bookmark_name] = hashutil.hash_to_bytes(
- target.decode())
+ b[bookmark_name] = hash_to_bytes(target.decode())
return b
def prepare_origin_visit(self, *, origin_url, visit_date, **kwargs):
self.origin_url = origin_url
self.origin = {'url': self.origin_url, 'type': 'hg'}
if isinstance(visit_date, str): # visit_date can be string or datetime
visit_date = parser.parse(visit_date)
self.visit_date = visit_date
def prepare(self, *, origin_url, visit_date, directory=None):
"""Prepare the necessary steps to load an actual remote or local
repository.
To load a local repository, pass the optional directory
parameter as filled with a path to a real local folder.
To load a remote repository, pass the optional directory
parameter as None.
Args:
origin_url (str): Origin url to load
visit_date (str/datetime): Date of the visit
directory (str/None): The local directory to load
"""
self.branches = {}
self.tags = []
self.releases = {}
self.node_2_rev = {}
if not directory: # remote repository
self.working_directory = mkdtemp(
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix='-%s' % os.getpid(),
dir=self.temp_directory)
os.makedirs(self.working_directory, exist_ok=True)
self.hgdir = self.working_directory
self.log.debug('Cloning %s to %s' % (
self.origin['url'], self.hgdir))
hglib.clone(source=self.origin['url'], dest=self.hgdir)
else: # local repository
self.working_directory = None
self.hgdir = directory
self.bundle_path = os.path.join(self.hgdir, self.bundle_filename)
self.log.debug('Bundling at %s' % self.bundle_path)
with hglib.open(self.hgdir) as repo:
self.heads = self.get_heads(repo)
repo.bundle(bytes(self.bundle_path, 'utf-8'),
all=True,
type=b'none-v2')
self.cache_filename1 = os.path.join(
self.hgdir, 'swh-cache-1-%s' % (
hex(random.randint(0, 0xffffff))[2:], ))
self.cache_filename2 = os.path.join(
self.hgdir, 'swh-cache-2-%s' % (
hex(random.randint(0, 0xffffff))[2:], ))
try:
self.br = Bundle20Reader(bundlefile=self.bundle_path,
cache_filename=self.cache_filename1,
cache_size=self.cache1_size)
except FileNotFoundError as e:
# Empty repository! Still a successful visit targeting an
# empty snapshot
self.log.warn('%s is an empty repository!' % self.hgdir)
self.empty_repository = True
else:
self.reduce_effort = set()
if self.reduce_effort_flag:
now = datetime.datetime.now(tz=datetime.timezone.utc)
if (now - self.visit_date).days > 1:
# Assuming that self.visit_date would be today for
# a new visit, treat older visit dates as
# indication of wanting to skip some processing
# effort.
for header, commit in self.br.yield_all_changesets():
ts = commit['time'].timestamp()
if ts < self.visit_date.timestamp():
self.reduce_effort.add(header['node'])
def has_contents(self):
return not self.empty_repository
def has_directories(self):
return not self.empty_repository
def has_revisions(self):
return not self.empty_repository
def has_releases(self):
return not self.empty_repository
def fetch_data(self):
"""Fetch the data from the data source."""
pass
def get_contents(self):
"""Get the contents that need to be loaded."""
# NOTE: This method generates blobs twice to reduce memory usage
# without generating disk writes.
self.file_node_to_hash = {}
hash_to_info = {}
self.num_contents = 0
contents = {}
missing_contents = set()
for blob, node_info in self.br.yield_all_blobs():
self.num_contents += 1
file_name = node_info[0]
header = node_info[2]
+ length = len(blob)
if header['linknode'] in self.reduce_effort:
- content = hashutil.hash_data(blob, algorithms=[ALGO],
- with_length=True)
+ algorithms = [ALGO]
else:
- content = hashutil.hash_data(blob, with_length=True)
-
+ algorithms = DEFAULT_ALGORITHMS
+ h = MultiHash.from_data(blob, hash_names=algorithms)
+ content = h.digest()
+ content['length'] = length
blob_hash = content[ALGO]
self.file_node_to_hash[header['node']] = blob_hash
if header['linknode'] in self.reduce_effort:
continue
hash_to_info[blob_hash] = node_info
contents[blob_hash] = content
missing_contents.add(blob_hash)
if file_name == b'.hgtags':
# https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model
# overwrite until the last one
self.tags = (t for t in blob.split(b'\n') if t != b'')
if contents:
missing_contents = set(
self.storage.content_missing(
list(contents.values()),
key_hash=ALGO
)
)
# Clusters needed blobs by file offset and then only fetches the
# groups at the needed offsets.
focs = {} # "file/offset/contents"
for blob_hash in missing_contents:
_, file_offset, header = hash_to_info[blob_hash]
focs.setdefault(file_offset, {})
focs[file_offset][header['node']] = blob_hash
hash_to_info = None
for offset, node_hashes in sorted(focs.items()):
for header, data, *_ in self.br.yield_group_objects(
group_offset=offset
):
node = header['node']
if node in node_hashes:
blob, meta = self.br.extract_meta_from_blob(data)
content = contents.pop(node_hashes[node], None)
if content:
content['data'] = blob
- content['length'] = len(blob)
yield content_for_storage(
content,
log=self.log,
max_content_size=self.content_max_size_limit,
origin_id=self.origin_id
)
def load_directories(self):
"""This is where the work is done to convert manifest deltas from the
repository bundle into SWH directories.
+
"""
self.mnode_to_tree_id = {}
cache_hints = self.br.build_manifest_hints()
def tree_size(t):
return t.size()
self.trees = SelectiveCache(cache_hints=cache_hints,
size_function=tree_size,
filename=self.cache_filename2,
max_size=self.cache2_size)
tree = SimpleTree()
for header, added, removed in self.br.yield_all_manifest_deltas(
cache_hints
):
node = header['node']
basenode = header['basenode']
tree = self.trees.fetch(basenode) or tree # working tree
for path in removed.keys():
tree = tree.remove_tree_node_for_path(path)
for path, info in added.items():
file_node, is_symlink, perms_code = info
tree = tree.add_blob(
path,
self.file_node_to_hash[file_node],
is_symlink,
perms_code
)
if header['linknode'] in self.reduce_effort:
self.trees.store(node, tree)
else:
new_dirs = []
self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs)
self.trees.store(node, tree)
yield header, tree, new_dirs
def get_directories(self):
- """Get the directories that need to be loaded."""
+ """Compute directories to load
+
+ """
dirs = {}
self.num_directories = 0
for _, _, new_dirs in self.load_directories():
for d in new_dirs:
self.num_directories += 1
dirs[d['id']] = d
missing_dirs = list(dirs.keys())
if missing_dirs:
missing_dirs = self.storage.directory_missing(missing_dirs)
for _id in missing_dirs:
yield dirs[_id]
dirs = {}
def get_revisions(self):
- """Get the revisions that need to be loaded."""
+ """Compute revisions to load
+
+ """
revisions = {}
self.num_revisions = 0
for header, commit in self.br.yield_all_changesets():
if header['node'] in self.reduce_effort:
continue
self.num_revisions += 1
date_dict = identifiers.normalize_timestamp(
int(commit['time'].timestamp())
)
author_dict = converters.parse_author(commit['user'])
if commit['manifest'] == Bundle20Reader.NAUGHT_NODE:
directory_id = SimpleTree().hash_changed()
else:
directory_id = self.mnode_to_tree_id[commit['manifest']]
extra_meta = []
extra = commit.get('extra')
if extra:
for e in extra.split(b'\x00'):
k, v = e.split(b':', 1)
k = k.decode('utf-8')
extra_meta.append([k, v])
revision = {
'author': author_dict,
'date': date_dict,
'committer': author_dict,
'committer_date': date_dict,
'type': 'hg',
'directory': directory_id,
'message': commit['message'],
'metadata': {
- 'node': hashutil.hash_to_hex(header['node']),
+ 'node': hash_to_hex(header['node']),
'extra_headers': [
['time_offset_seconds',
str(commit['time_offset_seconds']).encode('utf-8')],
] + extra_meta
},
'synthetic': False,
'parents': []
}
p1 = self.node_2_rev.get(header['p1'])
p2 = self.node_2_rev.get(header['p2'])
if p1:
revision['parents'].append(p1)
if p2:
revision['parents'].append(p2)
- revision['id'] = hashutil.hash_to_bytes(
+ revision['id'] = hash_to_bytes(
identifiers.revision_identifier(revision)
)
self.node_2_rev[header['node']] = revision['id']
revisions[revision['id']] = revision
# Converts heads to use swh ids
self.heads = {
branch_name: self.node_2_rev[node_id]
for branch_name, node_id in self.heads.items()
}
missing_revs = revisions.keys()
if missing_revs:
missing_revs = set(
self.storage.revision_missing(list(missing_revs))
)
for r in missing_revs:
yield revisions[r]
self.mnode_to_tree_id = None
def _read_tag(self, tag, split_byte=b' '):
node, *name = tag.split(split_byte)
name = split_byte.join(name)
return node, name
def get_releases(self):
"""Get the releases that need to be loaded."""
self.num_releases = 0
releases = {}
missing_releases = []
for t in self.tags:
self.num_releases += 1
node, name = self._read_tag(t)
node = node.decode()
- node_bytes = hashutil.hash_to_bytes(node)
+ node_bytes = hash_to_bytes(node)
if not TAG_PATTERN.match(node):
self.log.warn('Wrong pattern (%s) found in tags. Skipping' % (
node, ))
continue
if node_bytes not in self.node_2_rev:
self.log.warn('No matching revision for tag %s '
'(hg changeset: %s). Skipping' %
(name.decode(), node))
continue
tgt_rev = self.node_2_rev[node_bytes]
release = {
'name': name,
'target': tgt_rev,
'target_type': 'revision',
'message': None,
'metadata': None,
'synthetic': False,
'author': {'name': None, 'email': None, 'fullname': b''},
'date': None
}
- id_hash = hashutil.hash_to_bytes(
+ id_hash = hash_to_bytes(
identifiers.release_identifier(release))
release['id'] = id_hash
missing_releases.append(id_hash)
releases[id_hash] = release
self.releases[name] = id_hash
if missing_releases:
missing_releases = set(
self.storage.release_missing(missing_releases))
for _id in missing_releases:
yield releases[_id]
def get_snapshot(self):
"""Get the snapshot that need to be loaded."""
branches = {}
for name, target in self.heads.items():
branches[name] = {'target': target, 'target_type': 'revision'}
for name, target in self.releases.items():
branches[name] = {'target': target, 'target_type': 'release'}
snap = {
'id': None,
'branches': branches,
}
snap['id'] = identifiers.identifier_to_bytes(
identifiers.snapshot_identifier(snap))
return snap
def get_fetch_history_result(self):
"""Return the data to store in fetch_history."""
return {
'contents': self.num_contents,
'directories': self.num_directories,
'revisions': self.num_revisions,
'releases': self.num_releases,
}
class HgArchiveBundle20Loader(HgBundle20Loader):
"""Mercurial loader for repository wrapped within archives.
"""
def __init__(self):
super().__init__(
logging_class='swh.loader.mercurial.HgArchiveBundle20Loader')
self.temp_dir = None
def prepare(self, *, origin_url, archive_path, visit_date):
self.temp_dir = tmp_extract(archive=archive_path,
dir=self.temp_directory,
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix='.dump-%s' % os.getpid(),
log=self.log,
source=origin_url)
repo_name = os.listdir(self.temp_dir)[0]
directory = os.path.join(self.temp_dir, repo_name)
super().prepare(origin_url=origin_url,
visit_date=visit_date, directory=directory)
def cleanup(self):
if self.temp_dir and os.path.exists(self.temp_dir):
rmtree(self.temp_dir)
super().cleanup()
diff --git a/swh/loader/mercurial/slow_loader.py b/swh/loader/mercurial/slow_loader.py
index a2d53e4..90740fe 100644
--- a/swh/loader/mercurial/slow_loader.py
+++ b/swh/loader/mercurial/slow_loader.py
@@ -1,469 +1,471 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# WARNING WARNING WARNING WARNING
# hglib is too slow to be super useful. Unfortunately it's also the only
# python3 library for mercurial as of this writing. - Avi
import datetime
import hglib
import os
-from swh.model import identifiers, hashutil
+from swh.model import identifiers
+from swh.model.hashutil import MultiHash, DEFAULT_ALGORITHMS, hash_to_hex
from swh.loader.core.loader import SWHStatelessLoader
from .converters import parse_author, PRIMARY_ALGO as ALGO
OS_PATH_SEP = os.path.sep.encode('utf-8')
def data_to_content_id(data):
size = len(data)
ret = {
'length': size,
}
ret.update(identifiers.content_identifier({'data': data}))
return ret
def blob_to_content_dict(data, existing_hashes=None, max_size=None,
logger=None):
"""Convert blob data to a SWH Content. If the blob already
has hashes computed, don't recompute them.
TODO: This should be unified with similar functions in other places.
args:
existing_hashes: dict of hash algorithm:value pairs
max_size: size over which blobs should be rejected
logger: logging class instance
returns:
A Software Heritage "content".
"""
existing_hashes = existing_hashes or {}
size = len(data)
content = {
'length': size,
}
content.update(existing_hashes)
hash_types = list(existing_hashes.keys())
- hashes_to_do = hashutil.DEFAULT_ALGORITHMS.difference(hash_types)
- content.update(hashutil.hash_data(data, algorithms=hashes_to_do))
+ hashes_to_do = DEFAULT_ALGORITHMS.difference(hash_types)
+ hashes = MultiHash.from_data(data, hash_names=hashes_to_do).digest()
+ content.update(hashes)
if max_size and (size > max_size):
content.update({
'status': 'absent',
'reason': 'Content too large',
})
if logger:
- id_hash = hashutil.hash_to_hex(content[ALGO])
+ id_hash = hash_to_hex(content[ALGO])
logger.info(
'Skipping content %s, too large (%s > %s)'
% (id_hash, size, max_size),
extra={
'swh_type': 'loader_content_skip',
'swh_id': id_hash,
'swh_size': size
}
)
else:
content.update({'data': data, 'status': 'visible'})
return content
class SimpleBlob:
""" Stores basic metadata for a blob object.
"""
kind = 'file'
def __init__(self, file_hash, file_mode):
self.hash = file_hash
if not isinstance(file_mode, int):
self.mode = 0o100000 + int(file_mode, 8)
else:
self.mode = file_mode
class SimpleTree(dict):
""" Stores metadata for a nested 'tree'-like object.
"""
kind = 'dir'
mode = 0o040000
def add_tree_node_for_path(self, path):
"""Deeply nests SimpleTrees according to a directory path and returns
a cursor to the deepest one"""
node = self
for d in path.split(OS_PATH_SEP):
node = node.setdefault(d, SimpleTree())
return node
def remove_tree_node_for_path(self, path):
"""Deletes a SimpleBlob from inside nested SimpleTrees according to
the given file path"""
first, sep, rest = path.partition(OS_PATH_SEP)
if rest:
self[first].remove_tree_node_for_path(rest)
if not self.get(first):
del self[first]
else:
del self[first]
def add_blob(self, file_path, file_hash, file_mode):
"""Deeply nests a SimpleBlob inside nested SimpleTrees according to
the given file path"""
fdir = os.path.dirname(file_path)
fbase = os.path.basename(file_path)
if fdir:
node = self.add_tree_node_for_path(fdir)
else:
node = self
node[fbase] = SimpleBlob(file_hash, file_mode)
class HgLoader(SWHStatelessLoader):
"""Load a mercurial repository from a directory.
"""
CONFIG_BASE_FILENAME = 'loader/hg'
def __init__(self, logging_class='swh.loader.mercurial.HgLoader'):
super().__init__(logging_class=logging_class)
def prepare_origin_visit(self, origin_url, directory, visit_date):
self.origin = {
'type': 'hg',
'url': origin_url
}
self.visit_date = visit_date
def prepare(self, origin_url, directory, visit_date):
"""see base.BaseLoader.prepare"""
self.repo = hglib.open(directory)
self.node_to_blob_hash = {}
self.blob_hash_to_file_rev = {}
self.commit_trees = {}
self.unique_trees = {}
self.revisions = {}
def fetch_data(self):
"""Fetch the data from the data source"""
pass
def has_contents(self):
"""Checks whether we need to load contents"""
# if we have any revisions, then obviously we have contents.
return self.has_revisions()
def iter_changelog(self):
"""Iterate over the repository log"""
yield from self.repo.log('0:tip', removed=True)
def get_node_file_if_new(self, f, rev, node_hash):
"""Load a blob from disk"""
# Fast if the node hash is already cached. Somehow this shortcuts a
# meaningful but not huge percentage of the loads for a repository.
if node_hash not in self.node_to_blob_hash:
file_path = os.path.join(self.repo.root(), f)
data = self.repo.cat([file_path], rev)
blob_hash = identifiers.content_identifier(
{'data': data}
)[ALGO]
self.node_to_blob_hash[node_hash] = blob_hash
if blob_hash not in self.blob_hash_to_file_rev:
# new blob
self.blob_hash_to_file_rev[blob_hash] = (file_path, rev)
return blob_hash, data
return self.node_to_blob_hash[node_hash], None
def get_content_ids(self):
"""Get all the contents, but trim away the actual data"""
self.node_to_blob_hash = {}
self.blob_hash_to_file_rev = {}
self.num_contents = 0
for li in self.iter_changelog():
c = self.repo[li]
rev = c.rev()
manifest = c.manifest()
for f in c.added() + c.modified():
node_hash = manifest[f]
blob_hash, data = self.get_node_file_if_new(f, rev, node_hash)
if data is not None: # new blob
self.num_contents += 1
yield data_to_content_id(data)
def get_contents(self):
"""Get the contents that need to be loaded"""
# This method unfortunately loads and hashes the blobs twice.
max_content_size = self.config['content_size_limit']
missing_contents = set(
self.storage.content_missing(
self.get_content_ids(),
ALGO
)
)
for oid in missing_contents:
file_path, rev = self.blob_hash_to_file_rev[oid]
data = self.repo.cat([file_path], rev)
yield blob_to_content_dict(
data, max_size=max_content_size, logger=self.log
)
def has_directories(self):
"""Checks whether we need to load directories"""
# if we have any revs, we must also have dirs
return self.has_revisions()
def get_directories(self):
"""Get the directories that need to be loaded"""
missing_dirs = set(self.storage.directory_missing(
sorted(self.unique_trees.keys())
))
for dir_hash in missing_dirs:
yield self.unique_trees[dir_hash]
def has_revisions(self):
"""Checks whether we need to load revisions"""
self.num_revisions = int(self.repo.tip()[0]) + 1
return self.num_revisions > 0
def update_tree_from_rev(self, tree, rev, only_these_files=None):
"""Iterates over changes in a revision and adds corresponding
SimpleBlobs to a SimpleTree"""
if rev >= 0:
manifest = {k[4]: k for k in self.repo.manifest(rev=rev)}
loop_keys = only_these_files or manifest.keys()
for f in loop_keys:
node_hash = manifest[f][0]
file_mode = manifest[f][1]
file_hash, _ = self.get_node_file_if_new(f, rev, node_hash)
tree.add_blob(f, file_hash, file_mode)
return tree
def reconstruct_tree(self, directory):
"""Converts a flat directory into nested SimpleTrees."""
# This method exists because the code was already written to use
# SimpleTree before then reducing memory use and converting to the
# canonical format. A refactor using lookups instead of nesting could
# obviate the need.
new_tree = SimpleTree()
for entry in directory['entries']:
tgt = entry['target']
perms = entry['perms']
name = entry['name']
if tgt in self.unique_trees: # subtree
new_tree[name] = self.reconstruct_tree(self.unique_trees[tgt])
else: # blob
new_tree[name] = SimpleBlob(tgt, perms)
new_tree.hash = directory['id']
return new_tree
def collapse_tree(self, tree):
"""Converts nested SimpleTrees into multiple flat directories."""
# This method exists because the code was already written to use
# SimpleTree before then reducing memory use and converting to the
# canonical format. A refactor using lookups instead of nesting could
# obviate the need.
directory = {
'entries': [
{
'name': k,
'perms': v.mode,
'type': v.kind,
'target': (isinstance(v, SimpleBlob)
and v.hash
or self.collapse_tree(v))
}
for k, v in tree.items()
]
}
tree.hash = identifiers.directory_identifier(directory)
directory['id'] = tree.hash
self.unique_trees[tree.hash] = directory
return tree.hash
def get_revision_ids(self):
"""Get the revisions that need to be loaded"""
self.unique_trees = {}
commit_tree = None
for li in self.iter_changelog():
c = self.repo[li[1]]
rev = c.rev()
# start from the parent state
p1 = c.p1().rev()
if p1 in self.commit_trees:
if p1 != rev-1:
# Most of the time, a revision will inherit from the
# previous one. In those cases we can reuse commit_tree,
# otherwise build a new one here.
parent_tree = self.unique_trees[self.commit_trees[p1]]
commit_tree = self.reconstruct_tree(parent_tree)
else:
commit_tree = self.update_tree_from_rev(SimpleTree(), p1)
# remove whatever is removed
for f in c.removed():
commit_tree.remove_tree_node_for_path(f)
# update whatever is updated
self.update_tree_from_rev(commit_tree, rev, c.added()+c.modified())
self.commit_trees[rev] = self.collapse_tree(commit_tree)
date_dict = identifiers.normalize_timestamp(
int(c.date().timestamp())
)
author_dict = parse_author(c.author())
parents = []
for p in c.parents():
if p.rev() >= 0:
parents.append(self.revisions[p.node()]['id'])
phase = c.phase() # bytes
rev = str(rev).encode('utf-8')
hidden = str(c.hidden()).encode('utf-8')
hg_headers = [['phase', phase], ['rev', rev], ['hidden', hidden]]
revision = {
'author': author_dict,
'date': date_dict,
'committer': author_dict,
'committer_date': date_dict,
'type': 'hg',
'directory': identifiers.identifier_to_bytes(commit_tree.hash),
'message': c.description(),
'metadata': {
'extra_headers': hg_headers
},
'synthetic': False,
'parents': parents,
}
revision['id'] = identifiers.identifier_to_bytes(
identifiers.revision_identifier(revision))
self.revisions[c.node()] = revision
for n, r in self.revisions.items():
yield {'node': n, 'id': r['id']}
def get_revisions(self):
"""Get the revision identifiers from the repository"""
revs = {
r['id']: r['node']
for r in self.get_revision_ids()
}
missing_revs = set(self.storage.revision_missing(revs.keys()))
for r in missing_revs:
yield self.revisions[revs[r]]
def has_releases(self):
"""Checks whether we need to load releases"""
self.num_releases = len([t for t in self.repo.tags() if not t[3]])
return self.num_releases > 0
def get_releases(self):
"""Get the releases that need to be loaded"""
releases = {}
for t in self.repo.tags():
islocal = t[3]
name = t[0]
if (name != b'tip' and not islocal):
short_hash = t[2]
node_id = self.repo[short_hash].node()
target = self.revisions[node_id]['id']
release = {
'name': name,
'target': target,
'target_type': 'revision',
'message': None,
'metadata': None,
'synthetic': False,
'author': {'name': None, 'email': None, 'fullname': b''},
'date': None
}
id_bytes = identifiers.identifier_to_bytes(
identifiers.release_identifier(release))
release['id'] = id_bytes
releases[id_bytes] = release
missing_rels = set(self.storage.release_missing(
sorted(releases.keys())
))
yield from (releases[r] for r in missing_rels)
def get_snapshot(self):
"""Get the snapshot that need to be loaded"""
self.num_snapshot = 1
def _get_branches(repo=self.repo):
for t in (
repo.tags() + repo.branches() + repo.bookmarks()[0]
):
name = t[0]
short_hash = t[2]
node = self.repo[short_hash].node()
yield name, {
'target': self.revisions[node]['id'],
'target_type': 'revision'
}
snap = {
'branches': {
name: branch
for name, branch in _get_branches()
}
}
snap['id'] = identifiers.identifier_to_bytes(
identifiers.snapshot_identifier(snap))
return snap
def get_fetch_history_result(self):
"""Return the data to store in fetch_history for the current loader"""
return {
'contents': self.num_contents,
'directories': len(self.unique_trees),
'revisions': self.num_revisions,
'releases': self.num_releases,
'snapshot': self.num_snapshot,
}
def save_data(self):
"""We already have the data locally, no need to save it"""
pass
def eventful(self):
"""Whether the load was eventful"""
return True
if __name__ == '__main__':
import logging
import sys
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(process)d %(message)s'
)
loader = HgLoader()
origin_url = sys.argv[1]
directory = sys.argv[2]
visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
print(loader.load(origin_url, directory, visit_date))
diff --git a/swh/loader/mercurial/tasks.py b/swh/loader/mercurial/tasks.py
index 8162a3c..6e2703d 100644
--- a/swh/loader/mercurial/tasks.py
+++ b/swh/loader/mercurial/tasks.py
@@ -1,43 +1,43 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.task import Task
-from .bundle20_loader import HgBundle20Loader, HgArchiveBundle20Loader
+from .loader import HgBundle20Loader, HgArchiveBundle20Loader
-class LoadMercurialTsk(Task):
+class LoadMercurial(Task):
"""Mercurial repository loading
"""
task_queue = 'swh_loader_mercurial'
- def run_task(self, *, origin_url, visit_date, directory):
+ def run_task(self, *, origin_url, visit_date=None, directory=None):
"""Import a mercurial tarball into swh.
Args: see :func:`DepositLoader.load`.
"""
loader = HgBundle20Loader()
loader.log = self.log
return loader.load(origin_url=origin_url,
directory=directory,
visit_date=visit_date)
-class LoadArchiveMercurialTsk(Task):
+class LoadArchiveMercurial(Task):
task_queue = 'swh_loader_mercurial_archive'
def run_task(self, *, origin_url, archive_path, visit_date):
"""Import a mercurial tarball into swh.
Args: see :func:`DepositLoader.load`.
"""
loader = HgArchiveBundle20Loader()
loader.log = self.log
return loader.load(origin_url=origin_url,
archive_path=archive_path,
visit_date=visit_date)
diff --git a/swh/loader/mercurial/tests/__init__.py b/swh/loader/mercurial/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/swh/loader/mercurial/tests/resources/hello.tgz b/swh/loader/mercurial/tests/resources/hello.tgz
new file mode 100644
index 0000000..4b9789a
Binary files /dev/null and b/swh/loader/mercurial/tests/resources/hello.tgz differ
diff --git a/swh/loader/mercurial/tests/resources/the-sandbox.tgz b/swh/loader/mercurial/tests/resources/the-sandbox.tgz
new file mode 100644
index 0000000..c5a0f38
Binary files /dev/null and b/swh/loader/mercurial/tests/resources/the-sandbox.tgz differ
diff --git a/swh/loader/mercurial/tests/test_loader.org b/swh/loader/mercurial/tests/test_loader.org
new file mode 100644
index 0000000..cfd7c8f
--- /dev/null
+++ b/swh/loader/mercurial/tests/test_loader.org
@@ -0,0 +1,121 @@
+#+title: Where the loader test data comes from
+
+Mercurial repositories are archived within the folder
+swh/loader/mercurial/tests/resources. They contain mercurial
+repository.
+
+The following demonstrates the commands executed from within the
+repository to retrieve information.
+
+* the-sandbox
+
+Archive: the-sandbox.tgz
+
+** branches
+
+Listing of branches and their tip:
+#+BEGIN_SRC sh
+$ hg branches
+develop 57:76cc0882284d
+default 2:2f13849f14f5 (inactive)
+#+END_SRC
+
+** Changesets
+
+#+BEGIN_SRC sh
+$ for i in {0..57}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done
+0 e2e117569b086ceabeeedee4acd95f35298d4553
+1 9cd8160c67ac4b0bc97e2e2cd918a580425167d3
+2 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+3 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+4 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+5 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+6 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+7 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+8 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+9 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+10 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+11 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+12 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+13 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+14 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+15 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+16 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+17 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+18 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+19 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+20 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+21 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+22 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+23 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+24 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+25 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+26 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+27 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+28 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+29 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+30 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+31 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+32 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+33 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+34 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+35 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+36 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+37 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+38 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+39 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+40 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+41 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+42 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+43 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+44 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+45 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+46 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+47 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+48 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+49 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+50 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+51 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+52 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+53 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+54 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+55 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+56 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+57 180bd57623a7c2c47a8c43514a5f4d903503d0aa
+#+END_SRC
+
+Note: swh-hashtree is a cli tool defined in swh-model/bin/swh-hashtree
+
+* hello
+
+Archive: hello.tgz
+
+** branches
+
+#+BEGIN_SRC sh
+$ hg branches
+default 1:82e55d328c8c
+#+END_SRC
+
+** tags
+
+I added a tag to have some more data to load (1st repository has no tags):
+#+BEGIN_SRC sh
+$ hg tags
+tip 2:b985ae4a07e1
+0.1 1:82e55d328c8c
+#+END_SRC
+
+#+BEGIN_SRC sh
+$ cat .hgtags
+82e55d328c8ca4ee16520036c0aaace03a5beb65 0.1
+#+END_SRC
+
+** Changesets
+
+#+BEGIN_SRC sh
+$ for i in {0..1}; do hg checkout $i > /dev/null; echo $i $(swh-hashtree --ignore '.hg' --path .); done
+0 43d727f2f3f2f7cb3b098ddad1d7038464a4cee2
+1 b3f85f210ff86d334575f64cb01c5bf49895b63e
+2 8f2be433c945384c85920a8e60f2a68d2c0f20fb
+#+END_SRC
diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py
new file mode 100644
index 0000000..8db2e2e
--- /dev/null
+++ b/swh/loader/mercurial/tests/test_loader.py
@@ -0,0 +1,251 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+
+from nose.tools import istest
+
+from swh.loader.core.tests import BaseLoaderTest, LoaderNoStorage
+from swh.loader.mercurial.loader import HgBundle20Loader
+
+
+class MockStorage:
+ """A mixin inhibited storage overriding *_missing methods. Those are
+ called from within the mercurial loader.
+
+ Rationale: Need to take control of the current behavior prior
+ to refactor it. The end game is to remove this when we will
+ have tests ok.
+
+ """
+ def content_missing(self, contents, key_hash='sha1'):
+ return [c[key_hash] for c in contents]
+
+ def directory_missing(self, directories):
+ return directories
+
+ def release_missing(self, releases):
+ return releases
+
+ def revision_missing(self, revisions):
+ return revisions
+
+
+class BaseHgLoaderTest(BaseLoaderTest):
+ """Mixin base loader test to prepare the mercurial
+ repository to uncompress, load and test the results.
+
+ This sets up
+
+ """
+ def setUp(self, archive_name='the-sandbox.tgz', filename='the-sandbox'):
+ super().setUp(archive_name=archive_name, filename=filename,
+ prefix_tmp_folder_name='swh.loader.mercurial.',
+ start_path=os.path.dirname(__file__))
+
+
+class HgLoaderNoStorage(LoaderNoStorage, HgBundle20Loader):
+ """The mercurial loader to test.
+
+ Its behavior has been changed to:
+ - not use any persistence (no storage, or for now a passthrough
+ storage with no filtering)
+ - not use the default configuration loading
+
+ At the end of the tests, you can make sure you have the rights
+ objects.
+
+ """
+ ADDITIONAL_CONFIG = {
+ 'reduce_effort': ('bool', False), # FIXME: This needs to be
+ # checked (in production
+ # for now, this is not
+ # deployed.)
+ 'temp_directory': ('str', '/tmp/swh.loader.mercurial'),
+ 'cache1_size': ('int', 800*1024*1024),
+ 'cache2_size': ('int', 800*1024*1024),
+ 'bundle_filename': ('str', 'HG20_none_bundle'),
+ }
+
+ def __init__(self):
+ super().__init__()
+ self.origin_id = 1
+ self.visit = 1
+ self.storage = MockStorage()
+
+
+class LoaderITest1(BaseHgLoaderTest):
+ """Load a mercurial repository without release
+
+ """
+ def setUp(self):
+ super().setUp()
+ self.loader = HgLoaderNoStorage()
+
+ @istest
+ def load(self):
+ """Load a repository with multiple branches results in 1 snapshot
+
+ """
+ # when
+ self.loader.load(
+ origin_url=self.repo_url,
+ visit_date='2016-05-03 15:16:32+00',
+ directory=self.destination_path)
+
+ # then
+ self.assertCountContents(2)
+ self.assertCountDirectories(3)
+ self.assertCountReleases(0)
+ self.assertCountRevisions(58)
+
+ tip_revision_develop = 'a9c4534552df370f43f0ef97146f393ef2f2a08c'
+ tip_revision_default = '70e750bb046101fdced06f428e73fee471509c56'
+ # same from rev 3 onward
+ directory_hash = '180bd57623a7c2c47a8c43514a5f4d903503d0aa'
+ # cf. test_loader.org for explaining from where those hashes
+ # come from
+ expected_revisions = {
+ # revision hash | directory hash # noqa
+ 'aafb69fd7496ca617f741d38c40808ff2382aabe': 'e2e117569b086ceabeeedee4acd95f35298d4553', # noqa
+ 'b6932cb7f59e746899e4804f3d496126d1343615': '9cd8160c67ac4b0bc97e2e2cd918a580425167d3', # noqa
+ tip_revision_default: directory_hash,
+ '18012a93d5aadc331c468dac84b524430f4abc19': directory_hash,
+ 'bec4c0a31b0b2502f44f34aeb9827cd090cca621': directory_hash,
+ '5f4eba626c3f826820c4475d2d81410759ec911b': directory_hash,
+ 'dcba06661c607fe55ec67b1712d153b69f65e38c': directory_hash,
+ 'c77e776d22548d47a8d96463a3556172776cd59b': directory_hash,
+ '61d762d65afb3150e2653d6735068241779c1fcf': directory_hash,
+ '40def747398c76ceec1bd248e3a6cb2a52e22dc5': directory_hash,
+ '6910964416438ca8d1698f6295871d727c4d4851': directory_hash,
+ 'be44d5e6cc66580f59c108f8bff5911ee91a22e4': directory_hash,
+ 'c4a95d5097519dedac437fddf0ef775136081241': directory_hash,
+ '32eb0354a660128e205bf7c3a84b46040ef70d92': directory_hash,
+ 'dafa445964230e808148db043c126063ea1dc9b6': directory_hash,
+ 'a41e2a548ba51ee47f22baad8e88994853d3e2f5': directory_hash,
+ 'dc3e3ab7fe257d04769528e5e17ad9f1acb44659': directory_hash,
+ 'd2164061453ecb03d4347a05a77db83f706b8e15': directory_hash,
+ '34192ceef239b8b72141efcc58b1d7f1676a18c9': directory_hash,
+ '2652147529269778757d96e09aaf081695548218': directory_hash,
+ '4d640e8064fe69b4c851dfd43915c431e80c7497': directory_hash,
+ 'c313df50bfcaa773dcbe038d00f8bd770ba997f8': directory_hash,
+ '769db00b34b9e085dc699c8f1550c95793d0e904': directory_hash,
+ '2973e5dc9568ac491b198f6b7f10c44ddc04e0a3': directory_hash,
+ 'be34b8c7857a6c04e41cc06b26338d8e59cb2601': directory_hash,
+ '24f45e41637240b7f9e16d2791b5eacb4a406d0f': directory_hash,
+ '62ff4741eac1821190f6c2cdab7c8a9d7db64ad0': directory_hash,
+ 'c346f6ff7f42f2a8ff867f92ab83a6721057d86c': directory_hash,
+ 'f2afbb94b319ef5d60823859875284afb95dcc18': directory_hash,
+ '4e2dc6d6073f0b6d348f84ded52f9143b10344b9': directory_hash,
+ '31cd7c5f669868651c57e3a2ba25ac45f76fa5cf': directory_hash,
+ '25f5b27dfa5ed15d336188ef46bef743d88327d4': directory_hash,
+ '88b80615ed8561be74a700b92883ec0374ddacb0': directory_hash,
+ '5ee9ea92ed8cc1737b7670e39dab6081c64f2598': directory_hash,
+ 'dcddcc32740d2de0e1403e21a5c4ed837b352992': directory_hash,
+ '74335db9f45a5d1c8133ff7a7db5ed7a8d4a197b': directory_hash,
+ 'cb36b894129ca7910bb81c457c72d69d5ff111bc': directory_hash,
+ 'caef0cb155eb6c55215aa59aabe04a9c702bbe6a': directory_hash,
+ '5017ce0b285351da09a2029ea2cf544f79b593c7': directory_hash,
+ '17a62618eb6e91a1d5d8e1246ccedae020d3b222': directory_hash,
+ 'a1f000fb8216838aa2a120738cc6c7fef2d1b4d8': directory_hash,
+ '9f82d95bd3edfb7f18b1a21d6171170395ea44ce': directory_hash,
+ 'a701d39a17a9f48c61a06eee08bd9ac0b8e3838b': directory_hash,
+ '4ef794980f820d44be94b2f0d53eb34d4241638c': directory_hash,
+ 'ddecbc16f4c916c39eacfcb2302e15a9e70a231e': directory_hash,
+ '3565e7d385af0745ec208d719e469c2f58be8e94': directory_hash,
+ 'c875bad563a73a25c5f3379828b161b1441a7c5d': directory_hash,
+ '94be9abcf9558213ff301af0ecd8223451ce991d': directory_hash,
+ '1ee770fd10ea2d8c4f6e68a1dbe79378a86611e0': directory_hash,
+ '553b09724bd30d9691b290e157b27a73e2d3e537': directory_hash,
+ '9e912851eb64e3a1e08fbb587de7a4c897ce5a0a': directory_hash,
+ '9c9e0ff08f215a5a5845ce3dbfc5b48c8050bdaf': directory_hash,
+ 'db9e625ba90056304897a94c92e5d27bc60f112d': directory_hash,
+ '2d4a801c9a9645fcd3a9f4c06418d8393206b1f3': directory_hash,
+ 'e874cd5967efb1f45282e9f5ce87cc68a898a6d0': directory_hash,
+ 'e326a7bbb5bc00f1d8cacd6108869dedef15569c': directory_hash,
+ '3ed4b85d30401fe32ae3b1d650f215a588293a9e': directory_hash,
+ tip_revision_develop: directory_hash,
+ }
+
+ self.assertRevisionsOk(expected_revisions)
+ self.assertCountSnapshots(1)
+
+ expected_snapshot = {
+ 'id': '05cad59e8980069d9fe2324d406cf226c0021e1c',
+ 'branches': {
+ 'develop': {
+ 'target': tip_revision_develop,
+ 'target_type': 'revision'
+ },
+ 'default': {
+ 'target': tip_revision_default,
+ 'target_type': 'revision'
+ },
+ }
+ }
+
+ self.assertSnapshotOk(expected_snapshot)
+ self.assertEqual(self.loader.load_status(), {'status': 'eventful'})
+ self.assertEqual(self.loader.visit_status(), 'full')
+
+
+class LoaderITest2(BaseHgLoaderTest):
+ """Load a mercurial repository with release
+
+ """
+ def setUp(self):
+ super().setUp(archive_name='hello.tgz', filename='hello')
+ self.loader = HgLoaderNoStorage()
+
+ @istest
+ def load(self):
+ """Load a repository with tags results in 1 snapshot
+
+ """
+ # when
+ self.loader.load(
+ origin_url=self.repo_url,
+ visit_date='2016-05-03 15:16:32+00',
+ directory=self.destination_path)
+
+ # then
+ self.assertCountContents(3)
+ self.assertCountDirectories(3)
+ self.assertCountReleases(1)
+ self.assertCountRevisions(3)
+
+ tip_release = '515c4d72e089404356d0f4b39d60f948b8999140'
+ self.assertReleasesOk([tip_release])
+
+ tip_revision_default = 'c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27'
+ # cf. test_loader.org for explaining from where those hashes
+ # come from
+ expected_revisions = {
+ # revision hash | directory hash # noqa
+ '93b48d515580522a05f389bec93227fc8e43d940': '43d727f2f3f2f7cb3b098ddad1d7038464a4cee2', # noqa
+ '8dd3db5d5519e4947f035d141581d304565372d2': 'b3f85f210ff86d334575f64cb01c5bf49895b63e', # noqa
+ tip_revision_default: '8f2be433c945384c85920a8e60f2a68d2c0f20fb',
+ }
+
+ self.assertRevisionsOk(expected_revisions)
+ self.assertCountSnapshots(1)
+
+ expected_snapshot = {
+ 'id': 'fa537f8e0cbdb8a54e29533302ed6fcbee28cb7b',
+ 'branches': {
+ 'default': {
+ 'target': tip_revision_default,
+ 'target_type': 'revision'
+ },
+ '0.1': {
+ 'target': tip_release,
+ 'target_type': 'release'
+ }
+ }
+ }
+
+ self.assertSnapshotOk(expected_snapshot)
+ self.assertEqual(self.loader.load_status(), {'status': 'eventful'})
+ self.assertEqual(self.loader.visit_status(), 'full')
diff --git a/version.txt b/version.txt
index 2184018..3d755c9 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.12-0-gdb28032
\ No newline at end of file
+v0.0.13-0-g7e8386d
\ No newline at end of file