Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/requirements-swh.txt b/requirements-swh.txt
index 8a770f6..a602fab 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
swh.model >= 0.0.27
swh.storage >= 0.0.114
swh.scheduler >= 0.0.39
-swh.loader.core >= 0.0.37
+swh.loader.core >= 0.0.43
diff --git a/swh/loader/mercurial/loader.py b/swh/loader/mercurial/loader.py
index 835c3ef..9522b71 100644
--- a/swh/loader/mercurial/loader.py
+++ b/swh/loader/mercurial/loader.py
@@ -1,535 +1,535 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""This document contains a SWH loader for ingesting repository data
from Mercurial version 2 bundle files.
"""
# NOTE: The code here does expensive work twice in places because of the
# intermediate need to check for what is missing before sending to the database
# and the desire to not juggle very large amounts of data.
# TODO: Decide whether to also serialize to disk and read back more quickly
# from there. Maybe only for very large repos and fast drives.
# - Avi
import datetime
import hglib
import os
import random
import re
from dateutil import parser
from shutil import rmtree
from tempfile import mkdtemp
from swh.model import identifiers
from swh.model.hashutil import (
MultiHash, hash_to_hex, hash_to_bytes,
DEFAULT_ALGORITHMS
)
from swh.loader.core.loader import UnbufferedLoader
from swh.loader.core.converters import content_for_storage
from swh.loader.core.utils import clean_dangling_folders
from . import converters
from .archive_extract import tmp_extract
from .bundle20_reader import Bundle20Reader
from .converters import PRIMARY_ALGO as ALGO
from .objects import SelectiveCache, SimpleTree
TAG_PATTERN = re.compile('[0-9A-Fa-f]{40}')
TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.mercurial.'
HEAD_POINTER_NAME = b'tip'
class HgBundle20Loader(UnbufferedLoader):
"""Mercurial loader able to deal with remote or local repository.
"""
CONFIG_BASE_FILENAME = 'loader/mercurial'
ADDITIONAL_CONFIG = {
'bundle_filename': ('str', 'HG20_none_bundle'),
'reduce_effort': ('bool', False),
'temp_directory': ('str', '/tmp'),
'cache1_size': ('int', 800*1024*1024),
'cache2_size': ('int', 800*1024*1024),
}
def __init__(self, logging_class='swh.loader.mercurial.Bundle20Loader'):
super().__init__(logging_class=logging_class)
self.content_max_size_limit = self.config['content_size_limit']
self.bundle_filename = self.config['bundle_filename']
self.reduce_effort_flag = self.config['reduce_effort']
self.empty_repository = None
self.temp_directory = self.config['temp_directory']
self.cache1_size = self.config['cache1_size']
self.cache2_size = self.config['cache2_size']
self.working_directory = None
self.bundle_path = None
def pre_cleanup(self):
"""Cleanup potential dangling files from prior runs (e.g. OOM killed
tasks)
"""
clean_dangling_folders(self.temp_directory,
pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
log=self.log)
def cleanup(self):
"""Clean temporary working directory
"""
if self.bundle_path and os.path.exists(self.bundle_path):
self.log.debug('Cleanup up working bundle %s' % self.bundle_path)
os.unlink(self.bundle_path)
if self.working_directory and os.path.exists(self.working_directory):
self.log.debug('Cleanup up working directory %s' % (
self.working_directory, ))
rmtree(self.working_directory)
def get_heads(self, repo):
"""Read the closed branches heads (branch, bookmarks) and returns a
dict with key the branch_name (bytes) and values the tuple
(pointer nature (bytes), mercurial's node id
(bytes)). Those needs conversion to swh-ids. This is taken
care of in get_revisions.
"""
b = {}
for _, node_hash_id, pointer_nature, branch_name, *_ in repo.heads():
b[branch_name] = (
pointer_nature, hash_to_bytes(node_hash_id.decode()))
bookmarks = repo.bookmarks()
if bookmarks and bookmarks[0]:
for bookmark_name, _, target_short in bookmarks[0]:
target = repo[target_short].node()
b[bookmark_name] = (None, hash_to_bytes(target.decode()))
return b
def prepare_origin_visit(self, *, origin_url, visit_date, **kwargs):
self.origin_url = origin_url
self.origin = {'url': self.origin_url, 'type': 'hg'}
if isinstance(visit_date, str): # visit_date can be string or datetime
visit_date = parser.parse(visit_date)
self.visit_date = visit_date
def prepare(self, *, origin_url, visit_date, directory=None):
"""Prepare the necessary steps to load an actual remote or local
repository.
To load a local repository, pass the optional directory
parameter as filled with a path to a real local folder.
To load a remote repository, pass the optional directory
parameter as None.
Args:
origin_url (str): Origin url to load
visit_date (str/datetime): Date of the visit
directory (str/None): The local directory to load
"""
self.branches = {}
self.tags = []
self.releases = {}
self.node_2_rev = {}
if not directory: # remote repository
self.working_directory = mkdtemp(
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix='-%s' % os.getpid(),
dir=self.temp_directory)
os.makedirs(self.working_directory, exist_ok=True)
self.hgdir = self.working_directory
self.log.debug('Cloning %s to %s' % (
self.origin['url'], self.hgdir))
hglib.clone(source=self.origin['url'], dest=self.hgdir)
else: # local repository
self.working_directory = None
self.hgdir = directory
self.bundle_path = os.path.join(self.hgdir, self.bundle_filename)
self.log.debug('Bundling at %s' % self.bundle_path)
with hglib.open(self.hgdir) as repo:
self.heads = self.get_heads(repo)
repo.bundle(bytes(self.bundle_path, 'utf-8'),
all=True,
type=b'none-v2')
self.cache_filename1 = os.path.join(
self.hgdir, 'swh-cache-1-%s' % (
hex(random.randint(0, 0xffffff))[2:], ))
self.cache_filename2 = os.path.join(
self.hgdir, 'swh-cache-2-%s' % (
hex(random.randint(0, 0xffffff))[2:], ))
try:
self.br = Bundle20Reader(bundlefile=self.bundle_path,
cache_filename=self.cache_filename1,
cache_size=self.cache1_size)
except FileNotFoundError:
# Empty repository! Still a successful visit targeting an
# empty snapshot
self.log.warn('%s is an empty repository!' % self.hgdir)
self.empty_repository = True
else:
self.reduce_effort = set()
if self.reduce_effort_flag:
now = datetime.datetime.now(tz=datetime.timezone.utc)
if (now - self.visit_date).days > 1:
# Assuming that self.visit_date would be today for
# a new visit, treat older visit dates as
# indication of wanting to skip some processing
# effort.
for header, commit in self.br.yield_all_changesets():
ts = commit['time'].timestamp()
if ts < self.visit_date.timestamp():
self.reduce_effort.add(header['node'])
def has_contents(self):
return not self.empty_repository
def has_directories(self):
return not self.empty_repository
def has_revisions(self):
return not self.empty_repository
def has_releases(self):
return not self.empty_repository
def fetch_data(self):
"""Fetch the data from the data source."""
pass
def get_contents(self):
"""Get the contents that need to be loaded."""
# NOTE: This method generates blobs twice to reduce memory usage
# without generating disk writes.
self.file_node_to_hash = {}
hash_to_info = {}
self.num_contents = 0
contents = {}
missing_contents = set()
for blob, node_info in self.br.yield_all_blobs():
self.num_contents += 1
file_name = node_info[0]
header = node_info[2]
length = len(blob)
if header['linknode'] in self.reduce_effort:
algorithms = [ALGO]
else:
algorithms = DEFAULT_ALGORITHMS
h = MultiHash.from_data(blob, hash_names=algorithms)
content = h.digest()
content['length'] = length
blob_hash = content[ALGO]
self.file_node_to_hash[header['node']] = blob_hash
if header['linknode'] in self.reduce_effort:
continue
hash_to_info[blob_hash] = node_info
contents[blob_hash] = content
missing_contents.add(blob_hash)
if file_name == b'.hgtags':
# https://www.mercurial-scm.org/wiki/GitConcepts#Tag_model
# overwrite until the last one
self.tags = (t for t in blob.split(b'\n') if t != b'')
if contents:
missing_contents = set(
self.storage.content_missing(
list(contents.values()),
key_hash=ALGO
)
)
# Clusters needed blobs by file offset and then only fetches the
# groups at the needed offsets.
focs = {} # "file/offset/contents"
for blob_hash in missing_contents:
_, file_offset, header = hash_to_info[blob_hash]
focs.setdefault(file_offset, {})
focs[file_offset][header['node']] = blob_hash
hash_to_info = None
for offset, node_hashes in sorted(focs.items()):
for header, data, *_ in self.br.yield_group_objects(
group_offset=offset
):
node = header['node']
if node in node_hashes:
blob, meta = self.br.extract_meta_from_blob(data)
content = contents.pop(node_hashes[node], None)
if content:
content['data'] = blob
yield content_for_storage(
content,
log=self.log,
max_content_size=self.content_max_size_limit,
- origin_id=self.origin_id
+ origin_url=self.origin['url']
)
def load_directories(self):
"""This is where the work is done to convert manifest deltas from the
repository bundle into SWH directories.
"""
self.mnode_to_tree_id = {}
cache_hints = self.br.build_manifest_hints()
def tree_size(t):
return t.size()
self.trees = SelectiveCache(cache_hints=cache_hints,
size_function=tree_size,
filename=self.cache_filename2,
max_size=self.cache2_size)
tree = SimpleTree()
for header, added, removed in self.br.yield_all_manifest_deltas(
cache_hints
):
node = header['node']
basenode = header['basenode']
tree = self.trees.fetch(basenode) or tree # working tree
for path in removed.keys():
tree = tree.remove_tree_node_for_path(path)
for path, info in added.items():
file_node, is_symlink, perms_code = info
tree = tree.add_blob(
path,
self.file_node_to_hash[file_node],
is_symlink,
perms_code
)
if header['linknode'] in self.reduce_effort:
self.trees.store(node, tree)
else:
new_dirs = []
self.mnode_to_tree_id[node] = tree.hash_changed(new_dirs)
self.trees.store(node, tree)
yield header, tree, new_dirs
def get_directories(self):
"""Compute directories to load
"""
dirs = {}
self.num_directories = 0
for _, _, new_dirs in self.load_directories():
for d in new_dirs:
self.num_directories += 1
dirs[d['id']] = d
missing_dirs = list(dirs.keys())
if missing_dirs:
missing_dirs = self.storage.directory_missing(missing_dirs)
for _id in missing_dirs:
yield dirs[_id]
dirs = {}
def get_revisions(self):
"""Compute revisions to load
"""
revisions = {}
self.num_revisions = 0
for header, commit in self.br.yield_all_changesets():
if header['node'] in self.reduce_effort:
continue
self.num_revisions += 1
date_dict = identifiers.normalize_timestamp(
int(commit['time'].timestamp())
)
author_dict = converters.parse_author(commit['user'])
if commit['manifest'] == Bundle20Reader.NAUGHT_NODE:
directory_id = SimpleTree().hash_changed()
else:
directory_id = self.mnode_to_tree_id[commit['manifest']]
extra_meta = []
extra = commit.get('extra')
if extra:
for e in extra.split(b'\x00'):
k, v = e.split(b':', 1)
k = k.decode('utf-8')
# transplant_source stores binary reference to a changeset
# prefer to dump hexadecimal one in the revision metadata
if k == 'transplant_source':
v = hash_to_hex(v)
extra_meta.append([k, v])
revision = {
'author': author_dict,
'date': date_dict,
'committer': author_dict,
'committer_date': date_dict,
'type': 'hg',
'directory': directory_id,
'message': commit['message'],
'metadata': {
'node': hash_to_hex(header['node']),
'extra_headers': [
['time_offset_seconds',
str(commit['time_offset_seconds']).encode('utf-8')],
] + extra_meta
},
'synthetic': False,
'parents': []
}
p1 = self.node_2_rev.get(header['p1'])
p2 = self.node_2_rev.get(header['p2'])
if p1:
revision['parents'].append(p1)
if p2:
revision['parents'].append(p2)
revision['id'] = hash_to_bytes(
identifiers.revision_identifier(revision)
)
self.node_2_rev[header['node']] = revision['id']
revisions[revision['id']] = revision
# Converts heads to use swh ids
self.heads = {
branch_name: (pointer_nature, self.node_2_rev[node_id])
for branch_name, (pointer_nature, node_id) in self.heads.items()
}
missing_revs = revisions.keys()
if missing_revs:
missing_revs = set(
self.storage.revision_missing(list(missing_revs))
)
for r in missing_revs:
yield revisions[r]
self.mnode_to_tree_id = None
def _read_tag(self, tag, split_byte=b' '):
node, *name = tag.split(split_byte)
name = split_byte.join(name)
return node, name
def get_releases(self):
"""Get the releases that need to be loaded."""
self.num_releases = 0
releases = {}
missing_releases = []
for t in self.tags:
self.num_releases += 1
node, name = self._read_tag(t)
node = node.decode()
node_bytes = hash_to_bytes(node)
if not TAG_PATTERN.match(node):
self.log.warn('Wrong pattern (%s) found in tags. Skipping' % (
node, ))
continue
if node_bytes not in self.node_2_rev:
self.log.warn('No matching revision for tag %s '
'(hg changeset: %s). Skipping' %
(name.decode(), node))
continue
tgt_rev = self.node_2_rev[node_bytes]
release = {
'name': name,
'target': tgt_rev,
'target_type': 'revision',
'message': None,
'metadata': None,
'synthetic': False,
'author': {'name': None, 'email': None, 'fullname': b''},
'date': None
}
id_hash = hash_to_bytes(
identifiers.release_identifier(release))
release['id'] = id_hash
missing_releases.append(id_hash)
releases[id_hash] = release
self.releases[name] = id_hash
if missing_releases:
missing_releases = set(
self.storage.release_missing(missing_releases))
for _id in missing_releases:
yield releases[_id]
def get_snapshot(self):
"""Get the snapshot that need to be loaded."""
branches = {}
for name, (pointer_nature, target) in self.heads.items():
branches[name] = {'target': target, 'target_type': 'revision'}
if pointer_nature == HEAD_POINTER_NAME:
branches[b'HEAD'] = {'target': name, 'target_type': 'alias'}
for name, target in self.releases.items():
branches[name] = {'target': target, 'target_type': 'release'}
snap = {
'id': None,
'branches': branches,
}
snap['id'] = identifiers.identifier_to_bytes(
identifiers.snapshot_identifier(snap))
return snap
def get_fetch_history_result(self):
"""Return the data to store in fetch_history."""
return {
'contents': self.num_contents,
'directories': self.num_directories,
'revisions': self.num_revisions,
'releases': self.num_releases,
}
class HgArchiveBundle20Loader(HgBundle20Loader):
"""Mercurial loader for repository wrapped within archives.
"""
def __init__(self):
super().__init__(
logging_class='swh.loader.mercurial.HgArchiveBundle20Loader')
self.temp_dir = None
def prepare(self, *, origin_url, archive_path, visit_date):
self.temp_dir = tmp_extract(archive=archive_path,
dir=self.temp_directory,
prefix=TEMPORARY_DIR_PREFIX_PATTERN,
suffix='.dump-%s' % os.getpid(),
log=self.log,
source=origin_url)
repo_name = os.listdir(self.temp_dir)[0]
directory = os.path.join(self.temp_dir, repo_name)
super().prepare(origin_url=origin_url,
visit_date=visit_date, directory=directory)
def cleanup(self):
if self.temp_dir and os.path.exists(self.temp_dir):
rmtree(self.temp_dir)
super().cleanup()
diff --git a/swh/loader/mercurial/tests/test_loader.py b/swh/loader/mercurial/tests/test_loader.py
index c153fc8..b0952be 100644
--- a/swh/loader/mercurial/tests/test_loader.py
+++ b/swh/loader/mercurial/tests/test_loader.py
@@ -1,289 +1,293 @@
# Copyright (C) 2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from unittest.mock import patch
from swh.loader.core.tests import BaseLoaderTest
+from swh.storage.algos.snapshot import snapshot_get_all_branches
+
from .common import HgLoaderMemoryStorage, HgArchiveLoaderMemoryStorage
class BaseHgLoaderTest(BaseLoaderTest):
"""Mixin base loader test to prepare the mercurial
repository to uncompress, load and test the results.
This sets up
"""
def setUp(self, loader=HgLoaderMemoryStorage,
archive_name='the-sandbox.tgz', filename='the-sandbox',
uncompress_archive=True):
super().setUp(archive_name=archive_name, filename=filename,
prefix_tmp_folder_name='swh.loader.mercurial.',
start_path=os.path.dirname(__file__),
uncompress_archive=uncompress_archive)
self.loader = loader()
self.storage = self.loader.storage
class WithoutReleaseLoaderTest(BaseHgLoaderTest):
"""Load a mercurial repository without release
"""
def test_load(self):
"""Load a repository with multiple branches results in 1 snapshot
"""
# when
self.loader.load(
origin_url=self.repo_url,
visit_date='2016-05-03 15:16:32+00',
directory=self.destination_path)
# then
self.assertCountContents(2)
self.assertCountDirectories(3)
self.assertCountReleases(0)
self.assertCountRevisions(58)
tip_revision_develop = 'a9c4534552df370f43f0ef97146f393ef2f2a08c'
tip_revision_default = '70e750bb046101fdced06f428e73fee471509c56'
# same from rev 3 onward
directory_hash = '180bd57623a7c2c47a8c43514a5f4d903503d0aa'
# cf. test_loader.org for explaining from where those hashes
# come from
expected_revisions = {
# revision hash | directory hash # noqa
'aafb69fd7496ca617f741d38c40808ff2382aabe': 'e2e117569b086ceabeeedee4acd95f35298d4553', # noqa
'b6932cb7f59e746899e4804f3d496126d1343615': '9cd8160c67ac4b0bc97e2e2cd918a580425167d3', # noqa
tip_revision_default: directory_hash,
'18012a93d5aadc331c468dac84b524430f4abc19': directory_hash,
'bec4c0a31b0b2502f44f34aeb9827cd090cca621': directory_hash,
'5f4eba626c3f826820c4475d2d81410759ec911b': directory_hash,
'dcba06661c607fe55ec67b1712d153b69f65e38c': directory_hash,
'c77e776d22548d47a8d96463a3556172776cd59b': directory_hash,
'61d762d65afb3150e2653d6735068241779c1fcf': directory_hash,
'40def747398c76ceec1bd248e3a6cb2a52e22dc5': directory_hash,
'6910964416438ca8d1698f6295871d727c4d4851': directory_hash,
'be44d5e6cc66580f59c108f8bff5911ee91a22e4': directory_hash,
'c4a95d5097519dedac437fddf0ef775136081241': directory_hash,
'32eb0354a660128e205bf7c3a84b46040ef70d92': directory_hash,
'dafa445964230e808148db043c126063ea1dc9b6': directory_hash,
'a41e2a548ba51ee47f22baad8e88994853d3e2f5': directory_hash,
'dc3e3ab7fe257d04769528e5e17ad9f1acb44659': directory_hash,
'd2164061453ecb03d4347a05a77db83f706b8e15': directory_hash,
'34192ceef239b8b72141efcc58b1d7f1676a18c9': directory_hash,
'2652147529269778757d96e09aaf081695548218': directory_hash,
'4d640e8064fe69b4c851dfd43915c431e80c7497': directory_hash,
'c313df50bfcaa773dcbe038d00f8bd770ba997f8': directory_hash,
'769db00b34b9e085dc699c8f1550c95793d0e904': directory_hash,
'2973e5dc9568ac491b198f6b7f10c44ddc04e0a3': directory_hash,
'be34b8c7857a6c04e41cc06b26338d8e59cb2601': directory_hash,
'24f45e41637240b7f9e16d2791b5eacb4a406d0f': directory_hash,
'62ff4741eac1821190f6c2cdab7c8a9d7db64ad0': directory_hash,
'c346f6ff7f42f2a8ff867f92ab83a6721057d86c': directory_hash,
'f2afbb94b319ef5d60823859875284afb95dcc18': directory_hash,
'4e2dc6d6073f0b6d348f84ded52f9143b10344b9': directory_hash,
'31cd7c5f669868651c57e3a2ba25ac45f76fa5cf': directory_hash,
'25f5b27dfa5ed15d336188ef46bef743d88327d4': directory_hash,
'88b80615ed8561be74a700b92883ec0374ddacb0': directory_hash,
'5ee9ea92ed8cc1737b7670e39dab6081c64f2598': directory_hash,
'dcddcc32740d2de0e1403e21a5c4ed837b352992': directory_hash,
'74335db9f45a5d1c8133ff7a7db5ed7a8d4a197b': directory_hash,
'cb36b894129ca7910bb81c457c72d69d5ff111bc': directory_hash,
'caef0cb155eb6c55215aa59aabe04a9c702bbe6a': directory_hash,
'5017ce0b285351da09a2029ea2cf544f79b593c7': directory_hash,
'17a62618eb6e91a1d5d8e1246ccedae020d3b222': directory_hash,
'a1f000fb8216838aa2a120738cc6c7fef2d1b4d8': directory_hash,
'9f82d95bd3edfb7f18b1a21d6171170395ea44ce': directory_hash,
'a701d39a17a9f48c61a06eee08bd9ac0b8e3838b': directory_hash,
'4ef794980f820d44be94b2f0d53eb34d4241638c': directory_hash,
'ddecbc16f4c916c39eacfcb2302e15a9e70a231e': directory_hash,
'3565e7d385af0745ec208d719e469c2f58be8e94': directory_hash,
'c875bad563a73a25c5f3379828b161b1441a7c5d': directory_hash,
'94be9abcf9558213ff301af0ecd8223451ce991d': directory_hash,
'1ee770fd10ea2d8c4f6e68a1dbe79378a86611e0': directory_hash,
'553b09724bd30d9691b290e157b27a73e2d3e537': directory_hash,
'9e912851eb64e3a1e08fbb587de7a4c897ce5a0a': directory_hash,
'9c9e0ff08f215a5a5845ce3dbfc5b48c8050bdaf': directory_hash,
'db9e625ba90056304897a94c92e5d27bc60f112d': directory_hash,
'2d4a801c9a9645fcd3a9f4c06418d8393206b1f3': directory_hash,
'e874cd5967efb1f45282e9f5ce87cc68a898a6d0': directory_hash,
'e326a7bbb5bc00f1d8cacd6108869dedef15569c': directory_hash,
'3ed4b85d30401fe32ae3b1d650f215a588293a9e': directory_hash,
tip_revision_develop: directory_hash,
}
self.assertRevisionsContain(expected_revisions)
self.assertCountSnapshots(1)
expected_snapshot = {
'id': '3b8fe58e467deb7597b12a5fd3b2c096b8c02028',
'branches': {
'develop': {
'target': tip_revision_develop,
'target_type': 'revision'
},
'default': {
'target': tip_revision_default,
'target_type': 'revision'
},
'HEAD': {
'target': 'develop',
'target_type': 'alias',
}
}
}
self.assertSnapshotEqual(expected_snapshot)
self.assertEqual(self.loader.load_status(), {'status': 'eventful'})
self.assertEqual(self.loader.visit_status(), 'full')
class CommonHgLoaderData:
def assert_data_ok(self):
# then
self.assertCountContents(3)
self.assertCountDirectories(3)
self.assertCountReleases(1)
self.assertCountRevisions(3)
tip_release = '515c4d72e089404356d0f4b39d60f948b8999140'
self.assertReleasesContain([tip_release])
tip_revision_default = 'c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27'
# cf. test_loader.org for explaining from where those hashes
# come from
expected_revisions = {
# revision hash | directory hash # noqa
'93b48d515580522a05f389bec93227fc8e43d940': '43d727f2f3f2f7cb3b098ddad1d7038464a4cee2', # noqa
'8dd3db5d5519e4947f035d141581d304565372d2': 'b3f85f210ff86d334575f64cb01c5bf49895b63e', # noqa
tip_revision_default: '8f2be433c945384c85920a8e60f2a68d2c0f20fb',
}
self.assertRevisionsContain(expected_revisions)
self.assertCountSnapshots(1)
expected_snapshot = {
'id': 'd35668e02e2ba4321dc951cd308cf883786f918a',
'branches': {
'default': {
'target': tip_revision_default,
'target_type': 'revision'
},
'0.1': {
'target': tip_release,
'target_type': 'release'
},
'HEAD': {
'target': 'default',
'target_type': 'alias',
}
}
}
self.assertSnapshotEqual(expected_snapshot)
self.assertEqual(self.loader.load_status(), {'status': 'eventful'})
self.assertEqual(self.loader.visit_status(), 'full')
class WithReleaseLoaderTest(BaseHgLoaderTest, CommonHgLoaderData):
"""Load a mercurial repository with release
"""
def setUp(self):
super().setUp(archive_name='hello.tgz', filename='hello')
def test_load(self):
"""Load a repository with tags results in 1 snapshot
"""
# when
self.loader.load(
origin_url=self.repo_url,
visit_date='2016-05-03 15:16:32+00',
directory=self.destination_path)
self.assert_data_ok()
class ArchiveLoaderTest(BaseHgLoaderTest, CommonHgLoaderData):
"""Load a mercurial repository archive with release
"""
def setUp(self):
super().setUp(loader=HgArchiveLoaderMemoryStorage,
archive_name='hello.tgz', filename='hello',
uncompress_archive=False)
def test_load(self):
"""Load a mercurial repository archive with tags results in 1 snapshot
"""
# when
self.loader.load(
origin_url=self.repo_url,
visit_date='2016-05-03 15:16:32+00',
archive_path=self.destination_path)
self.assert_data_ok()
@patch('swh.loader.mercurial.archive_extract.patoolib')
def test_load_with_failure(self, mock_patoo):
mock_patoo.side_effect = ValueError
# when
r = self.loader.load(
origin_url=self.repo_url,
visit_date='2016-05-03 15:16:32+00',
archive_path=self.destination_path)
self.assertEqual(r, {'status': 'failed'})
self.assertCountContents(0)
self.assertCountDirectories(0)
self.assertCountRevisions(0)
self.assertCountReleases(0)
self.assertCountSnapshots(0)
class WithTransplantLoaderTest(BaseHgLoaderTest):
"""Load a mercurial repository where transplant operations
have been used.
"""
def setUp(self):
super().setUp(archive_name='transplant.tgz', filename='transplant')
def test_load(self):
# load hg repository
self.loader.load(
origin_url=self.repo_url,
visit_date='2019-05-23 12:06:00+00',
directory=self.destination_path)
# collect swh revisions
- origin_id = self.storage.origin_get([
- {'type': 'hg', 'url': self.repo_url}])[0]['id']
- snapshot = self.storage.snapshot_get_latest(origin_id)
+ origin_url = self.storage.origin_get([
+ {'type': 'hg', 'url': self.repo_url}])[0]['url']
+ visit = self.storage.origin_visit_get_latest(
+ origin_url, require_snapshot=True)
revisions = []
+ snapshot = snapshot_get_all_branches(self.storage, visit['snapshot'])
for branch in snapshot['branches'].values():
if branch['target_type'] != 'revision':
continue
revisions.append(branch['target'])
# extract original changesets info and the transplant sources
hg_changesets = set()
transplant_sources = set()
for rev in self.storage.revision_log(revisions):
hg_changesets.add(rev['metadata']['node'])
for k, v in rev['metadata']['extra_headers']:
if k == 'transplant_source':
transplant_sources.add(v)
# check extracted data are valid
self.assertTrue(len(hg_changesets) > 0)
self.assertTrue(len(transplant_sources) > 0)
self.assertTrue(transplant_sources.issubset(hg_changesets))

File Metadata

Mime Type
text/x-diff
Expires
Thu, Sep 18, 5:03 PM (1 d, 22 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3261717

Event Timeline