Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/mercurial/loader.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2021 The Software Heritage developers | ||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||
"""This document contains a SWH loader for ingesting repository data | """This document contains a SWH loader for ingesting repository data | ||||||||
from Mercurial version 2 bundle files. | from Mercurial version 2 bundle files. | ||||||||
""" | """ | ||||||||
Show All 15 Lines | |||||||||
from shutil import rmtree | from shutil import rmtree | ||||||||
from tempfile import mkdtemp | from tempfile import mkdtemp | ||||||||
import time | import time | ||||||||
from typing import Any, Dict, Iterable, List, Optional | from typing import Any, Dict, Iterable, List, Optional | ||||||||
import billiard | import billiard | ||||||||
from dateutil import parser | from dateutil import parser | ||||||||
import hglib | import hglib | ||||||||
from hglib.error import CommandError | |||||||||
from swh.core.config import merge_configs | from swh.core.config import merge_configs | ||||||||
from swh.loader.core.loader import DVCSLoader | from swh.loader.core.loader import DVCSLoader | ||||||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||||||
from swh.loader.exception import NotFound | |||||||||
from swh.model import identifiers | from swh.model import identifiers | ||||||||
from swh.model.hashutil import ( | from swh.model.hashutil import ( | ||||||||
DEFAULT_ALGORITHMS, | DEFAULT_ALGORITHMS, | ||||||||
MultiHash, | MultiHash, | ||||||||
hash_to_bytehex, | hash_to_bytehex, | ||||||||
hash_to_bytes, | hash_to_bytes, | ||||||||
hash_to_hex, | hash_to_hex, | ||||||||
) | ) | ||||||||
Show All 24 Lines | |||||||||
TAG_PATTERN = re.compile("[0-9A-Fa-f]{40}") | TAG_PATTERN = re.compile("[0-9A-Fa-f]{40}") | ||||||||
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial." | TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial." | ||||||||
HEAD_POINTER_NAME = b"tip" | HEAD_POINTER_NAME = b"tip" | ||||||||
class CommandErrorWrapper(Exception): | |||||||||
"""This exception is raised in place of a 'CommandError' | |||||||||
exception (raised by the underlying hglib library) | |||||||||
vlorentz: Could you briefly explain what this exception is for? (eg. `error raised when calling `hg`… | |||||||||
This is needed because billiard.Queue is serializing the | |||||||||
queued object and as CommandError doesn't have a constructor without | |||||||||
Not Done Inline ActionsCould you make it bytes? We already use bytes everywhere and the only difference afaict is mutability. vlorentz: Could you make it `bytes`? We already use `bytes` everywhere and the only difference afaict is… | |||||||||
parameters, the deserialization is failing | |||||||||
""" | |||||||||
def __init__(self, err: Optional[bytes]): | |||||||||
self.err = err | |||||||||
class CloneTimeoutError(Exception): | class CloneTimeoutError(Exception): | ||||||||
pass | pass | ||||||||
DEFAULT_CONFIG: Dict[str, Any] = { | DEFAULT_CONFIG: Dict[str, Any] = { | ||||||||
"bundle_filename": "HG20_none_bundle", | "bundle_filename": "HG20_none_bundle", | ||||||||
"reduce_effort": False, | "reduce_effort": False, | ||||||||
"temp_directory": "/tmp", | "temp_directory": "/tmp", | ||||||||
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines | class HgBundle20Loader(DVCSLoader): | ||||||||
@staticmethod | @staticmethod | ||||||||
def clone_with_timeout(log, origin, destination, timeout): | def clone_with_timeout(log, origin, destination, timeout): | ||||||||
queue = billiard.Queue() | queue = billiard.Queue() | ||||||||
start = time.monotonic() | start = time.monotonic() | ||||||||
def do_clone(queue, origin, destination): | def do_clone(queue, origin, destination): | ||||||||
try: | try: | ||||||||
result = hglib.clone(source=origin, dest=destination) | result = hglib.clone(source=origin, dest=destination) | ||||||||
except CommandError as e: | |||||||||
# the queued object need an empty constructor to be deserialized later | |||||||||
queue.put(CommandErrorWrapper(e.err)) | |||||||||
except BaseException as e: | except BaseException as e: | ||||||||
queue.put(e) | queue.put(e) | ||||||||
else: | else: | ||||||||
queue.put(result) | queue.put(result) | ||||||||
process = billiard.Process(target=do_clone, args=(queue, origin, destination)) | process = billiard.Process(target=do_clone, args=(queue, origin, destination)) | ||||||||
process.start() | process.start() | ||||||||
while True: | while True: | ||||||||
try: | try: | ||||||||
result = queue.get(timeout=0.1) | result = queue.get(timeout=0.1) | ||||||||
break | break | ||||||||
except Empty: | except Empty: | ||||||||
duration = time.monotonic() - start | duration = time.monotonic() - start | ||||||||
if timeout and duration > timeout: | if timeout and duration > timeout: | ||||||||
log.warning( | log.warning( | ||||||||
"Timeout cloning `%s` within %s seconds", origin, timeout | "Timeout cloning `%s` within %s seconds", origin, timeout | ||||||||
) | ) | ||||||||
process.terminate() | process.terminate() | ||||||||
process.join() | process.join() | ||||||||
raise CloneTimeoutError(origin, timeout) | raise CloneTimeoutError(origin, timeout) | ||||||||
continue | continue | ||||||||
process.join() | process.join() | ||||||||
if isinstance(result, Exception): | if isinstance(result, Exception): | ||||||||
raise result from None | raise result from None | ||||||||
return result | return result | ||||||||
def prepare(self, *args, **kwargs): | def prepare(self, *args, **kwargs): | ||||||||
"""Prepare the necessary steps to load an actual remote or local | """Prepare the necessary steps to load an actual remote or local | ||||||||
repository. | repository. | ||||||||
Show All 29 Lines | def prepare(self, *args, **kwargs): | ||||||||
self.log.debug( | self.log.debug( | ||||||||
"Cloning %s to %s with timeout %s seconds", | "Cloning %s to %s with timeout %s seconds", | ||||||||
self.origin_url, | self.origin_url, | ||||||||
self.hgdir, | self.hgdir, | ||||||||
self.clone_timeout, | self.clone_timeout, | ||||||||
) | ) | ||||||||
try: | |||||||||
self.clone_with_timeout( | self.clone_with_timeout( | ||||||||
self.log, self.origin_url, self.hgdir, self.clone_timeout | self.log, self.origin_url, self.hgdir, self.clone_timeout | ||||||||
) | ) | ||||||||
except CommandErrorWrapper as e: | |||||||||
for msg in [ | |||||||||
b"does not appear to be an hg repository", | |||||||||
b"404: Not Found", | |||||||||
b"Name or service not known", | |||||||||
Not Done Inline ActionsDid you check these messages don't change depending on $LANG? If yes, we may need to change how we call hglib vlorentz: Did you check these messages don't change depending on `$LANG`? If yes, we may need to change… | |||||||||
Done Inline ActionsIt seems it's not locale sensible : In [7]: import locale In [8]: locale.setlocale(locale.LC_ALL, 'fr_FR') Out[8]: 'fr_FR' In [9]: import hglib In [10]: hglib.clone("/fake") --------------------------------------------------------------------------- CommandError Traceback (most recent call last) <ipython-input-10-97847294dd82> in <module> ----> 1 hglib.clone("/fake") ~/src/swh/swh-environment/.venv/lib/python3.7/site-packages/hglib/__init__.py in clone(source, dest, noupdate, updaterev, rev, branch, pull, uncompressed, ssh, remotecmd, insecure, encoding, configs) 36 out, err = proc.communicate() 37 if proc.returncode: ---> 38 raise error.CommandError(args, proc.returncode, out, err) 39 40 return client.hgclient(dest, encoding, configs, connect=False) CommandError: (255, b'', b'abandon\xc2\xa0: repository /fake not found') ----> 1 hglib.clone("http://github.com/softwareheritage/swh-core") ~/src/swh/swh-environment/.venv/lib/python3.7/site-packages/hglib/__init__.py in clone(source, dest, noupdate, updaterev, rev, branch, pull, uncompressed, ssh, remotecmd, insecure, encoding, configs) 36 out, err = proc.communicate() 37 if proc.returncode: ---> 38 raise error.CommandError(args, proc.returncode, out, err) 39 40 return client.hgclient(dest, encoding, configs, connect=False) CommandError: (255, b'', b'real URL is https://github.com/softwareheritage/swh-core\nabandon\xc2\xa0: \'http://github.com/softwareheritage/swh-core\' does not appear to be an hg repository:\n---%<--- (text/html; charset=utf-8)\n\n\n\n\n\n\n<!DOCTYPE html>\n<html lang="en">\n <head>\n <meta charset="utf-8">\n <link rel="dns-prefetch" href="https://github.githubassets.com">\n <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n\n\n\n <link crossorigin="anonymous" media="all" integrity="sha512-rF3cnLJE5IkKUWFkw54emxUMV82DhbZ9aJun83zhvBgJ7J7ZXC20bEFVuLY9RRRC60Ig+pHQO57DuYBrYO+cAA==" rel="stylesheet" href="https://github.githubassets.com/assets/frameworks-ac5ddc9cb244e4890a516164c39e1e9b.css" />\n <link crossorigin="anonymous" media="all" integrity="sha512-tO1butB3aXG+Ab9M+171Fjde3B2uzMU0DEAKzjbXJ0GLJWfiaIVEhM9QS3/G9Ck32IEZLmaSTscoyA9Z66IglQ==" rel="stylesheet" href="https://github.githubassets.com/assets/site-b4ed5bbad0776971be01bf4cfb5ef516.css" />\n <link crossorigin="anonymous" media="all" integrity="sha512-QbKgFXj+JoU12QsMYLRWqW9sWAzGHCCMC7FlsHunxzfLL4jGwfsmyAtbn4F/deHyB\n---%<---') vsellier: It seems it's not locale sensible :
```
In [7]: import locale
In [8]: locale.setlocale(locale. | |||||||||
]: | |||||||||
if msg in e.err: | |||||||||
Not Done Inline Actions
maybe? (from None excludes the stack trace of the CommandErrorWrapper exception from the new one, and I don't think we need it here) vlorentz: maybe?
(`from None` excludes the stack trace of the `CommandErrorWrapper` exception from the… | |||||||||
raise NotFound(e.args[0]) from None | |||||||||
raise e | |||||||||
else: # local repository | else: # local repository | ||||||||
self.working_directory = None | self.working_directory = None | ||||||||
self.hgdir = directory | self.hgdir = directory | ||||||||
self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) | self.bundle_path = os.path.join(self.hgdir, self.bundle_filename) | ||||||||
self.log.debug("Bundling at %s" % self.bundle_path) | self.log.debug("Bundling at %s" % self.bundle_path) | ||||||||
with hglib.open(self.hgdir) as repo: | with hglib.open(self.hgdir) as repo: | ||||||||
self.heads = self.get_heads(repo) | self.heads = self.get_heads(repo) | ||||||||
repo.bundle(bytes(self.bundle_path, "utf-8"), all=True, type=b"none-v2") | repo.bundle(bytes(self.bundle_path, "utf-8"), all=True, type=b"none-v2") | ||||||||
self.cache_filename1 = os.path.join( | self.cache_filename1 = os.path.join( | ||||||||
self.hgdir, "swh-cache-1-%s" % (hex(random.randint(0, 0xFFFFFF))[2:],) | self.hgdir, "swh-cache-1-%s" % (hex(random.randint(0, 0xFFFFFF))[2:],) | ||||||||
) | ) | ||||||||
self.cache_filename2 = os.path.join( | self.cache_filename2 = os.path.join( | ||||||||
▲ Show 20 Lines • Show All 365 Lines • Show Last 20 Lines |
Could you briefly explain what this exception is for? (eg. error raised when calling hg`, with the returned message as the err attribute)