diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -13,3 +13,6 @@ [mypy-pytest.*] ignore_missing_imports = True + +[mypy-swh.loader.package.*] +ignore_missing_imports = True diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.storage >= 0.0.163 swh.model >= 0.0.59 swh.scheduler >= 0.0.39 -swh.loader.core >= 0.0.78 +swh.loader.core >= 0.0.80 diff --git a/swh/loader/svn/converters.py b/swh/loader/svn/converters.py --- a/swh/loader/svn/converters.py +++ b/swh/loader/svn/converters.py @@ -3,12 +3,20 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Any, Dict, List, Optional, Union + from email import utils +from swh.model.model import ( + Person, Revision, RevisionType, TimestampWithTimezone +) +from swh.loader.package.utils import EMPTY_AUTHOR + from .utils import strdate_to_timestamp -def svn_date_to_swh_date(strdate): +def svn_date_to_swh_date( + strdate: Optional[str]) -> TimestampWithTimezone: """Convert a string date to an swh one. Args: @@ -19,46 +27,45 @@ An swh date format """ - return { - 'timestamp': strdate_to_timestamp(strdate), - 'offset': 0 - } + return TimestampWithTimezone( + timestamp=strdate_to_timestamp(strdate), + offset=0, + negative_utc=False, + ) -def svn_author_to_swh_person(author): +def svn_author_to_swh_person(author: Union[str, bytes]) -> Person: """Convert an svn author to an swh person. Default policy: No information is added. Args: author (string): the svn author (in bytes) - Returns: a dictionary with keys: - fullname: the author's associated fullname - name: the author's associated name - email: None (no email in svn) + Returns: + a Person """ + # TODO: Align this function and move it up as library helper function if not author: - return {'fullname': b'', 'name': None, 'email': None} + return EMPTY_AUTHOR if isinstance(author, str): author = author.encode('utf-8') if b'<' in author and b'>' in author: name, email = utils.parseaddr(author.decode('utf-8')) - return { - 'fullname': author, - 'name': name.encode('utf-8'), - 'email': email.encode('utf-8') - } + return Person( + fullname=author, + name=name.encode('utf-8'), + email=email.encode('utf-8') + ) - return {'fullname': author, 'email': None, 'name': author} + return Person(fullname=author, name=author, email=None) - } - - -def build_swh_revision(rev, commit, repo_uuid, dir_id, parents): +def build_swh_revision( + rev: int, commit: Dict, repo_uuid: str, dir_id: bytes, + parents: List[bytes]) -> Revision: """Given a svn revision, build a swh revision. This adds an ['metadata']['extra-headers'] entry with the @@ -79,22 +86,22 @@ msg = commit['message'] date = commit['author_date'] - metadata = { + metadata: Dict[str, Any] = { 'extra_headers': [ ['svn_repo_uuid', repo_uuid], ['svn_revision', str(rev).encode('utf-8')] ] } - return { - 'date': date, - 'committer_date': date, - 'type': 'svn', - 'directory': dir_id, - 'message': msg, - 'author': author, - 'committer': author, - 'synthetic': True, - 'metadata': metadata, - 'parents': parents, - } + return Revision( + type=RevisionType.SUBVERSION, + date=date, + committer_date=date, + directory=dir_id, + message=msg, + author=author, + committer=author, + synthetic=True, + metadata=metadata, + parents=parents, + ) diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -15,12 +15,14 @@ from mmap import mmap, ACCESS_WRITE from subprocess import Popen +from typing import Iterator, List, Tuple from swh.model import hashutil -from swh.model.from_disk import Directory -from swh.model.identifiers import identifier_to_bytes, revision_identifier -from swh.model.identifiers import snapshot_identifier -from swh.loader.core.converters import prepare_contents +from swh.model.model import ( + Content, Directory, Origin, SkippedContent, Revision, Snapshot, + SnapshotBranch, TargetType +) +from swh.model import from_disk from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders from swh.storage.algos.snapshot import snapshot_get_all_branches @@ -37,23 +39,14 @@ DEFAULT_BRANCH = b'HEAD' -def _revision_id(revision): - return identifier_to_bytes(revision_identifier(revision)) - - def build_swh_snapshot(revision_id, branch=DEFAULT_BRANCH): """Build a swh snapshot from the revision id, origin url, and visit. """ - return { - 'id': None, - 'branches': { - branch: { - 'target': revision_id, - 'target_type': 'revision', - } - } - } + return Snapshot(branches={ + branch: SnapshotBranch( + target=revision_id, target_type=TargetType.REVISION) + }) TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.svn.' @@ -100,6 +93,7 @@ self.check_revision = None # internal state used to store swh objects self._contents = [] + self._skipped_contents = [] self._directories = [] self._revisions = [] self._snapshot = None @@ -145,7 +139,7 @@ """ local_dirname, local_url = self.svnrepo.export_temporary(revision) - h = Directory.from_disk(path=local_url).hash + h = from_disk.Directory.from_disk(path=local_url).hash self.svnrepo.clean_fs(local_dirname) return h @@ -237,12 +231,9 @@ rev, _, commit, _, root_dir = list(hash_data_per_revs)[0] dir_id = root_dir.hash - swh_revision = self.build_swh_revision(rev, - commit, - dir_id, - parents) - swh_revision_id = _revision_id(swh_revision) - + swh_revision = self.build_swh_revision( + rev, commit, dir_id, parents) + swh_revision_id = swh_revision.id return swh_revision_id == revision_id def _init_from(self, partial_swh_revision, previous_swh_revision): @@ -388,8 +379,14 @@ hashutil.hash_to_hex(checked_dir_id)) raise ValueError(err) - def process_svn_revisions(self, svnrepo, revision_start, revision_end, - revision_parents): + def process_svn_revisions( + self, svnrepo, revision_start, revision_end, + revision_parents) -> Iterator[ + Tuple[ + List[Content], List[SkippedContent], List[Directory], + Revision + ] + ]: """Process svn revisions from revision_start to revision_end. At each svn revision, apply new diffs and simultaneously @@ -416,33 +413,30 @@ for rev, nextrev, commit, new_objects, root_directory in gen_revs: count += 1 # Send the associated contents/directories - _contents = new_objects.get('content', {}).values() - _directories = new_objects.get('directory', {}).values() + _contents, _skipped_contents, _directories = new_objects # compute the fs tree's checksums dir_id = root_directory.hash swh_revision = self.build_swh_revision( rev, commit, dir_id, revision_parents[rev]) - swh_revision['id'] = _revision_id(swh_revision) - self.log.debug('rev: %s, swhrev: %s, dir: %s' % ( rev, - hashutil.hash_to_hex(swh_revision['id']), + hashutil.hash_to_hex(swh_revision.id), hashutil.hash_to_hex(dir_id))) if self.check_revision: self._check_revision_divergence(count, rev, dir_id) if nextrev: - revision_parents[nextrev] = [swh_revision['id']] + revision_parents[nextrev] = [swh_revision.id] - yield _contents, _directories, swh_revision + yield _contents, _skipped_contents, _directories, swh_revision def prepare_origin_visit(self, *args, **kwargs): - self.origin = { - 'url': self.origin_url if self.origin_url else self.svn_url, - } + self.origin = Origin( + url=self.origin_url if self.origin_url else self.svn_url + ) def prepare(self, *args, **kwargs): if self.swh_revision: @@ -514,10 +508,11 @@ self.done = True self._visit_status = 'partial' return False # Stopping iteration - self._contents, self._directories, revision = data - if revision: + self._contents, self._skipped_contents, self._directories, rev = data + if rev: + revision = rev self._last_revision = revision - self._revisions.append(revision) + self._revisions.append(revision) return True # next svn revision def store_data(self): @@ -528,11 +523,8 @@ This also resets the internal instance variable state. """ - contents, skipped_contents = prepare_contents( - self._contents, max_content_size=self.max_content_size, - origin_url=self.origin['url']) - self.storage.skipped_content_add(skipped_contents) - self.storage.content_add(contents) + self.storage.skipped_content_add(self._skipped_contents) + self.storage.content_add(self._contents) self.storage.directory_add(self._directories) self.storage.revision_add(self._revisions) @@ -542,8 +534,8 @@ snapshot=self._snapshot ) self.flush() - self.storage.origin_visit_update(self.origin['url'], self.visit, - snapshot=snapshot['id']) + self.storage.origin_visit_update( + self.origin.url, self.visit, snapshot=snapshot.id) self._contents = [] self._directories = [] @@ -564,8 +556,7 @@ """ if revision: # Priority to the revision - snap = build_swh_snapshot(revision['id']) - snap['id'] = identifier_to_bytes(snapshot_identifier(snap)) + snap = build_swh_snapshot(revision.id) elif snapshot: # Fallback to prior snapshot snap = snapshot else: diff --git a/swh/loader/svn/ra.py b/swh/loader/svn/ra.py --- a/swh/loader/svn/ra.py +++ b/swh/loader/svn/ra.py @@ -13,11 +13,14 @@ import shutil import tempfile +from typing import List, Tuple + from subvertpy import delta, properties from subvertpy.ra import RemoteAccess, Auth, get_username_provider from swh.model import hashutil -from swh.model.from_disk import Content, Directory +from swh.model import from_disk +from swh.model.model import Content, Directory, SkippedContent _eol_style = { @@ -245,10 +248,11 @@ data = f.read() data = _normalize_line_endings(data, eol_style) mode = os.lstat(self.fullpath).st_mode - self.directory[self.path] = Content.from_bytes(mode=mode, - data=data) + self.directory[self.path] = from_disk.Content.from_bytes( + mode=mode, data=data) else: - self.directory[self.path] = Content.from_file(path=self.fullpath) + self.directory[self.path] = from_disk.Content.from_file( + path=self.fullpath) class BaseDirEditor: @@ -318,7 +322,7 @@ """ path = os.fsencode(args[0]) - self.directory[path] = Content() + self.directory[path] = from_disk.Content() return FileEditor(self.directory, rootpath=self.rootpath, path=path) def add_file(self, path, copyfrom_path=None, copyfrom_rev=-1): @@ -326,7 +330,7 @@ """ path = os.fsencode(path) - self.directory[path] = Content() + self.directory[path] = from_disk.Content() return FileEditor(self.directory, self.rootpath, path) def change_prop(self, key, value): @@ -378,7 +382,7 @@ """ path = os.fsencode(path) os.makedirs(os.path.join(self.rootpath, path), exist_ok=True) - self.directory[path] = Directory() + self.directory[path] = from_disk.Directory() return self @@ -414,7 +418,7 @@ self.conn = conn self.rootpath = rootpath if directory is None: - directory = Directory() + directory = from_disk.Directory() self.directory = directory self.editor = Editor(rootpath=rootpath, directory=directory) @@ -433,8 +437,9 @@ codecs.register_error("strict", codecs.strict_errors) return self.editor.directory - def compute_hashes(self, rev): - """Compute hashes at revisions rev. + def compute_objects(self, rev: int) -> Tuple[ + List[Content], List[SkippedContent], List[Directory]]: + """Compute objects at revisions rev. Expects the state to be at previous revision's objects. Args: @@ -446,7 +451,24 @@ """ self.replay(rev) - return self.directory.collect() + # TODO: Move this listing up in swh.model + contents: List[Content] = [] + skipped_contents: List[SkippedContent] = [] + directories: List[Directory] = [] + for obj in self.directory.iter_tree(): + obj = obj.to_model() + if isinstance(obj, Content): + obj = obj.with_data() + contents.append(obj) + elif isinstance(obj, SkippedContent): + skipped_contents.append(obj) + elif isinstance(obj, Directory): + directories.append(obj) + else: + raise TypeError( + f'Unexpected content type from disk: {obj}') + + return contents, skipped_contents, directories @click.command() @@ -487,12 +509,13 @@ replay = Replay(conn, rootpath) for rev in range(revision_start, revision_end+1): - objects = replay.compute_hashes(rev) + contents, skipped_contents, directories = replay.compute_objects( + rev) print("r%s %s (%s new contents, %s new directories)" % ( rev, hashutil.hash_to_hex(replay.directory.hash), - len(objects.get('content', {})), - len(objects.get('directory', {})), + len(contents) + len(skipped_contents), + len(directories), )) if debug: diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -221,7 +221,7 @@ """ for commit in self.logs(start_revision, end_revision): rev = commit['rev'] - objects = self.swhreplay.compute_hashes(rev) + objects = self.swhreplay.compute_objects(rev) if rev == end_revision: nextrev = None diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -3,148 +3,135 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest - +from swh.model.hashutil import hash_to_bytes +from swh.model.model import ( + Person, Revision, Timestamp, TimestampWithTimezone +) from swh.loader.svn import converters -class TestAuthorConverters(unittest.TestCase): - def test_svn_author_to_swh_person(self): - """The author should have name, email and fullname filled. - - """ - actual_person = converters.svn_author_to_swh_person( - 'tony ') - self.assertEqual(actual_person, { - 'fullname': b'tony ', - 'name': b'tony', - 'email': b'ynot@dagobah', - }) - - def test_svn_author_to_swh_person_no_email(self): - """The author and fullname should be the same as the input (author). - - """ - actual_person = converters.svn_author_to_swh_person('tony') - self.assertEqual(actual_person, { - 'fullname': b'tony', - 'name': b'tony', - 'email': None, - }) - - def test_svn_author_to_swh_person_empty_person(self): - """Empty person has only its fullname filled with the empty - byte-string. - - """ - actual_person = converters.svn_author_to_swh_person('') - self.assertEqual(actual_person, { - 'fullname': b'', - 'name': None, - 'email': None, - }) - - -class TestRevisionConverters(unittest.TestCase): - def test_build_swh_revision_default(self): - """This should build the swh revision with the swh revision's extra - headers about the repository. - - """ - actual_swh_revision = converters.build_swh_revision( - repo_uuid=b'uuid', - dir_id='dir-id', - commit={ - 'author_name': { - 'name': b'theo', - 'email': b'theo@uuid', - 'fullname': b'theo ' - }, - 'message': b'commit message', - 'author_date': { - 'timestamp': { - 'seconds': 1088108379, - 'microseconds': 0, - }, - 'offset': 0 - } - }, - rev=10, - parents=['123']) - - date = { - 'timestamp': { - 'seconds': 1088108379, - 'microseconds': 0, - }, - 'offset': 0, - } - - self.assertEqual(actual_swh_revision, { - 'date': date, - 'committer_date': date, - 'type': 'svn', - 'directory': 'dir-id', +def test_svn_author_to_swh_person(): + """The author should have name, email and fullname filled. + + """ + actual_person = converters.svn_author_to_swh_person( + 'tony ') + + assert actual_person == Person.from_dict({ + 'fullname': b'tony ', + 'name': b'tony', + 'email': b'ynot@dagobah', + }) + + +def test_svn_author_to_swh_person_no_email(): + """The author and fullname should be the same as the input (author). + + """ + actual_person = converters.svn_author_to_swh_person('tony') + assert actual_person == Person.from_dict({ + 'fullname': b'tony', + 'name': b'tony', + 'email': None, + }) + + +def test_svn_author_to_swh_person_empty_person(): + """Empty person has only its fullname filled with the empty + byte-string. + + """ + actual_person = converters.svn_author_to_swh_person('') + assert actual_person == Person.from_dict({ + 'fullname': b'', + 'name': None, + 'email': None, + }) + + +def test_build_swh_revision_default(): + """This should build the swh revision with the swh revision's extra + headers about the repository. + + """ + dir_id = hash_to_bytes('d6e08e19159f77983242877c373c75222d5ae9dd') + date = TimestampWithTimezone( + timestamp=Timestamp(seconds=1088108379, microseconds=0), + offset=0, + negative_utc=False, + ) + actual_rev = converters.build_swh_revision( + repo_uuid=b'uuid', + dir_id=dir_id, + commit={ + 'author_name': Person( + name=b'theo', + email=b'theo@uuid', + fullname=b'theo ' + ), 'message': b'commit message', - 'author': { - 'name': b'theo', - 'email': b'theo@uuid', - 'fullname': b'theo ' - }, - 'committer': { - 'name': b'theo', - 'email': b'theo@uuid', - 'fullname': b'theo ' - }, - 'synthetic': True, - 'metadata': { - 'extra_headers': [ - ['svn_repo_uuid', b'uuid'], - ['svn_revision', b'10'], - ] - }, - 'parents': ['123'], - }) - - -class ConvertDate(unittest.TestCase): - def test_svn_date_to_swh_date(self): - """The timestamp should not be tampered with and include the - decimals. - - """ - self.assertEqual( - converters.svn_date_to_swh_date('2011-05-31T06:04:39.500900Z'), { - 'timestamp': { - 'seconds': 1306821879, - 'microseconds': 500900, - }, - 'offset': 0 - }) - - self.assertEqual( - converters.svn_date_to_swh_date('2011-05-31T06:04:39.800722Z'), - { - 'timestamp': { - 'seconds': 1306821879, - 'microseconds': 800722, - }, - 'offset': 0 - }) - - def test_svn_date_to_swh_date_epoch(self): - """Empty date should be EPOCH (timestamp and offset at 0).""" - # It should return 0, epoch - self.assertEqual({ - 'timestamp': { - 'seconds': 0, - 'microseconds': 0, - }, - 'offset': 0, - }, converters.svn_date_to_swh_date('')) - self.assertEqual({ - 'timestamp': { - 'seconds': 0, - 'microseconds': 0, - }, 'offset': 0, - }, converters.svn_date_to_swh_date(None)) + 'author_date': date, + }, + rev=10, + parents=[]) + + expected_rev = Revision.from_dict({ + 'date': date.to_dict(), + 'committer_date': date.to_dict(), + 'type': 'svn', + 'directory': dir_id, + 'message': b'commit message', + 'author': { + 'name': b'theo', + 'email': b'theo@uuid', + 'fullname': b'theo ' + }, + 'committer': { + 'name': b'theo', + 'email': b'theo@uuid', + 'fullname': b'theo ' + }, + 'synthetic': True, + 'metadata': { + 'extra_headers': [ + ['svn_repo_uuid', b'uuid'], + ['svn_revision', b'10'], + ] + }, + 'parents': [], + }) + + assert actual_rev == expected_rev + + +def test_svn_date_to_swh_date(): + """The timestamp should not be tampered with and include the + decimals. + + """ + assert converters.svn_date_to_swh_date('2011-05-31T06:04:39.500900Z') == \ + TimestampWithTimezone( + timestamp=Timestamp(seconds=1306821879, microseconds=500900), + offset=0, + negative_utc=False, + ) + + assert converters.svn_date_to_swh_date('2011-05-31T06:04:39.800722Z') == \ + TimestampWithTimezone( + timestamp=Timestamp(seconds=1306821879, microseconds=800722), + offset=0, + negative_utc=False, + ) + + +def test_svn_date_to_swh_date_epoch(): + """Empty date should be EPOCH (timestamp and offset at 0).""" + # It should return 0, epoch + default_tstz = TimestampWithTimezone( + timestamp=Timestamp(seconds=0, microseconds=0), + offset=0, + negative_utc=False, + ) + + assert converters.svn_date_to_swh_date('') == default_tstz + assert converters.svn_date_to_swh_date(None) == default_tstz diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -10,18 +10,27 @@ from swh.loader.svn.loader import (DEFAULT_BRANCH, SvnLoader, SvnLoaderFromRemoteDump, build_swh_snapshot) from swh.model import hashutil +from swh.model.model import ( + Origin, Snapshot +) def test_build_swh_snapshot(): - assert build_swh_snapshot('revision-id') == { - 'id': None, + rev_id = hashutil.hash_to_bytes( + '3f51abf3b3d466571be0855dfa67e094f9ceff1b') + snap = build_swh_snapshot(rev_id) + + assert isinstance(snap, Snapshot) + + expected_snapshot = Snapshot.from_dict({ 'branches': { DEFAULT_BRANCH: { - 'target': 'revision-id', + 'target': rev_id, 'target_type': 'revision', } } - } + }) + assert snap == expected_snapshot _LOADER_TEST_CONFIG = { @@ -35,9 +44,6 @@ 'storage': { 'cls': 'pipeline', 'steps': [ - { - 'cls': 'validate' - }, { 'cls': 'retry', }, @@ -106,14 +112,7 @@ super().__init__(url, destination_path=destination_path, start_from_scratch=start_from_scratch, swh_revision=swh_revision) - self.origin = { - 'id': 1, - 'url': url, - } - self.visit = { - 'origin': 1, - 'visit': 1, - } + self.origin = Origin(url=url) self.last_snp_rev = last_snp_rev def parse_config_file(self, *args, **kwargs): @@ -207,10 +206,10 @@ _LAST_SNP_REV = { - 'snapshot': { + 'snapshot': Snapshot.from_dict({ 'id': GOURMET_FLAG_SNAPSHOT, 'branches': {} - }, + }), 'revision': { 'id': hashutil.hash_to_bytes( '4876cb10aec6f708f7466dddf547567b65f6c39c'), diff --git a/swh/loader/svn/tests/test_utils.py b/swh/loader/svn/tests/test_utils.py --- a/swh/loader/svn/tests/test_utils.py +++ b/swh/loader/svn/tests/test_utils.py @@ -1,46 +1,46 @@ -# Copyright (C) 2016 The Software Heritage developers +# Copyright (C) 2016-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import pty -import unittest from subprocess import Popen +from swh.model.model import Timestamp from swh.loader.svn import utils -class TestUtils(unittest.TestCase): - def test_strdate_to_timestamp(self): - """Formatted string date should be converted in timestamp.""" - actual_ts = utils.strdate_to_timestamp('2011-05-31T06:04:39.800722Z') - self.assertEqual(actual_ts, {'seconds': 1306821879, - 'microseconds': 800722}) - - actual_ts = utils.strdate_to_timestamp('2011-05-31T06:03:39.123450Z') - self.assertEqual(actual_ts, {'seconds': 1306821819, - 'microseconds': 123450}) - - def test_strdate_to_timestamp_empty_does_not_break(self): - """Empty or None date should be timestamp 0.""" - self.assertEqual({'seconds': 0, 'microseconds': 0}, - utils.strdate_to_timestamp('')) - self.assertEqual({'seconds': 0, 'microseconds': 0}, - utils.strdate_to_timestamp(None)) - - def test_outputstream(self): - stdout_r, stdout_w = pty.openpty() - echo = Popen(['echo', '-e', 'foo\nbar\nbaz'], stdout=stdout_w) - os.close(stdout_w) - stdout_stream = utils.OutputStream(stdout_r) - lines = [] - while True: - current_lines, readable = stdout_stream.read_lines() - lines += current_lines - if not readable: - break - echo.wait() - os.close(stdout_r) - self.assertEqual(lines, ['foo', 'bar', 'baz']) +def test_outputstream(): + stdout_r, stdout_w = pty.openpty() + echo = Popen(['echo', '-e', 'foo\nbar\nbaz'], stdout=stdout_w) + os.close(stdout_w) + stdout_stream = utils.OutputStream(stdout_r) + lines = [] + while True: + current_lines, readable = stdout_stream.read_lines() + lines += current_lines + if not readable: + break + echo.wait() + os.close(stdout_r) + assert lines == ['foo', 'bar', 'baz'] + + +def test_strdate_to_timestamp(): + """Formatted string date should be converted in timestamp.""" + actual_ts = utils.strdate_to_timestamp('2011-05-31T06:04:39.800722Z') + assert actual_ts == Timestamp(seconds=1306821879, + microseconds=800722) + + actual_ts = utils.strdate_to_timestamp('2011-05-31T06:03:39.123450Z') + assert actual_ts == Timestamp(seconds=1306821819, + microseconds=123450) + + +def test_strdate_to_timestamp_empty_does_not_break(): + """Empty or None date should be timestamp 0.""" + default_ts = Timestamp(seconds=0, microseconds=0) + assert default_ts == utils.strdate_to_timestamp('') + assert default_ts == utils.strdate_to_timestamp(None) diff --git a/swh/loader/svn/utils.py b/swh/loader/svn/utils.py --- a/swh/loader/svn/utils.py +++ b/swh/loader/svn/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016 The Software Heritage developers +# Copyright (C) 2016-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,8 +11,10 @@ from dateutil import parser from subprocess import PIPE, Popen, call +from swh.model.model import Optional, Timestamp -def strdate_to_timestamp(strdate): + +def strdate_to_timestamp(strdate: Optional[str]) -> Timestamp: """Convert a string date to an int timestamp. Args: @@ -24,6 +26,7 @@ """ if strdate: + # TODO: Migrate to iso8601 if possible dt = parser.parse(strdate) ts = { 'seconds': int(dt.timestamp()), @@ -31,7 +34,7 @@ } else: # epoch ts = {'seconds': 0, 'microseconds': 0} - return ts + return Timestamp.from_dict(ts) class OutputStream: