diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -19,7 +19,6 @@ 'storage': { 'cls': 'pipeline', 'steps': [ - {'cls': 'validate'}, {'cls': 'retry'}, {'cls': 'filter'}, {'cls': 'buffer'}, diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ swh.core >= 0.0.75 -swh.model >= 0.0.54 +swh.model >= 0.0.57 swh.scheduler swh.storage >= 0.0.163 diff --git a/swh/loader/core/tests/test_converters.py b/swh/loader/core/tests/test_converters.py --- a/swh/loader/core/tests/test_converters.py +++ b/swh/loader/core/tests/test_converters.py @@ -31,8 +31,7 @@ data = b'temp file for testing content storage conversion' tmpfile = tmpfile_with_content(tmpdir, data) - obj = from_disk.Content.from_file(path=os.fsdecode(tmpfile), - save_path=True).get_data() + obj = from_disk.Content.from_file(path=os.fsdecode(tmpfile)).get_data() expected_content = obj.copy() expected_content['data'] = data diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -11,15 +11,17 @@ from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import release_name, artifact_identity -from swh.model.identifiers import normalize_timestamp +from swh.model.model import ( + Sha1Git, Person, TimestampWithTimezone, Revision, RevisionType, +) logger = logging.getLogger(__name__) -SWH_PERSON = { - 'name': b'Software Heritage', - 'fullname': b'Software Heritage', - 'email': b'robot@softwareheritage.org' -} +SWH_PERSON = Person( + name=b'Software Heritage', + fullname=b'Software Heritage', + email=b'robot@softwareheritage.org' +) REVISION_MESSAGE = b'swh-loader-package: synthetic revision message' @@ -101,21 +103,24 @@ return rev_id return None - def build_revision(self, a_metadata: Mapping[str, Any], - uncompressed_path: str) -> Dict: + def build_revision( + self, a_metadata: Mapping[str, Any], uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: time = a_metadata['time'] # assume it's a timestamp if isinstance(time, str): # otherwise, assume it's a parsable date time = iso8601.parse_date(time) - normalized_time = normalize_timestamp(time) - return { - 'type': 'tar', - 'message': REVISION_MESSAGE, - 'date': normalized_time, - 'author': SWH_PERSON, - 'committer': SWH_PERSON, - 'committer_date': normalized_time, - 'parents': [], - 'metadata': { + normalized_time = TimestampWithTimezone.from_datetime(time) + return Revision( + type=RevisionType.TAR, + message=REVISION_MESSAGE, + date=normalized_time, + author=SWH_PERSON, + committer=SWH_PERSON, + committer_date=normalized_time, + parents=[], + directory=directory, + synthetic=True, + metadata={ 'intrinsic': {}, 'extrinsic': { 'provider': self.url, @@ -123,4 +128,4 @@ 'raw': a_metadata, }, }, - } + ) diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -19,7 +19,9 @@ from swh.loader.package.utils import ( release_name, parse_author, swh_author, artifact_identity ) -from swh.model.identifiers import normalize_timestamp +from swh.model.model import ( + TimestampWithTimezone, Sha1Git, Revision, RevisionType, +) logger = logging.getLogger(__name__) @@ -85,21 +87,24 @@ def build_revision( self, a_metadata: Mapping[str, Any], - uncompressed_path: str) -> Dict[str, Any]: + uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: # a_metadata is empty metadata = extract_intrinsic_metadata(uncompressed_path) - normalized_date = normalize_timestamp(parse_date(metadata.get('Date'))) + date = parse_date(metadata.get('Date')) author = swh_author(parse_author(metadata.get('Maintainer', {}))) version = metadata.get('Version', a_metadata['version']) - return { - 'message': version.encode('utf-8'), - 'type': 'tar', - 'date': normalized_date, - 'author': author, - 'committer': author, - 'committer_date': normalized_date, - 'parents': [], - 'metadata': { + return Revision( + message=version.encode('utf-8'), + type=RevisionType.TAR, + date=date, + author=author, + committer=author, + committer_date=date, + parents=[], + directory=directory, + synthetic=True, + metadata={ 'intrinsic': { 'tool': 'DESCRIPTION', 'raw': metadata, @@ -110,7 +115,7 @@ 'raw': a_metadata, }, }, - } + ) def parse_debian_control(filepath: str) -> Dict[str, Any]: @@ -159,14 +164,14 @@ return parse_debian_control(description_path) -def parse_date(date: Optional[str]) -> Optional[datetime.datetime]: +def parse_date(date: Optional[str]) -> Optional[TimestampWithTimezone]: """Parse a date into a datetime """ assert not date or isinstance(date, str) dt: Optional[datetime.datetime] = None if not date: - return dt + return None try: specific_date = DATE_PATTERN.match(date) if specific_date: @@ -183,4 +188,7 @@ dt = dt.replace(tzinfo=timezone.utc) except Exception as e: logger.warning('Fail to parse date %s. Reason: %s', (date, e)) - return dt + if dt: + return TimestampWithTimezone.from_datetime(dt) + else: + return None diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py --- a/swh/loader/package/cran/tests/test_cran.py +++ b/swh/loader/package/cran/tests/test_cran.py @@ -16,6 +16,7 @@ parse_debian_control ) from swh.core.tarball import uncompress +from swh.model.model import TimestampWithTimezone from swh.loader.package.tests.common import ( check_snapshot, get_stats @@ -102,8 +103,12 @@ ("Check NEWS file for changes: news(package='simSummary')", None) ] for date, expected_date in data: - actual_date = parse_date(date) - assert actual_date == expected_date, f'input date to parse {date}' + actual_tstz = parse_date(date) + if expected_date is None: + assert actual_tstz is None, date + else: + expected_tstz = TimestampWithTimezone.from_datetime(expected_date) + assert actual_tstz == expected_tstz, date @pytest.mark.fs diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -4,21 +4,23 @@ # See top-level LICENSE file for more information import email.utils -import iso8601 import logging +from os import path import re import subprocess from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc -from os import path from typing import ( - Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple + Any, Generator, List, Mapping, Optional, Sequence, Tuple ) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download, release_name +from swh.model.model import ( + Sha1Git, Person, Revision, RevisionType, TimestampWithTimezone +) logger = logging.getLogger(__name__) @@ -119,8 +121,9 @@ logger.debug('dl_artifacts: %s', dl_artifacts) return extract_package(dl_artifacts, dest=dest) - def build_revision(self, a_metadata: Mapping[str, Any], - uncompressed_path: str) -> Dict: + def build_revision( + self, a_metadata: Mapping[str, Any], uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: dsc_url, dsc_name = dsc_information(a_metadata) if not dsc_name: raise ValueError( @@ -135,19 +138,22 @@ msg = 'Synthetic revision for Debian source package %s version %s' % ( a_metadata['name'], a_metadata['version']) - date = iso8601.parse_date(i_metadata['changelog']['date']) + date = TimestampWithTimezone.from_iso8601( + i_metadata['changelog']['date']) author = prepare_person(i_metadata['changelog']['person']) # inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa - return { - 'type': 'dsc', - 'message': msg.encode('utf-8'), - 'author': author, - 'date': date, - 'committer': author, - 'committer_date': date, - 'parents': [], - 'metadata': { + return Revision( + type=RevisionType.DSC, + message=msg.encode('utf-8'), + author=author, + date=date, + committer=author, + committer_date=date, + parents=[], + directory=directory, + synthetic=True, + metadata={ 'intrinsic': { 'tool': 'dsc', 'raw': i_metadata, @@ -157,8 +163,8 @@ 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, - } - } + }, + ) def resolve_revision_from(known_package_artifacts: Mapping, @@ -223,20 +229,20 @@ return ret -def prepare_person(person: Mapping[str, str]) -> Mapping[str, bytes]: +def prepare_person(person: Mapping[str, str]) -> Person: """Prepare person for swh serialization... Args: A person dict Returns: - A person dict ready for storage + A person ready for storage """ - ret = {} - for key, value in person.items(): - ret[key] = value.encode('utf-8') - return ret + return Person.from_dict({ + key: value.encode('utf-8') + for (key, value) in person.items() + }) def download_package( diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py --- a/swh/loader/package/debian/tests/test_debian.py +++ b/swh/loader/package/debian/tests/test_debian.py @@ -17,6 +17,8 @@ from swh.loader.package.tests.common import check_snapshot, get_stats from swh.loader.package.debian.loader import resolve_revision_from +from swh.model.model import Person + logger = logging.getLogger(__name__) @@ -224,11 +226,11 @@ 'fullname': 'Someone Name ', }) - assert actual_author == { - 'name': b'Someone Name', - 'email': b'someone@orga.org', - 'fullname': b'Someone Name ', - } + assert actual_author == Person( + name=b'Someone Name', + email=b'someone@orga.org', + fullname=b'Someone Name ', + ) def test_download_package(datadir, tmpdir, requests_mock_datadir): diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -12,6 +12,9 @@ ) from swh.model.hashutil import hash_to_hex, hash_to_bytes +from swh.model.model import ( + Person, Revision, RevisionType, TimestampWithTimezone, Sha1Git, +) from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download @@ -69,26 +72,35 @@ self.deposit_id, tmpdir, p_info['filename'])] def build_revision( - self, a_metadata: Dict, uncompressed_path: str) -> Dict: - revision = a_metadata.pop('revision') - metadata = { + self, a_metadata: Dict, uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: + revision_data = a_metadata.pop('revision') + + # FIXME: the deposit no longer needs to build the revision + + date = TimestampWithTimezone.from_dict(revision_data['date']) + metadata = revision_data['metadata'] + metadata.update({ 'extrinsic': { 'provider': self.client.metadata_url(self.deposit_id), 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, - } - - # FIXME: the deposit no longer needs to build the revision - revision['metadata'].update(metadata) - revision['author'] = parse_author(revision['author']) - revision['committer'] = parse_author(revision['committer']) - revision['message'] = revision['message'].encode('utf-8') - revision['type'] = 'tar' - parents = revision.get('parents', []) - revision['parents'] = [hash_to_bytes(p) for p in parents] - - return revision + }) + + return Revision( + type=RevisionType.TAR, + message=revision_data['message'].encode('utf-8'), + author=parse_author(revision_data['author']), + date=date, + committer=parse_author(revision_data['committer']), + committer_date=date, + parents=[hash_to_bytes(p) + for p in revision_data.get('parents', [])], + directory=directory, + synthetic=True, + metadata=metadata, + ) def load(self) -> Dict: # Usual loading @@ -153,15 +165,15 @@ return r -def parse_author(author): +def parse_author(author) -> Person: """See prior fixme """ - return { - 'fullname': author['fullname'].encode('utf-8'), - 'name': author['name'].encode('utf-8'), - 'email': author['email'].encode('utf-8'), - } + return Person( + fullname=author['fullname'].encode('utf-8'), + name=author['name'].encode('utf-8'), + email=author['email'].encode('utf-8'), + ) class ApiClient: diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -12,18 +12,22 @@ Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple ) +import attr + from swh.core.tarball import uncompress from swh.core.config import SWHConfig -from swh.model.from_disk import Directory +from swh.model import from_disk from swh.model.hashutil import hash_to_hex -from swh.model.identifiers import ( - revision_identifier, snapshot_identifier, identifier_to_bytes +from swh.model.model import ( + BaseModel, Sha1Git, + Content, SkippedContent, Directory, + Revision, + TargetType, Snapshot, + Origin ) -from swh.model.model import Sha1Git from swh.storage import get_storage from swh.storage.algos.snapshot import snapshot_get_all_branches -from swh.loader.core.converters import prepare_contents from swh.loader.package.utils import download @@ -96,8 +100,9 @@ yield from {} def build_revision( - self, a_metadata: Dict, uncompressed_path: str) -> Dict: - """Build the revision dict from the archive metadata (extrinsic + self, a_metadata: Dict, uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: + """Build the revision from the archive metadata (extrinsic artifact metadata) and the intrinsic metadata. Args: @@ -108,7 +113,7 @@ SWH data dict """ - return {} + raise NotImplementedError('build_revision') def get_default_version(self) -> str: """Retrieve the latest release version if any. @@ -119,19 +124,20 @@ """ return '' - def last_snapshot(self) -> Optional[Dict]: + def last_snapshot(self) -> Optional[Snapshot]: """Retrieve the last snapshot """ snapshot = None visit = self.storage.origin_visit_get_latest( self.url, require_snapshot=True) - if visit: - snapshot = snapshot_get_all_branches( - self.storage, visit['snapshot']) + if visit and visit.get('snapshot'): + snapshot = Snapshot.from_dict(snapshot_get_all_branches( + self.storage, visit['snapshot'])) return snapshot - def known_artifacts(self, snapshot: Optional[Dict]) -> Dict: + def known_artifacts( + self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]: """Retrieve the known releases/artifact for the origin. Args @@ -141,13 +147,13 @@ Dict of keys revision id (bytes), values a metadata Dict. """ - if not snapshot or 'branches' not in snapshot: + if not snapshot: return {} # retrieve only revisions (e.g the alias we do not want here) - revs = [rev['target'] - for rev in snapshot['branches'].values() - if rev and rev['target_type'] == 'revision'] + revs = [rev.target + for rev in snapshot.branches.values() + if rev and rev.target_type == TargetType.REVISION] known_revisions = self.storage.revision_get(revs) ret = {} @@ -263,16 +269,15 @@ snapshot = None # Prepare origin and origin_visit - origin = {'url': self.url} + origin = Origin(url=self.url) try: self.storage.origin_add_one(origin) visit_id = self.storage.origin_visit_add( origin=self.url, date=self.visit_date, type=self.visit_type)['visit'] - except Exception as e: - logger.error( - 'Failed to create origin/origin_visit. Reason: %s', e) + except Exception: + logger.exception('Failed to create origin/origin_visit:') return {'status': 'failed'} try: @@ -327,13 +332,12 @@ 'target': target, } - snapshot = { + snapshot_data = { 'branches': branches } - logger.debug('snapshot: %s', snapshot) + logger.debug('snapshot: %s', snapshot_data) - snapshot['id'] = identifier_to_bytes( - snapshot_identifier(snapshot)) + snapshot = Snapshot.from_dict(snapshot_data) logger.debug('snapshot: %s', snapshot) self.storage.snapshot_add([snapshot]) @@ -346,12 +350,12 @@ finally: self.storage.origin_visit_update( origin=self.url, visit_id=visit_id, status=status_visit, - snapshot=snapshot and snapshot['id']) + snapshot=snapshot and snapshot.id) result = { 'status': status_load, } # type: Dict[str, Any] if snapshot: - result['snapshot_id'] = hash_to_hex(snapshot['id']) + result['snapshot_id'] = hash_to_hex(snapshot.id) return result def _load_revision(self, p_info, origin) -> Tuple[Optional[Sha1Git], bool]: @@ -373,51 +377,56 @@ uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir) logger.debug('uncompressed_path: %s', uncompressed_path) - directory = Directory.from_disk( + directory = from_disk.Directory.from_disk( path=uncompressed_path.encode('utf-8'), - data=True) # noqa - # FIXME: Try not to load the full raw content in - # memory - objects = directory.collect() - - contents, skipped_contents = prepare_contents( - objects.get('content', {}).values(), - max_content_size=self.max_content_size, - origin_url=origin['url']) - self.storage.skipped_content_add(skipped_contents) + max_content_length=self.max_content_size) + + contents: List[Content] = [] + skipped_contents: List[SkippedContent] = [] + directories: List[Directory] = [] + + for obj in directory.iter_tree(): + obj = obj.to_model() + if isinstance(obj, Content): + # FIXME: read the data from disk later (when the + # storage buffer is flushed). + obj = obj.with_data() + contents.append(obj) + elif isinstance(obj, SkippedContent): + skipped_contents.append(obj) + elif isinstance(obj, Directory): + directories.append(obj) + else: + raise TypeError( + f'Unexpected content type from disk: {obj}') + logger.debug('Number of skipped contents: %s', len(skipped_contents)) - self.storage.content_add(contents) + self.storage.skipped_content_add(skipped_contents) logger.debug('Number of contents: %s', len(contents)) + self.storage.content_add(contents) - directories = list( - objects.get('directory', {}).values()) logger.debug('Number of directories: %s', len(directories)) self.storage.directory_add(directories) # FIXME: This should be release. cf. D409 - revision = self.build_revision(p_info['raw'], uncompressed_path) + revision = self.build_revision( + p_info['raw'], uncompressed_path, directory=directory.hash) if not revision: # Some artifacts are missing intrinsic metadata # skipping those return (None, True) - revision.update({ - 'synthetic': True, - 'directory': directory.hash, - }) - - revision['metadata'].update({ + metadata = revision.metadata or {} + metadata.update({ 'original_artifact': [ hashes for _, hashes in dl_artifacts ], }) - - revision['id'] = identifier_to_bytes( - revision_identifier(revision)) + revision = attr.evolve(revision, metadata=metadata) logger.debug('Revision: %s', revision) self.storage.revision_add([revision]) - return (revision['id'], True) + return (revision.id, True) diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -10,11 +10,14 @@ from codecs import BOM_UTF8 from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional +import attr import chardet -import iso8601 from urllib.parse import quote -from swh.model.identifiers import normalize_timestamp +from swh.model.model import ( + Person, RevisionType, Revision, TimestampWithTimezone, Sha1Git, +) + from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import ( api_info, release_name, parse_author, swh_author @@ -75,10 +78,11 @@ return artifact_to_revision_id(known_artifacts, artifact_metadata) def build_revision( - self, a_metadata: Dict, uncompressed_path: str) -> Dict: + self, a_metadata: Dict, uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: - return {} + return None # from intrinsic metadata author = extract_npm_package_author(i_metadata) message = i_metadata['version'].encode('ascii') @@ -101,18 +105,23 @@ (self.url, artifact_name) ) - date = iso8601.parse_date(date) - date = normalize_timestamp(int(date.timestamp())) - - return { - 'type': 'tar', - 'message': message, - 'author': author, - 'date': date, - 'committer': author, - 'committer_date': date, - 'parents': [], - 'metadata': { + date = TimestampWithTimezone.from_iso8601(date) + + # FIXME: remain bug-compatible with earlier versions + date = attr.evolve(date, timestamp=attr.evolve( + date.timestamp, microseconds=0)) + + r = Revision( + type=RevisionType.TAR, + message=message, + author=author, + date=date, + committer=author, + committer_date=date, + parents=[], + directory=directory, + synthetic=True, + metadata={ 'intrinsic': { 'tool': 'package.json', 'raw': i_metadata, @@ -123,7 +132,11 @@ 'raw': a_metadata, }, }, - } + ) + print(r.id.hex()) + import pprint + pprint.pprint(r.to_dict()) + return r def artifact_to_revision_id( @@ -170,7 +183,7 @@ return None -def extract_npm_package_author(package_json): +def extract_npm_package_author(package_json) -> Person: """ Extract package author from a ``package.json`` file content and return it in swh format. @@ -180,10 +193,7 @@ ``package.json`` file Returns: - dict: A dict with the following keys: - * fullname - * name - * email + Person """ diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py --- a/swh/loader/package/npm/tests/test_npm.py +++ b/swh/loader/package/npm/tests/test_npm.py @@ -8,6 +8,7 @@ import pytest from swh.model.hashutil import hash_to_bytes +from swh.model.model import Person from swh.loader.package.npm.loader import ( NpmLoader, extract_npm_package_author, @@ -26,19 +27,19 @@ package_metadata = json.load(json_file) extract_npm_package_author(package_metadata['versions']['0.0.2']) == \ - { - 'fullname': b'mooz ', - 'name': b'mooz', - 'email': b'stillpedant@gmail.com' - } + Person( + fullname=b'mooz ', + name=b'mooz', + email=b'stillpedant@gmail.com' + ) assert ( extract_npm_package_author(package_metadata['versions']['0.0.3']) == - { - 'fullname': b'Masafumi Oyamada ', - 'name': b'Masafumi Oyamada', - 'email': b'stillpedant@gmail.com' - } + Person( + fullname=b'Masafumi Oyamada ', + name=b'Masafumi Oyamada', + email=b'stillpedant@gmail.com' + ) ) package_json = json.loads(''' @@ -67,11 +68,11 @@ }''') # noqa assert extract_npm_package_author(package_json) == \ - { - 'fullname': b'Yauheni Pakala ', - 'name': b'Yauheni Pakala', - 'email': b'evgeniy.pakalo@gmail.com' - } + Person( + fullname=b'Yauheni Pakala ', + name=b'Yauheni Pakala', + email=b'evgeniy.pakalo@gmail.com' + ) package_json = json.loads(''' { @@ -106,11 +107,11 @@ }''') assert extract_npm_package_author(package_json) == \ - { - 'fullname': b'Shawn Walsh', - 'name': b'Shawn Walsh', - 'email': None - } + Person( + fullname=b'Shawn Walsh', + name=b'Shawn Walsh', + email=None + ) package_json = json.loads(''' { @@ -129,11 +130,11 @@ }''') assert extract_npm_package_author(package_json) == \ - { - 'fullname': b'fengmk2 ', - 'name': b'fengmk2', - 'email': b'fengmk2@gmail.com' - } + Person( + fullname=b'fengmk2 ', + name=b'fengmk2', + email=b'fengmk2@gmail.com' + ) package_json = json.loads(''' { @@ -153,11 +154,11 @@ }''') assert extract_npm_package_author(package_json) == \ - { - 'fullname': b'xiaohuoni <448627663@qq.com>', - 'name': b'xiaohuoni', - 'email': b'448627663@qq.com' - } + Person( + fullname=b'xiaohuoni <448627663@qq.com>', + name=b'xiaohuoni', + email=b'448627663@qq.com' + ) def normalize_hashes(hashes): diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -10,11 +10,12 @@ from urllib.parse import urlparse from pkginfo import UnpackedSDist -import iso8601 +from swh.model.model import ( + Person, Sha1Git, TimestampWithTimezone, Revision, RevisionType +) -from swh.model.identifiers import normalize_timestamp from swh.loader.package.loader import PackageLoader -from swh.loader.package.utils import api_info, release_name +from swh.loader.package.utils import api_info, release_name, EMPTY_AUTHOR logger = logging.getLogger(__name__) @@ -72,10 +73,11 @@ return artifact_to_revision_id(known_artifacts, artifact_metadata) def build_revision( - self, a_metadata: Dict, uncompressed_path: str) -> Dict: + self, a_metadata: Dict, uncompressed_path: str, + directory: Sha1Git) -> Optional[Revision]: i_metadata = extract_intrinsic_metadata(uncompressed_path) if not i_metadata: - return {} + return None # from intrinsic metadata name = i_metadata['version'] @@ -84,18 +86,19 @@ # from extrinsic metadata message = a_metadata.get('comment_text', '') message = '%s: %s' % (name, message) if message else name - date = normalize_timestamp( - int(iso8601.parse_date(a_metadata['upload_time']).timestamp())) - - return { - 'type': 'tar', - 'message': message.encode('utf-8'), - 'author': _author, - 'date': date, - 'committer': _author, - 'committer_date': date, - 'parents': [], - 'metadata': { + date = TimestampWithTimezone.from_iso8601(a_metadata['upload_time']) + + return Revision( + type=RevisionType.TAR, + message=message.encode('utf-8'), + author=_author, + date=date, + committer=_author, + committer_date=date, + parents=[], + directory=directory, + synthetic=True, + metadata={ 'intrinsic': { 'tool': 'PKG-INFO', 'raw': i_metadata, @@ -106,7 +109,7 @@ 'raw': a_metadata, }, } - } + ) def artifact_to_revision_id( @@ -210,7 +213,7 @@ return raw -def author(data: Dict) -> Dict: +def author(data: Dict) -> Person: """Given a dict of project/release artifact information (coming from PyPI), returns an author subset. @@ -232,7 +235,7 @@ fullname = name if not fullname: - return {'fullname': b'', 'name': None, 'email': None} + return EMPTY_AUTHOR if name is not None: name = name.encode('utf-8') @@ -240,8 +243,8 @@ if email is not None: email = email.encode('utf-8') - return { - 'fullname': fullname.encode('utf-8'), - 'name': name, - 'email': email - } + return Person( + fullname=fullname.encode('utf-8'), + name=name, + email=email + ) diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py --- a/swh/loader/package/pypi/tests/test_pypi.py +++ b/swh/loader/package/pypi/tests/test_pypi.py @@ -14,6 +14,7 @@ from swh.core.tarball import uncompress from swh.core.pytest_plugin import requests_mock_datadir_factory from swh.model.hashutil import hash_to_bytes +from swh.model.model import Person from swh.loader.package.pypi.loader import ( PyPILoader, pypi_api_url, author, extract_intrinsic_metadata, @@ -31,11 +32,11 @@ } actual_author = author(data) - expected_author = { - 'fullname': b'i-am-groot ', - 'name': b'i-am-groot', - 'email': b'iam@groot.org', - } + expected_author = Person( + fullname=b'i-am-groot ', + name=b'i-am-groot', + email=b'iam@groot.org', + ) assert actual_author == expected_author @@ -47,11 +48,11 @@ } actual_author = author(data) - expected_author = { - 'fullname': b'i-am-groot', - 'name': b'i-am-groot', - 'email': b'', - } + expected_author = Person( + fullname=b'i-am-groot', + name=b'i-am-groot', + email=b'', + ) assert actual_author == expected_author @@ -63,11 +64,11 @@ } actual_author = author(data) - expected_author = { - 'fullname': b' ', - 'name': b'', - 'email': b'iam@groot.org', - } + expected_author = Person( + fullname=b' ', + name=b'', + email=b'iam@groot.org', + ) assert actual_author == expected_author @@ -80,11 +81,11 @@ actual_author = author(data) - expected_author = { - 'fullname': b"['pierre', 'paul', 'jacques']", - 'name': b"['pierre', 'paul', 'jacques']", - 'email': None, - } + expected_author = Person( + fullname=b"['pierre', 'paul', 'jacques']", + name=b"['pierre', 'paul', 'jacques']", + email=None, + ) assert actual_author == expected_author @@ -97,11 +98,11 @@ actual_author = author(data) - expected_author = { - 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>', - 'name': b'[marie, jeanne]', - 'email': b'[marie@some, jeanne@thing]', - } + expected_author = Person( + fullname=b'[marie, jeanne] <[marie@some, jeanne@thing]>', + name=b'[marie, jeanne]', + email=b'[marie@some, jeanne@thing]', + ) assert actual_author == expected_author @@ -114,11 +115,14 @@ actual_author = author(data) - expected_author = { - 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa - 'name': b'[marie, jeanne, pierre]', - 'email': b'[marie@somewhere.org, jeanne@somewhere.org]', - } + expected_author = Person( + fullname=( + b'[marie, jeanne, pierre] ' + b'<[marie@somewhere.org, jeanne@somewhere.org]>' + ), + name=b'[marie, jeanne, pierre]', + email=b'[marie@somewhere.org, jeanne@somewhere.org]', + ) actual_author == expected_author diff --git a/swh/loader/package/tests/test_common.py b/swh/loader/package/tests/test_common.py --- a/swh/loader/package/tests/test_common.py +++ b/swh/loader/package/tests/test_common.py @@ -6,6 +6,7 @@ import pytest from swh.model.hashutil import hash_to_bytes +from swh.model.model import Snapshot, SnapshotBranch, TargetType from swh.loader.package.tests.common import ( decode_target, check_snapshot, check_metadata, check_metadata_paths ) @@ -17,9 +18,6 @@ storage_config = { 'cls': 'pipeline', 'steps': [ - { - 'cls': 'validate', - }, { 'cls': 'memory', } @@ -57,15 +55,15 @@ storage = get_storage(**storage_config) snap_id = '2498dbf535f882bc7f9a18fb16c9ad27fda7bab7' - snapshot = { - 'id': hash_to_bytes(snap_id), - 'branches': { - b'master': { - 'target': hash_to_bytes(hash_hex), - 'target_type': 'revision', - }, + snapshot = Snapshot( + id=hash_to_bytes(snap_id), + branches={ + b'master': SnapshotBranch( + target=hash_to_bytes(hash_hex), + target_type=TargetType.REVISION, + ), }, - } + ) s = storage.snapshot_add([snapshot]) assert s == { @@ -87,15 +85,15 @@ def test_check_snapshot_failure(): storage = get_storage(**storage_config) - snapshot = { - 'id': hash_to_bytes('2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'), - 'branches': { - b'master': { - 'target': hash_to_bytes(hash_hex), - 'target_type': 'revision', - }, + snapshot = Snapshot( + id=hash_to_bytes('2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'), + branches={ + b'master': SnapshotBranch( + target=hash_to_bytes(hash_hex), + target_type=TargetType.REVISION, + ), }, - } + ) s = storage.snapshot_add([snapshot]) assert s == { diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py --- a/swh/loader/package/utils.py +++ b/swh/loader/package/utils.py @@ -12,6 +12,8 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.model.model import Person + from swh.loader.package import DEFAULT_PARAMS @@ -25,7 +27,11 @@ _author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)' -_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None} +EMPTY_AUTHOR = Person( + fullname=b'', + name=None, + email=None, +) def api_info(url: str) -> Dict: @@ -171,7 +177,7 @@ return author -def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]: +def swh_author(author: Dict[str, str]) -> Person: """Transform an author like dict to an expected swh like dict (values are bytes) @@ -187,13 +193,13 @@ fullname = name if not fullname: - r = _EMPTY_AUTHOR + r = EMPTY_AUTHOR else: - r = { - 'fullname': fullname.encode('utf-8') if fullname else None, - 'name': name.encode('utf-8') if name else None, - 'email': email.encode('utf-8') if email else None - } + r = Person( + fullname=fullname.encode('utf-8') if fullname else b'', + name=name.encode('utf-8') if name else None, + email=email.encode('utf-8') if email else None + ) return r