Page MenuHomeSoftware Heritage

D2714.id9714.diff
No OneTemporary

D2714.id9714.diff

diff --git a/conftest.py b/conftest.py
--- a/conftest.py
+++ b/conftest.py
@@ -19,7 +19,6 @@
'storage': {
'cls': 'pipeline',
'steps': [
- {'cls': 'validate'},
{'cls': 'retry'},
{'cls': 'filter'},
{'cls': 'buffer'},
diff --git a/requirements-swh.txt b/requirements-swh.txt
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,4 +1,4 @@
swh.core >= 0.0.75
-swh.model >= 0.0.54
+swh.model >= 0.0.57
swh.scheduler
swh.storage >= 0.0.163
diff --git a/swh/loader/core/tests/test_converters.py b/swh/loader/core/tests/test_converters.py
--- a/swh/loader/core/tests/test_converters.py
+++ b/swh/loader/core/tests/test_converters.py
@@ -31,8 +31,7 @@
data = b'temp file for testing content storage conversion'
tmpfile = tmpfile_with_content(tmpdir, data)
- obj = from_disk.Content.from_file(path=os.fsdecode(tmpfile),
- save_path=True).get_data()
+ obj = from_disk.Content.from_file(path=os.fsdecode(tmpfile)).get_data()
expected_content = obj.copy()
expected_content['data'] = data
diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py
--- a/swh/loader/package/archive/loader.py
+++ b/swh/loader/package/archive/loader.py
@@ -11,15 +11,17 @@
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import release_name, artifact_identity
-from swh.model.identifiers import normalize_timestamp
+from swh.model.model import (
+ Sha1Git, Person, TimestampWithTimezone, Revision, RevisionType,
+)
logger = logging.getLogger(__name__)
-SWH_PERSON = {
- 'name': b'Software Heritage',
- 'fullname': b'Software Heritage',
- 'email': b'robot@softwareheritage.org'
-}
+SWH_PERSON = Person(
+ name=b'Software Heritage',
+ fullname=b'Software Heritage',
+ email=b'robot@softwareheritage.org'
+)
REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
@@ -101,21 +103,24 @@
return rev_id
return None
- def build_revision(self, a_metadata: Mapping[str, Any],
- uncompressed_path: str) -> Dict:
+ def build_revision(
+ self, a_metadata: Mapping[str, Any], uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
time = a_metadata['time'] # assume it's a timestamp
if isinstance(time, str): # otherwise, assume it's a parsable date
time = iso8601.parse_date(time)
- normalized_time = normalize_timestamp(time)
- return {
- 'type': 'tar',
- 'message': REVISION_MESSAGE,
- 'date': normalized_time,
- 'author': SWH_PERSON,
- 'committer': SWH_PERSON,
- 'committer_date': normalized_time,
- 'parents': [],
- 'metadata': {
+ normalized_time = TimestampWithTimezone.from_datetime(time)
+ return Revision(
+ type=RevisionType.TAR,
+ message=REVISION_MESSAGE,
+ date=normalized_time,
+ author=SWH_PERSON,
+ committer=SWH_PERSON,
+ committer_date=normalized_time,
+ parents=[],
+ directory=directory,
+ synthetic=True,
+ metadata={
'intrinsic': {},
'extrinsic': {
'provider': self.url,
@@ -123,4 +128,4 @@
'raw': a_metadata,
},
},
- }
+ )
diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py
--- a/swh/loader/package/cran/loader.py
+++ b/swh/loader/package/cran/loader.py
@@ -19,7 +19,9 @@
from swh.loader.package.utils import (
release_name, parse_author, swh_author, artifact_identity
)
-from swh.model.identifiers import normalize_timestamp
+from swh.model.model import (
+ TimestampWithTimezone, Sha1Git, Revision, RevisionType,
+)
logger = logging.getLogger(__name__)
@@ -85,21 +87,24 @@
def build_revision(
self, a_metadata: Mapping[str, Any],
- uncompressed_path: str) -> Dict[str, Any]:
+ uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
# a_metadata is empty
metadata = extract_intrinsic_metadata(uncompressed_path)
- normalized_date = normalize_timestamp(parse_date(metadata.get('Date')))
+ date = parse_date(metadata.get('Date'))
author = swh_author(parse_author(metadata.get('Maintainer', {})))
version = metadata.get('Version', a_metadata['version'])
- return {
- 'message': version.encode('utf-8'),
- 'type': 'tar',
- 'date': normalized_date,
- 'author': author,
- 'committer': author,
- 'committer_date': normalized_date,
- 'parents': [],
- 'metadata': {
+ return Revision(
+ message=version.encode('utf-8'),
+ type=RevisionType.TAR,
+ date=date,
+ author=author,
+ committer=author,
+ committer_date=date,
+ parents=[],
+ directory=directory,
+ synthetic=True,
+ metadata={
'intrinsic': {
'tool': 'DESCRIPTION',
'raw': metadata,
@@ -110,7 +115,7 @@
'raw': a_metadata,
},
},
- }
+ )
def parse_debian_control(filepath: str) -> Dict[str, Any]:
@@ -159,14 +164,14 @@
return parse_debian_control(description_path)
-def parse_date(date: Optional[str]) -> Optional[datetime.datetime]:
+def parse_date(date: Optional[str]) -> Optional[TimestampWithTimezone]:
"""Parse a date into a datetime
"""
assert not date or isinstance(date, str)
dt: Optional[datetime.datetime] = None
if not date:
- return dt
+ return None
try:
specific_date = DATE_PATTERN.match(date)
if specific_date:
@@ -183,4 +188,7 @@
dt = dt.replace(tzinfo=timezone.utc)
except Exception as e:
logger.warning('Fail to parse date %s. Reason: %s', (date, e))
- return dt
+ if dt:
+ return TimestampWithTimezone.from_datetime(dt)
+ else:
+ return None
diff --git a/swh/loader/package/cran/tests/test_cran.py b/swh/loader/package/cran/tests/test_cran.py
--- a/swh/loader/package/cran/tests/test_cran.py
+++ b/swh/loader/package/cran/tests/test_cran.py
@@ -16,6 +16,7 @@
parse_debian_control
)
from swh.core.tarball import uncompress
+from swh.model.model import TimestampWithTimezone
from swh.loader.package.tests.common import (
check_snapshot, get_stats
@@ -102,8 +103,12 @@
("Check NEWS file for changes: news(package='simSummary')", None)
]
for date, expected_date in data:
- actual_date = parse_date(date)
- assert actual_date == expected_date, f'input date to parse {date}'
+ actual_tstz = parse_date(date)
+ if expected_date is None:
+ assert actual_tstz is None, date
+ else:
+ expected_tstz = TimestampWithTimezone.from_datetime(expected_date)
+ assert actual_tstz == expected_tstz, date
@pytest.mark.fs
diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py
--- a/swh/loader/package/debian/loader.py
+++ b/swh/loader/package/debian/loader.py
@@ -4,21 +4,23 @@
# See top-level LICENSE file for more information
import email.utils
-import iso8601
import logging
+from os import path
import re
import subprocess
from dateutil.parser import parse as parse_date
from debian.changelog import Changelog
from debian.deb822 import Dsc
-from os import path
from typing import (
- Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+ Any, Generator, List, Mapping, Optional, Sequence, Tuple
)
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import download, release_name
+from swh.model.model import (
+ Sha1Git, Person, Revision, RevisionType, TimestampWithTimezone
+)
logger = logging.getLogger(__name__)
@@ -119,8 +121,9 @@
logger.debug('dl_artifacts: %s', dl_artifacts)
return extract_package(dl_artifacts, dest=dest)
- def build_revision(self, a_metadata: Mapping[str, Any],
- uncompressed_path: str) -> Dict:
+ def build_revision(
+ self, a_metadata: Mapping[str, Any], uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
dsc_url, dsc_name = dsc_information(a_metadata)
if not dsc_name:
raise ValueError(
@@ -135,19 +138,22 @@
msg = 'Synthetic revision for Debian source package %s version %s' % (
a_metadata['name'], a_metadata['version'])
- date = iso8601.parse_date(i_metadata['changelog']['date'])
+ date = TimestampWithTimezone.from_iso8601(
+ i_metadata['changelog']['date'])
author = prepare_person(i_metadata['changelog']['person'])
# inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa
- return {
- 'type': 'dsc',
- 'message': msg.encode('utf-8'),
- 'author': author,
- 'date': date,
- 'committer': author,
- 'committer_date': date,
- 'parents': [],
- 'metadata': {
+ return Revision(
+ type=RevisionType.DSC,
+ message=msg.encode('utf-8'),
+ author=author,
+ date=date,
+ committer=author,
+ committer_date=date,
+ parents=[],
+ directory=directory,
+ synthetic=True,
+ metadata={
'intrinsic': {
'tool': 'dsc',
'raw': i_metadata,
@@ -157,8 +163,8 @@
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
- }
- }
+ },
+ )
def resolve_revision_from(known_package_artifacts: Mapping,
@@ -223,20 +229,20 @@
return ret
-def prepare_person(person: Mapping[str, str]) -> Mapping[str, bytes]:
+def prepare_person(person: Mapping[str, str]) -> Person:
"""Prepare person for swh serialization...
Args:
A person dict
Returns:
- A person dict ready for storage
+ A person ready for storage
"""
- ret = {}
- for key, value in person.items():
- ret[key] = value.encode('utf-8')
- return ret
+ return Person.from_dict({
+ key: value.encode('utf-8')
+ for (key, value) in person.items()
+ })
def download_package(
diff --git a/swh/loader/package/debian/tests/test_debian.py b/swh/loader/package/debian/tests/test_debian.py
--- a/swh/loader/package/debian/tests/test_debian.py
+++ b/swh/loader/package/debian/tests/test_debian.py
@@ -17,6 +17,8 @@
from swh.loader.package.tests.common import check_snapshot, get_stats
from swh.loader.package.debian.loader import resolve_revision_from
+from swh.model.model import Person
+
logger = logging.getLogger(__name__)
@@ -224,11 +226,11 @@
'fullname': 'Someone Name <someone@orga.org>',
})
- assert actual_author == {
- 'name': b'Someone Name',
- 'email': b'someone@orga.org',
- 'fullname': b'Someone Name <someone@orga.org>',
- }
+ assert actual_author == Person(
+ name=b'Someone Name',
+ email=b'someone@orga.org',
+ fullname=b'Someone Name <someone@orga.org>',
+ )
def test_download_package(datadir, tmpdir, requests_mock_datadir):
diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py
--- a/swh/loader/package/deposit/loader.py
+++ b/swh/loader/package/deposit/loader.py
@@ -12,6 +12,9 @@
)
from swh.model.hashutil import hash_to_hex, hash_to_bytes
+from swh.model.model import (
+ Person, Revision, RevisionType, TimestampWithTimezone, Sha1Git,
+)
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import download
@@ -69,26 +72,35 @@
self.deposit_id, tmpdir, p_info['filename'])]
def build_revision(
- self, a_metadata: Dict, uncompressed_path: str) -> Dict:
- revision = a_metadata.pop('revision')
- metadata = {
+ self, a_metadata: Dict, uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
+ revision_data = a_metadata.pop('revision')
+
+ # FIXME: the deposit no longer needs to build the revision
+
+ date = TimestampWithTimezone.from_dict(revision_data['date'])
+ metadata = revision_data['metadata']
+ metadata.update({
'extrinsic': {
'provider': self.client.metadata_url(self.deposit_id),
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
- }
-
- # FIXME: the deposit no longer needs to build the revision
- revision['metadata'].update(metadata)
- revision['author'] = parse_author(revision['author'])
- revision['committer'] = parse_author(revision['committer'])
- revision['message'] = revision['message'].encode('utf-8')
- revision['type'] = 'tar'
- parents = revision.get('parents', [])
- revision['parents'] = [hash_to_bytes(p) for p in parents]
-
- return revision
+ })
+
+ return Revision(
+ type=RevisionType.TAR,
+ message=revision_data['message'].encode('utf-8'),
+ author=parse_author(revision_data['author']),
+ date=date,
+ committer=parse_author(revision_data['committer']),
+ committer_date=date,
+ parents=[hash_to_bytes(p)
+ for p in revision_data.get('parents', [])],
+ directory=directory,
+ synthetic=True,
+ metadata=metadata,
+ )
def load(self) -> Dict:
# Usual loading
@@ -153,15 +165,15 @@
return r
-def parse_author(author):
+def parse_author(author) -> Person:
"""See prior fixme
"""
- return {
- 'fullname': author['fullname'].encode('utf-8'),
- 'name': author['name'].encode('utf-8'),
- 'email': author['email'].encode('utf-8'),
- }
+ return Person(
+ fullname=author['fullname'].encode('utf-8'),
+ name=author['name'].encode('utf-8'),
+ email=author['email'].encode('utf-8'),
+ )
class ApiClient:
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -12,18 +12,22 @@
Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple
)
+import attr
+
from swh.core.tarball import uncompress
from swh.core.config import SWHConfig
-from swh.model.from_disk import Directory
+from swh.model import from_disk
from swh.model.hashutil import hash_to_hex
-from swh.model.identifiers import (
- revision_identifier, snapshot_identifier, identifier_to_bytes
+from swh.model.model import (
+ BaseModel, Sha1Git,
+ Content, SkippedContent, Directory,
+ Revision,
+ TargetType, Snapshot,
+ Origin
)
-from swh.model.model import Sha1Git
from swh.storage import get_storage
from swh.storage.algos.snapshot import snapshot_get_all_branches
-from swh.loader.core.converters import prepare_contents
from swh.loader.package.utils import download
@@ -96,8 +100,9 @@
yield from {}
def build_revision(
- self, a_metadata: Dict, uncompressed_path: str) -> Dict:
- """Build the revision dict from the archive metadata (extrinsic
+ self, a_metadata: Dict, uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
+ """Build the revision from the archive metadata (extrinsic
artifact metadata) and the intrinsic metadata.
Args:
@@ -108,7 +113,7 @@
SWH data dict
"""
- return {}
+ raise NotImplementedError('build_revision')
def get_default_version(self) -> str:
"""Retrieve the latest release version if any.
@@ -119,19 +124,20 @@
"""
return ''
- def last_snapshot(self) -> Optional[Dict]:
+ def last_snapshot(self) -> Optional[Snapshot]:
"""Retrieve the last snapshot
"""
snapshot = None
visit = self.storage.origin_visit_get_latest(
self.url, require_snapshot=True)
- if visit:
- snapshot = snapshot_get_all_branches(
- self.storage, visit['snapshot'])
+ if visit and visit.get('snapshot'):
+ snapshot = Snapshot.from_dict(snapshot_get_all_branches(
+ self.storage, visit['snapshot']))
return snapshot
- def known_artifacts(self, snapshot: Optional[Dict]) -> Dict:
+ def known_artifacts(
+ self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]:
"""Retrieve the known releases/artifact for the origin.
Args
@@ -141,13 +147,13 @@
Dict of keys revision id (bytes), values a metadata Dict.
"""
- if not snapshot or 'branches' not in snapshot:
+ if not snapshot:
return {}
# retrieve only revisions (e.g the alias we do not want here)
- revs = [rev['target']
- for rev in snapshot['branches'].values()
- if rev and rev['target_type'] == 'revision']
+ revs = [rev.target
+ for rev in snapshot.branches.values()
+ if rev and rev.target_type == TargetType.REVISION]
known_revisions = self.storage.revision_get(revs)
ret = {}
@@ -263,16 +269,15 @@
snapshot = None
# Prepare origin and origin_visit
- origin = {'url': self.url}
+ origin = Origin(url=self.url)
try:
self.storage.origin_add_one(origin)
visit_id = self.storage.origin_visit_add(
origin=self.url,
date=self.visit_date,
type=self.visit_type)['visit']
- except Exception as e:
- logger.error(
- 'Failed to create origin/origin_visit. Reason: %s', e)
+ except Exception:
+ logger.exception('Failed to create origin/origin_visit:')
return {'status': 'failed'}
try:
@@ -327,13 +332,12 @@
'target': target,
}
- snapshot = {
+ snapshot_data = {
'branches': branches
}
- logger.debug('snapshot: %s', snapshot)
+ logger.debug('snapshot: %s', snapshot_data)
- snapshot['id'] = identifier_to_bytes(
- snapshot_identifier(snapshot))
+ snapshot = Snapshot.from_dict(snapshot_data)
logger.debug('snapshot: %s', snapshot)
self.storage.snapshot_add([snapshot])
@@ -346,12 +350,12 @@
finally:
self.storage.origin_visit_update(
origin=self.url, visit_id=visit_id, status=status_visit,
- snapshot=snapshot and snapshot['id'])
+ snapshot=snapshot and snapshot.id)
result = {
'status': status_load,
} # type: Dict[str, Any]
if snapshot:
- result['snapshot_id'] = hash_to_hex(snapshot['id'])
+ result['snapshot_id'] = hash_to_hex(snapshot.id)
return result
def _load_revision(self, p_info, origin) -> Tuple[Optional[Sha1Git], bool]:
@@ -373,51 +377,56 @@
uncompressed_path = self.uncompress(dl_artifacts, dest=tmpdir)
logger.debug('uncompressed_path: %s', uncompressed_path)
- directory = Directory.from_disk(
+ directory = from_disk.Directory.from_disk(
path=uncompressed_path.encode('utf-8'),
- data=True) # noqa
- # FIXME: Try not to load the full raw content in
- # memory
- objects = directory.collect()
-
- contents, skipped_contents = prepare_contents(
- objects.get('content', {}).values(),
- max_content_size=self.max_content_size,
- origin_url=origin['url'])
- self.storage.skipped_content_add(skipped_contents)
+ max_content_length=self.max_content_size)
+
+ contents: List[Content] = []
+ skipped_contents: List[SkippedContent] = []
+ directories: List[Directory] = []
+
+ for obj in directory.iter_tree():
+ obj = obj.to_model()
+ if isinstance(obj, Content):
+ # FIXME: read the data from disk later (when the
+ # storage buffer is flushed).
+ obj = obj.with_data()
+ contents.append(obj)
+ elif isinstance(obj, SkippedContent):
+ skipped_contents.append(obj)
+ elif isinstance(obj, Directory):
+ directories.append(obj)
+ else:
+ raise TypeError(
+ f'Unexpected content type from disk: {obj}')
+
logger.debug('Number of skipped contents: %s',
len(skipped_contents))
- self.storage.content_add(contents)
+ self.storage.skipped_content_add(skipped_contents)
logger.debug('Number of contents: %s', len(contents))
+ self.storage.content_add(contents)
- directories = list(
- objects.get('directory', {}).values())
logger.debug('Number of directories: %s', len(directories))
self.storage.directory_add(directories)
# FIXME: This should be release. cf. D409
- revision = self.build_revision(p_info['raw'], uncompressed_path)
+ revision = self.build_revision(
+ p_info['raw'], uncompressed_path, directory=directory.hash)
if not revision:
# Some artifacts are missing intrinsic metadata
# skipping those
return (None, True)
- revision.update({
- 'synthetic': True,
- 'directory': directory.hash,
- })
-
- revision['metadata'].update({
+ metadata = revision.metadata or {}
+ metadata.update({
'original_artifact': [
hashes for _, hashes in dl_artifacts
],
})
-
- revision['id'] = identifier_to_bytes(
- revision_identifier(revision))
+ revision = attr.evolve(revision, metadata=metadata)
logger.debug('Revision: %s', revision)
self.storage.revision_add([revision])
- return (revision['id'], True)
+ return (revision.id, True)
diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py
--- a/swh/loader/package/npm/loader.py
+++ b/swh/loader/package/npm/loader.py
@@ -10,11 +10,14 @@
from codecs import BOM_UTF8
from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
+import attr
import chardet
-import iso8601
from urllib.parse import quote
-from swh.model.identifiers import normalize_timestamp
+from swh.model.model import (
+ Person, RevisionType, Revision, TimestampWithTimezone, Sha1Git,
+)
+
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import (
api_info, release_name, parse_author, swh_author
@@ -75,10 +78,11 @@
return artifact_to_revision_id(known_artifacts, artifact_metadata)
def build_revision(
- self, a_metadata: Dict, uncompressed_path: str) -> Dict:
+ self, a_metadata: Dict, uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
i_metadata = extract_intrinsic_metadata(uncompressed_path)
if not i_metadata:
- return {}
+ return None
# from intrinsic metadata
author = extract_npm_package_author(i_metadata)
message = i_metadata['version'].encode('ascii')
@@ -101,18 +105,23 @@
(self.url, artifact_name)
)
- date = iso8601.parse_date(date)
- date = normalize_timestamp(int(date.timestamp()))
-
- return {
- 'type': 'tar',
- 'message': message,
- 'author': author,
- 'date': date,
- 'committer': author,
- 'committer_date': date,
- 'parents': [],
- 'metadata': {
+ date = TimestampWithTimezone.from_iso8601(date)
+
+ # FIXME: this is to remain bug-compatible with earlier versions:
+ date = attr.evolve(date, timestamp=attr.evolve(
+ date.timestamp, microseconds=0))
+
+ r = Revision(
+ type=RevisionType.TAR,
+ message=message,
+ author=author,
+ date=date,
+ committer=author,
+ committer_date=date,
+ parents=[],
+ directory=directory,
+ synthetic=True,
+ metadata={
'intrinsic': {
'tool': 'package.json',
'raw': i_metadata,
@@ -123,7 +132,8 @@
'raw': a_metadata,
},
},
- }
+ )
+ return r
def artifact_to_revision_id(
@@ -170,7 +180,7 @@
return None
-def extract_npm_package_author(package_json):
+def extract_npm_package_author(package_json) -> Person:
"""
Extract package author from a ``package.json`` file content and
return it in swh format.
@@ -180,10 +190,7 @@
``package.json`` file
Returns:
- dict: A dict with the following keys:
- * fullname
- * name
- * email
+ Person
"""
diff --git a/swh/loader/package/npm/tests/test_npm.py b/swh/loader/package/npm/tests/test_npm.py
--- a/swh/loader/package/npm/tests/test_npm.py
+++ b/swh/loader/package/npm/tests/test_npm.py
@@ -8,6 +8,7 @@
import pytest
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Person
from swh.loader.package.npm.loader import (
NpmLoader, extract_npm_package_author,
@@ -26,19 +27,19 @@
package_metadata = json.load(json_file)
extract_npm_package_author(package_metadata['versions']['0.0.2']) == \
- {
- 'fullname': b'mooz <stillpedant@gmail.com>',
- 'name': b'mooz',
- 'email': b'stillpedant@gmail.com'
- }
+ Person(
+ fullname=b'mooz <stillpedant@gmail.com>',
+ name=b'mooz',
+ email=b'stillpedant@gmail.com'
+ )
assert (
extract_npm_package_author(package_metadata['versions']['0.0.3']) ==
- {
- 'fullname': b'Masafumi Oyamada <stillpedant@gmail.com>',
- 'name': b'Masafumi Oyamada',
- 'email': b'stillpedant@gmail.com'
- }
+ Person(
+ fullname=b'Masafumi Oyamada <stillpedant@gmail.com>',
+ name=b'Masafumi Oyamada',
+ email=b'stillpedant@gmail.com'
+ )
)
package_json = json.loads('''
@@ -67,11 +68,11 @@
}''') # noqa
assert extract_npm_package_author(package_json) == \
- {
- 'fullname': b'Yauheni Pakala <evgeniy.pakalo@gmail.com>',
- 'name': b'Yauheni Pakala',
- 'email': b'evgeniy.pakalo@gmail.com'
- }
+ Person(
+ fullname=b'Yauheni Pakala <evgeniy.pakalo@gmail.com>',
+ name=b'Yauheni Pakala',
+ email=b'evgeniy.pakalo@gmail.com'
+ )
package_json = json.loads('''
{
@@ -106,11 +107,11 @@
}''')
assert extract_npm_package_author(package_json) == \
- {
- 'fullname': b'Shawn Walsh',
- 'name': b'Shawn Walsh',
- 'email': None
- }
+ Person(
+ fullname=b'Shawn Walsh',
+ name=b'Shawn Walsh',
+ email=None
+ )
package_json = json.loads('''
{
@@ -129,11 +130,11 @@
}''')
assert extract_npm_package_author(package_json) == \
- {
- 'fullname': b'fengmk2 <fengmk2@gmail.com>',
- 'name': b'fengmk2',
- 'email': b'fengmk2@gmail.com'
- }
+ Person(
+ fullname=b'fengmk2 <fengmk2@gmail.com>',
+ name=b'fengmk2',
+ email=b'fengmk2@gmail.com'
+ )
package_json = json.loads('''
{
@@ -153,11 +154,11 @@
}''')
assert extract_npm_package_author(package_json) == \
- {
- 'fullname': b'xiaohuoni <448627663@qq.com>',
- 'name': b'xiaohuoni',
- 'email': b'448627663@qq.com'
- }
+ Person(
+ fullname=b'xiaohuoni <448627663@qq.com>',
+ name=b'xiaohuoni',
+ email=b'448627663@qq.com'
+ )
def normalize_hashes(hashes):
diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py
--- a/swh/loader/package/pypi/loader.py
+++ b/swh/loader/package/pypi/loader.py
@@ -10,11 +10,12 @@
from urllib.parse import urlparse
from pkginfo import UnpackedSDist
-import iso8601
+from swh.model.model import (
+ Person, Sha1Git, TimestampWithTimezone, Revision, RevisionType
+)
-from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
-from swh.loader.package.utils import api_info, release_name
+from swh.loader.package.utils import api_info, release_name, EMPTY_AUTHOR
logger = logging.getLogger(__name__)
@@ -72,10 +73,11 @@
return artifact_to_revision_id(known_artifacts, artifact_metadata)
def build_revision(
- self, a_metadata: Dict, uncompressed_path: str) -> Dict:
+ self, a_metadata: Dict, uncompressed_path: str,
+ directory: Sha1Git) -> Optional[Revision]:
i_metadata = extract_intrinsic_metadata(uncompressed_path)
if not i_metadata:
- return {}
+ return None
# from intrinsic metadata
name = i_metadata['version']
@@ -84,18 +86,19 @@
# from extrinsic metadata
message = a_metadata.get('comment_text', '')
message = '%s: %s' % (name, message) if message else name
- date = normalize_timestamp(
- int(iso8601.parse_date(a_metadata['upload_time']).timestamp()))
-
- return {
- 'type': 'tar',
- 'message': message.encode('utf-8'),
- 'author': _author,
- 'date': date,
- 'committer': _author,
- 'committer_date': date,
- 'parents': [],
- 'metadata': {
+ date = TimestampWithTimezone.from_iso8601(a_metadata['upload_time'])
+
+ return Revision(
+ type=RevisionType.TAR,
+ message=message.encode('utf-8'),
+ author=_author,
+ date=date,
+ committer=_author,
+ committer_date=date,
+ parents=[],
+ directory=directory,
+ synthetic=True,
+ metadata={
'intrinsic': {
'tool': 'PKG-INFO',
'raw': i_metadata,
@@ -106,7 +109,7 @@
'raw': a_metadata,
},
}
- }
+ )
def artifact_to_revision_id(
@@ -210,7 +213,7 @@
return raw
-def author(data: Dict) -> Dict:
+def author(data: Dict) -> Person:
"""Given a dict of project/release artifact information (coming from
PyPI), returns an author subset.
@@ -232,7 +235,7 @@
fullname = name
if not fullname:
- return {'fullname': b'', 'name': None, 'email': None}
+ return EMPTY_AUTHOR
if name is not None:
name = name.encode('utf-8')
@@ -240,8 +243,8 @@
if email is not None:
email = email.encode('utf-8')
- return {
- 'fullname': fullname.encode('utf-8'),
- 'name': name,
- 'email': email
- }
+ return Person(
+ fullname=fullname.encode('utf-8'),
+ name=name,
+ email=email
+ )
diff --git a/swh/loader/package/pypi/tests/test_pypi.py b/swh/loader/package/pypi/tests/test_pypi.py
--- a/swh/loader/package/pypi/tests/test_pypi.py
+++ b/swh/loader/package/pypi/tests/test_pypi.py
@@ -14,6 +14,7 @@
from swh.core.tarball import uncompress
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Person
from swh.loader.package.pypi.loader import (
PyPILoader, pypi_api_url, author, extract_intrinsic_metadata,
@@ -31,11 +32,11 @@
}
actual_author = author(data)
- expected_author = {
- 'fullname': b'i-am-groot <iam@groot.org>',
- 'name': b'i-am-groot',
- 'email': b'iam@groot.org',
- }
+ expected_author = Person(
+ fullname=b'i-am-groot <iam@groot.org>',
+ name=b'i-am-groot',
+ email=b'iam@groot.org',
+ )
assert actual_author == expected_author
@@ -47,11 +48,11 @@
}
actual_author = author(data)
- expected_author = {
- 'fullname': b'i-am-groot',
- 'name': b'i-am-groot',
- 'email': b'',
- }
+ expected_author = Person(
+ fullname=b'i-am-groot',
+ name=b'i-am-groot',
+ email=b'',
+ )
assert actual_author == expected_author
@@ -63,11 +64,11 @@
}
actual_author = author(data)
- expected_author = {
- 'fullname': b' <iam@groot.org>',
- 'name': b'',
- 'email': b'iam@groot.org',
- }
+ expected_author = Person(
+ fullname=b' <iam@groot.org>',
+ name=b'',
+ email=b'iam@groot.org',
+ )
assert actual_author == expected_author
@@ -80,11 +81,11 @@
actual_author = author(data)
- expected_author = {
- 'fullname': b"['pierre', 'paul', 'jacques']",
- 'name': b"['pierre', 'paul', 'jacques']",
- 'email': None,
- }
+ expected_author = Person(
+ fullname=b"['pierre', 'paul', 'jacques']",
+ name=b"['pierre', 'paul', 'jacques']",
+ email=None,
+ )
assert actual_author == expected_author
@@ -97,11 +98,11 @@
actual_author = author(data)
- expected_author = {
- 'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>',
- 'name': b'[marie, jeanne]',
- 'email': b'[marie@some, jeanne@thing]',
- }
+ expected_author = Person(
+ fullname=b'[marie, jeanne] <[marie@some, jeanne@thing]>',
+ name=b'[marie, jeanne]',
+ email=b'[marie@some, jeanne@thing]',
+ )
assert actual_author == expected_author
@@ -114,11 +115,14 @@
actual_author = author(data)
- expected_author = {
- 'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa
- 'name': b'[marie, jeanne, pierre]',
- 'email': b'[marie@somewhere.org, jeanne@somewhere.org]',
- }
+ expected_author = Person(
+ fullname=(
+ b'[marie, jeanne, pierre] '
+ b'<[marie@somewhere.org, jeanne@somewhere.org]>'
+ ),
+ name=b'[marie, jeanne, pierre]',
+ email=b'[marie@somewhere.org, jeanne@somewhere.org]',
+ )
actual_author == expected_author
diff --git a/swh/loader/package/tests/test_common.py b/swh/loader/package/tests/test_common.py
--- a/swh/loader/package/tests/test_common.py
+++ b/swh/loader/package/tests/test_common.py
@@ -6,6 +6,7 @@
import pytest
from swh.model.hashutil import hash_to_bytes
+from swh.model.model import Snapshot, SnapshotBranch, TargetType
from swh.loader.package.tests.common import (
decode_target, check_snapshot, check_metadata, check_metadata_paths
)
@@ -17,9 +18,6 @@
storage_config = {
'cls': 'pipeline',
'steps': [
- {
- 'cls': 'validate',
- },
{
'cls': 'memory',
}
@@ -57,15 +55,15 @@
storage = get_storage(**storage_config)
snap_id = '2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'
- snapshot = {
- 'id': hash_to_bytes(snap_id),
- 'branches': {
- b'master': {
- 'target': hash_to_bytes(hash_hex),
- 'target_type': 'revision',
- },
+ snapshot = Snapshot(
+ id=hash_to_bytes(snap_id),
+ branches={
+ b'master': SnapshotBranch(
+ target=hash_to_bytes(hash_hex),
+ target_type=TargetType.REVISION,
+ ),
},
- }
+ )
s = storage.snapshot_add([snapshot])
assert s == {
@@ -87,15 +85,15 @@
def test_check_snapshot_failure():
storage = get_storage(**storage_config)
- snapshot = {
- 'id': hash_to_bytes('2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'),
- 'branches': {
- b'master': {
- 'target': hash_to_bytes(hash_hex),
- 'target_type': 'revision',
- },
+ snapshot = Snapshot(
+ id=hash_to_bytes('2498dbf535f882bc7f9a18fb16c9ad27fda7bab7'),
+ branches={
+ b'master': SnapshotBranch(
+ target=hash_to_bytes(hash_hex),
+ target_type=TargetType.REVISION,
+ ),
},
- }
+ )
s = storage.snapshot_add([snapshot])
assert s == {
diff --git a/swh/loader/package/utils.py b/swh/loader/package/utils.py
--- a/swh/loader/package/utils.py
+++ b/swh/loader/package/utils.py
@@ -12,6 +12,8 @@
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+from swh.model.model import Person
+
from swh.loader.package import DEFAULT_PARAMS
@@ -25,7 +27,11 @@
_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
-_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
+EMPTY_AUTHOR = Person(
+ fullname=b'',
+ name=None,
+ email=None,
+)
def api_info(url: str) -> Dict:
@@ -171,7 +177,7 @@
return author
-def swh_author(author: Dict[str, str]) -> Dict[str, Optional[bytes]]:
+def swh_author(author: Dict[str, str]) -> Person:
"""Transform an author like dict to an expected swh like dict (values are
bytes)
@@ -187,13 +193,13 @@
fullname = name
if not fullname:
- r = _EMPTY_AUTHOR
+ r = EMPTY_AUTHOR
else:
- r = {
- 'fullname': fullname.encode('utf-8') if fullname else None,
- 'name': name.encode('utf-8') if name else None,
- 'email': email.encode('utf-8') if email else None
- }
+ r = Person(
+ fullname=fullname.encode('utf-8') if fullname else b'',
+ name=name.encode('utf-8') if name else None,
+ email=email.encode('utf-8') if email else None
+ )
return r

File Metadata

Mime Type
text/plain
Expires
Jan 30 2025, 11:26 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217438

Event Timeline