Page MenuHomeSoftware Heritage

converters.py
No OneTemporary

converters.py

# Copyright (C) 2015-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Convert dulwich objects to dictionaries suitable for swh.storage"""
from swh.model import hashutil
HASH_ALGORITHMS = hashutil.DEFAULT_ALGORITHMS - {'sha1_git'}
def origin_url_to_origin(origin_url):
"""Format a pygit2.Repository as an origin suitable for swh.storage"""
return {
'type': 'git',
'url': origin_url,
}
def dulwich_blob_to_content_id(blob):
"""Convert a dulwich blob to a Software Heritage content id"""
if blob.type_name != b'blob':
return
size = blob.raw_length()
ret = {
'sha1_git': blob.sha().digest(),
'length': size,
}
data = blob.as_raw_string()
ret.update(hashutil.hash_data(data, HASH_ALGORITHMS))
return ret
def dulwich_blob_to_content(blob, log=None, max_content_size=None,
origin_id=None):
"""Convert a dulwich blob to a Software Heritage content"""
if blob.type_name != b'blob':
return
ret = dulwich_blob_to_content_id(blob)
size = ret['length']
if max_content_size:
if size > max_content_size:
id = hashutil.hash_to_hex(ret['sha1_git'])
if log:
log.info('Skipping content %s, too large (%s > %s)' %
(id, size, max_content_size), extra={
'swh_type': 'loader_git_content_skip',
'swh_id': id,
'swh_size': size,
})
ret['status'] = 'absent'
ret['reason'] = 'Content too large'
ret['origin'] = origin_id
return ret
data = blob.as_raw_string()
ret['data'] = data
ret['status'] = 'visible'
return ret
def dulwich_tree_to_directory(tree, log=None):
"""Format a tree as a directory"""
if tree.type_name != b'tree':
return
ret = {
'id': tree.sha().digest(),
}
entries = []
ret['entries'] = entries
entry_mode_map = {
0o040000: 'dir',
0o160000: 'rev',
0o100644: 'file',
0o100755: 'file',
0o120000: 'file',
}
for entry in tree.iteritems():
entries.append({
'type': entry_mode_map.get(entry.mode, 'file'),
'perms': entry.mode,
'name': entry.path,
'target': hashutil.hash_to_bytes(entry.sha.decode('ascii')),
})
return ret
def parse_author(name_email):
"""Parse an author line"""
if name_email is None:
return None
try:
open_bracket = name_email.index(b'<')
except ValueError:
name = email = None
else:
raw_name = name_email[:open_bracket]
raw_email = name_email[open_bracket+1:]
if not raw_name:
name = None
elif raw_name.endswith(b' '):
name = raw_name[:-1]
else:
name = raw_name
try:
close_bracket = raw_email.index(b'>')
except ValueError:
email = None
else:
email = raw_email[:close_bracket]
return {
'name': name,
'email': email,
'fullname': name_email,
}
def dulwich_tsinfo_to_timestamp(timestamp, timezone, timezone_neg_utc):
"""Convert the dulwich timestamp information to a structure compatible with
Software Heritage"""
return {
'timestamp': timestamp,
'offset': timezone // 60,
'negative_utc': timezone_neg_utc if timezone == 0 else None,
}
def dulwich_commit_to_revision(commit, log=None):
if commit.type_name != b'commit':
return
ret = {
'id': commit.sha().digest(),
'author': parse_author(commit.author),
'date': dulwich_tsinfo_to_timestamp(
commit.author_time,
commit.author_timezone,
commit._author_timezone_neg_utc,
),
'committer': parse_author(commit.committer),
'committer_date': dulwich_tsinfo_to_timestamp(
commit.commit_time,
commit.commit_timezone,
commit._commit_timezone_neg_utc,
),
'type': 'git',
'directory': bytes.fromhex(commit.tree.decode()),
'message': commit.message,
'metadata': None,
'synthetic': False,
'parents': [bytes.fromhex(p.decode()) for p in commit.parents],
}
git_metadata = []
if commit.encoding is not None:
git_metadata.append(['encoding', commit.encoding])
if commit.mergetag:
for mergetag in commit.mergetag:
raw_string = mergetag.as_raw_string()
assert raw_string.endswith(b'\n')
git_metadata.append(['mergetag', raw_string[:-1]])
if commit.extra:
git_metadata.extend([k.decode('utf-8'), v] for k, v in commit.extra)
if commit.gpgsig:
git_metadata.append(['gpgsig', commit.gpgsig])
if git_metadata:
ret['metadata'] = {
'extra_headers': git_metadata,
}
return ret
DULWICH_TYPES = {
b'blob': 'content',
b'tree': 'directory',
b'commit': 'revision',
b'tag': 'release',
}
def dulwich_tag_to_release(tag, log=None):
if tag.type_name != b'tag':
return
target_type, target = tag.object
ret = {
'id': tag.sha().digest(),
'name': tag.name,
'target': bytes.fromhex(target.decode()),
'target_type': DULWICH_TYPES[target_type.type_name],
'message': tag._message,
'metadata': None,
'synthetic': False,
}
if tag.tagger:
ret['author'] = parse_author(tag.tagger)
ret['date'] = dulwich_tsinfo_to_timestamp(
tag.tag_time,
tag.tag_timezone,
tag._tag_timezone_neg_utc,
)
else:
ret['author'] = ret['date'] = None
return ret

File Metadata

Mime Type
text/x-python
Expires
Jul 4 2025, 8:51 AM (6 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3278705

Event Timeline