Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9344695
utils.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
utils.py
View Options
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
logging
from
typing
import
Any
,
Dict
,
Optional
,
Union
from
xml.etree
import
ElementTree
import
iso8601
import
xmltodict
from
swh.model.exceptions
import
ValidationError
from
swh.model.model
import
TimestampWithTimezone
from
swh.model.swhids
import
ExtendedSWHID
,
ObjectType
,
QualifiedSWHID
logger
=
logging
.
getLogger
(
__name__
)
NAMESPACES
=
{
"atom"
:
"http://www.w3.org/2005/Atom"
,
"app"
:
"http://www.w3.org/2007/app"
,
"dc"
:
"http://purl.org/dc/terms/"
,
"codemeta"
:
"https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
,
"sword"
:
"http://purl.org/net/sword/terms/"
,
"swh"
:
"https://www.softwareheritage.org/schema/2018/deposit"
,
"schema"
:
"http://schema.org/"
,
}
def
parse_xml
(
stream
,
encoding
=
"utf-8"
):
data
=
xmltodict
.
parse
(
stream
,
encoding
=
encoding
,
namespaces
=
{
uri
:
prefix
for
(
prefix
,
uri
)
in
NAMESPACES
.
items
()},
process_namespaces
=
True
,
dict_constructor
=
dict
,
)
if
"atom:entry"
in
data
:
data
=
data
[
"atom:entry"
]
return
data
def
normalize_date
(
date
):
"""Normalize date fields as expected by swh workers.
If date is a list, elect arbitrarily the first element of that
list
If date is (then) a string, parse it through
dateutil.parser.parse to extract a datetime.
Then normalize it through
:class:`swh.model.model.TimestampWithTimezone`
Returns
The swh date object
"""
if
isinstance
(
date
,
list
):
date
=
date
[
0
]
if
isinstance
(
date
,
str
):
date
=
iso8601
.
parse_date
(
date
)
tstz
=
TimestampWithTimezone
.
from_dict
(
date
)
return
{
"timestamp"
:
tstz
.
timestamp
.
to_dict
(),
"offset"
:
tstz
.
offset_minutes
(),
}
def
compute_metadata_context
(
swhid_reference
:
QualifiedSWHID
)
->
Dict
[
str
,
Any
]:
"""Given a SWHID object, determine the context as a dict.
"""
metadata_context
:
Dict
[
str
,
Any
]
=
{
"origin"
:
None
}
if
swhid_reference
.
qualifiers
():
metadata_context
=
{
"origin"
:
swhid_reference
.
origin
,
"path"
:
swhid_reference
.
path
,
}
snapshot
=
swhid_reference
.
visit
if
snapshot
:
metadata_context
[
"snapshot"
]
=
snapshot
anchor
=
swhid_reference
.
anchor
if
anchor
:
metadata_context
[
anchor
.
object_type
.
name
.
lower
()]
=
anchor
return
metadata_context
ALLOWED_QUALIFIERS_NODE_TYPE
=
(
ObjectType
.
SNAPSHOT
,
ObjectType
.
REVISION
,
ObjectType
.
RELEASE
,
ObjectType
.
DIRECTORY
,
)
def
parse_swh_metadata_provenance
(
metadata
:
ElementTree
.
Element
,
)
->
Optional
[
Union
[
QualifiedSWHID
,
str
]]:
"""Parse swh metadata-provenance within the metadata dict reference if found, None
otherwise.
.. code-block:: xml
<swh:deposit>
<swh:metadata-provenance>
<schema:url>https://url.org/metadata/url</schema:url>
</swh:metadata-provenance>
</swh:deposit>
Args:
metadata: result of parsing an Atom document with :func:`parse_xml`
Raises:
ValidationError in case of invalid xml
Returns:
Either the metadata provenance url if any or None otherwise
"""
url_element
=
metadata
.
find
(
"swh:deposit/swh:metadata-provenance/schema:url"
,
namespaces
=
NAMESPACES
)
if
url_element
is
not
None
:
return
url_element
.
text
return
None
def
parse_swh_reference
(
metadata
:
ElementTree
.
Element
,
)
->
Optional
[
Union
[
QualifiedSWHID
,
str
]]:
"""Parse swh reference within the metadata dict (or origin) reference if found,
None otherwise.
.. code-block:: xml
<swh:deposit>
<swh:reference>
<swh:origin url='https://github.com/user/repo'/>
</swh:reference>
</swh:deposit>
or:
.. code-block:: xml
<swh:deposit>
<swh:reference>
<swh:object swhid="swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/" />
</swh:deposit>
Args:
metadata: result of parsing an Atom document
Raises:
ValidationError in case the swhid referenced (if any) is invalid
Returns:
Either swhid or origin reference if any. None otherwise.
"""
# noqa
ref_origin
=
metadata
.
find
(
"swh:deposit/swh:reference/swh:origin[@url]"
,
namespaces
=
NAMESPACES
)
if
ref_origin
is
not
None
:
return
ref_origin
.
attrib
[
"url"
]
ref_object
=
metadata
.
find
(
"swh:deposit/swh:reference/swh:object[@swhid]"
,
namespaces
=
NAMESPACES
)
if
ref_object
is
None
:
return
None
swhid
=
ref_object
.
attrib
[
"swhid"
]
if
not
swhid
:
return
None
swhid_reference
=
QualifiedSWHID
.
from_string
(
swhid
)
if
swhid_reference
.
qualifiers
():
anchor
=
swhid_reference
.
anchor
if
anchor
:
if
anchor
.
object_type
not
in
ALLOWED_QUALIFIERS_NODE_TYPE
:
error_msg
=
(
"anchor qualifier should be a core SWHID with type one of "
f
"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}"
)
raise
ValidationError
(
error_msg
)
visit
=
swhid_reference
.
visit
if
visit
:
if
visit
.
object_type
!=
ObjectType
.
SNAPSHOT
:
raise
ValidationError
(
f
"visit qualifier should be a core SWHID with type snp, "
f
"not {visit.object_type.value}"
)
if
(
visit
and
anchor
and
visit
.
object_type
==
ObjectType
.
SNAPSHOT
and
anchor
.
object_type
==
ObjectType
.
SNAPSHOT
):
logger
.
warn
(
"SWHID use of both anchor and visit targeting "
f
"a snapshot: {swhid_reference}"
)
raise
ValidationError
(
"'anchor=swh:1:snp:' is not supported when 'visit' is also provided."
)
return
swhid_reference
def
extended_swhid_from_qualified
(
swhid
:
QualifiedSWHID
)
->
ExtendedSWHID
:
"""Used to get the target of a metadata object from a <swh:reference>,
as the latter uses a QualifiedSWHID."""
return
ExtendedSWHID
.
from_string
(
str
(
swhid
)
.
split
(
";"
)[
0
])
def
to_header_link
(
link
:
str
,
link_name
:
str
)
->
str
:
"""Build a single header link.
>>> link_next = to_header_link("next-url", "next")
>>> link_next
'<next-url>; rel="next"'
>>> ','.join([link_next, to_header_link("prev-url", "prev")])
'<next-url>; rel="next",<prev-url>; rel="prev"'
"""
return
f
'<{link}>; rel="{link_name}"'
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Fri, Jul 4, 2:41 PM (3 d, 23 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3434761
Attached To
rDDEP Push deposit
Event Timeline
Log In to Comment