Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9125772
npm.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
10 KB
Subscribers
None
npm.py
View Options
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import
re
import
urllib.parse
from
rdflib
import
RDF
,
BNode
,
Graph
,
Literal
,
URIRef
from
swh.indexer.codemeta
import
CROSSWALK_TABLE
from
swh.indexer.namespaces
import
SCHEMA
from
.base
import
JsonMapping
,
SingleFileIntrinsicMapping
from
.utils
import
add_list
,
prettyprint_graph
# noqa
SPDX
=
URIRef
(
"https://spdx.org/licenses/"
)
class
NpmMapping
(
JsonMapping
,
SingleFileIntrinsicMapping
):
"""
dedicated class for NPM (package.json) mapping and translation
"""
name
=
"npm"
mapping
=
CROSSWALK_TABLE
[
"NodeJS"
]
filename
=
b
"package.json"
string_fields
=
[
"name"
,
"version"
,
"description"
,
"email"
]
uri_fields
=
[
"homepage"
]
_schema_shortcuts
=
{
"github"
:
"git+https://github.com/
%s
.git"
,
"gist"
:
"git+https://gist.github.com/
%s
.git"
,
"gitlab"
:
"git+https://gitlab.com/
%s
.git"
,
# Bitbucket supports both hg and git, and the shortcut does not
# tell which one to use.
# 'bitbucket': 'https://bitbucket.org/',
}
def
normalize_repository
(
self
,
d
):
"""https://docs.npmjs.com/files/package.json#repository
>>> NpmMapping().normalize_repository({
... 'type': 'git',
... 'url': 'https://example.org/foo.git'
... })
rdflib.term.URIRef('git+https://example.org/foo.git')
>>> NpmMapping().normalize_repository(
... 'gitlab:foo/bar')
rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
>>> NpmMapping().normalize_repository(
... 'foo/bar')
rdflib.term.URIRef('git+https://github.com/foo/bar.git')
"""
if
(
isinstance
(
d
,
dict
)
and
isinstance
(
d
.
get
(
"type"
),
str
)
and
isinstance
(
d
.
get
(
"url"
),
str
)
):
url
=
"{type}+{url}"
.
format
(
**
d
)
elif
isinstance
(
d
,
str
):
if
"://"
in
d
:
url
=
d
elif
":"
in
d
:
(
schema
,
rest
)
=
d
.
split
(
":"
,
1
)
if
schema
in
self
.
_schema_shortcuts
:
url
=
self
.
_schema_shortcuts
[
schema
]
%
rest
else
:
return
None
else
:
url
=
self
.
_schema_shortcuts
[
"github"
]
%
d
else
:
return
None
return
URIRef
(
url
)
def
normalize_bugs
(
self
,
d
):
"""https://docs.npmjs.com/files/package.json#bugs
>>> NpmMapping().normalize_bugs({
... 'url': 'https://example.org/bugs/',
... 'email': 'bugs@example.org'
... })
rdflib.term.URIRef('https://example.org/bugs/')
>>> NpmMapping().normalize_bugs(
... 'https://example.org/bugs/')
rdflib.term.URIRef('https://example.org/bugs/')
"""
if
isinstance
(
d
,
dict
)
and
isinstance
(
d
.
get
(
"url"
),
str
):
return
URIRef
(
d
[
"url"
])
elif
isinstance
(
d
,
str
):
return
URIRef
(
d
)
else
:
return
None
_parse_author
=
re
.
compile
(
r"^ *"
r"(?P<name>.*?)"
r"( +<(?P<email>.*)>)?"
r"( +\((?P<url>.*)\))?"
r" *$"
)
def
translate_author
(
self
,
graph
:
Graph
,
root
,
d
):
r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
>>> from pprint import pprint
>>> root = URIRef("http://example.org/test-software")
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https://example.org/~john.doe',
... })
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe",
"http://schema.org/url": {
"@id": "https://example.org/~john.doe"
}
}
]
}
}
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root,
... 'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
... )
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe",
"http://schema.org/url": {
"@id": "https://example.org/~john.doe"
}
}
]
}
}
>>> graph = Graph()
>>> NpmMapping().translate_author(graph, root, {
... 'name': 'John Doe',
... 'email': 'john.doe@example.org',
... 'url': 'https:\\\\example.invalid/~john.doe',
... })
>>> prettyprint_graph(graph, root)
{
"@id": ...,
"http://schema.org/author": {
"@list": [
{
"@type": "http://schema.org/Person",
"http://schema.org/email": "john.doe@example.org",
"http://schema.org/name": "John Doe"
}
]
}
}
"""
# noqa
author
=
BNode
()
graph
.
add
((
author
,
RDF
.
type
,
SCHEMA
.
Person
))
if
isinstance
(
d
,
dict
):
name
=
d
.
get
(
"name"
,
None
)
email
=
d
.
get
(
"email"
,
None
)
url
=
d
.
get
(
"url"
,
None
)
elif
isinstance
(
d
,
str
):
match
=
self
.
_parse_author
.
match
(
d
)
if
not
match
:
return
None
name
=
match
.
group
(
"name"
)
email
=
match
.
group
(
"email"
)
url
=
match
.
group
(
"url"
)
else
:
return
None
if
name
and
isinstance
(
name
,
str
):
graph
.
add
((
author
,
SCHEMA
.
name
,
Literal
(
name
)))
if
email
and
isinstance
(
email
,
str
):
graph
.
add
((
author
,
SCHEMA
.
email
,
Literal
(
email
)))
if
url
and
isinstance
(
url
,
str
):
# Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
# URLs that are blatantly invalid early, so PyLD does not crash.
parsed_url
=
urllib
.
parse
.
urlparse
(
url
)
if
parsed_url
.
netloc
:
graph
.
add
((
author
,
SCHEMA
.
url
,
URIRef
(
url
)))
add_list
(
graph
,
root
,
SCHEMA
.
author
,
[
author
])
def
normalize_description
(
self
,
description
):
r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
mistake that causes issues in the database because of null bytes in JSON.
>>> NpmMapping().normalize_description("foo bar")
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
... )
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
... )
rdflib.term.Literal('foo bar')
>>> NpmMapping().normalize_description(
... # invalid UTF-16 and meaningless UTF-8:
... "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
... ) is None
True
>>> NpmMapping().normalize_description(
... # ditto (ut looks like little-endian at first)
... "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
... ) is None
True
>>> NpmMapping().normalize_description(None) is None
True
"""
if
not
isinstance
(
description
,
str
):
return
None
# XXX: if this function ever need to support more cases, consider
# switching to https://pypi.org/project/ftfy/ instead of adding more hacks
if
description
.
startswith
(
"
\ufffd\ufffd
"
)
and
"
\x00
"
in
description
:
# 2 unicode replacement characters followed by '# ' encoded as UTF-16
# is a common mistake, which indicates a README.md was saved as UTF-16,
# and some NPM tool opened it as UTF-8 and used the first line as
# description.
description_bytes
=
description
.
encode
()
# Strip the the two unicode replacement characters
assert
description_bytes
.
startswith
(
b
"
\xef\xbf\xbd\xef\xbf\xbd
"
)
description_bytes
=
description_bytes
[
6
:]
# If the following attempts fail to recover the description, discard it
# entirely because the current indexer storage backend (postgresql) cannot
# store zero bytes in JSON columns.
description
=
None
if
not
description_bytes
.
startswith
(
b
"
\x00
"
):
# try UTF-16 little-endian (the most common) first
try
:
description
=
description_bytes
.
decode
(
"utf-16le"
)
except
UnicodeDecodeError
:
pass
if
description
is
None
:
# if it fails, try UTF-16 big-endian
try
:
description
=
description_bytes
.
decode
(
"utf-16be"
)
except
UnicodeDecodeError
:
pass
if
description
:
if
description
.
startswith
(
"# "
):
description
=
description
[
2
:]
return
Literal
(
description
.
rstrip
())
else
:
return
None
return
Literal
(
description
)
def
normalize_license
(
self
,
s
):
"""https://docs.npmjs.com/files/package.json#license
>>> NpmMapping().normalize_license('MIT')
rdflib.term.URIRef('https://spdx.org/licenses/MIT')
"""
if
isinstance
(
s
,
str
):
if
s
.
startswith
(
"SEE LICENSE IN "
):
# Very common pattern, because it is an example in the specification.
# It is followed by the filename; and the indexer architecture currently
# does not allow accessing that from metadata mappings.
# (Plus, an hypothetical license mapping would eventually pick it up)
return
if
" "
in
s
:
# Either an SPDX expression, or unusable data
# TODO: handle it
return
return
SPDX
+
s
def
normalize_keywords
(
self
,
lst
):
"""https://docs.npmjs.com/files/package.json#homepage
>>> NpmMapping().normalize_keywords(['foo', 'bar'])
[rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
"""
if
isinstance
(
lst
,
list
):
return
[
Literal
(
x
)
for
x
in
lst
if
isinstance
(
x
,
str
)]
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Sat, Jun 21, 9:19 PM (4 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3320681
Attached To
rDCIDX Metadata indexer
Event Timeline
Log In to Comment