Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Paste
P1074
Tarballs archiving script
Active
Public
Actions
Authored by
anlambert
on Jun 21 2021, 2:02 PM.
Edit Paste
Archive Paste
View Raw File
Subscribe
Mute Notifications
Award Token
Flag For Later
Tags
None
Subscribers
None
import
os
from
bs4
import
BeautifulSoup
import
requests
url
=
"https://www.kermitproject.org/archive.html"
origin_url
=
url
response
=
requests
.
get
(
url
)
page
=
BeautifulSoup
(
response
.
content
,
features
=
"html.parser"
)
supported_extensions
=
{
".tar"
,
".tar.gz"
,
".tgz"
,
".zip"
,
".tar.Z"
,
".tar.x"
,
".tar.lz"
,
}
archive_links
=
[
p
[
"href"
]
for
p
in
page
.
find_all
(
"a"
)
if
any
([
p
[
"href"
]
.
endswith
(
ext
)
for
ext
in
supported_extensions
])
]
artifacts
=
{}
archives_data
=
[]
for
archive_link
in
archive_links
:
if
not
archive_link
.
startswith
(
"http"
):
archive_link
=
url
+
archive_link
if
not
requests
.
head
(
archive_link
)
.
ok
:
continue
artifact_version
=
archive_link
.
split
(
"/"
)[
-
1
]
.
split
(
"."
)[
0
]
if
artifact_version
not
in
artifacts
:
archives_data
.
append
(
{
"artifact_url"
:
archive_link
,
"artifact_version"
:
artifact_version
}
)
artifacts
[
artifact_version
]
=
archive_link
save_code_now_url
=
(
f
"https://archive.softwareheritage.org/api/1/origin/save/archives/url/{origin_url}/"
)
headers
=
{
"Authorization"
:
f
"Bearer {os.environ['SWH_TOKEN']}"
}
print
(
requests
.
post
(
save_code_now_url
,
json
=
{
"archives_data"
:
archives_data
},
headers
=
headers
)
.
json
()
)
Event Timeline
anlambert
created this paste.
Jun 21 2021, 2:02 PM
2021-06-21 14:02:05 (UTC+2)
anlambert
updated the paste's language from
autodetect
to
python
.
Log In to Comment