diff --git a/swh/web/misc/coverage.py b/swh/web/misc/coverage.py
index d1649a4f..742e8e6d 100644
--- a/swh/web/misc/coverage.py
+++ b/swh/web/misc/coverage.py
@@ -1,393 +1,397 @@
-# Copyright (C) 2018-2021 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import Counter, defaultdict
from functools import lru_cache
from typing import Dict, List, Tuple
from urllib.parse import urlparse
import sentry_sdk
from django.conf.urls import url
from django.shortcuts import render
from django.views.decorators.cache import never_cache
from django.views.decorators.clickjacking import xframe_options_exempt
from swh.scheduler.model import SchedulerMetrics
from swh.web.common import archive
+from swh.web.common.origin_save import get_savable_visit_types
from swh.web.common.utils import get_deposits_list, reverse
from swh.web.config import scheduler
_swh_arch_overview_doc = (
"https://docs.softwareheritage.org/devel/architecture/overview.html"
)
# Current coverage list of the archive in a high level overview fashion,
# categorized as follow:
# - listed origins: origins discovered using a swh lister
# - legacy: origins where public hosting service has closed
# - deposited: origins coming from swh-deposit
#
# TODO: Store that list in a database table somewhere (swh-scheduler, swh-storage ?)
# and retrieve it dynamically
listed_origins = {
"info": (
"These software origins get continuously discovered and archived using "
f'the listers implemented by Software Heritage.'
),
"origins": [
{
"type": "bitbucket",
"info_url": "https://bitbucket.org",
"info": "public repositories from Bitbucket",
"search_pattern": "https://bitbucket.org/",
},
{
"type": "cgit",
"info_url": "https://git.zx2c4.com/cgit/about",
"info": "public repositories from cgit instances",
"search_pattern": "cgit",
},
{
"type": "CRAN",
"info_url": "https://cran.r-project.org",
"info": "source packages from The Comprehensive R Archive Network",
"search_pattern": "https://cran.r-project.org/",
},
{
"type": "debian",
"info_url": "https://www.debian.org",
"info": "source packages from Debian and Debian-based distributions",
"search_pattern": "deb://",
},
{
"type": "gitea",
"info_url": "https://gitea.io",
"info": "public repositories from Gitea instances",
"search_pattern": "gitea",
},
{
"type": "github",
"info_url": "https://github.com",
"info": "public repositories from GitHub",
"search_pattern": "https://github.com/",
},
{
"type": "gitlab",
"info_url": "https://gitlab.com",
"info": "public repositories from multiple GitLab instances",
"search_pattern": "gitlab",
},
{
"type": "guix",
"info_url": "https://guix.gnu.org",
"info": "source code tarballs used to build the Guix package collection",
"visit_types": ["nixguix"],
"search_pattern": "https://guix.gnu.org/sources.json",
},
{
"type": "GNU",
"info_url": "https://www.gnu.org",
"info": "releases from the GNU project (as of August 2015)",
"search_pattern": "gnu",
},
{
"type": "heptapod",
"info_url": "https://heptapod.net/",
"info": "public repositories from multiple Heptapod instances",
"search_pattern": "heptapod",
},
{
"type": "launchpad",
"info_url": "https://launchpad.net",
"logo": "img/logos/launchpad.png",
"info": "public repositories from Launchpad",
"search_pattern": "https://git.launchpad.net/",
},
{
"type": "nixos",
"info_url": "https://nixos.org",
"info": "source code tarballs used to build the Nix package collection",
"visit_types": ["nixguix"],
"search_pattern": (
"https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"
),
},
{
"type": "npm",
"info_url": "https://www.npmjs.com",
"info": "public packages from the package registry for javascript",
"search_pattern": "https://www.npmjs.com",
},
{
"type": "opam",
"info_url": "https://opam.ocaml.org/",
"info": "public packages from the source-based package manager for OCaml",
"search_pattern": "opam+https://opam.ocaml.org/",
},
# apart our forge, most phabricator origins have not been archived
# while they have been listed so do not display those type of origins
# until new listing processes have been executed and origins loaded
#
# {
# "type": "phabricator",
# "info_url": "https://www.phacility.com/phabricator",
# "info": "public repositories from multiple Phabricator instances",
# "search_pattern": "phabricator",
# },
{
"type": "pypi",
"info_url": "https://pypi.org",
"info": "source packages from the Python Package Index",
"search_pattern": "https://pypi.org",
},
{
"type": "sourceforge",
"info_url": "https://sourceforge.net",
"info": "public repositories from SourceForge",
"search_pattern": "code.sf.net",
},
],
}
legacy_origins = {
"info": (
"Discontinued hosting services. Those origins have been archived "
"by Software Heritage."
),
"origins": [
{
"type": "gitorious",
"info_url": "https://en.wikipedia.org/wiki/Gitorious",
"info": (
"public repositories from the former Gitorious code hosting service"
),
"visit_types": ["git"],
"search_pattern": "https://gitorious.org",
"count": "122,014",
},
{
"type": "googlecode",
"info_url": "https://code.google.com/archive",
"info": (
"public repositories from the former Google Code project "
"hosting service"
),
"visit_types": ["git", "hg", "svn"],
"search_pattern": "googlecode.com",
"count": "790,026",
},
{
"type": "bitbucket",
"info_url": "https://bitbucket.org",
"info": "public repositories from Bitbucket",
"search_pattern": "https://bitbucket.org/",
"visit_types": ["hg"],
"count": "336,795",
},
],
}
deposited_origins = {
"info": (
"These origins are directly pushed into the archive by trusted partners "
f'using the deposit service of Software Heritage.'
),
"origins": [
{
"type": "elife",
"info_url": "https://elifesciences.org",
"info": (
"research software source code associated to the articles "
"eLife publishes"
),
"search_pattern": "elife.stencila.io",
"visit_types": ["deposit"],
},
{
"type": "hal",
"info_url": "https://hal.archives-ouvertes.fr",
"info": "scientific software source code deposited in the open archive HAL",
"visit_types": ["deposit"],
"search_pattern": "hal.archives-ouvertes.fr",
},
{
"type": "ipol",
"info_url": "https://www.ipol.im",
"info": "software artifacts associated to the articles IPOL publishes",
"visit_types": ["deposit"],
"search_pattern": "doi.org/10.5201",
},
],
}
@lru_cache()
def _get_listers_metrics() -> Dict[str, List[Tuple[str, SchedulerMetrics]]]:
"""Returns scheduler metrics in the following mapping:
Dict[lister_name, List[Tuple[instance_name, SchedulerMetrics]]]
as a lister instance has one SchedulerMetrics object per visit type.
"""
listers_metrics = defaultdict(list)
try:
listers = scheduler().get_listers()
scheduler_metrics = scheduler().get_metrics()
for lister in listers:
for metrics in filter(
lambda m: m.lister_id == lister.id, scheduler_metrics
):
listers_metrics[lister.name].append((lister.instance_name, metrics))
except Exception as e:
sentry_sdk.capture_exception(e)
return listers_metrics
@lru_cache()
def _get_deposits_netloc_counts() -> Counter:
"""Return deposit counts per origin url network location.
"""
def _process_origin_url(origin_url):
parsed_url = urlparse(origin_url)
netloc = parsed_url.netloc
# special treatment for doi.org netloc as it is not specific enough
# for origins mapping
if parsed_url.netloc == "doi.org":
netloc += "/" + parsed_url.path.split("/")[1]
return netloc
netlocs = []
try:
deposits = get_deposits_list()
netlocs = [
_process_origin_url(d["origin_url"])
for d in deposits
if d["status"] == "done"
]
except Exception as e:
sentry_sdk.capture_exception(e)
return Counter(netlocs)
@lru_cache()
def _get_nixguix_origins_count(origin_url: str) -> int:
"""Returns number of archived tarballs for NixOS, aka the number
of branches in a dedicated origin in the archive.
"""
snapshot = archive.lookup_latest_origin_snapshot(origin_url)
if snapshot:
snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"])
return snapshot_sizes["release"]
else:
return 0
def _search_url(query: str, visit_type: str) -> str:
return reverse(
"browse-search",
query_params={
"q": query,
"visit_type": visit_type,
"with_visit": "true",
"with_content": "true",
},
)
@xframe_options_exempt
@never_cache
def _swh_coverage(request):
listers_metrics = _get_listers_metrics()
for origins in listed_origins["origins"]:
origins["instances"] = {}
origins_type = origins["type"]
# special processing for nixos/guix origins as there is no
# scheduler metrics for those
if origins_type in ("nixos", "guix"):
count = _get_nixguix_origins_count(origins["search_pattern"])
origins["count"] = f"{count:,}" if count else ""
origins["instances"][origins_type] = {"nixguix": {"count": count}}
if origins_type not in listers_metrics:
continue
count_total = sum(
[metrics.origins_known for _, metrics in listers_metrics[origins_type]]
)
count_never_visited = sum(
[
metrics.origins_never_visited
for _, metrics in listers_metrics[origins_type]
]
)
count = count_total - count_never_visited
origins["count"] = f"{count:,}"
origins["instances"] = defaultdict(dict)
for instance, metrics in listers_metrics[origins_type]:
- # not yet in production
- if metrics.visit_type in ("bzr", "cvs"):
+ # these types are available in staging/docker but not yet in production
+ if (
+ metrics.visit_type in ("bzr", "cvs")
+ and metrics.visit_type not in get_savable_visit_types()
+ ):
continue
instance_count = metrics.origins_known - metrics.origins_never_visited
origins["instances"][instance].update(
{metrics.visit_type: {"count": f"{instance_count:,}"}}
)
origins["visit_types"] = list(
set(origins["instances"][instance].keys())
| set(origins.get("visit_types", []))
)
if origins_type == "CRAN":
origins["instances"]["cran"]["cran"] = {"count": origins["count"]}
# defaultdict cannot be iterated in django template
origins["instances"] = dict(origins["instances"])
for origins in listed_origins["origins"]:
instances = origins["instances"]
nb_instances = len(instances)
for instance_name, visit_types in instances.items():
for visit_type in visit_types:
if nb_instances > 1:
search_pattern = instance_name
else:
search_pattern = origins["search_pattern"]
search_url = _search_url(search_pattern, visit_type)
visit_types[visit_type]["search_url"] = search_url
for origins in legacy_origins["origins"]:
origins["search_urls"] = {}
for visit_type in origins["visit_types"]:
origins["search_urls"][visit_type] = _search_url(
origins["search_pattern"], visit_type
)
deposits_counts = _get_deposits_netloc_counts()
for origins in deposited_origins["origins"]:
if origins["search_pattern"] in deposits_counts:
origins["count"] = f"{deposits_counts[origins['search_pattern']]:,}"
origins["search_urls"] = {
"deposit": _search_url(origins["search_pattern"], "deposit")
}
return render(
request,
"misc/coverage.html",
{
"origins": {
"Regular crawling": listed_origins,
"Discontinued hosting": legacy_origins,
"On demand archival": deposited_origins,
}
},
)
urlpatterns = [
url(r"^coverage/$", _swh_coverage, name="swh-coverage"),
]
diff --git a/swh/web/tests/misc/test_coverage.py b/swh/web/tests/misc/test_coverage.py
index 1de2b2c7..fe345f74 100644
--- a/swh/web/tests/misc/test_coverage.py
+++ b/swh/web/tests/misc/test_coverage.py
@@ -1,133 +1,136 @@
-# Copyright (C) 2021 The Software Heritage developers
+# Copyright (C) 2021-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
from itertools import chain
import os
-from random import choice, randint
+from random import randint
import uuid
import pytest
from django.conf import settings
from django.utils.html import escape
from swh.scheduler.model import LastVisitStatus, ListedOrigin, OriginVisitStats
from swh.web.common.utils import reverse
from swh.web.misc.coverage import (
_get_deposits_netloc_counts,
_get_listers_metrics,
deposited_origins,
legacy_origins,
listed_origins,
)
from swh.web.tests.django_asserts import assert_contains
from swh.web.tests.utils import check_html_get_response
@pytest.fixture(autouse=True)
def clear_lru_caches():
_get_listers_metrics.cache_clear()
_get_deposits_netloc_counts.cache_clear()
-def test_coverage_view_no_metrics(client):
+def test_coverage_view_no_metrics(client, swh_scheduler):
"""
Check coverage view can be rendered when scheduler metrics and deposits
data are not available.
"""
url = reverse("swh-coverage")
check_html_get_response(
client, url, status_code=200, template_used="misc/coverage.html"
)
def test_coverage_view_with_metrics(client, swh_scheduler, mocker):
"""
Generate some sample scheduler metrics and some sample deposits
that will be consumed by the archive coverage view, then check
the HTML page gets rendered without errors.
"""
mocker.patch(
"swh.web.misc.coverage._get_nixguix_origins_count"
).return_value = 30095
listers = []
+ visit_types = ["git", "hg", "svn", "bzr", "svn"]
for origins in listed_origins["origins"]:
# create some instances for each lister
for instance in range(randint(1, 5)):
lister = swh_scheduler.get_or_create_lister(
origins["type"], f"instance-{instance}"
)
listers.append(lister)
# record some sample listed origins
_origins = []
origin_visit_stats = []
- for i in range(randint(3, 10)):
+ for i, visit_type in enumerate(visit_types):
url = str(uuid.uuid4())
- visit_type = choice(["git", "hg", "svn"])
_origins.append(
ListedOrigin(
lister_id=lister.id,
url=url,
visit_type=visit_type,
extra_loader_arguments={},
)
)
# set origin visit stats to some origins
if i % 2 == 0:
now = datetime.now(tz=timezone.utc)
origin_visit_stats.append(
OriginVisitStats(
url=url,
visit_type=visit_type,
last_successful=now,
last_visit=now,
last_visit_status=LastVisitStatus.successful,
last_snapshot=os.urandom(20),
)
)
# send origins data to scheduler
swh_scheduler.record_listed_origins(_origins)
swh_scheduler.origin_visit_stats_upsert(origin_visit_stats)
# compute scheduler metrics
swh_scheduler.update_metrics()
# add some sample deposits
deposits = []
for origins in deposited_origins["origins"]:
for _ in range(randint(2, 10)):
deposits.append(
{
"origin_url": f"https://{origins['search_pattern']}/{uuid.uuid4()}",
"status": "done",
}
)
get_deposits_list = mocker.patch("swh.web.misc.coverage.get_deposits_list")
get_deposits_list.return_value = deposits
# check view gets rendered without errors
url = reverse("swh-coverage")
resp = check_html_get_response(
client, url, status_code=200, template_used="misc/coverage.html"
)
# check logos and origins search links are present in the rendered page
for origins in chain(
listed_origins["origins"],
legacy_origins["origins"],
deposited_origins["origins"],
):
logo_url = f'{settings.STATIC_URL}img/logos/{origins["type"].lower()}.png'
assert_contains(resp, f'src="{logo_url}"')
if "instances" in origins:
- for visit_types in origins["instances"].values():
- for data in visit_types.values():
+ for visit_types_ in origins["instances"].values():
+ for data in visit_types_.values():
if data["count"]:
assert_contains(resp, f'{visit_type}")