diff --git a/swh/web/misc/coverage.py b/swh/web/misc/coverage.py index 742e8e6d..603a266e 100644 --- a/swh/web/misc/coverage.py +++ b/swh/web/misc/coverage.py @@ -1,397 +1,422 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter, defaultdict -from functools import lru_cache -from typing import Dict, List, Tuple +from typing import Any, Dict, List, Tuple from urllib.parse import urlparse import sentry_sdk from django.conf.urls import url +from django.core.cache import cache +from django.http.request import HttpRequest +from django.http.response import HttpResponse from django.shortcuts import render from django.views.decorators.cache import never_cache from django.views.decorators.clickjacking import xframe_options_exempt from swh.scheduler.model import SchedulerMetrics from swh.web.common import archive from swh.web.common.origin_save import get_savable_visit_types -from swh.web.common.utils import get_deposits_list, reverse +from swh.web.common.utils import get_deposits_list, is_swh_web_production, reverse from swh.web.config import scheduler _swh_arch_overview_doc = ( "https://docs.softwareheritage.org/devel/architecture/overview.html" ) # Current coverage list of the archive in a high level overview fashion, # categorized as follow: # - listed origins: origins discovered using a swh lister # - legacy: origins where public hosting service has closed # - deposited: origins coming from swh-deposit # # TODO: Store that list in a database table somewhere (swh-scheduler, swh-storage ?) # and retrieve it dynamically -listed_origins = { +listed_origins: Dict[str, Any] = { "info": ( "These software origins get continuously discovered and archived using " f'the listers implemented by Software Heritage.' ), "origins": [ { "type": "bitbucket", "info_url": "https://bitbucket.org", "info": "public repositories from Bitbucket", "search_pattern": "https://bitbucket.org/", }, { "type": "cgit", "info_url": "https://git.zx2c4.com/cgit/about", "info": "public repositories from cgit instances", "search_pattern": "cgit", }, { "type": "CRAN", "info_url": "https://cran.r-project.org", "info": "source packages from The Comprehensive R Archive Network", "search_pattern": "https://cran.r-project.org/", }, { "type": "debian", "info_url": "https://www.debian.org", "info": "source packages from Debian and Debian-based distributions", "search_pattern": "deb://", }, { "type": "gitea", "info_url": "https://gitea.io", "info": "public repositories from Gitea instances", "search_pattern": "gitea", }, { "type": "github", "info_url": "https://github.com", "info": "public repositories from GitHub", "search_pattern": "https://github.com/", }, { "type": "gitlab", "info_url": "https://gitlab.com", "info": "public repositories from multiple GitLab instances", "search_pattern": "gitlab", }, { "type": "guix", "info_url": "https://guix.gnu.org", "info": "source code tarballs used to build the Guix package collection", "visit_types": ["nixguix"], "search_pattern": "https://guix.gnu.org/sources.json", }, { "type": "GNU", "info_url": "https://www.gnu.org", "info": "releases from the GNU project (as of August 2015)", "search_pattern": "gnu", }, { "type": "heptapod", "info_url": "https://heptapod.net/", "info": "public repositories from multiple Heptapod instances", "search_pattern": "heptapod", }, { "type": "launchpad", "info_url": "https://launchpad.net", "logo": "img/logos/launchpad.png", "info": "public repositories from Launchpad", "search_pattern": "https://git.launchpad.net/", }, { "type": "nixos", "info_url": "https://nixos.org", "info": "source code tarballs used to build the Nix package collection", "visit_types": ["nixguix"], "search_pattern": ( "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" ), }, { "type": "npm", "info_url": "https://www.npmjs.com", "info": "public packages from the package registry for javascript", "search_pattern": "https://www.npmjs.com", }, { "type": "opam", "info_url": "https://opam.ocaml.org/", "info": "public packages from the source-based package manager for OCaml", "search_pattern": "opam+https://opam.ocaml.org/", }, # apart our forge, most phabricator origins have not been archived # while they have been listed so do not display those type of origins # until new listing processes have been executed and origins loaded # # { # "type": "phabricator", # "info_url": "https://www.phacility.com/phabricator", # "info": "public repositories from multiple Phabricator instances", # "search_pattern": "phabricator", # }, { "type": "pypi", "info_url": "https://pypi.org", "info": "source packages from the Python Package Index", "search_pattern": "https://pypi.org", }, { "type": "sourceforge", "info_url": "https://sourceforge.net", "info": "public repositories from SourceForge", "search_pattern": "code.sf.net", }, ], } -legacy_origins = { +legacy_origins: Dict[str, Any] = { "info": ( "Discontinued hosting services. Those origins have been archived " "by Software Heritage." ), "origins": [ { "type": "gitorious", "info_url": "https://en.wikipedia.org/wiki/Gitorious", "info": ( "public repositories from the former Gitorious code hosting service" ), "visit_types": ["git"], "search_pattern": "https://gitorious.org", "count": "122,014", }, { "type": "googlecode", "info_url": "https://code.google.com/archive", "info": ( "public repositories from the former Google Code project " "hosting service" ), "visit_types": ["git", "hg", "svn"], "search_pattern": "googlecode.com", "count": "790,026", }, { "type": "bitbucket", "info_url": "https://bitbucket.org", "info": "public repositories from Bitbucket", "search_pattern": "https://bitbucket.org/", "visit_types": ["hg"], "count": "336,795", }, ], } -deposited_origins = { +deposited_origins: Dict[str, Any] = { "info": ( "These origins are directly pushed into the archive by trusted partners " f'using the deposit service of Software Heritage.' ), "origins": [ { "type": "elife", "info_url": "https://elifesciences.org", "info": ( "research software source code associated to the articles " "eLife publishes" ), "search_pattern": "elife.stencila.io", "visit_types": ["deposit"], }, { "type": "hal", "info_url": "https://hal.archives-ouvertes.fr", "info": "scientific software source code deposited in the open archive HAL", "visit_types": ["deposit"], "search_pattern": "hal.archives-ouvertes.fr", }, { "type": "ipol", "info_url": "https://www.ipol.im", "info": "software artifacts associated to the articles IPOL publishes", "visit_types": ["deposit"], "search_pattern": "doi.org/10.5201", }, ], } +_cache_timeout = 5 * 60 -@lru_cache() -def _get_listers_metrics() -> Dict[str, List[Tuple[str, SchedulerMetrics]]]: + +def _get_listers_metrics( + cache_metrics: bool = False, +) -> Dict[str, List[Tuple[str, SchedulerMetrics]]]: """Returns scheduler metrics in the following mapping: Dict[lister_name, List[Tuple[instance_name, SchedulerMetrics]]] as a lister instance has one SchedulerMetrics object per visit type. """ - listers_metrics = defaultdict(list) - try: - listers = scheduler().get_listers() - scheduler_metrics = scheduler().get_metrics() - for lister in listers: - for metrics in filter( - lambda m: m.lister_id == lister.id, scheduler_metrics - ): - listers_metrics[lister.name].append((lister.instance_name, metrics)) - except Exception as e: - sentry_sdk.capture_exception(e) + cache_key = "lister_metrics" + listers_metrics = cache.get(cache_key, {}) + if not listers_metrics: + listers_metrics = defaultdict(list) + try: + listers = scheduler().get_listers() + scheduler_metrics = scheduler().get_metrics() + for lister in listers: + for metrics in filter( + lambda m: m.lister_id == lister.id, scheduler_metrics + ): + listers_metrics[lister.name].append((lister.instance_name, metrics)) + if cache_metrics: + cache.set(cache_key, listers_metrics, timeout=_cache_timeout) + except Exception as e: + sentry_sdk.capture_exception(e) + return listers_metrics -@lru_cache() -def _get_deposits_netloc_counts() -> Counter: +def _get_deposits_netloc_counts(cache_counts: bool = False) -> Counter: """Return deposit counts per origin url network location. """ def _process_origin_url(origin_url): parsed_url = urlparse(origin_url) netloc = parsed_url.netloc # special treatment for doi.org netloc as it is not specific enough # for origins mapping if parsed_url.netloc == "doi.org": netloc += "/" + parsed_url.path.split("/")[1] return netloc - netlocs = [] - try: - deposits = get_deposits_list() - netlocs = [ - _process_origin_url(d["origin_url"]) - for d in deposits - if d["status"] == "done" - ] - except Exception as e: - sentry_sdk.capture_exception(e) - return Counter(netlocs) - - -@lru_cache() -def _get_nixguix_origins_count(origin_url: str) -> int: + cache_key = "deposits_netloc_counts" + deposits_netloc_counts = cache.get(cache_key, Counter()) + if not deposits_netloc_counts: + netlocs = [] + try: + deposits = get_deposits_list() + netlocs = [ + _process_origin_url(d["origin_url"]) + for d in deposits + if d["status"] == "done" + ] + deposits_netloc_counts = Counter(netlocs) + if cache_counts: + cache.set(cache_key, deposits_netloc_counts, timeout=_cache_timeout) + except Exception as e: + sentry_sdk.capture_exception(e) + + return deposits_netloc_counts + + +def _get_nixguix_origins_count(origin_url: str, cache_count: bool = False) -> int: """Returns number of archived tarballs for NixOS, aka the number of branches in a dedicated origin in the archive. """ - snapshot = archive.lookup_latest_origin_snapshot(origin_url) - if snapshot: - snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"]) - return snapshot_sizes["release"] - else: - return 0 + cache_key = f"nixguix_origins_count_{origin_url}" + nixguix_origins_count = cache.get(cache_key, 0) + if not nixguix_origins_count: + snapshot = archive.lookup_latest_origin_snapshot(origin_url) + if snapshot: + snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"]) + nixguix_origins_count = snapshot_sizes["release"] + else: + nixguix_origins_count = 0 + if cache_count: + cache.set(cache_key, nixguix_origins_count, timeout=_cache_timeout) + return nixguix_origins_count def _search_url(query: str, visit_type: str) -> str: return reverse( "browse-search", query_params={ "q": query, "visit_type": visit_type, "with_visit": "true", "with_content": "true", }, ) @xframe_options_exempt @never_cache -def _swh_coverage(request): - listers_metrics = _get_listers_metrics() +def _swh_coverage(request: HttpRequest) -> HttpResponse: + use_cache = is_swh_web_production(request) + listers_metrics = _get_listers_metrics(use_cache) for origins in listed_origins["origins"]: origins["instances"] = {} origins_type = origins["type"] # special processing for nixos/guix origins as there is no # scheduler metrics for those if origins_type in ("nixos", "guix"): - count = _get_nixguix_origins_count(origins["search_pattern"]) + count = _get_nixguix_origins_count(origins["search_pattern"], use_cache) + origins["count"] = f"{count:,}" if count else "" origins["instances"][origins_type] = {"nixguix": {"count": count}} if origins_type not in listers_metrics: continue count_total = sum( [metrics.origins_known for _, metrics in listers_metrics[origins_type]] ) count_never_visited = sum( [ metrics.origins_never_visited for _, metrics in listers_metrics[origins_type] ] ) count = count_total - count_never_visited origins["count"] = f"{count:,}" origins["instances"] = defaultdict(dict) for instance, metrics in listers_metrics[origins_type]: # these types are available in staging/docker but not yet in production if ( metrics.visit_type in ("bzr", "cvs") and metrics.visit_type not in get_savable_visit_types() ): continue instance_count = metrics.origins_known - metrics.origins_never_visited origins["instances"][instance].update( {metrics.visit_type: {"count": f"{instance_count:,}"}} ) origins["visit_types"] = list( set(origins["instances"][instance].keys()) | set(origins.get("visit_types", [])) ) if origins_type == "CRAN": origins["instances"]["cran"]["cran"] = {"count": origins["count"]} # defaultdict cannot be iterated in django template origins["instances"] = dict(origins["instances"]) for origins in listed_origins["origins"]: instances = origins["instances"] nb_instances = len(instances) for instance_name, visit_types in instances.items(): for visit_type in visit_types: if nb_instances > 1: search_pattern = instance_name else: search_pattern = origins["search_pattern"] search_url = _search_url(search_pattern, visit_type) visit_types[visit_type]["search_url"] = search_url for origins in legacy_origins["origins"]: origins["search_urls"] = {} for visit_type in origins["visit_types"]: origins["search_urls"][visit_type] = _search_url( origins["search_pattern"], visit_type ) - deposits_counts = _get_deposits_netloc_counts() + deposits_counts = _get_deposits_netloc_counts(use_cache) + for origins in deposited_origins["origins"]: if origins["search_pattern"] in deposits_counts: origins["count"] = f"{deposits_counts[origins['search_pattern']]:,}" origins["search_urls"] = { "deposit": _search_url(origins["search_pattern"], "deposit") } return render( request, "misc/coverage.html", { "origins": { "Regular crawling": listed_origins, "Discontinued hosting": legacy_origins, "On demand archival": deposited_origins, } }, ) urlpatterns = [ url(r"^coverage/$", _swh_coverage, name="swh-coverage"), ] diff --git a/swh/web/tests/misc/test_coverage.py b/swh/web/tests/misc/test_coverage.py index fe345f74..f35501ff 100644 --- a/swh/web/tests/misc/test_coverage.py +++ b/swh/web/tests/misc/test_coverage.py @@ -1,136 +1,131 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone from itertools import chain import os from random import randint import uuid -import pytest - from django.conf import settings from django.utils.html import escape from swh.scheduler.model import LastVisitStatus, ListedOrigin, OriginVisitStats from swh.web.common.utils import reverse -from swh.web.misc.coverage import ( - _get_deposits_netloc_counts, - _get_listers_metrics, - deposited_origins, - legacy_origins, - listed_origins, -) +from swh.web.config import SWH_WEB_SERVER_NAME +from swh.web.misc.coverage import deposited_origins, legacy_origins, listed_origins from swh.web.tests.django_asserts import assert_contains -from swh.web.tests.utils import check_html_get_response - - -@pytest.fixture(autouse=True) -def clear_lru_caches(): - _get_listers_metrics.cache_clear() - _get_deposits_netloc_counts.cache_clear() +from swh.web.tests.utils import check_html_get_response, check_http_get_response def test_coverage_view_no_metrics(client, swh_scheduler): """ Check coverage view can be rendered when scheduler metrics and deposits data are not available. """ url = reverse("swh-coverage") check_html_get_response( client, url, status_code=200, template_used="misc/coverage.html" ) def test_coverage_view_with_metrics(client, swh_scheduler, mocker): """ Generate some sample scheduler metrics and some sample deposits that will be consumed by the archive coverage view, then check the HTML page gets rendered without errors. """ - mocker.patch( - "swh.web.misc.coverage._get_nixguix_origins_count" - ).return_value = 30095 + + # mock calls to get nixguix origin counts + mock_archive = mocker.patch("swh.web.misc.coverage.archive") + mock_archive.lookup_latest_origin_snapshot.return_value = {"id": "some-snapshot"} + mock_archive.lookup_snapshot_sizes.return_value = {"release": 30095} + listers = [] visit_types = ["git", "hg", "svn", "bzr", "svn"] for origins in listed_origins["origins"]: # create some instances for each lister for instance in range(randint(1, 5)): lister = swh_scheduler.get_or_create_lister( origins["type"], f"instance-{instance}" ) listers.append(lister) # record some sample listed origins _origins = [] origin_visit_stats = [] for i, visit_type in enumerate(visit_types): url = str(uuid.uuid4()) _origins.append( ListedOrigin( lister_id=lister.id, url=url, visit_type=visit_type, extra_loader_arguments={}, ) ) # set origin visit stats to some origins if i % 2 == 0: now = datetime.now(tz=timezone.utc) origin_visit_stats.append( OriginVisitStats( url=url, visit_type=visit_type, last_successful=now, last_visit=now, last_visit_status=LastVisitStatus.successful, last_snapshot=os.urandom(20), ) ) # send origins data to scheduler swh_scheduler.record_listed_origins(_origins) swh_scheduler.origin_visit_stats_upsert(origin_visit_stats) # compute scheduler metrics swh_scheduler.update_metrics() # add some sample deposits deposits = [] for origins in deposited_origins["origins"]: for _ in range(randint(2, 10)): deposits.append( { "origin_url": f"https://{origins['search_pattern']}/{uuid.uuid4()}", "status": "done", } ) get_deposits_list = mocker.patch("swh.web.misc.coverage.get_deposits_list") get_deposits_list.return_value = deposits # check view gets rendered without errors url = reverse("swh-coverage") resp = check_html_get_response( client, url, status_code=200, template_used="misc/coverage.html" ) # check logos and origins search links are present in the rendered page for origins in chain( listed_origins["origins"], legacy_origins["origins"], deposited_origins["origins"], ): logo_url = f'{settings.STATIC_URL}img/logos/{origins["type"].lower()}.png' assert_contains(resp, f'src="{logo_url}"') if "instances" in origins: for visit_types_ in origins["instances"].values(): for data in visit_types_.values(): if data["count"]: assert_contains(resp, f'{visit_type}") + + # check request as in production with cache enabled + check_http_get_response( + client, url, status_code=200, server_name=SWH_WEB_SERVER_NAME + )