diff --git a/swh/web/archive_coverage/__init__.py b/swh/web/archive_coverage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swh/web/templates/misc/coverage.html b/swh/web/archive_coverage/templates/archive-coverage.html similarity index 100% rename from swh/web/templates/misc/coverage.html rename to swh/web/archive_coverage/templates/archive-coverage.html diff --git a/swh/web/archive_coverage/urls.py b/swh/web/archive_coverage/urls.py new file mode 100644 index 00000000..c0bba692 --- /dev/null +++ b/swh/web/archive_coverage/urls.py @@ -0,0 +1,12 @@ +# Copyright (C) 2018-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from django.urls import re_path as url + +from swh.web.archive_coverage.views import swh_coverage + +urlpatterns = [ + url(r"^coverage/$", swh_coverage, name="swh-coverage"), +] diff --git a/swh/web/misc/coverage.py b/swh/web/archive_coverage/views.py similarity index 98% rename from swh/web/misc/coverage.py rename to swh/web/archive_coverage/views.py index 9c28d0a7..101ee951 100644 --- a/swh/web/misc/coverage.py +++ b/swh/web/archive_coverage/views.py @@ -1,501 +1,495 @@ # Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import Counter, defaultdict from typing import Any, Dict, List, Tuple from urllib.parse import urlparse from django.http.request import HttpRequest from django.http.response import HttpResponse from django.shortcuts import render -from django.urls import re_path as url from django.views.decorators.cache import never_cache from django.views.decorators.clickjacking import xframe_options_exempt from swh.scheduler.model import SchedulerMetrics from swh.web.config import scheduler from swh.web.utils import ( archive, django_cache, get_deposits_list, is_swh_web_development, is_swh_web_production, reverse, ) _swh_arch_overview_doc = ( "https://docs.softwareheritage.org/devel/architecture/overview.html" ) # Current coverage list of the archive in a high level overview fashion, # categorized as follow: # - listed origins: origins discovered using a swh lister # - legacy: origins where public hosting service has closed # - deposited: origins coming from swh-deposit # # TODO: Store that list in a database table somewhere (swh-scheduler, swh-storage ?) # and retrieve it dynamically listed_origins: Dict[str, Any] = { "info": ( "These software origins get continuously discovered and archived using " f'the listers implemented by Software Heritage.' ), "origins": [ { "type": "bitbucket", "info_url": "https://bitbucket.org", "info": "public repositories from Bitbucket", "search_pattern": { "default": "https://bitbucket.org/", }, }, { "type": "cgit", "info_url": "https://git.zx2c4.com/cgit/about", "info": "public repositories from cgit instances", "search_pattern": { "default": "cgit", }, }, { "type": "CRAN", "info_url": "https://cran.r-project.org", "info": "source packages from The Comprehensive R Archive Network", "search_pattern": { "default": "https://cran.r-project.org/", }, }, { "type": "debian", "info_url": "https://www.debian.org", "info": "source packages from Debian and Debian-based distributions", "search_pattern": { "default": "deb://", }, }, { "type": "gitea", "info_url": "https://gitea.io", "info": "public repositories from Gitea instances", "search_pattern": { "default": "gitea", }, }, { "type": "github", "info_url": "https://github.com", "info": "public repositories from GitHub", "search_pattern": { "default": "https://github.com/", }, }, { "type": "gitlab", "info_url": "https://gitlab.com", "info": "public repositories from multiple GitLab instances", "search_pattern": { "default": "gitlab", }, }, { "type": "guix", "info_url": "https://guix.gnu.org", "info": "source code tarballs used to build the Guix package collection", "visit_types": ["nixguix"], "search_pattern": { "default": "https://guix.gnu.org/sources.json", }, }, { "type": "GNU", "info_url": "https://www.gnu.org", "info": "releases from the GNU project (as of August 2015)", "search_pattern": { "default": "gnu", }, }, { "type": "heptapod", "info_url": "https://heptapod.net/", "info": "public repositories from multiple Heptapod instances", "search_pattern": { "default": "heptapod", }, }, { "type": "launchpad", "info_url": "https://launchpad.net", "logo": "img/logos/launchpad.png", "info": "public repositories from Launchpad", "search_pattern": { "default": "launchpad.net/", }, }, { "type": "maven", "info_url": "https://maven.apache.org/", "info": "java source packages from maven repositories", "search_pattern": { "default": "maven", "cvs": "", "git": "", "hg": "", "svn": "", }, }, { "type": "nixos", "info_url": "https://nixos.org", "info": "source code tarballs used to build the Nix package collection", "visit_types": ["nixguix"], "search_pattern": { "default": ( "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json" ) }, }, { "type": "npm", "info_url": "https://www.npmjs.com", "info": "public packages from the package registry for javascript", "search_pattern": { "default": "https://www.npmjs.com", }, }, { "type": "opam", "info_url": "https://opam.ocaml.org/", "info": "public packages from the source-based package manager for OCaml", "search_pattern": { "default": "opam+https://", }, }, { "type": "Packagist", "info_url": "https://packagist.org/", "info": "source code repositories referenced by The PHP Package Repository", "search_pattern": { "default": "", }, }, { "type": "phabricator", "info_url": "https://www.phacility.com/phabricator", "info": "public repositories from multiple Phabricator instances", "search_pattern": { "default": "phabricator", }, }, { "type": "pypi", "info_url": "https://pypi.org", "info": "source packages from the Python Package Index", "search_pattern": { "default": "https://pypi.org", }, }, { "type": "sourceforge", "info_url": "https://sourceforge.net", "info": "public repositories from SourceForge", "search_pattern": { "default": "code.sf.net", "bzr": "bzr.sourceforge.net", "cvs": "cvs.sourceforge.net", }, }, ], } legacy_origins: Dict[str, Any] = { "info": ( "Discontinued hosting services. Those origins have been archived " "by Software Heritage." ), "origins": [ { "type": "gitorious", "info_url": "https://en.wikipedia.org/wiki/Gitorious", "info": ( "public repositories from the former Gitorious code hosting service" ), "visit_types": ["git"], "search_pattern": "https://gitorious.org", "count": "122,014", }, { "type": "googlecode", "info_url": "https://code.google.com/archive", "info": ( "public repositories from the former Google Code project " "hosting service" ), "visit_types": ["git", "hg", "svn"], "search_pattern": "googlecode.com", "count": "790,026", }, { "type": "bitbucket", "info_url": "https://bitbucket.org", "info": "public repositories from Bitbucket", "search_pattern": "https://bitbucket.org/", "visit_types": ["hg"], "count": "336,795", }, ], } deposited_origins: Dict[str, Any] = { "info": ( "These origins are directly pushed into the archive by trusted partners " f'using the deposit service of Software Heritage.' ), "origins": [ { "type": "elife", "info_url": "https://elifesciences.org", "info": ( "research software source code associated to the articles " "eLife publishes" ), "search_pattern": "elife.stencila.io", "visit_types": ["deposit"], }, { "type": "hal", "info_url": "https://hal.archives-ouvertes.fr", "info": "scientific software source code deposited in the open archive HAL", "visit_types": ["deposit"], "search_pattern": "hal.archives-ouvertes.fr", }, { "type": "ipol", "info_url": "https://www.ipol.im", "info": "software artifacts associated to the articles IPOL publishes", "visit_types": ["deposit"], "search_pattern": "doi.org/10.5201", }, ], } _cache_timeout = 60 * 60 # one hour def _get_listers_metrics( cache_metrics: bool = False, ) -> Dict[str, List[Tuple[str, SchedulerMetrics]]]: """Returns scheduler metrics in the following mapping: Dict[lister_name, List[Tuple[instance_name, SchedulerMetrics]]] as a lister instance has one SchedulerMetrics object per visit type. """ @django_cache( timeout=_cache_timeout, catch_exception=True, exception_return_value={}, invalidate_cache_pred=lambda m: not cache_metrics, ) def _get_listers_metrics_internal(): listers_metrics = defaultdict(list) listers = scheduler().get_listers() scheduler_metrics = scheduler().get_metrics() for lister in listers: for metrics in filter( lambda m: m.lister_id == lister.id, scheduler_metrics ): listers_metrics[lister.name].append((lister.instance_name, metrics)) return listers_metrics return _get_listers_metrics_internal() def _get_deposits_netloc_counts(cache_counts: bool = False) -> Counter: """Return deposit counts per origin url network location.""" def _process_origin_url(origin_url): parsed_url = urlparse(origin_url) netloc = parsed_url.netloc # special treatment for doi.org netloc as it is not specific enough # for origins mapping if parsed_url.netloc == "doi.org": netloc += "/" + parsed_url.path.split("/")[1] return netloc @django_cache( timeout=_cache_timeout, catch_exception=True, exception_return_value=Counter(), invalidate_cache_pred=lambda m: not cache_counts, ) def _get_deposits_netloc_counts_internal(): netlocs = [] deposits = get_deposits_list() netlocs = [ _process_origin_url(d["origin_url"]) for d in deposits if d["status"] == "done" ] deposits_netloc_counts = Counter(netlocs) return deposits_netloc_counts return _get_deposits_netloc_counts_internal() def _get_nixguix_origins_count(origin_url: str, cache_count: bool = False) -> int: """Returns number of archived tarballs for NixOS, aka the number of branches in a dedicated origin in the archive. """ @django_cache( timeout=_cache_timeout, catch_exception=True, exception_return_value=0, invalidate_cache_pred=lambda m: not cache_count, ) def _get_nixguix_origins_count_internal(): snapshot = archive.lookup_latest_origin_snapshot(origin_url) if snapshot: snapshot_sizes = archive.lookup_snapshot_sizes(snapshot["id"]) nixguix_origins_count = snapshot_sizes["release"] else: nixguix_origins_count = 0 return nixguix_origins_count return _get_nixguix_origins_count_internal() def _search_url(query: str, visit_type: str) -> str: return reverse( "browse-search", query_params={ "q": query, "visit_type": visit_type, "with_visit": "true", "with_content": "true", }, ) @xframe_options_exempt @never_cache -def _swh_coverage(request: HttpRequest) -> HttpResponse: +def swh_coverage(request: HttpRequest) -> HttpResponse: use_cache = is_swh_web_production(request) listers_metrics = _get_listers_metrics(use_cache) for origins in listed_origins["origins"]: origins["count"] = "0" origins["instances"] = {} origins_type = origins["type"] # special processing for nixos/guix origins as there is no # scheduler metrics for those if origins_type in ("nixos", "guix"): count = _get_nixguix_origins_count( origins["search_pattern"]["default"], use_cache ) origins["count"] = f"{count:,}" origins["instances"][origins_type] = {"nixguix": {"count": count}} if origins_type not in listers_metrics: continue count_total = sum( [metrics.origins_enabled for _, metrics in listers_metrics[origins_type]] ) count_never_visited = sum( [ metrics.origins_never_visited for _, metrics in listers_metrics[origins_type] ] ) count = count_total - count_never_visited origins["count"] = f"{count:,}" origins["instances"] = defaultdict(dict) for instance, metrics in listers_metrics[origins_type]: instance_count = metrics.origins_enabled - metrics.origins_never_visited # no archived origins for that visit type, skip it if instance_count == 0: continue origins["instances"][instance].update( {metrics.visit_type: {"count": f"{instance_count:,}"}} ) origins["visit_types"] = list( set(origins["instances"][instance].keys()) | set(origins.get("visit_types", [])) ) if origins_type == "CRAN": origins["instances"]["cran"]["cran"] = {"count": origins["count"]} # defaultdict cannot be iterated in django template origins["instances"] = dict(origins["instances"]) for origins in listed_origins["origins"]: instances = origins["instances"] nb_instances = len(instances) for instance_name, visit_types in instances.items(): for visit_type in visit_types: search_url = "" if visit_type in origins["search_pattern"]: search_pattern = origins["search_pattern"][visit_type] elif nb_instances > 1: search_pattern = instance_name else: search_pattern = origins["search_pattern"]["default"] if search_pattern: search_url = _search_url(search_pattern, visit_type) visit_types[visit_type]["search_url"] = search_url # filter out origin types without archived origins on production and staging if not is_swh_web_development(request): listed_origins["origins"] = list( filter(lambda o: o["count"] != "0", listed_origins["origins"]) ) for origins in legacy_origins["origins"]: origins["search_urls"] = {} for visit_type in origins["visit_types"]: origins["search_urls"][visit_type] = _search_url( origins["search_pattern"], visit_type ) deposits_counts = _get_deposits_netloc_counts(use_cache) for origins in deposited_origins["origins"]: origins["count"] = "0" if origins["search_pattern"] in deposits_counts: origins["count"] = f"{deposits_counts[origins['search_pattern']]:,}" origins["search_urls"] = { "deposit": _search_url(origins["search_pattern"], "deposit") } focus = [] focus_param = request.GET.get("focus") if focus_param: focus = focus_param.split(",") return render( request, - "misc/coverage.html", + "archive-coverage.html", { "origins": { "Regular crawling": listed_origins, "Discontinued hosting": legacy_origins, "On demand archival": deposited_origins, }, "focus": focus, }, ) - - -urlpatterns = [ - url(r"^coverage/$", _swh_coverage, name="swh-coverage"), -] diff --git a/swh/web/config.py b/swh/web/config.py index 2fa30995..4f510091 100644 --- a/swh/web/config.py +++ b/swh/web/config.py @@ -1,236 +1,237 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.counters import get_counters from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings SWH_WEB_SERVER_NAME = "archive.softwareheritage.org" SWH_WEB_INTERNAL_SERVER_NAME = "archive.internal.softwareheritage.org" SWH_WEB_STAGING_SERVER_NAMES = [ "webapp.staging.swh.network", "webapp.internal.staging.swh.network", ] SETTINGS_DIR = os.path.dirname(settings.__file__) DEFAULT_CONFIG = { "allowed_hosts": ("list", []), "storage": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5002/", "timeout": 10, }, ), "indexer_storage": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5007/", "timeout": 1, }, ), "counters": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5011/", "timeout": 1, }, ), "search": ( "dict", { "cls": "remote", "url": "http://127.0.0.1:5010/", "timeout": 10, }, ), "search_config": ( "dict", { "metadata_backend": "swh-indexer-storage", }, # or "swh-search" ), "log_dir": ("string", "/tmp/swh/log"), "debug": ("bool", False), "serve_assets": ("bool", False), "host": ("string", "127.0.0.1"), "port": ("int", 5004), "secret_key": ("string", "development key"), # do not display code highlighting for content > 1MB "content_display_max_size": ("int", 5 * 1024 * 1024), "snapshot_content_max_size": ("int", 1000), "throttling": ( "dict", { "cache_uri": None, # production: memcached as cache (127.0.0.1:11211) # development: in-memory cache so None "scopes": { "swh_api": { "limiter_rate": {"default": "120/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_search": { "limiter_rate": {"default": "10/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_vault_cooking": { "limiter_rate": {"default": "120/h", "GET": "60/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_save_origin": { "limiter_rate": {"default": "120/h", "POST": "10/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_visit_latest": { "limiter_rate": {"default": "700/m"}, "exempted_networks": ["127.0.0.0/8"], }, }, }, ), "vault": ( "dict", { "cls": "remote", "args": { "url": "http://127.0.0.1:5005/", }, }, ), "scheduler": ("dict", {"cls": "remote", "url": "http://127.0.0.1:5008/"}), "development_db": ("string", os.path.join(SETTINGS_DIR, "db.sqlite3")), "test_db": ("dict", {"name": "swh-web-test"}), "production_db": ("dict", {"name": "swh-web"}), "deposit": ( "dict", { "private_api_url": "https://deposit.softwareheritage.org/1/private/", "private_api_user": "swhworker", "private_api_password": "some-password", }, ), "e2e_tests_mode": ("bool", False), "es_workers_index_url": ("string", ""), "history_counters_url": ( "string", ( "http://counters1.internal.softwareheritage.org:5011" "/counters_history/history.json" ), ), "client_config": ("dict", {}), "keycloak": ("dict", {"server_url": "", "realm_name": ""}), "graph": ( "dict", { "server_url": "http://graph.internal.softwareheritage.org:5009/graph/", "max_edges": {"staff": 0, "user": 100000, "anonymous": 1000}, }, ), "status": ( "dict", { "server_url": "https://status.softwareheritage.org/", "json_path": "1.0/status/578e5eddcdc0cc7951000520", }, ), "counters_backend": ("string", "swh-storage"), # or "swh-counters" "staging_server_names": ("list", SWH_WEB_STAGING_SERVER_NAMES), "instance_name": ("str", "archive-test.softwareheritage.org"), "give": ("dict", {"public_key": "", "token": ""}), "features": ("dict", {"add_forge_now": True}), "add_forge_now": ("dict", {"email_address": "add-forge-now@example.com"}), "swh_extra_django_apps": ( "list", [ "swh.web.inbound_email", "swh.web.add_forge_now", "swh.web.mailmap", "swh.web.save_code_now", "swh.web.deposit", "swh.web.badges", + "swh.web.archive_coverage", ], ), } swhweb_config: Dict[str, Any] = {} def get_config(config_file="web/web"): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get("SWH_CONFIG_FILENAME") if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, "log_dir") if swhweb_config.get("search"): swhweb_config["search"] = get_search(**swhweb_config["search"]) else: swhweb_config["search"] = None swhweb_config["storage"] = get_storage(**swhweb_config["storage"]) swhweb_config["vault"] = get_vault(**swhweb_config["vault"]) swhweb_config["indexer_storage"] = get_indexer_storage( **swhweb_config["indexer_storage"] ) swhweb_config["scheduler"] = get_scheduler(**swhweb_config["scheduler"]) swhweb_config["counters"] = get_counters(**swhweb_config["counters"]) return swhweb_config def search(): """Return the current application's search.""" return get_config()["search"] def storage(): """Return the current application's storage.""" return get_config()["storage"] def vault(): """Return the current application's vault.""" return get_config()["vault"] def indexer_storage(): """Return the current application's indexer storage.""" return get_config()["indexer_storage"] def scheduler(): """Return the current application's scheduler.""" return get_config()["scheduler"] def counters(): """Return the current application's counters.""" return get_config()["counters"] diff --git a/swh/web/misc/urls.py b/swh/web/misc/urls.py index 4856af6b..7c12bd6c 100644 --- a/swh/web/misc/urls.py +++ b/swh/web/misc/urls.py @@ -1,118 +1,117 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import requests from django.conf.urls import include from django.contrib.staticfiles import finders from django.http import JsonResponse from django.shortcuts import render from django.urls import re_path as url from django.views.decorators.clickjacking import xframe_options_exempt from swh.web.config import get_config from swh.web.misc.metrics import prometheus_metrics from swh.web.utils import archive from swh.web.utils.exc import sentry_capture_exception def _jslicenses(request): jslicenses_file = finders.find("jssources/jslicenses.json") jslicenses_data = json.load(open(jslicenses_file)) jslicenses_data = sorted( jslicenses_data.items(), key=lambda item: item[0].split("/")[-1] ) return render(request, "misc/jslicenses.html", {"jslicenses_data": jslicenses_data}) def _stat_counters(request): stat_counters = archive.stat_counters() url = get_config()["history_counters_url"] stat_counters_history = {} try: response = requests.get(url, timeout=5) stat_counters_history = json.loads(response.text) except Exception as exc: sentry_capture_exception(exc) counters = { "stat_counters": stat_counters, "stat_counters_history": stat_counters_history, } return JsonResponse(counters) @xframe_options_exempt def hiring_banner(request): lang = request.GET.get("lang") return render( request, "misc/hiring-banner-iframe.html", { "lang": lang if lang else "en", }, ) urlpatterns = [ - url(r"^", include("swh.web.misc.coverage")), url(r"^jslicenses/$", _jslicenses, name="jslicenses"), url(r"^stat_counters/$", _stat_counters, name="stat-counters"), url(r"^metrics/prometheus/$", prometheus_metrics, name="metrics-prometheus"), url(r"^", include("swh.web.misc.fundraising")), url(r"^hiring/banner/$", hiring_banner, name="swh-hiring-banner"), ] # when running end to end tests through cypress, declare some extra # endpoints to provide input data for some of those tests if get_config()["e2e_tests_mode"]: from swh.web.tests.views import ( get_content_code_data_all_exts, get_content_code_data_all_filenames, get_content_code_data_by_ext, get_content_code_data_by_filename, get_content_other_data_by_ext, ) urlpatterns.append( url( r"^tests/data/content/code/extension/(?P.+)/$", get_content_code_data_by_ext, name="tests-content-code-extension", ) ) urlpatterns.append( url( r"^tests/data/content/other/extension/(?P.+)/$", get_content_other_data_by_ext, name="tests-content-other-extension", ) ) urlpatterns.append( url( r"^tests/data/content/code/extensions/$", get_content_code_data_all_exts, name="tests-content-code-extensions", ) ) urlpatterns.append( url( r"^tests/data/content/code/filename/(?P.+)/$", get_content_code_data_by_filename, name="tests-content-code-filename", ) ) urlpatterns.append( url( r"^tests/data/content/code/filenames/$", get_content_code_data_all_filenames, name="tests-content-code-filenames", ) ) diff --git a/swh/web/templates/homepage.html b/swh/web/templates/homepage.html index d49f8411..c6ad2def 100644 --- a/swh/web/templates/homepage.html +++ b/swh/web/templates/homepage.html @@ -1,113 +1,117 @@ {% extends "layout.html" %} {% comment %} Copyright (C) 2017-2020 The Software Heritage developers See the AUTHORS file at the top-level directory of this distribution License: GNU Affero General Public License version 3, or any later version See top-level LICENSE file for more information {% endcomment %} {% load static %} {% load render_bundle from webpack_loader %} {% block header %} {% render_bundle 'browse' %} {% endblock %} {% block title %}Welcome to the Software Heritage archive{% endblock %} {% block navbar-content %}

Welcome to the Software Heritage archive

{% endblock %} {% block content %}

... or check our Web API

Overview

The long term goal of the Software Heritage initiative is to collect all publicly available software in source code form together with its development history, replicate it massively to ensure its preservation, and share it with everyone who needs it. The Software Heritage archive is growing over time as we crawl new source code from software projects and development forges.

-
-

Content

+{% if "swh.web.archive_coverage" in SWH_DJANGO_APPS %} - -
+
+

Content

+ + +
+ +{% endif %}

Size

As of today the archive already contains and keeps safe for you the following amount of objects:

Source files
0
Commits
0
Projects
0
Directories
0
Authors
0
Releases
0

Note: the counters and graphs above are based on heuristics that might not reflect the exact size of the archive. While the long-term trends shown and ballpark figures are reliable, individual point-in-time values might not be.

{% endblock %} diff --git a/swh/web/tests/archive_coverage/__init__.py b/swh/web/tests/archive_coverage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swh/web/tests/archive_coverage/test_app.py b/swh/web/tests/archive_coverage/test_app.py new file mode 100644 index 00000000..543d2ab9 --- /dev/null +++ b/swh/web/tests/archive_coverage/test_app.py @@ -0,0 +1,33 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from django.urls import get_resolver + +from swh.web.archive_coverage.urls import urlpatterns +from swh.web.tests.django_asserts import assert_not_contains +from swh.web.tests.helpers import check_html_get_response +from swh.web.utils import reverse + + +@pytest.mark.django_db +def test_archive_coverage_deactivate(client, django_settings): + """Check archive coverage feature is deactivated when the swh.web.archive_coverage + django application is not in installed apps.""" + + django_settings.SWH_DJANGO_APPS = [ + app + for app in django_settings.SWH_DJANGO_APPS + if app != "swh.web.archive_coverage" + ] + + url = reverse("swh-web-homepage") + resp = check_html_get_response(client, url, status_code=200) + assert_not_contains(resp, "swh-coverage-iframe") + + archive_coverage_view_names = set(urlpattern.name for urlpattern in urlpatterns) + all_view_names = set(get_resolver().reverse_dict.keys()) + assert archive_coverage_view_names & all_view_names == set() diff --git a/swh/web/tests/misc/test_coverage.py b/swh/web/tests/archive_coverage/test_coverage.py similarity index 92% rename from swh/web/tests/misc/test_coverage.py rename to swh/web/tests/archive_coverage/test_coverage.py index 9090a43d..f601b96f 100644 --- a/swh/web/tests/misc/test_coverage.py +++ b/swh/web/tests/archive_coverage/test_coverage.py @@ -1,261 +1,265 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import copy from datetime import datetime, timezone from itertools import chain import os from random import choices, randint import uuid import pytest from django.conf import settings from django.utils.html import escape from swh.scheduler.model import LastVisitStatus, ListedOrigin, OriginVisitStats +from swh.web.archive_coverage.views import ( + deposited_origins, + legacy_origins, + listed_origins, +) from swh.web.config import SWH_WEB_SERVER_NAME -from swh.web.misc.coverage import deposited_origins, legacy_origins, listed_origins from swh.web.tests.django_asserts import assert_contains, assert_not_contains from swh.web.tests.helpers import check_html_get_response, check_http_get_response from swh.web.utils import reverse def test_coverage_view_no_metrics(client, swh_scheduler): """ Check coverage view can be rendered when scheduler metrics and deposits data are not available. """ url = reverse("swh-coverage") check_html_get_response( - client, url, status_code=200, template_used="misc/coverage.html" + client, url, status_code=200, template_used="archive-coverage.html" ) visit_types = ["git", "hg", "svn", "bzr", "cvs"] @pytest.fixture def archive_coverage_data(mocker, swh_scheduler): """Generate some sample scheduler metrics and some sample deposits that will be consumed by the archive coverage view. """ # mock calls to get nixguix origin counts - mock_archive = mocker.patch("swh.web.misc.coverage.archive") + mock_archive = mocker.patch("swh.web.archive_coverage.views.archive") mock_archive.lookup_latest_origin_snapshot.return_value = {"id": "some-snapshot"} mock_archive.lookup_snapshot_sizes.return_value = {"release": 30095} listers = [] for origins in listed_origins["origins"]: # create some instances for each lister for instance in range(randint(1, 5)): lister = swh_scheduler.get_or_create_lister( origins["type"], f"instance-{instance}" ) listers.append(lister) # record some sample listed origins _origins = [] origin_visit_stats = [] for i, visit_type in enumerate(visit_types): url = str(uuid.uuid4()) _origins.append( ListedOrigin( lister_id=lister.id, url=url, visit_type=visit_type, extra_loader_arguments={}, ) ) # set origin visit stats to some origins if i % 2 == 0: now = datetime.now(tz=timezone.utc) origin_visit_stats.append( OriginVisitStats( url=url, visit_type=visit_type, last_successful=now, last_visit=now, last_visit_status=LastVisitStatus.successful, last_snapshot=os.urandom(20), ) ) # send origins data to scheduler swh_scheduler.record_listed_origins(_origins) swh_scheduler.origin_visit_stats_upsert(origin_visit_stats) # compute scheduler metrics swh_scheduler.update_metrics() # add some sample deposits deposits = [] for origins in deposited_origins["origins"]: for _ in range(randint(2, 10)): deposits.append( { "origin_url": f"https://{origins['search_pattern']}/{uuid.uuid4()}", "status": "done", } ) - get_deposits_list = mocker.patch("swh.web.misc.coverage.get_deposits_list") + get_deposits_list = mocker.patch("swh.web.archive_coverage.views.get_deposits_list") get_deposits_list.return_value = deposits def test_coverage_view_with_metrics(client, archive_coverage_data): # check view gets rendered without errors url = reverse("swh-coverage") resp = check_html_get_response( - client, url, status_code=200, template_used="misc/coverage.html" + client, url, status_code=200, template_used="archive-coverage.html" ) # check logos and origins search links are present in the rendered page for origins in chain( listed_origins["origins"], legacy_origins["origins"], deposited_origins["origins"], ): logo_url = f'{settings.STATIC_URL}img/logos/{origins["type"].lower()}.png' assert_contains(resp, f'src="{logo_url}"') origin_visit_types = set() if "instances" in origins: for visit_types_ in origins["instances"].values(): origin_visit_types.update(visit_types_.keys()) for data in visit_types_.values(): if data["count"] and data["search_url"]: assert_contains(resp, f'{visit_type}") # check request as in production with cache enabled check_http_get_response( client, url, status_code=200, server_name=SWH_WEB_SERVER_NAME ) def test_coverage_view_with_focus(client, archive_coverage_data): origins = ( listed_origins["origins"] + legacy_origins["origins"] + deposited_origins["origins"] ) focus = choices([o["type"] for o in origins], k=randint(1, 3)) # check view gets rendered without errors url = reverse("swh-coverage", query_params={"focus": ",".join(focus)}) resp = check_html_get_response( - client, url, status_code=200, template_used="misc/coverage.html" + client, url, status_code=200, template_used="archive-coverage.html" ) # check focused elements assert_contains( resp, "swh-coverage-focus", count=len([o for o in origins if o["type"] in focus]), ) # check bootstrap cards are expanded assert_contains( resp, 'class="collapse show"', count=len(origins), ) @pytest.fixture def archive_coverage_data_with_non_visited_origins(mocker, swh_scheduler): # mock calls to get nixguix origin counts - mock_archive = mocker.patch("swh.web.misc.coverage.archive") + mock_archive = mocker.patch("swh.web.archive_coverage.views.archive") mock_archive.lookup_latest_origin_snapshot.return_value = {"id": "some-snapshot"} mock_archive.lookup_snapshot_sizes.return_value = {"release": 30095} listers = [] for i, origins in enumerate(listed_origins["origins"]): # create one instances for each lister lister = swh_scheduler.get_or_create_lister( origins["type"], f"instance-{origins['type']}" ) listers.append(lister) if i % 2 == 1 or origins["type"] in ("guix", "nixos"): # do not declare origins for lister with odd index continue _origins = [] origin_visit_stats = [] for j, visit_type in enumerate(visit_types): url = str(uuid.uuid4()) _origins.append( ListedOrigin( lister_id=lister.id, url=url, visit_type=visit_type, extra_loader_arguments={}, ) ) # do not declare visit for visit type with even index if j % 2 != 0: now = datetime.now(tz=timezone.utc) origin_visit_stats.append( OriginVisitStats( url=url, visit_type=visit_type, last_successful=now, last_visit=now, last_visit_status=LastVisitStatus.successful, last_snapshot=os.urandom(20), ) ) # send origins data to scheduler swh_scheduler.record_listed_origins(_origins) swh_scheduler.origin_visit_stats_upsert(origin_visit_stats) # compute scheduler metrics swh_scheduler.update_metrics() # set deposit origins as empty - get_deposits_list = mocker.patch("swh.web.misc.coverage.get_deposits_list") + get_deposits_list = mocker.patch("swh.web.archive_coverage.views.get_deposits_list") get_deposits_list.return_value = [] def test_coverage_view_filter_out_non_visited_origins( client, archive_coverage_data_with_non_visited_origins ): origins = copy.copy(listed_origins) # check view gets rendered without errors url = reverse("swh-coverage") resp = check_html_get_response( client, url, status_code=200, - template_used="misc/coverage.html", + template_used="archive-coverage.html", server_name=SWH_WEB_SERVER_NAME, ) for i, origins in enumerate(origins["origins"]): if origins["type"] in ("guix", "nixos"): continue if i % 2 == 1: # counters for lister with odd index should not be displayed assert_not_contains(resp, f'id="{origins["type"]}"') else: # counters for lister with even index should be displayed assert_contains(resp, f'id="{origins["type"]}"') for j, visit_type in enumerate(visit_types): if j % 2 == 0: # counter for visit type with even index should be displayed assert_not_contains(resp, f'id="{origins["type"]}-{visit_type}"') else: # counter for visit type with odd index should not be displayed assert_contains(resp, f'id="{origins["type"]}-{visit_type}"')