diff --git a/swh/web/client/cli.py b/swh/web/client/cli.py --- a/swh/web/client/cli.py +++ b/swh/web/client/cli.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import List + # WARNING: do not import unnecessary things here to keep cli startup time under # control import click @@ -115,3 +117,69 @@ Alias for 'revoke-token' """ ctx.forward(revoke_token) + + +@swh_cli_group.group(name="web", context_settings=CONTEXT_SETTINGS) +@click.pass_context +def web(ctx: Context): + """Software Heritage web client""" + + from swh.web.client.client import WebAPIClient + + ctx.ensure_object(dict) + # TODO (T2872): add configuration file for the web client + ctx.obj["client"] = WebAPIClient() + + +@web.command(name="search") +@click.argument( + "query", required=True, nargs=-1, metavar="KEYWORD...", +) +@click.option( + "--limit", + "limit", + type=int, + default=10, + show_default=True, + help="maximum number of results to show", +) +@click.option( + "--only-visited", + is_flag=True, + show_default=True, + help="if true, only return origins with at least one visit by Software heritage", +) +@click.option( + "--url-encode/--no-url-encode", + default=False, + show_default=True, + help="if true, escape origin URLs in results with percent encoding (RFC 3986)", +) +@click.pass_context +def search( + ctx: Context, query: List[str], limit: int, only_visited: bool, url_encode: bool, +): + """Search a query (as a list of keywords) into the Software Heritage + archive. + + The search results are printed to CSV format, one result per line, using a + tabulation as the field delimiter. + """ + + import sys + import urllib.parse + + client = ctx.obj["client"] + keywords = " ".join(query) + results = client.origin_search(keywords, limit, only_visited) + try: + for result in results: + if url_encode: + result["url"] = urllib.parse.quote_plus(result["url"]) + + print("\t".join(result.values())) + except (BrokenPipeError, IOError): + pass + + # Get rid of the BrokenPipeError message + sys.stderr.close() diff --git a/swh/web/client/client.py b/swh/web/client/client.py --- a/swh/web/client/client.py +++ b/swh/web/client/client.py @@ -553,3 +553,52 @@ r.raise_for_status() yield from r.iter_content(chunk_size=None, decode_unicode=False) + + def origin_search( + self, + query: str, + limit: Optional[int] = None, + with_visit: bool = False, + **req_args, + ) -> Iterator[Dict[str, Any]]: + """List origin search results + + Args: + query: search keywords + limit: the maximum number of found origins to return + with_visit: if true, only return origins with at least one visit + + Returns: + an iterator over search results + + Raises: + requests.HTTPError: if HTTP request fails + + """ + + params = [] + if limit is not None: + params.append(("limit", limit)) + if with_visit: + params.append(("with_visit", True)) + + done = False + nb_returned = 0 + q = f"origin/search/{query}/" + while not done: + r = self._call(q, params=params, **req_args) + json = r.json() + if limit and nb_returned + len(json) > limit: + json = json[: limit - nb_returned] + + nb_returned += len(json) + yield from json + + if limit and nb_returned == limit: + done = True + + if "next" in r.links and "url" in r.links["next"]: + params = [] + q = r.links["next"]["url"] + else: + done = True diff --git a/swh/web/client/tests/api_data.py b/swh/web/client/tests/api_data.py --- a/swh/web/client/tests/api_data.py +++ b/swh/web/client/tests/api_data.py @@ -7717,6 +7717,138 @@ "origin_visit_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/NixOS/nixpkgs/visit/30/", "snapshot_url": "https://archive.softwareheritage.org/api/1/snapshot/100de51846f317e6ab48da79d985cefa6fdefe42/" } +] + """, # NoQA: E501 # NoQA: E501 + "origin/search/foo%20bar%20baz%20qux/?with_visit=true": r""" +[ + { + "url": "https://github.com/foo-bar-baz-qux/mygithubpage", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/mygithubpage/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/foo-bar-baz-qux.github.io", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/foo-bar-baz-qux.github.io/visits/" + }, + { + "url": "https://github.com/tunnckoCore/foo-bar-baz-qux", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/tunnckoCore/foo-bar-baz-qux/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/aml-project", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/aml-project/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/ci_test", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/ci_test/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/extreme-computing", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/extreme-computing/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/cs205-homework", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/cs205-homework/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/rstan", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/rstan/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/cs207", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/cs207/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/2015lab1", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/2015lab1/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/stan", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/stan/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/atom-script", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/atom-script/visits/" + }, + { + "url": "https://github.com/foobarbazquxquux/gordon-test", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foobarbazquxquux/gordon-test/visits/" + }, + { + "url": "https://github.com/foobarbazquxquux/helios", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foobarbazquxquux/helios/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/2016", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/2016/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/devtools", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/devtools/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/dplyr", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/dplyr/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/paletter", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/paletter/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/ggplot2", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/ggplot2/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/stm", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/stm/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/concept-to-clinic", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/concept-to-clinic/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/scales", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/scales/visits/" + }, + { + "url": "https://www.npmjs.com/package/foo-bar-baz-qux", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://www.npmjs.com/package/foo-bar-baz-qux/visits/" + }, + { + "url": "https://bitbucket.org/foobarbazqux/rp.git", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/foobarbazqux/rp.git/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/knowledge-repo", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/knowledge-repo/visits/" + }, + { + "url": "https://github.com/foo-bar-baz-qux/EconML", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/EconML/visits/" + } +] + """, # NoQA: E501 # NoQA: E501 + "origin/search/python/?limit=5": r""" +[ + { + "url": "https://github.com/neon670/python.dev", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/neon670/python.dev/visits/" + }, + { + "url": "https://github.com/aur-archive/python-werkzeug", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/aur-archive/python-werkzeug/visits/" + }, + { + "url": "https://github.com/jsagon/jtradutor-web-python", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/jsagon/jtradutor-web-python/visits/" + }, + { + "url": "https://github.com/zjmwqx/ipythonCode", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/zjmwqx/ipythonCode/visits/" + }, + { + "url": "https://github.com/knutab/Python-BSM", + "origin_visits_url": "https://archive.softwareheritage.org/api/1/origin/https://github.com/knutab/Python-BSM/visits/" + } ] """, # NoQA: E501 # NoQA: E501 } diff --git a/swh/web/client/tests/gen-api-data.sh b/swh/web/client/tests/gen-api-data.sh --- a/swh/web/client/tests/gen-api-data.sh +++ b/swh/web/client/tests/gen-api-data.sh @@ -20,6 +20,8 @@ urls="${urls} snapshot/cabcc7d7bf639bbe1cc3b41989e1806618dd5764/?branches_count=1000&branches_from=refs/tags/v3.0-rc7" urls="${urls} origin/https://github.com/NixOS/nixpkgs/visits/?last_visit=50&per_page=10" urls="${urls} origin/https://github.com/NixOS/nixpkgs/visits/?last_visit=40&per_page=10" +urls="${urls} origin/search/foo%20bar%20baz%20qux/?with_visit=true" +urls="${urls} origin/search/python/?limit=5" echo "# GENERATED FILE, DO NOT EDIT." echo "# Run './gen-api-data.sh > api_data.py' instead." diff --git a/swh/web/client/tests/test_web_api_client.py b/swh/web/client/tests/test_web_api_client.py --- a/swh/web/client/tests/test_web_api_client.py +++ b/swh/web/client/tests/test_web_api_client.py @@ -150,6 +150,33 @@ assert visits[7]["snapshot"] == parse_swhid(snapshot_swhid) +def test_origin_search(web_api_client, web_api_mock): + limited_results = list(web_api_client.origin_search("python", limit=5)) + assert len(limited_results) == 5 + + results = list(web_api_client.origin_search("foo bar baz qux", with_visit=True)) + actual_urls = [r["url"] for r in results] + actual_visits = [r["origin_visits_url"] for r in results] + # Check *some* of the URLS since the search could return more results in the future + expected = [ + ( + "https://github.com/foo-bar-baz-qux/mygithubpage", + "https://archive.softwareheritage.org/api/1/origin/https://github.com/foo-bar-baz-qux/mygithubpage/visits/", # NoQA: E501 + ), + ( + "https://www.npmjs.com/package/foo-bar-baz-qux", + "https://archive.softwareheritage.org/api/1/origin/https://www.npmjs.com/package/foo-bar-baz-qux/visits/", # NoQA: E501 + ), + ( + "https://bitbucket.org/foobarbazqux/rp.git", + "https://archive.softwareheritage.org/api/1/origin/https://bitbucket.org/foobarbazqux/rp.git/visits/", # NoQA: E501 + ), + ] + for (url, visit) in expected: + assert url in actual_urls + assert visit in actual_visits + + def test_known(web_api_client, web_api_mock): # full list of SWHIDs for which we mock a {known: True} answer known_swhids = [