From 965be8f9ee16d5cd9edbaea58f0e2006553bf87d Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Fri, 24 Jan 2025 11:50:24 +0100
Subject: [PATCH] cli: Add 'sentry extract-origin-urls' command

Usage: swh sentry extract-origin-urls [OPTIONS]

  Extract origin URLs from events.

  This command allows to extract origin URLs from Sentry events related to a
  Software Heritage loader and dumps them to stdout.

Options:
  -u, --sentry-url TEXT           Sentry URL  [default:
                                  https://sentry.softwareheritage.org]
  -t, --sentry-token TEXT         Bearer token required to communicate with
                                  Sentry API (can also be provided in
                                  SENTRY_TOKEN environment variable)
                                  [required]
  -i, --sentry-issue-number TEXT  Sentry issue number to extract origin URLs
                                  from its events  [required]
  -e, --environment TEXT          Filter on environment: production or
                                  staging, both are selected by default
  -h, --help                      Show this message and exit.
---
 pyproject.toml                                |   1 +
 swh/core/cli/sentry.py                        | 110 ++++++++++++++++++
 swh/core/pytest_plugin.py                     |  31 +++--
 .../api_0_issues_112726_events                |  56 +++++++++
 .../api_0_issues_112726_events,cursor=0:100:0 |  56 +++++++++
 .../api_0_issues_112726_events,cursor=0:200:0 |   1 +
 swh/core/tests/test_cli_sentry.py             |  48 ++++++++
 7 files changed, 295 insertions(+), 8 deletions(-)
 create mode 100644 swh/core/cli/sentry.py
 create mode 100644 swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
 create mode 100644 swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:100:0
 create mode 100644 swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:200:0
 create mode 100644 swh/core/tests/test_cli_sentry.py

diff --git a/pyproject.toml b/pyproject.toml
index cb13689..f81566c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ testing = {file = [
 [project.entry-points."swh.cli.subcommands"]
 "swh.core.db" = "swh.core.cli.db"
 "swh.core.backend" = "swh.core.cli.backend"
+"swh.core.sentry" = "swh.core.cli.sentry"
 
 [project.entry-points.pytest11]
 "pytest_swh_core" = "swh.core.pytest_plugin"
diff --git a/swh/core/cli/sentry.py b/swh/core/cli/sentry.py
new file mode 100644
index 0000000..41b15d7
--- /dev/null
+++ b/swh/core/cli/sentry.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (C) 2025  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import Any, Callable, Dict, List
+
+import click
+
+from swh.core.cli import CONTEXT_SETTINGS
+from swh.core.cli import swh as swh_cli_group
+
+
+def common_options(func):
+    import functools
+
+    @click.option(
+        "--sentry-url",
+        "-u",
+        default="https://sentry.softwareheritage.org",
+        show_default=True,
+        help="Sentry URL",
+    )
+    @click.option(
+        "--sentry-token",
+        "-t",
+        default=None,
+        envvar="SENTRY_TOKEN",
+        help=(
+            "Bearer token required to communicate with Sentry API (can also be provided "
+            "in SENTRY_TOKEN environment variable)"
+        ),
+        required=True,
+    )
+    @click.option(
+        "--sentry-issue-number",
+        "-i",
+        help="Sentry issue number to extract origin URLs from its events",
+        required=True,
+    )
+    @click.option(
+        "--environment",
+        "-e",
+        default="",
+        help="Filter on environment: production or staging, both are selected by default",
+    )
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+@swh_cli_group.group(name="sentry", context_settings=CONTEXT_SETTINGS)
+def sentry():
+    """Software Heritage tools for extracting data from the events associated to
+    a Sentry issue using Sentry REST API."""
+    pass
+
+
+def _process_sentry_events_pages(
+    sentry_url,
+    sentry_token,
+    sentry_issue_number,
+    events_page_process_callback: Callable[[List[Dict[str, Any]]], None],
+):
+    import requests
+
+    sentry_api_base_url = f"{sentry_url.rstrip('/')}/api/0"
+    sentry_issue_events_url = (
+        f"{sentry_api_base_url}/issues/{sentry_issue_number}/events/"
+    )
+    while True:
+        response = requests.get(
+            sentry_issue_events_url, headers={"Authorization": f"Bearer {sentry_token}"}
+        )
+        events = response.json()
+        if not events:
+            break
+        events_page_process_callback(events)
+        sentry_issue_events_url = response.links.get("next", {}).get("url")
+
+
+@sentry.command(name="extract-origin-urls", context_settings=CONTEXT_SETTINGS)
+@common_options
+def extract_origin_urls(sentry_url, sentry_token, sentry_issue_number, environment):
+    """Extract origin URLs from events.
+
+    This command allows to extract origin URLs from Sentry events related to
+    a Software Heritage loader and dumps them to stdout."""
+
+    origin_urls = set()
+
+    def _extract_origin_urls(events: List[Dict[str, Any]]):
+        for event in events:
+            tags = {tag["key"]: tag["value"] for tag in event.get("tags", [])}
+            env_match = environment in tags.get("environment", "")
+            if "swh.loader.origin_url" in tags and env_match:
+                origin_urls.add(tags["swh.loader.origin_url"])
+
+    _process_sentry_events_pages(
+        sentry_url,
+        sentry_token,
+        sentry_issue_number,
+        _extract_origin_urls,
+    )
+
+    for origin_url in sorted(origin_urls):
+        click.echo(origin_url)
diff --git a/swh/core/pytest_plugin.py b/swh/core/pytest_plugin.py
index a3d8332..0f431c1 100644
--- a/swh/core/pytest_plugin.py
+++ b/swh/core/pytest_plugin.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2024  The Software Heritage developers
+# Copyright (C) 2019-2025  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -10,7 +10,7 @@ import logging
 from os import path
 from pathlib import Path
 import re
-from typing import Dict, List, Optional
+from typing import Callable, Dict, List, Optional
 from urllib.parse import unquote, urlparse
 
 import pytest
@@ -29,11 +29,12 @@ MAX_VISIT_FILES = 10
 
 
 def get_response_cb(
-    request: requests.Request,
+    request,
     context,
     datadir,
     ignore_urls: List[str] = [],
     visits: Optional[Dict] = None,
+    response_context_callback: Optional[Callable[[object, object], None]] = None,
 ):
     """Mount point callback to fetch on disk the request's content. The request
     urls provided are url decoded first to resolve the associated file on disk.
@@ -72,14 +73,16 @@ def get_response_cb(
         datadir/http_nowhere.com/path_to_resource,a=b,c=d
 
     Args:
-        request: Object requests
-        context (requests.Context): Object holding response metadata
-            information (status_code, headers, etc...)
+        request: input HTTP request
+        context: Object holding response metadata information (status_code, headers, etc...)
         datadir: Data files path
         ignore_urls: urls whose status response should be 404 even if the local
             file exists
         visits: Dict of url, number of visits. If None, disable multi visit
             support (default)
+        response_context_callback: Optional callback function taking request and
+            response context object (having headers, status_code, reason and cookies
+            as attributes) as parameters for extra processing
 
     Returns:
         Optional[FileDescriptor] on disk file to read from the test context
@@ -117,6 +120,8 @@ def get_response_cb(
         return None
     fd = open(filepath, "rb")
     context.headers["content-length"] = str(path.getsize(filepath))
+    if response_context_callback:
+        response_context_callback(request, context)
     return fd
 
 
@@ -143,7 +148,9 @@ def datadir(request: pytest.FixtureRequest) -> str:
 
 
 def requests_mock_datadir_factory(
-    ignore_urls: List[str] = [], has_multi_visit: bool = False
+    ignore_urls: List[str] = [],
+    has_multi_visit: bool = False,
+    response_context_callback: Optional[Callable[[object, object], None]] = None,
 ):
     """This factory generates fixtures which allow to look for files on the
     local filesystem based on the requested URL, using the following rules:
@@ -176,13 +183,21 @@ def requests_mock_datadir_factory(
         ignore_urls: List of urls to always returns 404 (whether file
             exists or not)
         has_multi_visit: Activate or not the multiple visits behavior
+        response_context_callback: Optional callback function taking request and
+            response context object (having headers, status_code, reason and cookies
+            as attributes) as parameters for extra processing
 
     """
 
     @pytest.fixture
     def requests_mock_datadir(requests_mock, datadir):
         if not has_multi_visit:
-            cb = partial(get_response_cb, ignore_urls=ignore_urls, datadir=datadir)
+            cb = partial(
+                get_response_cb,
+                ignore_urls=ignore_urls,
+                datadir=datadir,
+                response_context_callback=response_context_callback,
+            )
             requests_mock.get(re.compile("https?://"), body=cb)
         else:
             visits = {}
diff --git a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
new file mode 100644
index 0000000..c160a87
--- /dev/null
+++ b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
@@ -0,0 +1,56 @@
+[
+    {
+        "tags": [
+            {
+                "key": "environment",
+                "value": "production"
+            },
+            {
+                "key": "swh.loader.origin_url",
+                "value": "opam+https://opam.ocaml.org/packages/cgi/"
+            }
+        ],
+        "context": {
+            "celery-job": {
+                "args": [],
+                "kwargs": {
+                    "lister_instance_name": "opam.ocaml.org",
+                    "lister_name": "opam",
+                    "opam_instance": "opam.ocaml.org",
+                    "opam_package": "cgi",
+                    "opam_root": "/tmp/opam/",
+                    "opam_url": "https://opam.ocaml.org",
+                    "url": "opam+https://opam.ocaml.org/packages/cgi/"
+                },
+                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
+            }
+        }
+    },
+    {
+        "tags": [
+            {
+                "key": "environment",
+                "value": "production"
+            },
+            {
+                "key": "swh.loader.origin_url",
+                "value": "opam+https://opam.ocaml.org/packages/combine/"
+            }
+        ],
+        "context": {
+            "celery-job": {
+                "args": [],
+                "kwargs": {
+                    "lister_instance_name": "opam.ocaml.org",
+                    "lister_name": "opam",
+                    "opam_instance": "opam.ocaml.org",
+                    "opam_package": "combine",
+                    "opam_root": "/tmp/opam/",
+                    "opam_url": "https://opam.ocaml.org",
+                    "url": "opam+https://opam.ocaml.org/packages/combine/"
+                },
+                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:100:0 b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:100:0
new file mode 100644
index 0000000..a24042e
--- /dev/null
+++ b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:100:0
@@ -0,0 +1,56 @@
+[
+    {
+        "tags": [
+            {
+                "key": "environment",
+                "value": "production"
+            },
+            {
+                "key": "swh.loader.origin_url",
+                "value": "opam+https://opam.ocaml.org/packages/bitv/"
+            }
+        ],
+        "context": {
+            "celery-job": {
+                "args": [],
+                "kwargs": {
+                    "lister_instance_name": "opam.ocaml.org",
+                    "lister_name": "opam",
+                    "opam_instance": "opam.ocaml.org",
+                    "opam_package": "bitv",
+                    "opam_root": "/tmp/opam/",
+                    "opam_url": "https://opam.ocaml.org",
+                    "url": "opam+https://opam.ocaml.org/packages/bitv/"
+                },
+                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
+            }
+        }
+    },
+    {
+        "tags": [
+            {
+                "key": "environment",
+                "value": "production"
+            },
+            {
+                "key": "swh.loader.origin_url",
+                "value": "opam+https://opam.ocaml.org/packages/bdd/"
+            }
+        ],
+        "context": {
+            "celery-job": {
+                "args": [],
+                "kwargs": {
+                    "lister_instance_name": "opam.ocaml.org",
+                    "lister_name": "opam",
+                    "opam_instance": "opam.ocaml.org",
+                    "opam_package": "bdd",
+                    "opam_root": "/tmp/opam/",
+                    "opam_url": "https://opam.ocaml.org",
+                    "url": "opam+https://opam.ocaml.org/packages/bdd/"
+                },
+                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:200:0 b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:200:0
new file mode 100644
index 0000000..0637a08
--- /dev/null
+++ b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,cursor=0:200:0
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/swh/core/tests/test_cli_sentry.py b/swh/core/tests/test_cli_sentry.py
new file mode 100644
index 0000000..c19953e
--- /dev/null
+++ b/swh/core/tests/test_cli_sentry.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2025  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+from click.testing import CliRunner
+import pytest
+
+from swh.core.pytest_plugin import requests_mock_datadir_factory
+from swh.core.tests.test_cli import assert_result
+
+
+def response_context_callback(request, context):
+    """Add link headers to mocked Sentry REST API responses"""
+    base_url = f"{request.scheme}://{request.netloc}{request.path}"
+    if not request.query:
+        context.headers["Link"] = f'<{base_url}?cursor=0:100:0>; rel="next"'
+    else:
+        context.headers["Link"] = f'<{base_url}?cursor=0:200:0>; rel="next"'
+
+
+requests_mock_sentry = requests_mock_datadir_factory(
+    response_context_callback=response_context_callback
+)
+
+
+@pytest.fixture
+def swhmain(swhmain):
+    from swh.core.cli.sentry import sentry as swhsentry
+
+    swhmain.add_command(swhsentry)
+    return swhmain
+
+
+def test_sentry_extract_origin_urls(swhmain, requests_mock_sentry):
+    runner = CliRunner()
+    result = runner.invoke(
+        swhmain, ["sentry", "extract-origin-urls", "-t", "sentry-token", "-i", "112726"]
+    )
+    assert_result(result)
+    expected_output = """
+opam+https://opam.ocaml.org/packages/bdd/
+opam+https://opam.ocaml.org/packages/bitv/
+opam+https://opam.ocaml.org/packages/cgi/
+opam+https://opam.ocaml.org/packages/combine/
+"""
+    assert result.output.strip() == expected_output.strip("\n")
-- 
GitLab