From c16bcd571d382d9fbcce9e8a53d7ac419eeb0041 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Fri, 24 Jan 2025 15:57:14 +0100
Subject: [PATCH] cli: Add 'sentry extract-scheduler-tasks' command

Usage: swh sentry extract-scheduler-tasks [OPTIONS]

  Extract scheduler task parameters from events.

  This command allows to extract scheduler task parameters from Sentry events
  related to a Software Heritage scheduler task and dumps a CSV file to stdout
  that can be consumed by the CLI command:

  $ swh scheduler task schedule --columns type --columns kwargs <csv_file>.

Options:
  -u, --sentry-url TEXT           Sentry URL  [default:
                                  https://sentry.softwareheritage.org]
  -t, --sentry-token TEXT         Bearer token required to communicate with
                                  Sentry API (can also be provided in
                                  SENTRY_TOKEN environment variable)
                                  [required]
  -i, --sentry-issue-number TEXT  Sentry issue number to extract origin URLs
                                  from its events  [required]
  -e, --environment TEXT          Filter on environment: production or
                                  staging, both are selected by default
  -h, --help                      Show this message and exit.
---
 swh/core/cli/sentry.py                        | 44 ++++++++++++++
 .../api_0_issues_112726_events                | 57 +------------------
 .../api_0_issues_112726_events,full=true      | 56 ++++++++++++++++++
 .../data/sentry_expected_scheduler_tasks.csv  |  4 ++
 swh/core/tests/test_cli_sentry.py             | 27 ++++++++-
 5 files changed, 130 insertions(+), 58 deletions(-)
 mode change 100644 => 120000 swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
 create mode 100644 swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,full=true
 create mode 100644 swh/core/tests/data/sentry_expected_scheduler_tasks.csv

diff --git a/swh/core/cli/sentry.py b/swh/core/cli/sentry.py
index 41b15d7..290853a 100644
--- a/swh/core/cli/sentry.py
+++ b/swh/core/cli/sentry.py
@@ -64,6 +64,7 @@ def _process_sentry_events_pages(
     sentry_token,
     sentry_issue_number,
     events_page_process_callback: Callable[[List[Dict[str, Any]]], None],
+    full_sentry_responses: bool = False,
 ):
     import requests
 
@@ -71,6 +72,8 @@ def _process_sentry_events_pages(
     sentry_issue_events_url = (
         f"{sentry_api_base_url}/issues/{sentry_issue_number}/events/"
     )
+    if full_sentry_responses:
+        sentry_issue_events_url += "?full=true"
     while True:
         response = requests.get(
             sentry_issue_events_url, headers={"Authorization": f"Bearer {sentry_token}"}
@@ -108,3 +111,44 @@ def extract_origin_urls(sentry_url, sentry_token, sentry_issue_number, environme
 
     for origin_url in sorted(origin_urls):
         click.echo(origin_url)
+
+
+@sentry.command(name="extract-scheduler-tasks", context_settings=CONTEXT_SETTINGS)
+@common_options
+def extract_scheduler_tasks(sentry_url, sentry_token, sentry_issue_number, environment):
+    """Extract scheduler task parameters from events.
+
+    This command allows to extract scheduler task parameters from Sentry events related to
+    a Software Heritage scheduler task and dumps a CSV file to stdout that can be consumed
+    by the CLI command:
+
+    $ swh scheduler task schedule --columns type --columns kwargs <csv_file>.
+    """
+    import csv
+    import json
+    import sys
+
+    task_params = {}
+
+    def _extract_scheduler_tasks(events):
+        for event in events:
+            celery_job = event.get("context", {}).get("celery-job", {})
+            task_name = celery_job.get("task_name")
+            task_param = celery_job.get("kwargs")
+            if task_param:
+                key = tuple([task_name] + list(task_param.values()))
+                task_params[key] = (task_name, task_param)
+
+    _process_sentry_events_pages(
+        sentry_url,
+        sentry_token,
+        sentry_issue_number,
+        _extract_scheduler_tasks,
+        full_sentry_responses=True,
+    )
+
+    csv_writer = csv.writer(sys.stdout)
+    for task_type, task_param in sorted(
+        task_params.values(), key=lambda p: p[1].get("url", "")
+    ):
+        csv_writer.writerow([task_type, json.dumps(task_param)])
diff --git a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
deleted file mode 100644
index c160a87..0000000
--- a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
+++ /dev/null
@@ -1,56 +0,0 @@
-[
-    {
-        "tags": [
-            {
-                "key": "environment",
-                "value": "production"
-            },
-            {
-                "key": "swh.loader.origin_url",
-                "value": "opam+https://opam.ocaml.org/packages/cgi/"
-            }
-        ],
-        "context": {
-            "celery-job": {
-                "args": [],
-                "kwargs": {
-                    "lister_instance_name": "opam.ocaml.org",
-                    "lister_name": "opam",
-                    "opam_instance": "opam.ocaml.org",
-                    "opam_package": "cgi",
-                    "opam_root": "/tmp/opam/",
-                    "opam_url": "https://opam.ocaml.org",
-                    "url": "opam+https://opam.ocaml.org/packages/cgi/"
-                },
-                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
-            }
-        }
-    },
-    {
-        "tags": [
-            {
-                "key": "environment",
-                "value": "production"
-            },
-            {
-                "key": "swh.loader.origin_url",
-                "value": "opam+https://opam.ocaml.org/packages/combine/"
-            }
-        ],
-        "context": {
-            "celery-job": {
-                "args": [],
-                "kwargs": {
-                    "lister_instance_name": "opam.ocaml.org",
-                    "lister_name": "opam",
-                    "opam_instance": "opam.ocaml.org",
-                    "opam_package": "combine",
-                    "opam_root": "/tmp/opam/",
-                    "opam_url": "https://opam.ocaml.org",
-                    "url": "opam+https://opam.ocaml.org/packages/combine/"
-                },
-                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
-            }
-        }
-    }
-]
\ No newline at end of file
diff --git a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
new file mode 120000
index 0000000..abf5175
--- /dev/null
+++ b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events
@@ -0,0 +1 @@
+api_0_issues_112726_events,full=true
\ No newline at end of file
diff --git a/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,full=true b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,full=true
new file mode 100644
index 0000000..c160a87
--- /dev/null
+++ b/swh/core/tests/data/https_sentry.softwareheritage.org/api_0_issues_112726_events,full=true
@@ -0,0 +1,56 @@
+[
+    {
+        "tags": [
+            {
+                "key": "environment",
+                "value": "production"
+            },
+            {
+                "key": "swh.loader.origin_url",
+                "value": "opam+https://opam.ocaml.org/packages/cgi/"
+            }
+        ],
+        "context": {
+            "celery-job": {
+                "args": [],
+                "kwargs": {
+                    "lister_instance_name": "opam.ocaml.org",
+                    "lister_name": "opam",
+                    "opam_instance": "opam.ocaml.org",
+                    "opam_package": "cgi",
+                    "opam_root": "/tmp/opam/",
+                    "opam_url": "https://opam.ocaml.org",
+                    "url": "opam+https://opam.ocaml.org/packages/cgi/"
+                },
+                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
+            }
+        }
+    },
+    {
+        "tags": [
+            {
+                "key": "environment",
+                "value": "production"
+            },
+            {
+                "key": "swh.loader.origin_url",
+                "value": "opam+https://opam.ocaml.org/packages/combine/"
+            }
+        ],
+        "context": {
+            "celery-job": {
+                "args": [],
+                "kwargs": {
+                    "lister_instance_name": "opam.ocaml.org",
+                    "lister_name": "opam",
+                    "opam_instance": "opam.ocaml.org",
+                    "opam_package": "combine",
+                    "opam_root": "/tmp/opam/",
+                    "opam_url": "https://opam.ocaml.org",
+                    "url": "opam+https://opam.ocaml.org/packages/combine/"
+                },
+                "task_name": "swh.loader.package.opam.tasks.LoadOpam"
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/swh/core/tests/data/sentry_expected_scheduler_tasks.csv b/swh/core/tests/data/sentry_expected_scheduler_tasks.csv
new file mode 100644
index 0000000..442ff82
--- /dev/null
+++ b/swh/core/tests/data/sentry_expected_scheduler_tasks.csv
@@ -0,0 +1,4 @@
+swh.loader.package.opam.tasks.LoadOpam,"{""lister_instance_name"": ""opam.ocaml.org"", ""lister_name"": ""opam"", ""opam_instance"": ""opam.ocaml.org"", ""opam_package"": ""bdd"", ""opam_root"": ""/tmp/opam/"", ""opam_url"": ""https://opam.ocaml.org"", ""url"": ""opam+https://opam.ocaml.org/packages/bdd/""}"
+swh.loader.package.opam.tasks.LoadOpam,"{""lister_instance_name"": ""opam.ocaml.org"", ""lister_name"": ""opam"", ""opam_instance"": ""opam.ocaml.org"", ""opam_package"": ""bitv"", ""opam_root"": ""/tmp/opam/"", ""opam_url"": ""https://opam.ocaml.org"", ""url"": ""opam+https://opam.ocaml.org/packages/bitv/""}"
+swh.loader.package.opam.tasks.LoadOpam,"{""lister_instance_name"": ""opam.ocaml.org"", ""lister_name"": ""opam"", ""opam_instance"": ""opam.ocaml.org"", ""opam_package"": ""cgi"", ""opam_root"": ""/tmp/opam/"", ""opam_url"": ""https://opam.ocaml.org"", ""url"": ""opam+https://opam.ocaml.org/packages/cgi/""}"
+swh.loader.package.opam.tasks.LoadOpam,"{""lister_instance_name"": ""opam.ocaml.org"", ""lister_name"": ""opam"", ""opam_instance"": ""opam.ocaml.org"", ""opam_package"": ""combine"", ""opam_root"": ""/tmp/opam/"", ""opam_url"": ""https://opam.ocaml.org"", ""url"": ""opam+https://opam.ocaml.org/packages/combine/""}"
\ No newline at end of file
diff --git a/swh/core/tests/test_cli_sentry.py b/swh/core/tests/test_cli_sentry.py
index c19953e..b837b24 100644
--- a/swh/core/tests/test_cli_sentry.py
+++ b/swh/core/tests/test_cli_sentry.py
@@ -3,6 +3,7 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import os
 
 from click.testing import CliRunner
 import pytest
@@ -14,7 +15,7 @@ from swh.core.tests.test_cli import assert_result
 def response_context_callback(request, context):
     """Add link headers to mocked Sentry REST API responses"""
     base_url = f"{request.scheme}://{request.netloc}{request.path}"
-    if not request.query:
+    if "cursor" not in request.query:
         context.headers["Link"] = f'<{base_url}?cursor=0:100:0>; rel="next"'
     else:
         context.headers["Link"] = f'<{base_url}?cursor=0:200:0>; rel="next"'
@@ -24,6 +25,8 @@ requests_mock_sentry = requests_mock_datadir_factory(
     response_context_callback=response_context_callback
 )
 
+SENTRY_ISSUE_ID = "112726"
+
 
 @pytest.fixture
 def swhmain(swhmain):
@@ -36,7 +39,8 @@ def swhmain(swhmain):
 def test_sentry_extract_origin_urls(swhmain, requests_mock_sentry):
     runner = CliRunner()
     result = runner.invoke(
-        swhmain, ["sentry", "extract-origin-urls", "-t", "sentry-token", "-i", "112726"]
+        swhmain,
+        ["sentry", "extract-origin-urls", "-t", "sentry-token", "-i", SENTRY_ISSUE_ID],
     )
     assert_result(result)
     expected_output = """
@@ -46,3 +50,22 @@ opam+https://opam.ocaml.org/packages/cgi/
 opam+https://opam.ocaml.org/packages/combine/
 """
     assert result.output.strip() == expected_output.strip("\n")
+
+
+def test_sentry_extract_scheduler_tasks(swhmain, requests_mock_sentry, datadir):
+    runner = CliRunner()
+    result = runner.invoke(
+        swhmain,
+        [
+            "sentry",
+            "extract-scheduler-tasks",
+            "-t",
+            "sentry-token",
+            "-i",
+            SENTRY_ISSUE_ID,
+        ],
+    )
+    assert_result(result)
+    csv_tasks_file = os.path.join(datadir, "sentry_expected_scheduler_tasks.csv")
+    with open(csv_tasks_file, "r") as tasks_csv:
+        assert result.output.strip() == tasks_csv.read()
-- 
GitLab