From b8db76314aee375af353763e6466541d7d1627a3 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Tue, 18 Mar 2025 17:36:48 +0100
Subject: [PATCH 1/4] provenance/requirements: Add awscli

Refs. swh/infra/sysadm-environment#5608
---
 apps/swh-provenance/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/swh-provenance/requirements.txt b/apps/swh-provenance/requirements.txt
index 1a035b4e8..8c6278249 100644
--- a/apps/swh-provenance/requirements.txt
+++ b/apps/swh-provenance/requirements.txt
@@ -1,3 +1,4 @@
 swh.provenance
 python-json-logger
 gunicorn
+awscli
-- 
GitLab


From c03f9c6db62b19493e8465db94bca25729a15f0f Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Tue, 18 Mar 2025 18:13:20 +0100
Subject: [PATCH 2/4] provenance/Dockerfile: Evolve to compile the rust
 provenance crate

```
root@d79ab1e18df7:/opt/swh# swh-provenance-
swh-provenance-gen-test-database  swh-provenance-grpc-serve         swh-provenance-index
root@d79ab1e18df7:/opt/swh# swh-provenance-index --help
Builds .ef indexes for extra quick querying of the Software Heritage Provenance Index

Usage: swh-provenance-index [OPTIONS] --database <DATABASE>

Options:
      --database <DATABASE>        Path to the provenance database
      --indexes <INDEXES>          Path to the directory where to write paths to. Defaults to `--database` (when it is a file:// URL)
      --statsd-host <STATSD_HOST>  Defaults to `localhost:8125` (or whatever is configured by the `STATSD_HOST` and `STATSD_PORT` environment variables)
  -h, --help                       Print help
root@d79ab1e18df7:/opt/swh# swh-provenance-grpc-serve --help
gRPC server for the Software Heritage Provenance Index

Usage: swh-provenance-grpc-serve [OPTIONS] --graph <GRAPH> --database <DATABASE>

Options:
      --cache-parquet                Keep Parquet metadata in RAM between queries, instead of re-parsing them every time
      --graph-format <GRAPH_FORMAT>  [default: webgraph] [possible values: webgraph, json]
      --graph <GRAPH>                Path to the graph prefix
      --database <DATABASE>          Path to the provenance database
      --indexes <INDEXES>            Path to Elias-Fano indexes, default to `--database` (when it is a file:// URL)
      --bind <BIND>                  [default: [::]:50141]
      --statsd-host <STATSD_HOST>    Defaults to `localhost:8125` (or whatever is configured by the `STATSD_HOST` and `STATSD_PORT` environment variables)
  -h, --help                         Print help
```

Refs. swh/infra/sysadm-environment#5608
---
 apps/swh-provenance/Dockerfile | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/apps/swh-provenance/Dockerfile b/apps/swh-provenance/Dockerfile
index 53685d6be..af27a6997 100644
--- a/apps/swh-provenance/Dockerfile
+++ b/apps/swh-provenance/Dockerfile
@@ -2,17 +2,42 @@ ARG REGISTRY=container-registry.softwareheritage.org/swh/infra/swh-apps/
 ARG base_image=${REGISTRY}base
 ARG base_image_version=latest
 
-FROM ${base_image}:${base_image_version}
-
+ARG userid=1000
+ARG groupid=1000
 ARG user=swh
 ARG workdir=/opt/${user}
 ARG configdir=/etc/${user}
 
+FROM rust:1.85-bookworm AS rust_build
+
+# ... build swh-graph rust deps
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y build-essential libclang-dev \
+      zstd protobuf-compiler default-jre && \
+    apt-get clean
+
+# Install swh-graph feature swh-graph-grpc-serve
+RUN --mount=type=cache,target=.cache,uid=1000,gid=1000 \
+  RUSTFLAGS="-C target-cpu=native" \
+  cargo install swh-provenance --locked
+
+FROM ${base_image}:${base_image_version}
+
+USER root
+RUN apt-get update && \
+    apt-get install -y zstd && \
+    apt-get clean
+
+FROM ${base_image}:${base_image_version}
+
 COPY --chmod=0644 requirements-frozen.txt ${workdir}
 RUN --mount=type=cache,target=.cache,uid=1000,gid=1000 \
     uv pip sync requirements-frozen.txt
 
 COPY --chmod=0755 entrypoint.sh ${workdir}
+COPY --from=rust_build /usr/local/cargo/bin/swh-provenance* /usr/local/bin/
+COPY --chmod=0755 entrypoint.sh ${workdir}
 
 USER ${user}
 ENV SWH_CONFIG_FILENAME=${configdir}/config.yml
-- 
GitLab


From 96720fd3ff6a14abccbcdaf429456db26d642587 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Wed, 19 Mar 2025 10:28:01 +0100
Subject: [PATCH 3/4] utils: Add awscli tool

This image will be used to aws cp the necessary provenance dataset files.

Refs. swh/infra/sysadm-environment#5608
---
 apps/swh-utils/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/swh-utils/Dockerfile b/apps/swh-utils/Dockerfile
index 5f69b37d0..e1f4dca28 100644
--- a/apps/swh-utils/Dockerfile
+++ b/apps/swh-utils/Dockerfile
@@ -2,5 +2,5 @@ FROM debian:bookworm-slim
 
 RUN apt-get update && \
     apt-get -y upgrade && \
-    apt-get install -y gettext-base curl && \
+    apt-get install -y gettext-base curl awscli && \
     apt-get clean
-- 
GitLab


From 9d71df1e3b7a8e25f242fc68f6feda68039b89f7 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Wed, 19 Mar 2025 12:41:43 +0100
Subject: [PATCH 4/4] provenance/entrypoint.sh: Adapt to run either a grpc or
 rpc

Refs. swh/infra/sysadm-environment#5608
---
 apps/swh-provenance/Dockerfile    |  6 ++++
 apps/swh-provenance/entrypoint.sh | 49 ++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/apps/swh-provenance/Dockerfile b/apps/swh-provenance/Dockerfile
index af27a6997..411d2c997 100644
--- a/apps/swh-provenance/Dockerfile
+++ b/apps/swh-provenance/Dockerfile
@@ -41,8 +41,14 @@ COPY --chmod=0755 entrypoint.sh ${workdir}
 
 USER ${user}
 ENV SWH_CONFIG_FILENAME=${configdir}/config.yml
+# Default to be a rpc service
 ENV PORT 5014
+ENV PROVENANCE_TYPE "rpc"
+# Possible other value for grpc kind
+# ENV PORT 50141
+# ENV PROVENANCE_TYPE "grpc"
 EXPOSE $PORT
+# For rpc type, this maps directly to gunicorn env variables
 ENV WORKERS 8
 ENV THREADS 2
 ENV TIMEOUT 3600
diff --git a/apps/swh-provenance/entrypoint.sh b/apps/swh-provenance/entrypoint.sh
index 657cafa04..e5a5fcd13 100755
--- a/apps/swh-provenance/entrypoint.sh
+++ b/apps/swh-provenance/entrypoint.sh
@@ -18,25 +18,40 @@ case "$1" in
         ;;
     *)
         EXTRA_CLI_FLAGS=()
-        if [ -n "${SWH_LOG_CONFIG_JSON}" ]; then
-            EXTRA_CLI_FLAGS+=('--log-config-json' "${SWH_LOG_CONFIG_JSON}")
-        fi
         if [ -n "${STATSD_HOST}" -a -n "${STATSD_PORT}" ]; then
             EXTRA_CLI_FLAGS+=('--statsd-host' "${STATSD_HOST}:${STATSD_PORT}")
         fi
-        if [ -n "${STATSD_SERVICE_TYPE}" ]; then
-            EXTRA_CLI_FLAGS+=('--statsd-prefix' "${STATSD_SERVICE_TYPE}")
-        fi
+        case "${PROVENANCE_TYPE}" in
+            "rpc")
+                if [ -n "${SWH_LOG_CONFIG_JSON}" ]; then
+                    EXTRA_CLI_FLAGS+=('--log-config-json' "${SWH_LOG_CONFIG_JSON}")
+                fi
+                if [ -n "${STATSD_SERVICE_TYPE}" ]; then
+                    EXTRA_CLI_FLAGS+=('--statsd-prefix' "${STATSD_SERVICE_TYPE}")
+                fi
+
+                echo 'Starting the swh-provenance RPC server'
+                exec gunicorn --bind "0.0.0.0:${PORT}" \
+                     --log-level "${SWH_LOG_LEVEL:-INFO}" \
+                     "${EXTRA_CLI_FLAGS[@]}" \
+                     --threads "${THREADS}" \
+                     --workers "${WORKERS}" \
+                     --timeout "${TIMEOUT}" \
+                     --reload \
+                     --config 'python:swh.core.api.gunicorn_config' \
+                     'swh.provenance.api.server:make_app_from_configfile()'
+              ;;
+            "grpc")
+                echo 'Starting the swh-provenance GRPC server'
+                exec swh-provenance-grpc-serve \
+                    --graph "${GRAPH_PATH}" \
+                    --database "${PROVENANCE_PATH}" \
+                    --bind ":${PORT}"
+                    "${EXTRA_CLI_FLAGS[@]}" \
+                ;;
 
-        echo 'Starting the swh-provenance RPC server'
-        exec gunicorn --bind "0.0.0.0:${PORT}" \
-             --log-level "${SWH_LOG_LEVEL:-INFO}" \
-             "${EXTRA_CLI_FLAGS[@]}" \
-             --threads "${THREADS}" \
-             --workers "${WORKERS}" \
-             --timeout "${TIMEOUT}" \
-             --reload \
-             --config 'python:swh.core.api.gunicorn_config' \
-             'swh.provenance.api.server:make_app_from_configfile()'
-      ;;
+            *)
+                echo "Unknown provenance type <${PROVENANCE_TYPE}> (either rpc or gprc)"
+                exit 1
+        esac
 esac
-- 
GitLab