From b8db76314aee375af353763e6466541d7d1627a3 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Tue, 18 Mar 2025 17:36:48 +0100 Subject: [PATCH 1/4] provenance/requirements: Add awscli Refs. swh/infra/sysadm-environment#5608 --- apps/swh-provenance/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/swh-provenance/requirements.txt b/apps/swh-provenance/requirements.txt index 1a035b4e8..8c6278249 100644 --- a/apps/swh-provenance/requirements.txt +++ b/apps/swh-provenance/requirements.txt @@ -1,3 +1,4 @@ swh.provenance python-json-logger gunicorn +awscli -- GitLab From c03f9c6db62b19493e8465db94bca25729a15f0f Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Tue, 18 Mar 2025 18:13:20 +0100 Subject: [PATCH 2/4] provenance/Dockerfile: Evolve to compile the rust provenance crate ``` root@d79ab1e18df7:/opt/swh# swh-provenance- swh-provenance-gen-test-database swh-provenance-grpc-serve swh-provenance-index root@d79ab1e18df7:/opt/swh# swh-provenance-index --help Builds .ef indexes for extra quick querying of the Software Heritage Provenance Index Usage: swh-provenance-index [OPTIONS] --database <DATABASE> Options: --database <DATABASE> Path to the provenance database --indexes <INDEXES> Path to the directory where to write paths to. Defaults to `--database` (when it is a file:// URL) --statsd-host <STATSD_HOST> Defaults to `localhost:8125` (or whatever is configured by the `STATSD_HOST` and `STATSD_PORT` environment variables) -h, --help Print help root@d79ab1e18df7:/opt/swh# swh-provenance-grpc-serve --help gRPC server for the Software Heritage Provenance Index Usage: swh-provenance-grpc-serve [OPTIONS] --graph <GRAPH> --database <DATABASE> Options: --cache-parquet Keep Parquet metadata in RAM between queries, instead of re-parsing them every time --graph-format <GRAPH_FORMAT> [default: webgraph] [possible values: webgraph, json] --graph <GRAPH> Path to the graph prefix --database <DATABASE> Path to the provenance database --indexes <INDEXES> Path to Elias-Fano indexes, default to `--database` (when it is a file:// URL) --bind <BIND> [default: [::]:50141] --statsd-host <STATSD_HOST> Defaults to `localhost:8125` (or whatever is configured by the `STATSD_HOST` and `STATSD_PORT` environment variables) -h, --help Print help ``` Refs. swh/infra/sysadm-environment#5608 --- apps/swh-provenance/Dockerfile | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/apps/swh-provenance/Dockerfile b/apps/swh-provenance/Dockerfile index 53685d6be..af27a6997 100644 --- a/apps/swh-provenance/Dockerfile +++ b/apps/swh-provenance/Dockerfile @@ -2,17 +2,42 @@ ARG REGISTRY=container-registry.softwareheritage.org/swh/infra/swh-apps/ ARG base_image=${REGISTRY}base ARG base_image_version=latest -FROM ${base_image}:${base_image_version} - +ARG userid=1000 +ARG groupid=1000 ARG user=swh ARG workdir=/opt/${user} ARG configdir=/etc/${user} +FROM rust:1.85-bookworm AS rust_build + +# ... build swh-graph rust deps +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y build-essential libclang-dev \ + zstd protobuf-compiler default-jre && \ + apt-get clean + +# Install swh-graph feature swh-graph-grpc-serve +RUN --mount=type=cache,target=.cache,uid=1000,gid=1000 \ + RUSTFLAGS="-C target-cpu=native" \ + cargo install swh-provenance --locked + +FROM ${base_image}:${base_image_version} + +USER root +RUN apt-get update && \ + apt-get install -y zstd && \ + apt-get clean + +FROM ${base_image}:${base_image_version} + COPY --chmod=0644 requirements-frozen.txt ${workdir} RUN --mount=type=cache,target=.cache,uid=1000,gid=1000 \ uv pip sync requirements-frozen.txt COPY --chmod=0755 entrypoint.sh ${workdir} +COPY --from=rust_build /usr/local/cargo/bin/swh-provenance* /usr/local/bin/ +COPY --chmod=0755 entrypoint.sh ${workdir} USER ${user} ENV SWH_CONFIG_FILENAME=${configdir}/config.yml -- GitLab From 96720fd3ff6a14abccbcdaf429456db26d642587 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Wed, 19 Mar 2025 10:28:01 +0100 Subject: [PATCH 3/4] utils: Add awscli tool This image will be used to aws cp the necessary provenance dataset files. Refs. swh/infra/sysadm-environment#5608 --- apps/swh-utils/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/swh-utils/Dockerfile b/apps/swh-utils/Dockerfile index 5f69b37d0..e1f4dca28 100644 --- a/apps/swh-utils/Dockerfile +++ b/apps/swh-utils/Dockerfile @@ -2,5 +2,5 @@ FROM debian:bookworm-slim RUN apt-get update && \ apt-get -y upgrade && \ - apt-get install -y gettext-base curl && \ + apt-get install -y gettext-base curl awscli && \ apt-get clean -- GitLab From 9d71df1e3b7a8e25f242fc68f6feda68039b89f7 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Wed, 19 Mar 2025 12:41:43 +0100 Subject: [PATCH 4/4] provenance/entrypoint.sh: Adapt to run either a grpc or rpc Refs. swh/infra/sysadm-environment#5608 --- apps/swh-provenance/Dockerfile | 6 ++++ apps/swh-provenance/entrypoint.sh | 49 ++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/apps/swh-provenance/Dockerfile b/apps/swh-provenance/Dockerfile index af27a6997..411d2c997 100644 --- a/apps/swh-provenance/Dockerfile +++ b/apps/swh-provenance/Dockerfile @@ -41,8 +41,14 @@ COPY --chmod=0755 entrypoint.sh ${workdir} USER ${user} ENV SWH_CONFIG_FILENAME=${configdir}/config.yml +# Default to be a rpc service ENV PORT 5014 +ENV PROVENANCE_TYPE "rpc" +# Possible other value for grpc kind +# ENV PORT 50141 +# ENV PROVENANCE_TYPE "grpc" EXPOSE $PORT +# For rpc type, this maps directly to gunicorn env variables ENV WORKERS 8 ENV THREADS 2 ENV TIMEOUT 3600 diff --git a/apps/swh-provenance/entrypoint.sh b/apps/swh-provenance/entrypoint.sh index 657cafa04..e5a5fcd13 100755 --- a/apps/swh-provenance/entrypoint.sh +++ b/apps/swh-provenance/entrypoint.sh @@ -18,25 +18,40 @@ case "$1" in ;; *) EXTRA_CLI_FLAGS=() - if [ -n "${SWH_LOG_CONFIG_JSON}" ]; then - EXTRA_CLI_FLAGS+=('--log-config-json' "${SWH_LOG_CONFIG_JSON}") - fi if [ -n "${STATSD_HOST}" -a -n "${STATSD_PORT}" ]; then EXTRA_CLI_FLAGS+=('--statsd-host' "${STATSD_HOST}:${STATSD_PORT}") fi - if [ -n "${STATSD_SERVICE_TYPE}" ]; then - EXTRA_CLI_FLAGS+=('--statsd-prefix' "${STATSD_SERVICE_TYPE}") - fi + case "${PROVENANCE_TYPE}" in + "rpc") + if [ -n "${SWH_LOG_CONFIG_JSON}" ]; then + EXTRA_CLI_FLAGS+=('--log-config-json' "${SWH_LOG_CONFIG_JSON}") + fi + if [ -n "${STATSD_SERVICE_TYPE}" ]; then + EXTRA_CLI_FLAGS+=('--statsd-prefix' "${STATSD_SERVICE_TYPE}") + fi + + echo 'Starting the swh-provenance RPC server' + exec gunicorn --bind "0.0.0.0:${PORT}" \ + --log-level "${SWH_LOG_LEVEL:-INFO}" \ + "${EXTRA_CLI_FLAGS[@]}" \ + --threads "${THREADS}" \ + --workers "${WORKERS}" \ + --timeout "${TIMEOUT}" \ + --reload \ + --config 'python:swh.core.api.gunicorn_config' \ + 'swh.provenance.api.server:make_app_from_configfile()' + ;; + "grpc") + echo 'Starting the swh-provenance GRPC server' + exec swh-provenance-grpc-serve \ + --graph "${GRAPH_PATH}" \ + --database "${PROVENANCE_PATH}" \ + --bind ":${PORT}" + "${EXTRA_CLI_FLAGS[@]}" \ + ;; - echo 'Starting the swh-provenance RPC server' - exec gunicorn --bind "0.0.0.0:${PORT}" \ - --log-level "${SWH_LOG_LEVEL:-INFO}" \ - "${EXTRA_CLI_FLAGS[@]}" \ - --threads "${THREADS}" \ - --workers "${WORKERS}" \ - --timeout "${TIMEOUT}" \ - --reload \ - --config 'python:swh.core.api.gunicorn_config' \ - 'swh.provenance.api.server:make_app_from_configfile()' - ;; + *) + echo "Unknown provenance type <${PROVENANCE_TYPE}> (either rpc or gprc)" + exit 1 + esac esac -- GitLab