Skip to content
Snippets Groups Projects
Verified Commit cc92b697 authored by Vincent Sellier's avatar Vincent Sellier
Browse files

Add an image for swh-scrubber service

Related to sysadm-environment#4707
parent 61e2e7bc
Branches master
No related tags found
No related merge requests found
# Deeply inspired from the Dockerfile of the swh-graph project
FROM python:3.10-bullseye
RUN apt-get -y update && \
apt-get -y upgrade && \
apt-get install -y libcmph-dev librdkafka-dev && \
apt clean && \
addgroup --gid 1000 swh && \
useradd --gid 1000 --uid 1000 -m -d /opt/swh swh && \
mkdir /etc/swh
USER swh
WORKDIR /opt/swh
COPY --chown=swh:swh requirements-frozen.txt /opt/swh
ENV PYTHONPATH=/opt/swh
ENV PATH=/opt/swh/.local/bin:$PATH
RUN /usr/local/bin/python -m pip install --upgrade pip && \
pip install --no-cache-dir -r requirements-frozen.txt
COPY --chown=swh:swh entrypoint.sh /opt/swh
RUN chmod u+x /opt/swh/entrypoint.sh
ENV SWH_CONFIG_FILENAME=/etc/swh/config.yml
ENV LOGLEVEL INFO
ENV STATSD_PORT=9125
ENV STATSD_HOST=prometheus-statsd-exporter
# STATSD_TAGS: scrubber_instance:<database>-<objecttype>-<id>
ENV STATSD_TAGS=
# OBJECT_TYPE: The type of object to run on (origin/origin-visit/...)
ENV OBJECT_TYPE=
# PARTITION_COUNT: ^2 number of ranges to split the object
ENV PARTITION_COUNT=
# FIRST_PARTITION: The first partition id to check (inclusive)
ENV FIRST_PARTITION=
# LAST_PARTITION: The last partition id to check (exclusive)
ENV LAST_PARTITION=
ENTRYPOINT "/opt/swh/entrypoint.sh"
#!/bin/bash
if [ -e "${SWH_CONFIG_FILENAME}" ]; then
echo "The config file ${SWH_CONFIG_FILENAME} does not exist."
exit 1
fi
ENV_VARS="LOGLEVEL STATSD_TAGS OBJECT_TYPE PARTITION_COUNT FIRST_PARTITION LAST_PARTITION"
ERROR=0
for VAR in ${ENV_VARS}; do
if [ -z "${!VAR}" ]; then
echo "The ${VAR} environment variable must be set"
ERROR=1
fi
done
if [ $ERROR -ne 0 ]; then
exit 1
fi
echo "Starting scrubber for OBJECT_TYPE=${OBJECT_TYPE} from FIRST_PARTITION=${FIRST_PARTITION} to LAST_PARTITION=${LAST_PARTITION}"
exec swh \
--log-level $LOGLEVEL \
scrubber check storage \
--object-type ${OBJECT_TYPE} \
--nb-partitions ${NB_PARTITIONS} \
--start-partition-id ${START_OBJECT} \
--end-partition-id ${END_OBJECT}
aiohttp==3.8.4
aiohttp-utils==3.2.1
aiosignal==1.3.1
amqp==5.1.1
async-timeout==4.0.2
attrs==22.2.0
attrs-strict==1.0.0
billiard==3.6.4.0
blinker==1.5
cassandra-driver==3.25.0
celery==5.2.7
certifi==2022.12.7
cffi==1.15.1
chardet==5.1.0
charset-normalizer==3.1.0
click==8.1.3
click-didyoumean==0.3.0
click-plugins==1.1.1
click-repl==0.2.0
confluent-kafka==2.0.2
Deprecated==1.2.13
dulwich==0.21.3
exceptiongroup==1.1.1
Flask==2.2.3
frozenlist==1.3.3
geomet==0.2.1.post1
gunicorn==20.1.0
humanize==4.6.0
hypothesis==6.70.0
idna==3.4
importlib-metadata==4.13.0
iniconfig==2.0.0
iso8601==1.1.0
itsdangerous==2.1.2
Jinja2==3.1.2
kombu==5.2.4
MarkupSafe==2.1.2
mirakuru==2.5.1
msgpack==1.0.5
multidict==6.0.4
mypy-extensions==1.0.0
packaging==23.0
pika==1.3.1
pkginfo==1.9.6
pluggy==1.0.0
port-for==0.6.3
prompt-toolkit==3.0.38
psutil==5.9.4
psycopg2==2.9.5
pycparser==2.21
pytest==7.2.2
pytest-postgresql==3.1.3
python-dateutil==2.8.2
python-debian==0.1.49
python-magic==0.4.27
python-mimeparse==1.6.0
pytz==2022.7.1
PyYAML==6.0
redis==4.5.3
requests==2.28.2
retrying==1.3.4
sentry-sdk==1.17.0
six==1.16.0
sortedcontainers==2.4.0
swh.core==2.21.2
swh.counters==0.9.2
swh.journal==1.3.1
swh.loader.core==5.2.0
swh.loader.git==2.2.0
swh.model==6.6.3
swh.objstorage==2.1.0
swh.perfecthash==0.1.2
swh.scheduler==1.7.0
swh.scrubber==1.0.1
swh.storage==1.11.0
tenacity==8.2.2
toml==0.10.2
tomli==2.0.1
typing_extensions==4.5.0
urllib3==1.26.15
vine==5.0.0
wcwidth==0.2.6
Werkzeug==2.2.3
wrapt==1.15.0
yarl==1.8.2
zipp==3.15.0
swh-scrubber
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment