Skip to content
Snippets Groups Projects
Unverified Commit db356083 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

wip: provenance/deployment: Fetch and prepare volumes with dataset

Refs. swh/infra/sysadm-environment#5608
parent 29307d0d
No related branches found
No related tags found
No related merge requests found
......@@ -88,7 +88,7 @@
image: {{ .Values.swh_utils_image }}:{{ .Values.swh_utils_image_version }}
imagePullPolicy: IfNotPresent
command:
- /entrypoints/graph-wait-for-dataset.sh
- /entrypoints/wait-for-dataset.sh
env:
{{- include "swh.graph.volume.witnessfile" (dict "graphPath" .graphPath) | nindent 4 }}
- name: DATASET_LOCATION
......
{{/* Generate the initialize provenance backend container configuration if needed */}}
{{- define "swh.provenance.volume.witnessfile" -}}
- name: {{ .witness_file_env_variable_name | default "WITNESS_FILE" }}
value: {{ .provenancePath }}/{{ .filename | default ".provenance-is-initialized" }}
{{- end -}}
{{/* Generate the initialize provenance backend container configuration if needed */}}
{{- define "swh.provenance.fetchDataset" -}}
{{- $image_version := get . "imageVersion" | default ( get .Values (print .imagePrefixName "_version") ) |
required (print .imagePrefixName "_version is mandatory in values.yaml ") -}}
- name: {{ .containerName | default "fetch-provenance-dataset" }}
image: {{ get .Values .imagePrefixName }}:{{ $image_version }}
command:
- /entrypoints/provenance-fetch-datasets.sh
env:
{{- include "swh.provenance.volume.witnessfile" (dict "provenancePath" .provenancePath) | nindent 2 }}
- name: SWH_CONFIG_FILENAME
value: /etc/swh/config.yml
- name: PROVENANCE_PATH
value: {{ .provenancePATH }}
- name: GRAPH_PATH
value: {{ .graphPath }}
- name: DATASET_VERSION
value: {{ .datasetName | default "" }}
volumeMounts:
- name: configuration
mountPath: /etc/swh
- name: backend-utils
mountPath: /entrypoints
{{- range $volumeName, $volumeConfig := .extraVolumes }}
- name: {{ $volumeName }}
mountPath: {{ $volumeConfig.mountPath }}
readOnly: {{ $volumeConfig.readOnly | default "false" }}
{{ end }}
{{- end -}}
{{/* init-container to wait for dataset presence. */}}
{{- define "swh.provenance.waitForDataset" -}}
- name: wait-for-dataset
image: {{ .Values.swh_utils_image }}:{{ .Values.swh_utils_image_version }}
imagePullPolicy: IfNotPresent
command:
- /entrypoints/wait-for-dataset.sh
env:
{{- include "swh.provenance.volume.witnessfile" (dict "provenancePath" .provenancePath) | nindent 4 }}
- name: PERIOD
value: {{ .period | default "3" | quote }}
volumeMounts:
- name: backend-utils
mountPath: /entrypoints
readOnly: true
{{- range $volumeName, $volumeConfig := .extraVolumes }}
- name: {{ $volumeName }}
mountPath: {{ $volumeConfig.mountPath }}
readOnly: {{ $volumeConfig.readOnly | default "false" }}
{{ end }}
{{- end -}}
{{/* init-container to index provenance dataset. */}}
{{- define "swh.provenance.indexDataset" -}}
{{- $image_version := get . "imageVersion" | default ( get .Values (print .imagePrefixName "_version") ) |
required (print .imagePrefixName "_version is mandatory in values.yaml ") -}}
- name: {{ .containerName | default "reindex-provenance-dataset" }}
image: {{ get .Values .imagePrefixName }}:{{ $image_version }}
imagePullPolicy: IfNotPresent
command:
- /entrypoints/provenance-index-dataset.sh
env:
{{- include "swh.provenance.volume.witnessfile" (dict "witness_file_env_variable_name" "WITNESS_INDEX_FILE"
"provenancePath" .provenancePath
"filename" ".provenance-is-reindexed") | nindent 4 }}
- name: PROVENANCE_PATH
value: {{ .provenancePath }}
- name: PERIOD
value: {{ .period | default "3" | quote }}
volumeMounts:
- name: backend-utils
mountPath: /entrypoints
readOnly: true
{{- range $volumeName, $volumeConfig := .extraVolumes }}
- name: {{ $volumeName }}
mountPath: {{ $volumeConfig.mountPath }}
readOnly: {{ $volumeConfig.readOnly | default "false" }}
{{ end }}
{{- end -}}
......@@ -20,6 +20,8 @@
{{- $graphPath := $provenanceConfig.graphPath | default "" -}}
{{- $provenancePath := $provenanceConfig.provenancePath | default "" -}}
{{- $gunicornConfig := $provenanceConfig.gunicornConfig | default dict -}}
{{- $provenanceFetchDataset := provenanceConfig.provenanceFetchDataset -}}
{{- $provenanceIndexDataset := provenanceConfig.provenanceIndexDataset -}}
---
apiVersion: apps/v1
kind: Deployment
......@@ -83,6 +85,25 @@ spec:
- name: config-utils
mountPath: /entrypoints
readOnly: true
{{- if $provenanceFetchDataset }}
{{ include "swh.provenance.fetchDataset" (dict "Values" $.Values
"datasetName" $datasetName
"graphPath" $graphPath
"provenancePath" $provenancePath
"extraVolumes" $provenanceConfig.extraVolumes) | nindent 8 }}
{{- end }}
{{- if $provenanceIndexDataset }}
{{ include "swh.provenance.indexDataset" (dict "Values" $.Values
"imagePrefixName" "swh_provenance_image"
"imageVersion" $provenanceImageVersion
"provenancePath" $provenancePath
"extraVolumes" $graphConfig.extraVolumes) | nindent 8 }}
{{ end }}
{{- if $fetchGraphDataset }}
{{ include "swh.provenance.waitForDataset" (dict "Values" $.Values
"provenancePath" $provenancePath
"extraVolumes" $graphConfig.extraVolumes) | nindent 8 }}
{{ end }}
containers:
- name: {{ $serviceType }}
resources:
......
......@@ -62,7 +62,7 @@ data:
# Finally, we make explicit the graph is ready
touch ${WITNESS_FILE}
graph-wait-for-dataset.sh: |
wait-for-dataset.sh: |
#!/usr/bin/env bash
# Uses env variables WITNESS_FILE
[ -z "${WITNESS_FILE}" ] && \
......@@ -158,6 +158,77 @@ data:
swh graph reindex --ef ${DATASET_LOCATION}/${GRAPH_NAME} && \
touch $WITNESS_REINDEX_FILE
provenance-fetch-datasets.sh: |
#!/usr/bin/env bash
[ -z "${WITNESS_FETCH_FILE}" ] && \
echo "<WITNESS_FETCH_FILE> env variable must be set" && exit 1
[ -z "${DATASET_VERSION}" ] && \
echo "<DATASET_VERSION> env variable must be set" && exit 1
[ -z "${PROVENANCE_PATH}" ] && \
echo "<PROVENANCE_PATH> env variable must be set" && exit 1
[ -z "${GRAPH_PATH}" ] && \
echo "<GRAPH_PATH> env variable must be set" && exit 1
[ -f ${WITNESS_FETCH_FILE} ] && \
echo "Datasets graph & provenance <${DATASET_VERSION}> already present. Skip." && \
exit 0
URL_PROVENANCE="s3://softwareheritage/derived_datasets/${DATASET_VERSION}/provenance/all/"
CMD_GET="aws s3 cp --no-sign-request"
# Retrieve the provenance dataset parquet files
$CMD_GET --recursive "${URL_PROVENANCE}" "${PROVENANCE_PATH}"
# Retrieve the required graph files
URL_GRAPH="s3://softwareheritage/graph/${DATASET_VERSION}/compressed"
for filename in graph.pthash graph.pthash.order graph.node2swhid.bin.zst graph.node2type.bin.zst; do
$CMD_GET "${URL_GRAPH}/${filename}" "${GRAPH_PATH}"
done
# Uncompress the compressed graph files
pushd "${DIR_GRAPH}"
for filename in graph.node2type.bin.zst graph.node2swhid.bin.zst; do
# Uncompress and delete the .zst file
[ -f "${filename}" ] && unzstd --rm "${filename}"
done
popd
# Make explicit the provenance datasets are fetched
touch ${WITNESS_FETCH_FILE}
provenance-index-dataset.sh: |
#!/usr/bin/env bash
[ -z "${WITNESS_SOURCE_FILE}" ] && \
echo "<WITNESS_SOURCE_FILE> env variable must be set" && exit 1
[ -z "${WITNESS_INDEX_FILE}" ] && \
echo "<WITNESS_INDEX_FILE> env variable must be set" && exit 1
[ -z "${PERIOD}" ] && \
echo "<PERIOD> env variable must be set" && exit 1
[ -z "${PROVENANCE_PATH}" ] && \
echo "<PROVENANCE_PATH> env variable must be set" && exit 1
[ -f ${WITNESS_INDEX_FILE} ] && echo "Provenance already indexed, do nothing." && \
exit 0
set -eux
# Let's wait for the dataset installation
while [ ! -f "${WITNESS_SOURCE_FILE}" ]; do
echo "${WITNESS_SOURCE_FILE} missing, waiting provenance dataset installation..."
sleep $PERIOD
done
# To make the query faster, the provenance needs to build index out of the
# current dataset files. We store the output indexes in the same path as
# the dataset.
swh-provenance-index \
--database ${PROVENANCE_PATH} \
--indexes ${PROVENANCE_PATH} && \
touch "${WITNESS_INDEX_FILE}"
initialize-search-backend.sh: |
#!/usr/bin/env bash
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment