Upgrade reaper to 3.3.3

changed milestone to %MRO 2023

changed the description

taking a snapshot of the reaper_db keyspace before upgrading in staging

root@pergamon:~# clush -b -w @staging-cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS listsnapshots"
---------------
cassandra[1-3].internal.staging.swh.network (3)
---------------
Snapshot Details:
There are no snapshots
root@pergamon:~# clush -b -w @staging-cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS snapshot reaper_db"
---------------
cassandra1.internal.staging.swh.network
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693295461049] and options {skipFlush=false}
Snapshot directory: 1693295461049
---------------
cassandra2.internal.staging.swh.network
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693295461138] and options {skipFlush=false}
Snapshot directory: 1693295461138
---------------
cassandra3.internal.staging.swh.network
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693295461199] and options {skipFlush=false}
Snapshot directory: 1693295461199
root@pergamon:~# clush -b -w @staging-cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS listsnapshots"
---------------
cassandra1.internal.staging.swh.network
---------------
Snapshot Details:
Snapshot name Keyspace name Column family name                      True size Size on disk
1693295461049 reaper_db     repair_run_by_unit                      0 bytes   18.18 KiB
1693295461049 reaper_db     percent_repaired_by_schedule            29.44 KiB 38.11 KiB
1693295461049 reaper_db     running_reapers                         10.08 KiB 16.18 KiB
1693295461049 reaper_db     repair_run_by_cluster                   0 bytes   964 bytes
1693295461049 reaper_db     repair_unit_v1                          0 bytes   7.56 KiB
1693295461049 reaper_db     leader                                  0 bytes   11.69 KiB
1693295461049 reaper_db     node_metrics_v1                         0 bytes   1.23 KiB
1693295461049 reaper_db     cluster                                 0 bytes   12.17 KiB
1693295461049 reaper_db     repair_schedule_by_cluster_and_keyspace 0 bytes   13.27 KiB
1693295461049 reaper_db     diagnostic_event_subscription           0 bytes   1.03 KiB
1693295461049 reaper_db     node_operations                         0 bytes   1.13 KiB
1693295461049 reaper_db     schema_migration                        0 bytes   14.77 KiB
1693295461049 reaper_db     repair_schedule_v1                      0 bytes   14.33 KiB
1693295461049 reaper_db     node_metrics_v3                         0 bytes   1.36 KiB
1693295461049 reaper_db     running_repairs                         0 bytes   1.01 KiB
1693295461049 reaper_db     repair_run                              0 bytes   388.6 KiB
1693295461049 reaper_db     snapshot                                0 bytes   984 bytes
1693295461049 reaper_db     repair_run_by_cluster_v2                0 bytes   18.02 KiB
1693295461049 reaper_db     schema_migration_leader                 0 bytes   964 bytes

Total TrueDiskSpaceUsed: 39.51 KiB

---------------
cassandra2.internal.staging.swh.network
---------------
Snapshot Details:
Snapshot name Keyspace name Column family name                      True size Size on disk
1693295461138 reaper_db     repair_run_by_unit                      0 bytes   18.18 KiB
1693295461138 reaper_db     percent_repaired_by_schedule            29.44 KiB 38.11 KiB
1693295461138 reaper_db     running_reapers                         10.08 KiB 16.18 KiB
1693295461138 reaper_db     repair_run_by_cluster                   0 bytes   964 bytes
1693295461138 reaper_db     repair_unit_v1                          0 bytes   7.56 KiB
1693295461138 reaper_db     leader                                  0 bytes   11.69 KiB
1693295461138 reaper_db     node_metrics_v1                         0 bytes   1.23 KiB
1693295461138 reaper_db     cluster                                 0 bytes   12.17 KiB
1693295461138 reaper_db     repair_schedule_by_cluster_and_keyspace 0 bytes   13.27 KiB
1693295461138 reaper_db     diagnostic_event_subscription           0 bytes   1.03 KiB
1693295461138 reaper_db     node_operations                         0 bytes   1.13 KiB
1693295461138 reaper_db     schema_migration                        0 bytes   14.77 KiB
1693295461138 reaper_db     repair_schedule_v1                      0 bytes   14.33 KiB
1693295461138 reaper_db     node_metrics_v3                         0 bytes   1.36 KiB
1693295461138 reaper_db     running_repairs                         0 bytes   1.01 KiB
1693295461138 reaper_db     repair_run                              0 bytes   388.6 KiB
1693295461138 reaper_db     snapshot                                0 bytes   984 bytes
1693295461138 reaper_db     repair_run_by_cluster_v2                0 bytes   18.02 KiB
1693295461138 reaper_db     schema_migration_leader                 0 bytes   964 bytes

Total TrueDiskSpaceUsed: 39.51 KiB

---------------
cassandra3.internal.staging.swh.network
---------------
Snapshot Details:
Snapshot name Keyspace name Column family name                      True size Size on disk
1693295461199 reaper_db     repair_run_by_unit                      0 bytes   18.18 KiB
1693295461199 reaper_db     percent_repaired_by_schedule            29.44 KiB 38.11 KiB
1693295461199 reaper_db     running_reapers                         10.08 KiB 16.23 KiB
1693295461199 reaper_db     repair_run_by_cluster                   0 bytes   964 bytes
1693295461199 reaper_db     repair_unit_v1                          0 bytes   7.56 KiB
1693295461199 reaper_db     leader                                  0 bytes   11.69 KiB
1693295461199 reaper_db     node_metrics_v1                         0 bytes   1.23 KiB
1693295461199 reaper_db     cluster                                 0 bytes   12.17 KiB
1693295461199 reaper_db     repair_schedule_by_cluster_and_keyspace 0 bytes   13.27 KiB
1693295461199 reaper_db     diagnostic_event_subscription           0 bytes   1.03 KiB
1693295461199 reaper_db     node_operations                         0 bytes   1.13 KiB
1693295461199 reaper_db     schema_migration                        0 bytes   14.77 KiB
1693295461199 reaper_db     repair_schedule_v1                      0 bytes   14.33 KiB
1693295461199 reaper_db     node_metrics_v3                         0 bytes   1.36 KiB
1693295461199 reaper_db     running_repairs                         0 bytes   1.01 KiB
1693295461199 reaper_db     repair_run                              0 bytes   388.6 KiB
1693295461199 reaper_db     snapshot                                0 bytes   984 bytes
1693295461199 reaper_db     repair_run_by_cluster_v2                0 bytes   18.02 KiB
1693295461199 reaper_db     schema_migration_leader                 0 bytes   964 bytes

Total TrueDiskSpaceUsed: 39.51 KiB

mentioned in commit swh/infra/ci-cd/k8s-clusters-conf@0db3586a

Some permissions are missing to the reaper user to apply the migrations:

cassandra/reaper-78785bf47f-t56mn[reaper]: org.cognitor.cassandra.migration.MigrationException: Error during migration of script 032_add_2i_status.cql while executing 'CREATE INDEX IF NOT EXISTS state2i ON repair_run_by_cluster_v2 (repair_run_state);'
...
cassandra/reaper-78785bf47f-t56mn[reaper]: Caused by: com.datastax.driver.core.exceptions.UnauthorizedException: User reaper has no ALTER permission on <table reaper_db.repair_run_by_cluster_v2> or any of its parents

After adding the ALTER the permission to the reaper user

GRANT ALTER ON KEYSPACE reaper_db to 'reaper';

and restarting the pod, reaper successfully restarted in staging.

mentioned in commit swh/infra/ci-cd/k8s-clusters-conf@cd110e0d

For the production, it's not necessary to add the permission as the security is not yet activated.

Upgrade:

Snapshot of the reaper_db taken:

root@pergamon:~# clush -b -w @cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS snapshot reaper_db"
---------------
cassandra01
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484854] and options {skipFlush=false}
Snapshot directory: 1693296484854
---------------
cassandra02
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484948] and options {skipFlush=false}
Snapshot directory: 1693296484948
---------------
cassandra03
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484809] and options {skipFlush=false}
Snapshot directory: 1693296484809
---------------
cassandra04
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484951] and options {skipFlush=false}
Snapshot directory: 1693296484951
---------------
cassandra05
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484852] and options {skipFlush=false}
Snapshot directory: 1693296484852
---------------
cassandra06
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296485256] and options {skipFlush=false}
Snapshot directory: 1693296485256
---------------
cassandra07
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484959] and options {skipFlush=false}
Snapshot directory: 1693296484959
---------------
cassandra08
---------------
Requested creating snapshot(s) for [reaper_db] with snapshot name [1693296484975] and options {skipFlush=false}
Snapshot directory: 1693296484975

Reaper upgraded by argo
Rester OK

The old (and stuck) repairs are now present in the UI:

looks like the upgrade has also upgrade the scheduling as no segments were repaired on the repair of the 14th august, and a segment is in repair. cool.

Let's remove the snapshot as everything looks ok:

staging:

root@pergamon:~# clush -b -w @staging-cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS clearsnapshot --all"
---------------
cassandra[1-3].internal.staging.swh.network (3)
---------------
Requested clearing snapshot(s) for [all keyspaces] with [all snapshots]
root@pergamon:~# clush -b -w @staging-cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS listsnapshots"
---------------
cassandra[1-3].internal.staging.swh.network (3)
---------------
Snapshot Details:
There are no snapshots

production

root@pergamon:~# clush -b -w @cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS clearsnapshot --all"
---------------
cassandra[01-08] (8)
---------------
Requested clearing snapshot(s) for [all keyspaces] with [all snapshots]
root@pergamon:~# clush -b -w @cassandra "/opt/cassandra/bin/nodetool -u cassandra --password $PASS listsnapshots"
---------------
cassandra[01-08] (8)
---------------
Snapshot Details:
There are no snapshots

(The removal of the snapshot is async and it can take some time before listsnapshots is empty)

closed

mentioned in commit vsellier/swh-docs@feda58c2

mentioned in commit vsellier/swh-docs@11f8cc22

mentioned in merge request swh/devel/swh-docs!377 (merged)

mentioned in commit vsellier/swh-docs@e6502d5f

Upgrade reaper to 3.3.3

Designs

Child items ...

Activity