Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
S
swh-scrubber
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Antoine Lambert
swh-scrubber
Commits
a408bb04
Commit
a408bb04
authored
2 years ago
by
Jenkins for Software Heritage
Browse files
Options
Downloads
Plain Diff
Merge tag 'debian/0.0.6-1_swh1' into debian/buster-swh
parents
02b865f9
f55afb4c
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
PKG-INFO
+1
-1
1 addition, 1 deletion
PKG-INFO
debian/changelog
+6
-3
6 additions, 3 deletions
debian/changelog
swh.scrubber.egg-info/PKG-INFO
+1
-1
1 addition, 1 deletion
swh.scrubber.egg-info/PKG-INFO
swh/scrubber/db.py
+96
-96
96 additions, 96 deletions
swh/scrubber/db.py
with
104 additions
and
101 deletions
PKG-INFO
+
1
−
1
View file @
a408bb04
Metadata-Version: 2.1
Name: swh.scrubber
Version: 0.0.
5
Version: 0.0.
6
Summary: Software Heritage Datastore Scrubber
Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber
Author: Software Heritage developers
...
...
This diff is collapsed.
Click to expand it.
debian/changelog
+
6
−
3
View file @
a408bb04
swh-scrubber (0.0.
5
-1~swh1
~bpo10+1) buster
-swh; urgency=medium
swh-scrubber (0.0.
6
-1~swh1
) unstable
-swh; urgency=medium
* Rebuild for buster-swh
* New upstream release 0.0.6 - (tagged by Antoine R. Dumont
(@ardumont) <antoine.romain.dumont@gmail.com> on 2022-05-31 11:15:58
+0200)
* Upstream changes: - v0.0.6 - Wrap queries in transaction
-- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>
Mon
, 3
0
May 2022
16:05:09
+0000
-- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>
Tue
, 3
1
May 2022
09:20:56
+0000
swh-scrubber (0.0.5-1~swh1) unstable-swh; urgency=medium
...
...
This diff is collapsed.
Click to expand it.
swh.scrubber.egg-info/PKG-INFO
+
1
−
1
View file @
a408bb04
Metadata-Version: 2.1
Name: swh.scrubber
Version: 0.0.
5
Version: 0.0.
6
Summary: Software Heritage Datastore Scrubber
Home-page: https://forge.softwareheritage.org/diffusion/swh-scrubber
Author: Software Heritage developers
...
...
This diff is collapsed.
Click to expand it.
swh/scrubber/db.py
+
96
−
96
View file @
a408bb04
...
...
@@ -50,32 +50,32 @@ class ScrubberDb(BaseDb):
@functools.lru_cache
(
1000
)
def
datastore_get_or_add
(
self
,
datastore
:
Datastore
)
->
int
:
"""
Creates a datastore if it does not exist, and returns its id.
"""
cur
=
self
.
cursor
()
cur
.
execute
(
"""
WITH inserted AS (
INSERT INTO datastore (package, class, instance)
VALUES (%(package)s, %(cls)s, %(instance)s)
ON CONFLICT DO NOTHING
RETURNING id
)
SELECT id
FROM inserted
UNION (
-- If the datastore already exists, we need to fetch its id
with
self
.
transaction
()
as
cur
:
cur
.
execute
(
"""
WITH inserted AS (
INSERT INTO datastore (package, class, instance)
VALUES (%(package)s, %(cls)s, %(instance)s)
ON CONFLICT DO NOTHING
RETURNING id
)
SELECT id
FROM datastore
WHERE
package=%(package)s
AND class=%(cls)s
AND instance=%(instance)s
FROM inserted
UNION (
-- If the datastore already exists, we need to fetch its id
SELECT id
FROM datastore
WHERE
package=%(package)s
AND class=%(cls)s
AND instance=%(instance)s
)
LIMIT 1
"""
,
(
dataclasses
.
asdict
(
datastore
)),
)
LIMIT 1
"""
,
(
dataclasses
.
asdict
(
datastore
)),
)
(
id_
,)
=
cur
.
fetchone
()
return
id_
(
id_
,)
=
cur
.
fetchone
()
return
id_
def
corrupt_object_add
(
self
,
...
...
@@ -84,40 +84,40 @@ class ScrubberDb(BaseDb):
serialized_object
:
bytes
,
)
->
None
:
datastore_id
=
self
.
datastore_get_or_add
(
datastore
)
cur
=
self
.
cursor
()
cur
.
execute
(
"""
INSERT INTO corrupt_object (id, datastore, object)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING
"""
,
(
str
(
id
),
datastore_id
,
serialized_object
),
)
with
self
.
transaction
()
as
cur
:
cur
.
execute
(
"""
INSERT INTO corrupt_object (id, datastore, object)
VALUES (%s, %s, %s)
ON CONFLICT DO NOTHING
"""
,
(
str
(
id
),
datastore_id
,
serialized_object
),
)
def
corrupt_object_iter
(
self
)
->
Iterator
[
CorruptObject
]:
"""
Yields all records in the
'
corrupt_object
'
table.
"""
cur
=
self
.
cursor
()
cur
.
execute
(
"""
SELECT
co.id, co.first_occurrence, co.object,
ds.package, ds.class, ds.instance
FROM corrupt_object AS co
INNER JOIN datastore AS ds ON (ds.id=co.datastore)
"""
)
for
row
in
cur
:
(
id
,
first_occurrence
,
object_
,
ds_package
,
ds_class
,
ds_instance
)
=
row
yield
CorruptObject
(
id
=
CoreSWHID
.
from_string
(
id
),
first_occurrence
=
first_occurrence
,
object_
=
object_
,
datastore
=
Datastore
(
package
=
ds_package
,
cls
=
ds_class
,
instance
=
ds_instance
),
with
self
.
transaction
()
as
cur
:
cur
.
execute
(
"""
SELECT
co.id, co.first_occurrence, co.object,
ds.package, ds.class, ds.instance
FROM corrupt_object AS co
INNER JOIN datastore AS ds ON (ds.id=co.datastore)
"""
)
for
row
in
cur
:
(
id
,
first_occurrence
,
object_
,
ds_package
,
ds_class
,
ds_instance
)
=
row
yield
CorruptObject
(
id
=
CoreSWHID
.
from_string
(
id
),
first_occurrence
=
first_occurrence
,
object_
=
object_
,
datastore
=
Datastore
(
package
=
ds_package
,
cls
=
ds_class
,
instance
=
ds_instance
),
)
def
_corrupt_object_list_from_cursor
(
self
,
cur
:
psycopg2
.
extensions
.
cursor
)
->
List
[
CorruptObject
]:
...
...
@@ -151,23 +151,23 @@ class ScrubberDb(BaseDb):
in_origin: An origin URL. If provided, only returns objects that may be
found in the given origin
"""
cur
=
self
.
cursor
()
cur
.
execute
(
"""
SELECT
co.id, co.first_occurrence, co.object,
ds.package, ds.class, ds.instance
FROM corrupt_object AS co
INNER JOIN datastore AS ds ON (ds.id=co.datastore)
WHERE
co.id >= %s
AND co.id <= %s
ORDER BY co.id
LIMIT %s
"""
,
(
str
(
start_id
),
str
(
end_id
),
limit
),
)
return
self
.
_corrupt_object_list_from_cursor
(
cur
)
with
self
.
transaction
()
as
cur
:
cur
.
execute
(
"""
SELECT
co.id, co.first_occurrence, co.object,
ds.package, ds.class, ds.instance
FROM corrupt_object AS co
INNER JOIN datastore AS ds ON (ds.id=co.datastore)
WHERE
co.id >= %s
AND co.id <= %s
ORDER BY co.id
LIMIT %s
"""
,
(
str
(
start_id
),
str
(
end_id
),
limit
),
)
return
self
.
_corrupt_object_list_from_cursor
(
cur
)
def
corrupt_object_grab_by_id
(
self
,
...
...
@@ -273,24 +273,24 @@ class ScrubberDb(BaseDb):
Arguments:
after: if given, only returns origins with an URL after this value
"""
cur
=
self
.
cursor
()
cur
.
execute
(
"""
SELECT DISTINCT origin_url
FROM object_origin
WHERE
origin_url > %(after)s
AND object_id IN (
(SELECT id FROM corrupt_object)
EXCEPT (SELECT id FROM fixed_object)
)
ORDER BY origin_url
LIMIT %(limit)s
"""
,
dict
(
after
=
after
,
limit
=
limit
),
)
with
self
.
transaction
()
as
cur
:
cur
.
execute
(
"""
SELECT DISTINCT origin_url
FROM object_origin
WHERE
origin_url > %(after)s
AND object_id IN (
(SELECT id FROM corrupt_object)
EXCEPT (SELECT id FROM fixed_object)
)
ORDER BY origin_url
LIMIT %(limit)s
"""
,
dict
(
after
=
after
,
limit
=
limit
),
)
return
[
origin_url
for
(
origin_url
,)
in
cur
]
return
[
origin_url
for
(
origin_url
,)
in
cur
]
def
fixed_object_add
(
self
,
cur
:
psycopg2
.
extensions
.
cursor
,
fixed_objects
:
List
[
FixedObject
]
...
...
@@ -309,12 +309,12 @@ class ScrubberDb(BaseDb):
)
def
fixed_object_iter
(
self
)
->
Iterator
[
FixedObject
]:
cur
=
self
.
cursor
()
cur
.
execute
(
"
SELECT id, object, method, recovery_date FROM fixed_object
"
)
for
(
id
,
object_
,
method
,
recovery_date
)
in
cur
:
yield
FixedObject
(
id
=
CoreSWHID
.
from_string
(
id
),
object_
=
object_
,
method
=
method
,
recovery_date
=
recovery_date
,
)
with
self
.
transaction
()
as
cur
:
cur
.
execute
(
"
SELECT id, object, method, recovery_date FROM fixed_object
"
)
for
(
id
,
object_
,
method
,
recovery_date
)
in
cur
:
yield
FixedObject
(
id
=
CoreSWHID
.
from_string
(
id
),
object_
=
object_
,
method
=
method
,
recovery_date
=
recovery_date
,
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment