Skip to content
Snippets Groups Projects
Commit 38f66160 authored by vlorentz's avatar vlorentz
Browse files

Prevent timestamps in node properties from being shifted according to the...

Prevent timestamps in node properties from being shifted according to the timezone WriteNodeProperties is being run in.

Due to our ORC exports using the `timestamp` instead of the `timestamp with timezone`, reader and writer need to agree out of bound on the timezone used in files they exchange.

However, we don't do this:

* `swh-dataset` uses pyorc, which uses the C++ ORC library, which assumes users (us) always write in GMT
* `swh-graph` uses the Java ORC library, which assumes the system timezone (or `$TZ` if set)

So when reading with a non-UTC timezone, the Java ORC library interprets timestamps in the dataset as being in the local timezone, and converts them to UNIX timestamps (number of seconds since epoch); then we use these converted timestamps and write them to `.property.author_timestamp.bin` and `.property.committer_timestamp.bin`.

This commit regenerates the example graph to have the correct timestamps. It also applies the 39ed0d17 change that removes useless padding at the end of all property files.

Resolves #4788
parent d7c3f831
No related branches found
No related tags found
1 merge request!334Prevent timestamps in node properties from being shifted
Pipeline #4816 passed
Showing
with 32 additions and 1 deletion
......@@ -53,6 +53,10 @@ public class ORCGraphDataset implements GraphDataset {
public final AllowedNodes allowedNodeTypes;
protected ORCGraphDataset() {
if (!TimeZone.getDefault().getID().equals("UTC")) {
throw new RuntimeException(
"ORCGraphDataset cannot be used in non-UTC timezones (try setting the $TZ environment variable to 'UTC')");
}
this.allowedNodeTypes = new AllowedNodes("*");
}
......@@ -78,6 +82,10 @@ public class ORCGraphDataset implements GraphDataset {
if (!datasetDir.exists()) {
throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist");
}
if (!TimeZone.getDefault().getID().equals("UTC")) {
throw new RuntimeException(
"ORCGraphDataset cannot be used in non-UTC timezones (try setting the $TZ environment variable to 'UTC')");
}
this.datasetDir = datasetDir;
this.allowedNodeTypes = allowedNodeTypes;
}
......
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -10,7 +10,7 @@ from click.testing import CliRunner
import pytest
from swh.graph.cli import graph_cli_group
from swh.graph.example_dataset import DATASET_DIR
from swh.graph.example_dataset import DATASET_DIR, RELEASES, REVISIONS
from ..test_cli import read_properties
......@@ -63,6 +63,26 @@ def test_compressgraph(tmpdir, workers):
assert compression_meta[0]["object_types"] == "cnt,dir,rev,rel,snp,ori"
with open(
"swh/graph/example_dataset/compressed/example.property.author_timestamp.bin",
"rb",
) as f:
timestamps = [
int.from_bytes(f.read(8), byteorder="big")
for _ in range(int(properties["nodes"]))
]
# remove non revision/releases
timestamps = [timestamp for timestamp in timestamps if timestamp != 2**63]
timestamps.sort()
expected_timestamps = [
rel.date.timestamp.seconds for rel in RELEASES if rel.date is not None
] + [rev.date.timestamp.seconds for rev in REVISIONS if rev.date is not None]
expected_timestamps.sort()
assert timestamps == expected_timestamps
@pytest.mark.parametrize(
"workers,object_types",
......
......@@ -308,6 +308,7 @@ def do_step(step, conf):
cmd_env = os.environ.copy()
cmd_env["JAVA_TOOL_OPTIONS"] = conf["java_tool_options"]
cmd_env["CLASSPATH"] = conf["classpath"]
cmd_env["TZ"] = "UTC"
process = subprocess.Popen(
["/bin/bash", "-c", cmd],
env=cmd_env,
......
......@@ -21,6 +21,8 @@ commands =
[testenv:java]
skip_install = true
setenv =
TZ = UTC
allowlist_externals =
mvn
commands =
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment