From 1831f873ea680f99bc2669f6f6927b9f1b892a9f Mon Sep 17 00:00:00 2001 From: Antoine Pietri <antoine.pietri1@gmail.com> Date: Mon, 27 Jan 2020 16:24:54 +0100 Subject: [PATCH] swh-graph: azure: first draft of azure docs --- docs/graph/azure_open_datasets/.gitignore | 1 + docs/graph/azure_open_datasets/Makefile | 31 +++++++++++++++++++ .../azure_open_datasets/config_template.json | 23 ++++++++++++++ .../graph/azure_open_datasets/dataset_stub.md | 4 +++ .../datasets/swh_graph/overview.md | 31 +++++++++++++++++++ docs/graph/schema.rst | 4 +-- 6 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 docs/graph/azure_open_datasets/.gitignore create mode 100644 docs/graph/azure_open_datasets/Makefile create mode 100644 docs/graph/azure_open_datasets/config_template.json create mode 100644 docs/graph/azure_open_datasets/dataset_stub.md create mode 100644 docs/graph/azure_open_datasets/datasets/swh_graph/overview.md diff --git a/docs/graph/azure_open_datasets/.gitignore b/docs/graph/azure_open_datasets/.gitignore new file mode 100644 index 0000000..c38eeed --- /dev/null +++ b/docs/graph/azure_open_datasets/.gitignore @@ -0,0 +1 @@ +datasets/swh_graph_* diff --git a/docs/graph/azure_open_datasets/Makefile b/docs/graph/azure_open_datasets/Makefile new file mode 100644 index 0000000..8ff57cd --- /dev/null +++ b/docs/graph/azure_open_datasets/Makefile @@ -0,0 +1,31 @@ +TABLES := content skipped_content directory directory_entry_file \ + directory_entry_dir directory_entry_rev person revision revision_history \ + release snapshot snapshot_branches snapshot_branch origin origin_visit + +DATASETS := $(addprefix datasets/swh_graph_,$(TABLES)) +OVERVIEWS := $(addsuffix /overview.md,$(DATASETS)) +CONFIGS := $(addsuffix /config.json,$(DATASETS)) +TARGETS := $(OVERVIEWS) $(CONFIGS) datasets/swh_graph/config.json + +all: $(TARGETS) + +datasets/swh_graph_%/overview.md: + mkdir -p $$( dirname $@ ) + cat dataset_stub.md > $@ + sed -n '/^- \+\*\*$*\*\*/,/^-/p' ../schema.rst | head -n-1 >> $@ + +datasets/swh_graph/config.json: config_template.json + cat config_template.json |\ + jq '.Id = "software-heritage-graph-dataset"' |\ + jq '.Slug = "software-heritage-graph-dataset"' |\ + jq '.Name = "Software Heritage Graph Dataset"' |\ + jq '.DataAccess.AzureDatabricks.python."azureml-opendatasets" = "Notebooks/software-heritage-graph-dataset/swh-graph-example-notebook.ipynb"' \ + > $@ + +datasets/swh_graph_%/config.json: + cat config_template.json |\ + jq '.Id = "software-heritage-graph-dataset-$*"' |\ + jq '.Slug = "software-heritage-graph-dataset-$*"' |\ + jq '.Name = "Software Heritage Graph Dataset: $* table"' |\ + jq '.BlobLocation.Path = "swhgraph/2018-09-25/parquet/$*"' \ + > $@ diff --git a/docs/graph/azure_open_datasets/config_template.json b/docs/graph/azure_open_datasets/config_template.json new file mode 100644 index 0000000..0f65a4e --- /dev/null +++ b/docs/graph/azure_open_datasets/config_template.json @@ -0,0 +1,23 @@ +{ + "Version": 2, + "Id": "software-heritage-graph-dataset%%TABLE_SLUG%%", + "Slug": "software-heritage-graph-dataset%%TABLE_SLUG%%", + "Name": "Software Heritage Graph Dataset%%TABLE_TITLE%%", + "DataFormat": { + "Type": "Parquet" + }, + "IconUrl": "https://swhopendataset.blob.core.windows.net/swhgraph/swh-logo.svg", + "Tags": [ + "software heritage", + "graph dataset", + "development history", + "software repositories", + "source code", + "open source software", + "free software", + "development history graph" + ], + "ProfileIntervalInSeconds": "TODO", + "BootstrapTimeUtc": "TODO", + "Triaged": "TODO" +} diff --git a/docs/graph/azure_open_datasets/dataset_stub.md b/docs/graph/azure_open_datasets/dataset_stub.md new file mode 100644 index 0000000..b35c557 --- /dev/null +++ b/docs/graph/azure_open_datasets/dataset_stub.md @@ -0,0 +1,4 @@ +This dataset is part of the [Software Heritage Graph open +dataset](https://azure.microsoft.com/en-us/services/open-datasets/catalog/software-heritage-graph/). +Please refer to the main dataset page for documentation and examples. + diff --git a/docs/graph/azure_open_datasets/datasets/swh_graph/overview.md b/docs/graph/azure_open_datasets/datasets/swh_graph/overview.md new file mode 100644 index 0000000..eed2d7a --- /dev/null +++ b/docs/graph/azure_open_datasets/datasets/swh_graph/overview.md @@ -0,0 +1,31 @@ +This is the Software Heritage graph dataset: a fully-deduplicated Merkle DAG +representation of the Software Heritage archive. The dataset links together +file content identifiers, source code directories, Version Control System (VCS) +commits tracking evolution over time, up to the full states of VCS repositories +as observed by Software Heritage during periodic crawls. The dataset's contents +come from major development forges (including [GitHub](https://github.com/) and +[GitLab](https://gitlab.com)), FOSS distributions (e.g., [Debian](debian.org)), +and language-specific package managers (e.g., [PyPI](https://pypi.org/)). +Crawling information is also included, providing timestamps about when and +where all archived source code artifacts have been observed in the wild. + +The Software Heritage graph dataset is also available for download in other +formats, including CSV dumps and Apache Parquet files for local use. + +By accessing the dataset, you agree with the Software Heritage [Ethical Charter +for using the archive +data](https://www.softwareheritage.org/legal/users-ethical-charter/), and the +[terms of use for bulk +access](https://www.softwareheritage.org/legal/bulk-access-terms-of-use/). + +If you use this dataset for research purposes, please cite the following paper: + +- Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli. + *The Software Heritage Graph Dataset: Public software development under one + roof.* + In proceedings of [MSR 2019](http://2019.msrconf.org/): The 16th + International Conference on Mining Software Repositories, May 2019, + Montreal, Canada. Co-located with [ICSE + 2019](https://2019.icse-conferences.org/). + [preprint](https://upsilon.cc/~zack/research/publications/msr-2019-swh.pdf), + [bibtex](https://upsilon.cc/~zack/research/publications/msr-2019-swh.bib) diff --git a/docs/graph/schema.rst b/docs/graph/schema.rst index e2518f6..66837cc 100644 --- a/docs/graph/schema.rst +++ b/docs/graph/schema.rst @@ -9,14 +9,14 @@ A simplified view of the corresponding database schema is shown here: This page documents the details of the schema. -- **content**: contains information on the contents stored in +- **content**: contains information on the contents stored in the archive. - ``sha1`` (bytes): the SHA-1 of the content - ``sha1_git`` (bytes): the Git SHA-1 of the content - ``length`` (integer): the length of the content -- **skipped_content**: contains information on the contents that were not archived for +- **skipped_content**: contains information on the contents that were not archived for various reasons. - ``sha1`` (bytes): the SHA-1 of the missing content -- GitLab