From 1831f873ea680f99bc2669f6f6927b9f1b892a9f Mon Sep 17 00:00:00 2001
From: Antoine Pietri <antoine.pietri1@gmail.com>
Date: Mon, 27 Jan 2020 16:24:54 +0100
Subject: [PATCH] swh-graph: azure: first draft of azure docs

---
 docs/graph/azure_open_datasets/.gitignore     |  1 +
 docs/graph/azure_open_datasets/Makefile       | 31 +++++++++++++++++++
 .../azure_open_datasets/config_template.json  | 23 ++++++++++++++
 .../graph/azure_open_datasets/dataset_stub.md |  4 +++
 .../datasets/swh_graph/overview.md            | 31 +++++++++++++++++++
 docs/graph/schema.rst                         |  4 +--
 6 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100644 docs/graph/azure_open_datasets/.gitignore
 create mode 100644 docs/graph/azure_open_datasets/Makefile
 create mode 100644 docs/graph/azure_open_datasets/config_template.json
 create mode 100644 docs/graph/azure_open_datasets/dataset_stub.md
 create mode 100644 docs/graph/azure_open_datasets/datasets/swh_graph/overview.md

diff --git a/docs/graph/azure_open_datasets/.gitignore b/docs/graph/azure_open_datasets/.gitignore
new file mode 100644
index 0000000..c38eeed
--- /dev/null
+++ b/docs/graph/azure_open_datasets/.gitignore
@@ -0,0 +1 @@
+datasets/swh_graph_*
diff --git a/docs/graph/azure_open_datasets/Makefile b/docs/graph/azure_open_datasets/Makefile
new file mode 100644
index 0000000..8ff57cd
--- /dev/null
+++ b/docs/graph/azure_open_datasets/Makefile
@@ -0,0 +1,31 @@
+TABLES := content skipped_content directory directory_entry_file \
+	directory_entry_dir directory_entry_rev person revision revision_history \
+	release snapshot snapshot_branches snapshot_branch origin origin_visit
+
+DATASETS := $(addprefix datasets/swh_graph_,$(TABLES))
+OVERVIEWS := $(addsuffix /overview.md,$(DATASETS))
+CONFIGS := $(addsuffix /config.json,$(DATASETS))
+TARGETS := $(OVERVIEWS) $(CONFIGS) datasets/swh_graph/config.json
+
+all: $(TARGETS)
+
+datasets/swh_graph_%/overview.md:
+	mkdir -p $$( dirname $@ )
+	cat dataset_stub.md > $@
+	sed -n '/^- \+\*\*$*\*\*/,/^-/p' ../schema.rst | head -n-1 >> $@
+
+datasets/swh_graph/config.json: config_template.json
+	cat config_template.json |\
+		jq '.Id = "software-heritage-graph-dataset"' |\
+		jq '.Slug = "software-heritage-graph-dataset"' |\
+		jq '.Name = "Software Heritage Graph Dataset"' |\
+		jq '.DataAccess.AzureDatabricks.python."azureml-opendatasets" = "Notebooks/software-heritage-graph-dataset/swh-graph-example-notebook.ipynb"' \
+		> $@
+
+datasets/swh_graph_%/config.json:
+	cat config_template.json |\
+		jq '.Id = "software-heritage-graph-dataset-$*"' |\
+		jq '.Slug = "software-heritage-graph-dataset-$*"' |\
+		jq '.Name = "Software Heritage Graph Dataset: $* table"' |\
+		jq '.BlobLocation.Path = "swhgraph/2018-09-25/parquet/$*"' \
+		> $@
diff --git a/docs/graph/azure_open_datasets/config_template.json b/docs/graph/azure_open_datasets/config_template.json
new file mode 100644
index 0000000..0f65a4e
--- /dev/null
+++ b/docs/graph/azure_open_datasets/config_template.json
@@ -0,0 +1,23 @@
+{
+  "Version": 2,
+  "Id": "software-heritage-graph-dataset%%TABLE_SLUG%%",
+  "Slug": "software-heritage-graph-dataset%%TABLE_SLUG%%",
+  "Name": "Software Heritage Graph Dataset%%TABLE_TITLE%%",
+  "DataFormat": {
+    "Type": "Parquet"
+  },
+  "IconUrl": "https://swhopendataset.blob.core.windows.net/swhgraph/swh-logo.svg",
+  "Tags": [
+    "software heritage",
+    "graph dataset",
+    "development history",
+    "software repositories",
+    "source code",
+    "open source software",
+    "free software",
+    "development history graph"
+  ],
+  "ProfileIntervalInSeconds": "TODO",
+  "BootstrapTimeUtc": "TODO",
+  "Triaged": "TODO"
+}
diff --git a/docs/graph/azure_open_datasets/dataset_stub.md b/docs/graph/azure_open_datasets/dataset_stub.md
new file mode 100644
index 0000000..b35c557
--- /dev/null
+++ b/docs/graph/azure_open_datasets/dataset_stub.md
@@ -0,0 +1,4 @@
+This dataset is part of the [Software Heritage Graph open
+dataset](https://azure.microsoft.com/en-us/services/open-datasets/catalog/software-heritage-graph/).
+Please refer to the main dataset page for documentation and examples.
+
diff --git a/docs/graph/azure_open_datasets/datasets/swh_graph/overview.md b/docs/graph/azure_open_datasets/datasets/swh_graph/overview.md
new file mode 100644
index 0000000..eed2d7a
--- /dev/null
+++ b/docs/graph/azure_open_datasets/datasets/swh_graph/overview.md
@@ -0,0 +1,31 @@
+This is the Software Heritage graph dataset: a fully-deduplicated Merkle DAG
+representation of the Software Heritage archive. The dataset links together
+file content identifiers, source code directories, Version Control System (VCS)
+commits tracking evolution over time, up to the full states of VCS repositories
+as observed by Software Heritage during periodic crawls. The dataset's contents
+come from major development forges (including [GitHub](https://github.com/) and
+[GitLab](https://gitlab.com)), FOSS distributions (e.g., [Debian](debian.org)),
+and language-specific package managers (e.g., [PyPI](https://pypi.org/)).
+Crawling information is also included, providing timestamps about when and
+where all archived source code artifacts have been observed in the wild.
+
+The Software Heritage graph dataset is also available for download in other
+formats, including CSV dumps and Apache Parquet files for local use.
+
+By accessing the dataset, you agree with the Software Heritage [Ethical Charter
+for using the archive
+data](https://www.softwareheritage.org/legal/users-ethical-charter/), and the
+[terms of use for bulk
+access](https://www.softwareheritage.org/legal/bulk-access-terms-of-use/).
+
+If you use this dataset for research purposes, please cite the following paper:
+
+-   Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli.  
+    *The Software Heritage Graph Dataset: Public software development under one
+    roof.*  
+    In proceedings of [MSR 2019](http://2019.msrconf.org/): The 16th
+    International Conference on Mining Software Repositories, May 2019,
+    Montreal, Canada.  Co-located with [ICSE
+    2019](https://2019.icse-conferences.org/).  
+    [preprint](https://upsilon.cc/~zack/research/publications/msr-2019-swh.pdf),
+    [bibtex](https://upsilon.cc/~zack/research/publications/msr-2019-swh.bib)  
diff --git a/docs/graph/schema.rst b/docs/graph/schema.rst
index e2518f6..66837cc 100644
--- a/docs/graph/schema.rst
+++ b/docs/graph/schema.rst
@@ -9,14 +9,14 @@ A simplified view of the corresponding database schema is shown here:
 
 This page documents the details of the schema.
 
--  **content**: contains information on the contents stored in
+- **content**: contains information on the contents stored in
    the archive.
 
   - ``sha1`` (bytes): the SHA-1 of the content
   - ``sha1_git`` (bytes): the Git SHA-1 of the content
   - ``length`` (integer): the length of the content
 
--  **skipped_content**: contains information on the contents that were not archived for
+- **skipped_content**: contains information on the contents that were not archived for
   various reasons.
 
   - ``sha1`` (bytes): the SHA-1 of the missing content
-- 
GitLab