Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • lunar/swh-fuse
  • anlambert/swh-fuse
  • swh/devel/swh-fuse
  • douardda/swh-fuse
  • martin/swh-fuse
5 results
Show changes
Commits on Source (44)
# Changes here will be overwritten by Copier
_commit: v0.3.3
_src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
description: Software Heritage virtual file system
distribution_name: swh-fuse
have_cli: true
have_workers: false
package_root: swh/fuse
project_name: swh.fuse
python_minimal_version: '3.7'
readme_format: rst
# python: Reformat code with black
82ad1d6e501637fce226bc2812aecdd12f20cbdc
02ae6b6dac247f9d4b7100422ffd9b11c13ebc2a
*.egg-info/
*.pyc
*.sw?
*~
.coverage
.eggs/
.hypothesis
.mypy_cache
.tox
__pycache__
build/
dist/
version.txt
.mypy_cache/
# these are symlinks created by a hook in swh-docs' main sphinx conf.py
docs/README.rst
docs/README.md
# this should be a symlink for people who want to build the sphinx doc
# without using tox, generally created by the swh-env/bin/update script
docs/Makefile.sphinx
.vscode/
\ No newline at end of file
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0
hooks:
- id: trailing-whitespace
- id: check-json
- id: check-yaml
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-json
- id: check-yaml
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.3
hooks:
- id: flake8
- repo: https://github.com/python/black
rev: 25.1.0
hooks:
- id: black
- repo: https://github.com/codespell-project/codespell
rev: v1.16.0
hooks:
- id: codespell
exclude: tests/data/api_data.py
- repo: https://github.com/PyCQA/isort
rev: 6.0.0
hooks:
- id: isort
- repo: local
hooks:
- id: mypy
name: mypy
entry: mypy
args: [swh]
pass_filenames: false
language: system
types: [python]
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]
# unfortunately, we are far from being able to enable this...
# - repo: https://github.com/PyCQA/pydocstyle.git
# rev: 4.0.0
# hooks:
# - id: pydocstyle
# name: pydocstyle
# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions.
# entry: pydocstyle --convention=google
# language: python
# types: [python]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
name: Check source code spelling
stages: [pre-commit]
exclude:
swh/fuse/tests/data/api_data.py
- id: codespell
name: Check commit message spelling
stages: [commit-msg]
- repo: https://github.com/PyCQA/isort
rev: 5.5.2
hooks:
- id: isort
- repo: https://github.com/python/black
rev: 19.10b0
hooks:
- id: black
- repo: local
hooks:
- id: mypy
name: mypy
entry: mypy
args: [swh]
pass_filenames: false
language: system
types: [python]
- id: twine-check
name: twine check
description: call twine check when pushing an annotated release tag
entry: bash -c "ref=$(git describe) &&
[[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] &&
(python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true"
pass_filenames: false
stages: [pre-push]
language: python
additional_dependencies: [twine, build]
......@@ -6,7 +6,7 @@ In the interest of fostering an open and welcoming environment, we as Software
Heritage contributors and maintainers pledge to making participation in our
project and our community a harassment-free experience for everyone, regardless
of age, body size, disability, ethnicity, sex characteristics, gender identity
and expression, level of experience, education, socio-economic status,
and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, religion, or sexual identity and
orientation.
......
include Makefile
include requirements*.txt
include version.txt
include README.md
recursive-include swh py.typed
docs/README.rst
\ No newline at end of file
Software Heritage Filesystem (SwhFS)
====================================
User-space POSIX filesystem to browse the `Software Heritage
<https://www.softwareheritage.org/>`_ `archive
<https://archive.softwareheritage.org/>`_, as if it were locally available.
Built using the `FUSE <https://github.com/libfuse/libfuse>`_ framework.
Demo
----
A live demo of SwhFS in action is available as a `screencast
<https://www.youtube.com/watch?v=2L4ANVlICaE>`_.
Bibliography
------------
In addition to accompanying technical documentation, SwhFS is also described in
the following scientific paper. If you use SwhFS for your scientific results,
please acknowledge it by citing the paper as follows:
.. note::
Thibault Allançon, Antoine Pietri, Stefano Zacchiroli. `The Software Heritage
Filesystem (SwhFS): Integrating Source Code Archival with Development
<https://arxiv.org/pdf/2102.06390.pdf>`_. In proceedings of `ICSE 2021
<https://conf.researchr.org/home/icse-2021>`_: The 43rd International
Conference on Software Engineering, May 2021, Madrid, Spain. IEEE 2021.
Links: `preprint <https://arxiv.org/pdf/2102.06390.pdf>`_, `bibtex
<https://upsilon.cc/~zack/research/publications/saner-2020-swh-graph.bib>`_.
include ../../swh-docs/Makefile.sphinx
-include Makefile.local
include Makefile.sphinx
include Makefile.local
Software Heritage Filesystem (SwhFS)
====================================
User-space POSIX filesystem to browse the `Software Heritage
<https://www.softwareheritage.org/>`_ `archive
<https://archive.softwareheritage.org/>`_, as if it were locally available.
Built using the `FUSE <https://github.com/libfuse/libfuse>`_ framework.
Demo
----
A live demo of SwhFS in action is available as a `screencast
<https://www.youtube.com/watch?v=2L4ANVlICaE>`_.
Bibliography
------------
In addition to accompanying technical documentation, SwhFS is also described in
the following scientific paper. If you use SwhFS for your scientific results,
please acknowledge it by citing the paper as follows:
.. note::
Thibault Allançon, Antoine Pietri, Stefano Zacchiroli. `The Software Heritage
Filesystem (SwhFS): Integrating Source Code Archival with Development
<https://arxiv.org/pdf/2102.06390.pdf>`_. In proceedings of `ICSE 2021
<https://conf.researchr.org/home/icse-2021>`_: The 43rd International
Conference on Software Engineering, May 2021, Madrid, Spain. IEEE 2021.
Links: `preprint <https://arxiv.org/pdf/2102.06390.pdf>`_, `bibtex
<https://upsilon.cc/~zack/research/publications/saner-2020-swh-graph.bib>`_.
../README.rst
\ No newline at end of file
......@@ -5,4 +5,4 @@ Command-line interface
.. click:: swh.fuse.cli:fuse
:prog: swh fs
:show-nested:
:nested: full
......@@ -14,8 +14,24 @@ The configuration file location is subject to the `XDG Base Directory
well as explicitly overridden on the :ref:`command line <swh-fuse-cli>` via the
``-C/--config-file`` flag.
The following sub-sections and fields can be used within the ``swh > fuse``
stanza:
You can choose how ``swh-fuse`` will fetch content from the archive.
The simplest (and default) way is to query the SWH public API.
This method can be configured with the following block:
- ``web-api``:
- ``url``: archive API URL
- ``auth-token``: (optional) authentication token used with the API URL
You can use a :ref:`compressed graph <swh-graph>` close to your server, via its gRPC API,
to traverse the folder hierarchy much faster.
This can be configured with the following block:
- ``graph``:
- ``grpc-url``: URL to the graph's :ref:`gRPC server <swh-graph-grpc-api>`.
``swh-fuse`` will also search for the following options:
- ``cache``:
......@@ -28,11 +44,6 @@ stanza:
specified using a ``maxram`` entry (either as a percentage of available RAM,
or with disk storage unit suffixes: ``B``, ``KB``, ``MB``, ``GB``).
- ``web-api``:
- ``url``: archive API URL
- ``auth-token``: authentication token used with the API URL
- ``json-indent``: number of spaces used to print JSON metadata files (setting
it to ``null`` disables indentation).
......@@ -41,17 +52,14 @@ If no configuration is given, default values are:
- ``cache``: all cache files are stored in ``$XDG_CACHE_HOME/swh/fuse/`` (or
``~/.cache/swh/fuse`` if ``XDG_CACHE_HOME`` is not set). The direntry cache
will use at most 10% of available RAM.
- ``web-api``: URL is https://archive.softwareheritage.org/api/1/, with no
authentication token
- ``web-api``: URL is :swh_web:`api/1/`, with no authentication token
- ``json-indent``: 2 spaces.
Example
-------
Examples
--------
Here is a full ``~/.config/swh/global.yml`` example, showcasing different cache
storage strategies (in-memory for metadata, on-disk for blob, 20% RAM for
direntry), using the default Web API service:
Here is a full ``~/.config/swh/global.yml`` equivalent to the default configuration:
.. code:: yaml
......@@ -59,15 +67,32 @@ direntry), using the default Web API service:
fuse:
cache:
metadata:
in-memory: true
path: "/path/to/cache/blob.sqlite"
blob:
path: "/path/to/cache/blob.sqlite"
direntry:
maxram: 20%
maxram: 10%
web-api:
url: "https://archive.softwareheritage.org/api/1/"
auth-token: eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtO...
json-indent: 2
This example uses a local compressed graph, an in-memory cache for metadata,
and authenticates against the SWH public API to benefit from higher rate limits:
.. code:: yaml
swh:
fuse:
cache:
metadata:
in-memory: true
blob:
path: "/path/to/cache/blob.sqlite"
graph:
grpc-url: localhost:50091
web-api:
url: "https://archive.softwareheritage.org/api/1/"
auth-token: eyJhbGciOiJIUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJhMTMxYTQ1My1hM2IyLTQwMTUtO...
Logging
-------
......
......@@ -3,7 +3,7 @@ PDFs = $(patsubst %.puml,%.pdf,$(PUMLs))
PNGs = $(patsubst %.puml,%.png,$(PUMLs))
SVGs = $(patsubst %.puml,%.svg,$(PUMLs))
all: $(PNGs) $(PDFs) $(SVGs)
all: $(PNGs) $(SVGs)
%.pdf: %.puml
plantuml -Tpdf $<
......@@ -14,7 +14,6 @@ all: $(PNGs) $(PDFs) $(SVGs)
%.svg: %.puml
plantuml -Tsvg $<
.PHONY: clean
clean:
......
......@@ -11,4 +11,12 @@
configuration
Design notes <design>
Tutorial <tutorial>
API reference </apidoc/swh.fuse>
.. only:: standalone_package_doc
Indices and tables
------------------
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
# Software Heritage Filesystem (SwhFS) --- Tutorial
## Installation
The Software Heritage virtual filesystem (SwhFS) is available from PyPI
as [swh.fuse](https://pypi.org/project/swh.fuse/). It can be installed from
there using `pip`:
$ pip install swh.fuse
## Setup and teardown
SwhFS is controlled by the `swh fs` command-line interface (CLI).
Like all filesystems, SwhFS must be "mounted" before use and "unmounted"
afterwards. Users should first mount the archive as a whole and then browse
archived objects looking up their SWHIDs below the `archive/` entry-point. To
mount the Software Heritage archive, use the `swh fs mount` command:
$ mkdir swhfs
$ swh fs mount swhfs/ # mount the archive
$ ls -1F swhfs/ # list entry points
archive/ # <- start browsing from here
cache/
origin/
README
By default SwhFS daemonizes into background and logs to syslog; it can be kept
in foreground, logging to the console, by passing `-f/--foreground` to `mount`.
To unmount use `swh fs umount PATH`. Note that, since SwhFS is a *user-space*
filesystem, mounting and unmounting it are not privileged operations, any user
can do it.
The configuration file `~/.swh/config/global.yml` is read if present. Its main
use case is inserting a per-user authentication token for the SWH API, which
might be needed in case of heavy use to bypass the default API rate limit. See
the {ref}`configuration documentation <swh-fuse-config>` for details.
## Lazy loading
Once mounted, the archive can be navigated as if it were locally available
on-disk. Archived objects are referenced by
{ref}`Software Heritage identifiers <persistent-identifiers>` (SWHIDs).
They are loaded on-demand from the archive and populate lazily the `archive/`
directory below the SwhFS mount point.
SWHIDs for source code that is not locally available can be obtained in various
ways: searching on the [Software Heritage website][webui]; finding SWHID
references in [scientific papers][citeguide], [Wikidata][wikidataswhid], and
software bills of materials using the [SPDX standard][spdx]; deriving SWHIDs
from other version control system references (e.g., as SWHIDs version 1 are
compatible with Git, a Git commit identifier like
`9d76c0b163675505d1a901e5fe5249a2c55609bc` can be turned into a SWHID by simply
prefixing it with `swh:1:rev:` to obtain
`swh:1:rev:9d76c0b163675505d1a901e5fe5249a2c55609bc`).
[citeguide]: https://www.softwareheritage.org/save-and-reference-research-software
[spdx]: https://spdx.dev/
[swhid]: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
[webui]: https://archive.softwareheritage.org
[wikidataswhid]: https://www.wikidata.org/wiki/Property:P6138
## Source code files
Here is a SwhFS Hello World:
$ cd swhfs/
$ cat archive/swh:1:cnt:c839dea9e8e6f0528b468214348fee8669b305b2
#include <stdio.h>
int main(void) {
printf("Hello, World!\n");
}
Given the SWHID of a source code file, we can directly access it via the
filesystem.
Metadata about archived source code artifacts is also locally available. For
each entry `archive/<SWHID>` there is a matching JSON file
`archive/<SWHID>.json`, corresponding to what the [Software Heritage Web
API][webapi] will return. For example, here is what the Software Heritage
archive knows about the above Hello World implementation:
$ cat archive/swh:1:cnt:c839dea9e8e6f0528b468214348fee8669b305b2.json
{
"length": 67,
"status": "visible",
"checksums": {
"sha256": "06dfb5d936f50b3cb80152aa053724e4a18417c35f745b66ab9571c25afd0f79",
"sha1": "459ee8545e5ba6cb819ba41e6ea2f0011cedd728",
"blake2s256": "87e6ab9c92681e9a022a8f4679dcd9d9b841fe4146edcbc15329fc66d8c82b4f",
"sha1_git": "c839dea9e8e6f0528b468214348fee8669b305b2"
},
"data_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/raw/",
"filetype_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/filetype/",
"language_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/language/",
"license_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/license/"
}
Note: JSON metadata files are indented by default when read, this can be changed
in the configuration file (see {ref}`documentation <swh-fuse-config>`).
[webapi]: https://archive.softwareheritage.org/api/
## Source code trees
In addition to individual source code files, we can also browse entire source
code directories. Here is the historical Apollo 11 source code, where we can
find interesting comments about the antenna during landing:
$ cd archive/swh:1:dir:1fee702c7e6d14395bbf5ac3598e73bcbf97b030
$ ls | wc -l
127
$ grep -i antenna THE_LUNAR_LANDING.s | cut -f 5
# IS THE LR ANTENNA IN POSITION 1 YET
# BRANCH IF ANTENNA ALREADY IN POSITION 1
We can checkout the commit of a more modern code base, like jQuery, and count
its JavaScript lines of code (SLOC):
$ cd archive/swh:1:rev:9d76c0b163675505d1a901e5fe5249a2c55609bc
$ ls -1F
history/
meta.json@
parent@
parents/
root@
$ find root/src/ -type f -name '*.js' | xargs cat | wc -l
10136
## History browsing
`meta.json` files of revision objects contain complete commit metadata, e.g.:
$ jq '.author.name, .date, .message' meta.json
"Michal Golebiowski-Owczarek"
"2020-03-02T23:02:42+01:00"
"Data:Event:Manipulation: Prevent collisions with Object.prototype ..."
Commit history can be browsed commit-by-commit digging into directories
`parent(s)/` directories or, more efficiently, using the history summaries
located under `history/`:
$ ls -f history/by-page/000/ | wc -l
6469
$ ls -f history/by-page/000/ | head -n 5
swh:1:rev:358b769a00c3a09a8ec621b8dcb2d5e31b7da69a
swh:1:rev:4a7fc8544e2020c75047456d11979e4e3a517fdf
swh:1:rev:364476c3dc1231603ba61fc08068fa89fb095e1a
swh:1:rev:721744a9fab5b597febea64e466272eabfdb9463
swh:1:rev:4592595b478be979141ce35c693dbc6b65647173
The jQuery commit at hand is preceded by 6469 commits, which can be listed in
`git log` order via the `by-page` view. The `by-hash` and `by-date` views list
commits sharded by commit identifier and timestamp:
$ ls history/by-hash/00/ | head -n 5
swh:1:rev:00a9c2e5f4c855382435cec6b3908eb9bd5a53b7
swh:1:rev:005040379d8b64aacbe54941d878efa6e86df1cc
swh:1:rev:00cc67af23bf9cf2cdbaeaeee6ded76baf0292f0
swh:1:rev:00575d4d8c7421c5119f181009374ff2e7736127
swh:1:rev:0019a463bdcb81dc6ba3434505a45774ca27f363
$ ls -1F history/by-date/
2006/
2007/
2008/
...
2018/
2019/
2020/
$ ls -f history/by-date/2020/03/16/
swh:1:ref:90fed4b453a5becdb7f173d9e3c1492390a1441f
$ jq .date history/by-date/2020/03/16/*/meta.json
"2020-03-16T21:49:29+01:00"
Note that to populate the `by-date` view, metadata about all commits in the
history are needed. To avoid blocking on that, metadata are retrieved
asynchronously, populating the view incrementally. The hidden `by-date/.status`
file provides a progress report and is removed upon completion.
## Repository snapshots and branches
Snapshot objects keep track of where each branch and release (or "tag") pointed
at archival time. Here is an example using
the [Unix history repository](https://github.com/dspinellis/unix-history-repo),
which uses historical Unix releases as branch names:
$ cd archive/swh:1:snp:2ca5d6eff8f04a671c0d5b13646cede522c64b7d
$ ls -f refs/heads/ | wc -l
40
$ ls -f refs/heads/ | grep Bell
Bell-32V-Snapshot-Development
Bell-Release
$ cd refs/heads/Bell-Release
$ jq .message,.date meta.json
"Bell 32V release\nSnapshot of the completed development branch\n\nSynthesized-from: 32v\n"
"1979-05-02T23:26:55-05:00"
$ grep core root/usr/src/games/fortune.c
printf("Memory fault -- core dumped\n");
We can check that two of the available branches correspond to historical Bell
Labs UNIX releases. And we can dig into the `fortune` implementation of
[UNIX/32V](https://en.wikipedia.org/wiki/UNIX/32V) instantly, without having to
clone a 1.6 GiB repository first.
## Origin search
Origins can be accessed via the `origin/` top-level directory using their
**encoded** URL (the percent-encoding mechanism described in [RFC
3986](https://tools.ietf.org/html/rfc3986.html).
$ cd origin/https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinux
$ ls
2015-07-09/ 2016-09-14/ 2017-09-12/ 2018-03-08/ 2018-09-06/ ...
Each directory corresponds to a visit, containing metadata and a symlink to the
visit's snapshot:
$ ls -l origin/https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinux/2020-09-21/
total 0
-r--r--r-- 1 haltode haltode 470 Dec 28 12:12 meta.json
lr--r--r-- 1 haltode haltode 67 Dec 28 12:12 snapshot -> ../../../archive/swh:1:snp:c7beb2432b7e93c4cf6ab09cd194c7c1998df2f9/
In order to find origin URLs, we can use the `web search` CLI:
$ swh web search python --limit 5
https://github.com/neon670/python.dev https://archive.softwareheritage.org/api/1/origin/https://github.com/neon670/python.dev/visits/
https://github.com/aur-archive/python-werkzeug https://archive.softwareheritage.org/api/1/origin/https://github.com/aur-archive/python-werkzeug/visits/
https://github.com/jsagon/jtradutor-web-python https://archive.softwareheritage.org/api/1/origin/https://github.com/jsagon/jtradutor-web-python/visits/
https://github.com/zjmwqx/ipythonCode https://archive.softwareheritage.org/api/1/origin/https://github.com/zjmwqx/ipythonCode/visits/
https://github.com/knutab/Python-BSM https://archive.softwareheritage.org/api/1/origin/https://github.com/knutab/Python-BSM/visits/
The `search` tool is also useful to escape URL:
$ swh web search "torvalds linux" --limit 1 --url-encode | cut -f1
https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinux
Software Heritage Filesystem (SwhFS) — Tutorial
===============================================
Installation
------------
The Software Heritage virtual filesystem (SwhFS) is available from PyPI as `swh.fuse
<https://pypi.org/project/swh.fuse/>`_. It can be installed from there using ``pip``:
::
$ pip install swh.fuse
Setup and teardown
------------------
SwhFS is controlled by the ``swh fs`` command-line interface (CLI).
Like all filesystems, SwhFS must be “mounted” before use and “unmounted” afterwards.
Users should first mount the archive as a whole and then browse archived objects looking
up their SWHIDs below the ``archive/`` entry-point. To mount the Software Heritage
archive, use the ``swh fs mount`` command:
::
$ mkdir swhfs
$ swh fs mount swhfs/ # mount the archive
$ ls -1F swhfs/ # list entry points
archive/ # <- start browsing from here
cache/
origin/
README
By default SwhFS daemonizes into background and logs to syslog; it can be kept in
foreground, logging to the console, by passing ``-f/--foreground`` to ``mount``.
To unmount use ``swh fs umount PATH``. Note that, since SwhFS is a *user-space*
filesystem, mounting and unmounting it are not privileged operations, any user can do
it.
The configuration file ``~/.swh/config/global.yml`` is read if present. Its main use
case is inserting a per-user authentication token for the SWH API, which might be needed
in case of heavy use to bypass the default API rate limit. See the {ref}\
``configuration documentation <swh-fuse-config>`` for details.
Lazy loading
------------
Once mounted, the archive can be navigated as if it were locally available on-disk.
Archived objects are referenced by {ref}\ ``Software Heritage identifiers
<persistent-identifiers>`` (SWHIDs). They are loaded on-demand from the archive and
populate lazily the ``archive/`` directory below the SwhFS mount point.
SWHIDs for source code that is not locally available can be obtained in various ways:
searching on the :swh_web:`Software Heritage website </>`; finding SWHID references in
`scientific papers
<https://www.softwareheritage.org/save-and-reference-research-software>`_, `Wikidata
<https://www.wikidata.org/wiki/Property:P6138>`_, and software bills of materials using
the `SPDX standard <https://spdx.dev/>`_; deriving SWHIDs from other version control
system references (e.g., as SWHIDs version 1 are compatible with Git, a Git commit
identifier like ``9d76c0b163675505d1a901e5fe5249a2c55609bc`` can be turned into a SWHID
by simply prefixing it with ``swh:1:rev:`` to obtain
``swh:1:rev:9d76c0b163675505d1a901e5fe5249a2c55609bc``).
Source code files
-----------------
Here is a SwhFS Hello World:
::
$ cd swhfs/
$ cat archive/swh:1:cnt:c839dea9e8e6f0528b468214348fee8669b305b2
#include <stdio.h>
int main(void) {
printf("Hello, World!\n");
}
Given the SWHID of a source code file, we can directly access it via the filesystem.
Metadata about archived source code artifacts is also locally available. For each entry
``archive/<SWHID>`` there is a matching JSON file ``archive/<SWHID>.json``,
corresponding to what the :swh_web:`Software Heritage Web API <api/>` will return. For
example, here is what the Software Heritage archive knows about the above Hello World
implementation:
::
$ cat archive/swh:1:cnt:c839dea9e8e6f0528b468214348fee8669b305b2.json
{
"length": 67,
"status": "visible",
"checksums": {
"sha256": "06dfb5d936f50b3cb80152aa053724e4a18417c35f745b66ab9571c25afd0f79",
"sha1": "459ee8545e5ba6cb819ba41e6ea2f0011cedd728",
"blake2s256": "87e6ab9c92681e9a022a8f4679dcd9d9b841fe4146edcbc15329fc66d8c82b4f",
"sha1_git": "c839dea9e8e6f0528b468214348fee8669b305b2"
},
"data_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/raw/",
"filetype_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/filetype/",
"language_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/language/",
"license_url": "https://archive.softwareheritage.org/api/1/content/sha1_git:c839dea9e8e6f0528b468214348fee8669b305b2/license/"
}
Note: JSON metadata files are indented by default when read, this can be changed in the
configuration file (see {ref}\ ``documentation <swh-fuse-config>``).
Source code trees
-----------------
In addition to individual source code files, we can also browse entire source code
directories. Here is the historical Apollo 11 source code, where we can find interesting
comments about the antenna during landing:
::
$ cd archive/swh:1:dir:1fee702c7e6d14395bbf5ac3598e73bcbf97b030
$ ls | wc -l
127
$ grep -i antenna THE_LUNAR_LANDING.s | cut -f 5
# IS THE LR ANTENNA IN POSITION 1 YET
# BRANCH IF ANTENNA ALREADY IN POSITION 1
We can checkout the commit of a more modern code base, like jQuery, and count its
JavaScript lines of code (SLOC):
::
$ cd archive/swh:1:rev:9d76c0b163675505d1a901e5fe5249a2c55609bc
$ ls -1F
history/
meta.json@
parent@
parents/
root@
$ find root/src/ -type f -name '*.js' | xargs cat | wc -l
10136
History browsing
----------------
``meta.json`` files of revision objects contain complete commit metadata, e.g.:
::
$ jq '.author.name, .date, .message' meta.json
"Michal Golebiowski-Owczarek"
"2020-03-02T23:02:42+01:00"
"Data:Event:Manipulation: Prevent collisions with Object.prototype ..."
Commit history can be browsed commit-by-commit digging into directories ``parent(s)/``
directories or, more efficiently, using the history summaries located under
``history/``:
::
$ ls -f history/by-page/000/ | wc -l
6469
$ ls -f history/by-page/000/ | head -n 5
swh:1:rev:358b769a00c3a09a8ec621b8dcb2d5e31b7da69a
swh:1:rev:4a7fc8544e2020c75047456d11979e4e3a517fdf
swh:1:rev:364476c3dc1231603ba61fc08068fa89fb095e1a
swh:1:rev:721744a9fab5b597febea64e466272eabfdb9463
swh:1:rev:4592595b478be979141ce35c693dbc6b65647173
The jQuery commit at hand is preceded by 6469 commits, which can be listed in ``git
log`` order via the ``by-page`` view. The ``by-hash`` and ``by-date`` views list commits
sharded by commit identifier and timestamp:
::
$ ls history/by-hash/00/ | head -n 5
swh:1:rev:00a9c2e5f4c855382435cec6b3908eb9bd5a53b7
swh:1:rev:005040379d8b64aacbe54941d878efa6e86df1cc
swh:1:rev:00cc67af23bf9cf2cdbaeaeee6ded76baf0292f0
swh:1:rev:00575d4d8c7421c5119f181009374ff2e7736127
swh:1:rev:0019a463bdcb81dc6ba3434505a45774ca27f363
$ ls -1F history/by-date/
2006/
2007/
2008/
...
2018/
2019/
2020/
$ ls -f history/by-date/2020/03/16/
swh:1:ref:90fed4b453a5becdb7f173d9e3c1492390a1441f
$ jq .date history/by-date/2020/03/16/*/meta.json
"2020-03-16T21:49:29+01:00"
Note that to populate the ``by-date`` view, metadata about all commits in the history
are needed. To avoid blocking on that, metadata are retrieved asynchronously, populating
the view incrementally. The hidden ``by-date/.status`` file provides a progress report
and is removed upon completion.
Repository snapshots and branches
---------------------------------
Snapshot objects keep track of where each branch and release (or “tag”) pointed at
archival time. Here is an example using the `Unix history repository
<https://github.com/dspinellis/unix-history-repo>`_, which uses historical Unix releases
as branch names:
::
$ cd archive/swh:1:snp:2ca5d6eff8f04a671c0d5b13646cede522c64b7d
$ ls -f refs/heads/ | wc -l
40
$ ls -f refs/heads/ | grep Bell
Bell-32V-Snapshot-Development
Bell-Release
$ cd refs/heads/Bell-Release
$ jq .message,.date meta.json
"Bell 32V release\nSnapshot of the completed development branch\n\nSynthesized-from: 32v\n"
"1979-05-02T23:26:55-05:00"
$ grep core root/usr/src/games/fortune.c
printf("Memory fault -- core dumped\n");
We can check that two of the available branches correspond to historical Bell Labs UNIX
releases. And we can dig into the ``fortune`` implementation of `UNIX/32V
<https://en.wikipedia.org/wiki/UNIX/32V>`_ instantly, without having to clone a 1.6 GiB
repository first.
Origin search
-------------
Origins can be accessed via the ``origin/`` top-level directory using their **encoded**
URL (the percent-encoding mechanism described in `RFC 3986
<https://tools.ietf.org/html/rfc3986.html>`_.
::
$ cd origin/https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinux
$ ls
2015-07-09/ 2016-09-14/ 2017-09-12/ 2018-03-08/ 2018-09-06/ ...
Each directory corresponds to a visit, containing metadata and a symlink to the visit’s
snapshot:
::
$ ls -l origin/https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinux/2020-09-21/
total 0
-r--r--r-- 1 haltode haltode 470 Dec 28 12:12 meta.json
lr--r--r-- 1 haltode haltode 67 Dec 28 12:12 snapshot -> ../../../archive/swh:1:snp:c7beb2432b7e93c4cf6ab09cd194c7c1998df2f9/
In order to find origin URLs, we can use the ``web search`` CLI:
::
$ swh web search python --limit 5
https://github.com/neon670/python.dev https://archive.softwareheritage.org/api/1/origin/https://github.com/neon670/python.dev/visits/
https://github.com/aur-archive/python-werkzeug https://archive.softwareheritage.org/api/1/origin/https://github.com/aur-archive/python-werkzeug/visits/
https://github.com/jsagon/jtradutor-web-python https://archive.softwareheritage.org/api/1/origin/https://github.com/jsagon/jtradutor-web-python/visits/
https://github.com/zjmwqx/ipythonCode https://archive.softwareheritage.org/api/1/origin/https://github.com/zjmwqx/ipythonCode/visits/
https://github.com/knutab/Python-BSM https://archive.softwareheritage.org/api/1/origin/https://github.com/knutab/Python-BSM/visits/
The ``search`` tool is also useful to escape URL:
::
$ swh web search "torvalds linux" --limit 1 --url-encode | cut -f1
https%3A%2F%2Fgithub.com%2Ftorvalds%2Flinux
[mypy]
namespace_packages = True
warn_unused_ignores = True
# 3rd party libraries without stubs (yet)
[mypy-aiosqlite.*]
ignore_missing_imports = True
[mypy-daemon.*]
ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
[mypy-psutil.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[mypy-pyfuse3.*]
ignore_missing_imports = True
[mypy-pyfuse3_asyncio.*]
ignore_missing_imports = True
[project]
name = "swh.fuse"
authors = [
{name="Software Heritage developers", email="swh-devel@inria.fr"},
]
description = "Software Heritage virtual file system"
readme = {file = "README.rst", content-type = "text/x-rst"}
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 3 - Alpha",
]
dynamic = ["version", "dependencies", "optional-dependencies"]
[tool.setuptools.packages.find]
include = ["swh.*"]
[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt", "requirements-swh.txt"]}
[tool.setuptools.dynamic.optional-dependencies]
testing = {file = ["requirements.txt", "requirements-swh.txt", "requirements-test.txt"]}
[project.entry-points."swh.cli.subcommands"]
"swh.fuse" = "swh.fuse.cli"
[project.urls]
"Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-fuse"
"Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-fuse/-/issues"
"Funding" = "https://www.softwareheritage.org/donate"
"Documentation" = "https://docs.softwareheritage.org/devel/swh-fuse/"
"Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-fuse.git"
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[tool.setuptools_scm]
fallback_version = "0.0.1"
[tool.black]
target-version = ['py37']
target-version = ['py39', 'py310', 'py311', 'py312']
[tool.isort]
multi_line_output = 3
......@@ -9,3 +53,37 @@ use_parentheses = true
ensure_newline_before_comments = true
line_length = 88
force_sort_within_sections = true
known_first_party = ['swh']
[tool.mypy]
namespace_packages = true
warn_unused_ignores = true
explicit_package_bases = true
# ^ Needed for mypy to detect py.typed from swh packages installed
# in editable mode
plugins = []
# 3rd party libraries without stubs (yet)
[[tool.mypy.overrides]]
module = [
"aiosqlite.*",
"daemon.*",
]
ignore_missing_imports = true
[tool.flake8]
select = ["C", "E", "F", "W", "B950"]
ignore = [
"E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
"E231", # missing whitespace after ','
"E501", # line too long, use B950 warning from flake8-bugbear instead
"W503" # line break before binary operator <https://github.com/psf/black/issues/52>
]
max-line-length = 88
[tool.pytest.ini_options]
norecursedirs = "build docs .*"
asyncio_mode = "strict"
consider_namespace_packages = true
asyncio_default_fixture_loop_scope = "function"
\ No newline at end of file
[pytest]
norecursedirs = docs .*