diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 59cb0de58a27d254e4998563682dec289aa72a7a..7fc590b5c95596df1a82e98df1b62e5cc7621f3c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: hooks: - id: codespell name: Check source code spelling - args: [-L aks] + args: [-L aks, -L crate] stages: [commit] - repo: local diff --git a/docs/software-origins-support.yml b/docs/software-origins-support.yml index f2888719c1108b8f6c5a9605562c137c42e22bbc..c3f8a7664fad8a7efa4f36ee771de94e78929325 100644 --- a/docs/software-origins-support.yml +++ b/docs/software-origins-support.yml @@ -4,9 +4,13 @@ forges: lister: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4233 + supports_last_update: true loader: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4233 + metadata: + intrinsic: collected + extrinsic: none grant: sloan-hashbang-2022 developer: hashbang @@ -17,15 +21,22 @@ forges: loader: status: prod id_in_swh_web: tar + metadata: + intrinsic: none + extrinsic: none aur: name: AUR lister: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4466 + supports_last_update: true loader: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4466 + metadata: + intrinsic: collected + extrinsic: none grant: sloan-hashbang-2022 developer: hashbang @@ -37,6 +48,9 @@ forges: status: prod source: https://gitlab.softwareheritage.org/swh/devel/swh-loader-bzr/ package_name: swh.loader.bzr + metadata: + intrinsic: none + extrinsic: none grant: sloan-octobus-2021 developer: octobus @@ -44,15 +58,23 @@ forges: name: Bitbucket lister: status: prod + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: not collected bower: name: Bower lister: status: staging + supports_last_update: false loader: status: N/A + metadata: + intrinsic: none + extrinsic: not collected grant: nlnet-octobus-2022 developer: octobus @@ -60,17 +82,25 @@ forges: name: CGit lister: status: prod + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: not collected conda: name: Conda lister: status: dev issue: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/issues/4547 + supports_last_update: true loader: status: dev issue: https://gitlab.softwareheritage.org/swh/devel/swh-loader-core/-/issues/4579 + metadata: + intrinsic: collected + extrinsic: not collected grant: nlnet-octobus-2022 developer: octobus @@ -78,9 +108,14 @@ forges: name: CPAN lister: status: dev - issue: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/issues/2833 + issue: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/issues/4520 + supports_last_update: true loader: - status: N/A + status: dev + issue: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/issues/2833 + metadata: + intrinsic: collected + extrinsic: not collected grant: nlnet-octobus-2022 developer: octobus @@ -89,17 +124,25 @@ forges: lister: status: prod id_in_swh_web: CRAN + supports_last_update: true loader: status: prod + metadata: + intrinsic: collected + extrinsic: none crates: name: Crates lister: status: dev issue: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/issues/1424 + supports_last_update: true loader: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4104 + metadata: + intrinsic: collected + extrinsic: not collected grant: sloan-hashbang-2022 developer: hashbang @@ -111,6 +154,9 @@ forges: status: prod source: https://gitlab.softwareheritage.org/swh/devel/swh-loader-cvs/ package_name: swh.loader.cvs + metadata: + intrinsic: none + extrinsic: none grant: sloan-stsp-cvs developer: stsp @@ -118,9 +164,13 @@ forges: name: Debian lister: status: prod + supports_last_update: true loader: status: prod id_in_swh_web: deb + metadata: + intrinsic: collected + extrinsic: not collected deposit: name: Deposit @@ -128,14 +178,21 @@ forges: status: N/A loader: status: prod + metadata: + intrinsic: indexed + extrinsic: indexed gitea: name: Gitea notes: "Reuses the Gogs lister" lister: status: prod + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: indexed git: name: Git @@ -145,44 +202,67 @@ forges: status: prod source: https://gitlab.softwareheritage.org/swh/devel/swh-loader-git/ package_name: swh.loader.git + metadata: + intrinsic: none + extrinsic: none github: name: GitHub lister: status: prod + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: indexed gitlab: name: GitLab notes: "Also supports `Heptapod <https://heptapod.net/>`_" lister: status: prod + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: not collected gnu: name: GNU lister: status: prod id_in_swh_web: GNU + supports_last_update: true loader: status: N/A + metadata: + intrinsic: collected + extrinsic: none gogs: name: Gogs lister: - status: dev + status: prod issue: https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/issues/1721 + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: not collected golang: name: Golang lister: - status: staging + status: prod + supports_last_update: true loader: - status: staging + status: prod + metadata: + intrinsic: none + extrinsic: none grant: nlnet-octobus-2022 developer: octobus @@ -191,9 +271,13 @@ forges: lister: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4494 + supports_last_update: true loader: status: dev issue: https://gitlab.softwareheritage.org/swh/meta/-/issues/4494 + metadata: + intrinsic: collected + extrinsic: none grant: nlnet-octobus-2022 developer: octobus @@ -201,8 +285,12 @@ forges: name: Launchpad lister: status: prod + supports_last_update: true loader: status: N/A + metadata: + intrinsic: none + extrinsic: not collected maven: name: Maven @@ -210,6 +298,9 @@ forges: status: prod loader: status: prod + metadata: + intrinsic: collected + extrinsic: collected grant: sloan-castalia-maven developer: castalia @@ -222,6 +313,9 @@ forges: id_in_swh_web: hg source: https://gitlab.softwareheritage.org/swh/devel/swh-loader-mercurial/ package_name: swh.loader.mercurial + metadata: + intrinsic: none + extrinsic: none grant: sloan-2020 developer: octobus diff --git a/docs/user/software-origins/arch.rst b/docs/user/software-origins/arch.rst index 07d9a223555874ff1a2459a00017d80e887e2bf1..e158419360fde5dabde52afa4879f0a7ee733464 100644 --- a/docs/user/software-origins/arch.rst +++ b/docs/user/software-origins/arch.rst @@ -3,7 +3,26 @@ Archlinux ========= -.. todo:: - This page is a work in progress. - .. include:: dynamic/arch_status.inc + +This page documents how |swh| archives source packages from the +`Archlinux <https://archlinux.org/>`_ and `Archlinux ARM <https://archlinuxarm.org>`_ +distribution. +The `AUR (Archlinux User Repository) <https://aur.archlinux.org/>`_ is +:ref:`described in its own dedicated documentation <user-software-origins-aur>`, +as it uses a very different packaging architecture. + +|swh| currently has a lister and a loader for Archlinux packages, but they list and load +binary packages; and need to be modified to list and load source packages instead. + +Origin URLs match the one of the canonical web page displaying information about each +package. For example: https://archlinux.org/packages/core/x86_64/coreutils/ +and https://aur.archlinux.org/packages/hg-evolve. + +As all metadata about Archlinux packages is stored within the package (in +:file:`PKGBUILD` in the source, or :file:`.PKGINFO` in the binary package), |swh| does +not need to store them as :term:`extrinsic metadata`. + +Resources: + +* `HTTP API documentation <https://wiki.archlinux.org/title/Official_repositories_web_interface>`_ diff --git a/docs/user/software-origins/archive.rst b/docs/user/software-origins/archive.rst index 1b75670c79283276f4d25f8eb34badbdb5295670..b0640ef0c77d4a56b954092f48f83e44d6a3a08c 100644 --- a/docs/user/software-origins/archive.rst +++ b/docs/user/software-origins/archive.rst @@ -7,3 +7,13 @@ Archive loader This page is a work in progress. .. include:: dynamic/archive_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/aur.rst b/docs/user/software-origins/aur.rst index ecf226c6874065d080b9239f22d90598138e545b..b15ede16ad22376ff78368755b9c3a8a8fcf489e 100644 --- a/docs/user/software-origins/aur.rst +++ b/docs/user/software-origins/aur.rst @@ -3,7 +3,29 @@ AUR === -.. todo:: - This page is a work in progress. - .. include:: dynamic/aur_status.inc + +This page documents how |swh| archives source packages from the +`AUR (Archlinux User Repository) <https://aur.archlinux.org/>`. +The `Archlinux <https://archlinux.org/>`_ and `Archlinux ARM <https://archlinuxarm.org>`_ +distributions are +:ref:`described in their own dedicated documentation <user-software-origins-arch>`, +as they uses a very different packaging architecture. + +The AUR lister will send requests to https://aur.archlinux.org/packages-meta-v1.json.gz +to get a list of packages; then tells the AUR loader to creates origins like +https://aur.archlinux.org/hg-evolve.git using tarballs from URLs like +https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz + +.. note:: + + We should probably use https://aur.archlinux.org/packages/hg-evolve as origin URL + instead of https://aur.archlinux.org/hg-evolve.git + +As all metadata about AUR packages is stored within the :file:`PKGBUILD` file that +serves as source, |swh| does +not need to store them as :term:`extrinsic metadata`. + +Resources: + +* `HTTP API documentation <https://wiki.archlinux.org/title/Aurweb_RPC_interface>`_ diff --git a/docs/user/software-origins/bitbucket.rst b/docs/user/software-origins/bitbucket.rst index fc6dbef3bffe78fb0e4dd5a7e51a611a1f07c9a0..e43d57a09c6042f84cb5e8d11a7e96705ed5b3e7 100644 --- a/docs/user/software-origins/bitbucket.rst +++ b/docs/user/software-origins/bitbucket.rst @@ -3,7 +3,32 @@ Bitbucket ========= -.. todo:: - This page is a work in progress. - .. include:: dynamic/bitbucket_status.inc + +Bitbucket is a Git hosting platform, which used to support Mercurial. + +|swh|'s Bitbucket lister queries the https://api.bitbucket.org/2.0/repositories API +endpoint anonymously. + +It provides a ``updated_on`` field for each repository, matching the last time +the repository (TODO: or project? does it cover stuff like PRs and issues?) was updated; +which is passed as ``last_update`` to the scheduler. + +|swh| does not have a specific loader for Bitbucket; the :ref:`Git +<user-software-origins-git>` loader is used instead. +Therefore, origin URLs are Bitbucket's canonical URL for the corresponding Git +repository: :file:`https://bitbucket.org/{owner}/{name}.git`. + +Bitbucket does not support :ref:`Mercurial <user-software-origins-mercurial>` anymore; +but Mercurial repositories used to be loaded with the Mercurial loader and are +`available in the archive <https://archive.softwareheritage.org/browse/search/?q=bitbucket.org&with_visit=true&with_content=true&visit_type=hg>`__. +Additionally, |swh| provides a `dump of raw Mercurial repositories <https://bitbucket-archive.softwareheritage.org/>`_. + +Bitbucket provides extrinsic metadata on repositories (owner, description, +``created_on``, size, language, fork policy, parent repository, ...) which are currently +not archived. Consequently, fork detection isn't used to speedup archival of git +repositories yet. + +Resources: + +* `HTTP API documentation <https://developer.atlassian.com/cloud/bitbucket/rest/api-group-repositories/>`__ diff --git a/docs/user/software-origins/bower.rst b/docs/user/software-origins/bower.rst index 19bcd8ac8974d3ef9f10f7d6ad1248556946aa7e..71c1e755d825837a7ced1437861bcb05d64c4ebf 100644 --- a/docs/user/software-origins/bower.rst +++ b/docs/user/software-origins/bower.rst @@ -3,7 +3,19 @@ Bower ===== -.. todo:: - This page is a work in progress. - .. include:: dynamic/bower_status.inc + +`Bower <https://bower.io/>`_ is a package manager for the Javascript ecosystem, +which doesn't host its own packages. +Instead, it points to Git repositories hosted externally (eg. on GitHub). + +|swh| archives Bower by querying ``https://registry.bower.io/packages``, which returns +the complete database of the registry: name and repository URL of every package +registered on it. +It then dispatches loading tasks to the :ref:`Git loader <user-software-origins-git>`. + +|swh| currently does not archive the mapping from package names to repository URLs. + +Resources: + +* `Source code of the Bower registry <https://github.com/bower/registry>`_ diff --git a/docs/user/software-origins/bzr.rst b/docs/user/software-origins/bzr.rst index 5a66361f65f8338f5f9c283314b748e08d08b2bc..a01f3db66c654466e30db4c4b0fd6f1c274b26bf 100644 --- a/docs/user/software-origins/bzr.rst +++ b/docs/user/software-origins/bzr.rst @@ -3,7 +3,10 @@ Bazaar ====== -.. todo:: - This page is a work in progress. - .. include:: dynamic/bzr_status.inc + +Bazaar/Breezy repositories are often discovered through listing +:ref:`user-software-origins-launchpad` or package managers. + +Bazaar and Breezy repositories can also be loaded individually if they are not on any recognized +forge, through the `Save Code Now <https://archive.softwareheritage.org/save/>`__ interface. diff --git a/docs/user/software-origins/cgit.rst b/docs/user/software-origins/cgit.rst index b243d8f9ec80c70e59cada8456097cd970830b39..0024ecf872f6fdc740ccf1525391967eae4e4ab3 100644 --- a/docs/user/software-origins/cgit.rst +++ b/docs/user/software-origins/cgit.rst @@ -3,7 +3,26 @@ Cgit ==== -.. todo:: - This page is a work in progress. - .. include:: dynamic/cgit_status.inc + +`CGit <https://git.zx2c4.com/cgit/about/>`_ is a lightweight front-end for Git. + +|swh|'s archives CGit instances by scrapping their HTML, starting from the index page, +then looking for Git URLs in each project's page, embedded as ``<link rel='vcs-git'`` +HTML tags. +Only the first HTTP(S) URL is kept; or the first URL at all, if there is no HTTP(S) URL. + +The CGit lister then dispatches these URLs to the :ref:`Git loader +<user-software-origins-git>`. +CGit project may have their repository hosted on arbitrary other domains (even GitHub); +which is supported by |swh|. + +The "summary" page of CGit projects display the last update of each of their branch; +the lister uses this information to pass a ``last_update`` date to the scheduler. + +New CGit instances can be submitted to |swh| through the +`Add Forge Now <https://archive.softwareheritage.org/add-forge/request/create/>`_ +interface. + +Project description, owner information, and mapping between CGit projects and +repositories on third-party domains are currently not archived. diff --git a/docs/user/software-origins/conda.rst b/docs/user/software-origins/conda.rst index 5ac13ad6a54de75a9b7bf230842ce233a9412713..7e93bca44b86c9823a0c56be703d2e3bc6004a8b 100644 --- a/docs/user/software-origins/conda.rst +++ b/docs/user/software-origins/conda.rst @@ -3,7 +3,27 @@ Conda ===== -.. todo:: - This page is a work in progress. - .. include:: dynamic/conda_status.inc + +`Conda <https://conda.io/>`_ is an alternative package manager for Python, used +in particular by the `Anaconda <https://www.anaconda.com/>`_ and +`conda-forge <https://anaconda.org/conda-forge/>`_ distributions, +with support for other language ecosystems. + +|swh| currently has a lister and a loader for Archlinux packages, but they load +binary packages (``.tar.gz``); and need to be modified to load source packages instead +(``.conda``). + +For every configured channel (``main``, ``conda-forge``, ...) and every architecture +(``linux-64``, ``win-64``, ...), the Conda lister downloads +:file:`https://repo.anaconda.com/pkgs/{channel}/{arch}/repodata.json.bz2`, +from which it extracts a list of package names. Then, from each of these package names, +it triggers a load for the origin :file:`https://anaconda.org/{channel}/{package_name}` +with the list of tarballs of that package. + +.. note:: + + There is a ``_anaconda_depends`` package; what do we and should we do with it? + +Source code from Conda is currently only archived on |swh|'s staging infrastructure. +Metadata from Conda is currently not collected or indexed at all. diff --git a/docs/user/software-origins/cpan.rst b/docs/user/software-origins/cpan.rst index 6b3a172c9df16122dcbb9e31a7ce52d7ba28d301..b9a9ab092c5e615fb286133bff5a7a3c84f1798c 100644 --- a/docs/user/software-origins/cpan.rst +++ b/docs/user/software-origins/cpan.rst @@ -3,7 +3,20 @@ CPAN ==== -.. todo:: - This page is a work in progress. - .. include:: dynamic/cpan_status.inc + +The `Comprehensive Perl Archive Network <https://www.cpan.org/>` is Perl's main package +manager. + +CPAN packages archived by |swh| will be associated to the metacpan.org domain rather than +cpan.org in order to point to an original web page with information about the package. +This pattern of origin URLs is: :file:`https://metacpan.org/dist/{package_name}`, +which references all versions of the same package. + +metacpan.org is also used by |swh| to list packages, thanks to its ElasticSearch API. + +CPAN does not seem to store any extrinsic metadata, beyond mapping between author +username and package. Author name and email is present in intrinsic metadata and in +release fields, anyway. + +Source code from CPAN is currently only archived on |swh|'s staging infrastructure. diff --git a/docs/user/software-origins/cran.rst b/docs/user/software-origins/cran.rst index 6bb728a739d88465aafe24635e191b8f3563cc8e..ceffb798f9bf8f511ba655628fa68a2fd8355d13 100644 --- a/docs/user/software-origins/cran.rst +++ b/docs/user/software-origins/cran.rst @@ -3,7 +3,19 @@ CRAN ==== -.. todo:: - This page is a work in progress. - .. include:: dynamic/cran_status.inc + +The `Comprehensive R Archive Network <https://cran.r-project.org/>`_ is the package +management system of the R language. + +CRAN does not expose a language-agnostic API with the information we need, so for +simplicity/efficiency, |swh|'s CRAN lister loads the weekly dump of the CRAN database +(in RDS format) and parses it with ``rpy2`` +Then for each package, it creates an origin with +:file:`https://cran.r-project.org/package={package_name}` as URL. + +R packages have intrinsic metadata, mostly the :file:`DESCRIPTION` file in their root +directory, in the `deb822 <https://manpages.debian.org/bookworm/dpkg-dev/deb822.5.en.html>`_ +format. +|swh|'s R loader parses it to extract authorship information, but this file is otherwise +not parsed yet. diff --git a/docs/user/software-origins/crates.rst b/docs/user/software-origins/crates.rst index e8388e1f69425789a3199a1903a3309e8a200784..2a1111c37abf387d439e3cf4a32926067f83f750 100644 --- a/docs/user/software-origins/crates.rst +++ b/docs/user/software-origins/crates.rst @@ -3,7 +3,12 @@ Crates ====== -.. todo:: - This page is a work in progress. - .. include:: dynamic/crates_status.inc + +`crates.io <https://crates.io/>`_ is the package manager of the `Rust programming language +<https://www.rust-lang.org/>`_. + +It relies on `an index hosted on GitHub <https://github.com/rust-lang/crates.io-index>`_, +and provides `database dumps <https://crates.io/data-access>`_, which |swh| uses to +list packages, and create origins using this pattern: +:file:`https://crates.io/crates/{crate}`. diff --git a/docs/user/software-origins/cvs.rst b/docs/user/software-origins/cvs.rst index 16f72c99c2ca3336ce5e5997141af9b784b5b79f..346ea446da9b75b143235f6a630368796691f826 100644 --- a/docs/user/software-origins/cvs.rst +++ b/docs/user/software-origins/cvs.rst @@ -3,7 +3,10 @@ CVS === -.. todo:: - This page is a work in progress. - .. include:: dynamic/cvs_status.inc + +See :ref:`swh-loader-cvs` for a description of how |swh| handles loading CVS +(aka. Concurrent Versions System) repositories. + +CVS repositories can be loaded individually if they are not on any recognized +forge, through the `Save Code Now <https://archive.softwareheritage.org/save/>`__ interface. diff --git a/docs/user/software-origins/debian.rst b/docs/user/software-origins/debian.rst index 014876d953a48204c3ace7cac7c1238c0341ca42..e28948af1ce6b9df3758d4990a9e57fb35cc8f32 100644 --- a/docs/user/software-origins/debian.rst +++ b/docs/user/software-origins/debian.rst @@ -7,3 +7,13 @@ Debian This page is a work in progress. .. include:: dynamic/debian_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/deposit.rst b/docs/user/software-origins/deposit.rst index 870595bcafb7f031c0e117192636e91d6fe35d71..2f8c18dd85a95e8022c67ed25be3bd31d04db2a0 100644 --- a/docs/user/software-origins/deposit.rst +++ b/docs/user/software-origins/deposit.rst @@ -7,3 +7,13 @@ Deposit This page is a work in progress. .. include:: dynamic/deposit_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/git.rst b/docs/user/software-origins/git.rst index 815770f3daeed6e9b695841bcedaf19f589f1fcb..6ed860830e2999d9d0f5879a367a185da3e17548 100644 --- a/docs/user/software-origins/git.rst +++ b/docs/user/software-origins/git.rst @@ -3,7 +3,12 @@ Git === -.. todo:: - This page is a work in progress. - .. include:: dynamic/git_status.inc + +The Git versioning system inspired the |swh| data model, and |swh| fully supports it +on all forges (including :ref:`user-software-origins-cgit`, :ref:`user-software-origins-gitea`, :ref:`user-software-origins-github`, :ref:`user-software-origins-gitlab`, +:ref:`user-software-origins-gogs`, and formerly `Gitorious +<https://www.softwareheritage.org/2016/07/21/gitorious-retrieved/>`__). + +Git repositories can also be loaded individually if they are not on any recognized +forge, through the `Save Code Now <https://archive.softwareheritage.org/save/>`__ interface. diff --git a/docs/user/software-origins/gitea.rst b/docs/user/software-origins/gitea.rst index 8f28f5794f81de368c548c6eb0f1ae122c527972..b98b6ff139358650db6540f858dbb431cadf5868 100644 --- a/docs/user/software-origins/gitea.rst +++ b/docs/user/software-origins/gitea.rst @@ -3,7 +3,24 @@ Gitea ===== -.. todo:: - This page is a work in progress. - .. include:: dynamic/gitea_status.inc + +Gitea is a Git hosting platform forked from Gogs. + +|swh|'s Gitea lister queries the project API (eg. https://try.gitea.io/api/v1/repos/search +for try.gitea.io) anonymously. + +It provides an ``updated_at`` field for each repository, matching the last time +the repository (TODO: or project? does it cover stuff like PRs and issues?) was updated; +which is passed as ``last_update`` to the scheduler. + +|swh| does not have a specific loader for Gitea; the :ref:`Git +<user-software-origins-git>` loader is used instead. +Therefore, origin URLs are Gitea's canonical URLs for the corresponding Git +repository: :file:`https://{domain}/{owner}/{name}.git`` + +New Gitea instances can be submitted to |swh| through the +`Add Forge Now <https://archive.softwareheritage.org/add-forge/request/create/>`_ +interface. + +|swh| also archives extrinsic project metadata (eg. project description) from Gitea. diff --git a/docs/user/software-origins/github.rst b/docs/user/software-origins/github.rst index e5b57f38239230f3169c611135dccc42cbf60531..65908186181cbe4ed6938991c410cc4256922468 100644 --- a/docs/user/software-origins/github.rst +++ b/docs/user/software-origins/github.rst @@ -7,3 +7,13 @@ GitHub This page is a work in progress. .. include:: dynamic/github_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/gitlab.rst b/docs/user/software-origins/gitlab.rst index 98a988ed1d20c00fb18584fc35c403ea7cad99db..9c3e567fcba8667f43620193912660c2632a9023 100644 --- a/docs/user/software-origins/gitlab.rst +++ b/docs/user/software-origins/gitlab.rst @@ -1,9 +1,30 @@ .. _user-software-origins-gitlab: +.. _user-software-origins-heptapod: GitLab ====== -.. todo:: - This page is a work in progress. - .. include:: dynamic/gitlab_status.inc + +Gitlab is a Git hosting platform. Its fork Heptapod also supports Mercurial. + +|swh|'s Gitlab lister queries the project API (eg. https://gitlab.com/api/v4/projects +for gitlab.com) anonymously. + +It provides a ``last_activity_at`` field for each repository, matching the last time +the repository (TODO: or project? does it cover stuff like PRs and issues?) was updated; +which is passed as ``last_update`` to the scheduler. + +|swh| does not have a specific loader for Gitlab/Heptapod; the :ref:`Git +<user-software-origins-git>` and :ref:`Mercurial <<user-software-origins-mercurial>` +loaders are used instead. +Therefore, origin URLs are Gitlab/Heptapod's canonical URLs for the corresponding Git +or Mercurial repository: :file:`https://{domain}/{owner}/{name}.git`` and +:file:`https://{domain}/{owner}/{name}` respectively. + +New Gitlab/Heptapod instances can be submitted to |swh| through the +`Add Forge Now <https://archive.softwareheritage.org/add-forge/request/create/>`_ +interface. + +|swh| currently does not archive extrinsic metadata from Gitlab or Heptapod due to +`a limitation of the Gitlab API <https://gitlab.com/gitlab-org/gitlab/-/issues/361952>`__. diff --git a/docs/user/software-origins/gnu.rst b/docs/user/software-origins/gnu.rst index 5aadba670ab8fae94bf0de30c9debf5458433302..b6d5fc0e5e4d675802e7d3e741acb1de1c942c5d 100644 --- a/docs/user/software-origins/gnu.rst +++ b/docs/user/software-origins/gnu.rst @@ -3,7 +3,11 @@ GNU projects ============ -.. todo:: - This page is a work in progress. - .. include:: dynamic/gnu_status.inc + +|swh| archives all software available on https://ftp.gnu.org. It does so by listing +projects from https://ftp.gnu.org/tree.json.gz and passing them to the +:ref:`Archive loader <user-software-origins-archive>`. + +This API provides a ``time`` field for each file, matching the time the file +was uploaded was updated; which is passed as ``last_update`` to the scheduler. diff --git a/docs/user/software-origins/gogs.rst b/docs/user/software-origins/gogs.rst index cadfb9891a010ec0dec64b324cf8847b7aa9d88a..e27139b28656eaade4a6de2aa14e93e13011d8d6 100644 --- a/docs/user/software-origins/gogs.rst +++ b/docs/user/software-origins/gogs.rst @@ -3,7 +3,25 @@ Gogs ==== -.. todo:: - This page is a work in progress. - .. include:: dynamic/gogs_status.inc + +Gogs (Go Git Service) is a Git hosting platform. + +|swh|'s Gogs lister queries the project API (eg. https://try.gogs.io/api/v1/repos/search +for try.gogs.io), usually with an authentication token as Gogs does not allow anonymous +access. + +It provides an ``updated_at`` field for each repository, matching the last time +the repository (TODO: or project? does it cover stuff like PRs and issues?) was updated; +which is passed as ``last_update`` to the scheduler. + +|swh| does not have a specific loader for Gitea; the :ref:`Git +<user-software-origins-git>` loader is used instead. +Therefore, origin URLs are Gogs's canonical URLs for the corresponding Git +repository: :file:`https://{domain}/{owner}/{name}.git`` + +New Gogs instances can be submitted to |swh| through the +`Add Forge Now <https://archive.softwareheritage.org/add-forge/request/create/>`_ +interface. + +|swh| does not yet archive extrinsic project metadata (eg. project description) from Gogs. diff --git a/docs/user/software-origins/golang.rst b/docs/user/software-origins/golang.rst index 65154b638784cb9c6bab2cb9da859b573f2902a8..aac69b3846f3886e2e6cc43de7014fd1555295e2 100644 --- a/docs/user/software-origins/golang.rst +++ b/docs/user/software-origins/golang.rst @@ -3,7 +3,29 @@ Golang ====== -.. todo:: - This page is a work in progress. - .. include:: dynamic/golang_status.inc + +The `Go programming language <https://go.dev/>`_ identifies modules using URL-like +strings, called the "module path". +Module paths start with a domain and path to a VCS repository (usually Git) and +optionally path of a directory within that repository. See the +`Go Modules Reference <https://go.dev/ref/mod>`_ for details. + +|swh| follows the convention of the Golang ecosystem of proxying through the +proxy.golang.org rather than accessing these repositories directly in order to be +as close as possible to the Go build system. + +Go origin URLs in |swh| are module paths prefixed with ``https://pkg.go.dev/``. +For example, the origin URL for module ``github.com/gofiber/fiber`` is +``https://pkg.go.dev/github.com/gofiber/fiber`` (`see it in the archive <https://archive.softwareheritage.org/browse/origin/directory/?origin_url=https://pkg.go.dev/github.com/gofiber/fiber>`__) + +In the Golang ecosystem, it is customary to handle breaking changes in a module by +publishing the new module version at a different path; for example +``github.com/gofiber/fiber/v2``. +See `Module version numbering <https://go.dev/doc/modules/version-numbers>`_ for details. +|swh| follows this convention, and uses different origin URLs for new major versions, +such as ``https://pkg.go.dev/github.com/gofiber/fiber/v2`` (`see it in the archive <https://archive.softwareheritage.org/browse/origin/directory/?origin_url=https://pkg.go.dev/github.com/gofiber/fiber/v2>`__) + +On the technical side, |swh| fetches the list of known Go modules from +https://index.golang.org/index, and relies on the given timestamps to detect updates +to packages archived in the past. diff --git a/docs/user/software-origins/hackage.rst b/docs/user/software-origins/hackage.rst index c59f46e52f9f73c5052f31f90e623ac0170a4ef2..002b18adb2ce01a79259362d20f58278e69980da 100644 --- a/docs/user/software-origins/hackage.rst +++ b/docs/user/software-origins/hackage.rst @@ -3,7 +3,22 @@ Hackage ======= -.. todo:: - This page is a work in progress. - .. include:: dynamic/hackage_status.inc + +`Hackage <https://hackage.haskell.org/>`_ is the main package manager for the +Haskell ecosystem. + +|swh| archives Hackage by querying ``https://hackage.haskell.org/packages/search``, which +returns the list of packages updated since a given date. +It then dispatches loading tasks to a dedicated loader, which downloads a list of revisions +from :file:`https://hackage.haskell.org/package/{pkgname}-{version}/revisions/` and packages +themselves from +:file:`https://hackage.haskell.org/package/{pkgname}-{version}/{pkgname}-{version}.tar.gz`. + +Metadata from Hackage is archived as part of each package (in ``.cabal`` files). + +Resources: + +* `Source code of Hackage <https://github.com/haskell/hackage-server>`_ + +Source code from Hackage is currently only archived on |swh|'s staging infrastructure. diff --git a/docs/user/software-origins/launchpad.rst b/docs/user/software-origins/launchpad.rst index ee285506f5f9b17ed8f2e3a799503b4cdacb7424..9accda478ebb6979053da195deb1c6534697d0d4 100644 --- a/docs/user/software-origins/launchpad.rst +++ b/docs/user/software-origins/launchpad.rst @@ -3,7 +3,16 @@ Launchpad ========= -.. todo:: - This page is a work in progress. - .. include:: dynamic/launchpad_status.inc + +`Launchpad <https://launchpad.net/>`_ is a Bazaar and Git hosting platform. + +It provides a ``bzr_date_last_modified``/``git_date_last_modified`` field for each +repository, matching the last time the repository was updated; +which is passed as ``last_update`` to the scheduler. + +|swh| does not have a specific loader for Bitbucket; the :ref:`BZR +<user-software-origins-bzr>` and :ref:`Git <user-software-origins-git>` loaders are used +instead. +Therefore, origin URLs are Launchpad canonical URL for the corresponding Bazaar or Git +repository. diff --git a/docs/user/software-origins/maven.rst b/docs/user/software-origins/maven.rst index bd163be4c4e573f87caf09d16456ebad81e11a8e..c283fab54e21dc752402e07b96ec2d1f75bfb384 100644 --- a/docs/user/software-origins/maven.rst +++ b/docs/user/software-origins/maven.rst @@ -3,7 +3,14 @@ Maven ===== -.. todo:: - This page is a work in progress. - .. include:: dynamic/maven_status.inc + +`Maven <https://maven.apache.org/>`_ is Java's main package manager. There are multiple +Maven repositories, each of which store both binary packages (JAR files containing Java +classes) and source code (as source JARs). |swh| archives the latter. + +Additionally, |swh| archives each package's :file:`pom.xml` as :term:`extrinsic metadata` +and mines them for links to external version control systems to archive. + +See the `Maven lister's documentation <https://gitlab.softwareheritage.org/swh/devel/swh-lister/-/blob/master/swh/lister/maven/README.md>`_ +for details on its implementation. diff --git a/docs/user/software-origins/mercurial.rst b/docs/user/software-origins/mercurial.rst index e7563c07368ce03a299b5d90c2d87f42e484fb31..4c57c696a468697fe611221510850a0ecd075e7b 100644 --- a/docs/user/software-origins/mercurial.rst +++ b/docs/user/software-origins/mercurial.rst @@ -3,7 +3,14 @@ Mercurial ========= -.. todo:: - This page is a work in progress. - .. include:: dynamic/mercurial_status.inc + +Mercurial repositories are often discovered through listing package managers +or forges like :ref:`Heptapod <user-software-origins-gitlab>` or formerly +:ref:`Bitbucket <user-software-origins-bitbucket>`. + +Mercurial repositories can also be loaded individually if they are not on any recognized +forge, through the `Save Code Now <https://archive.softwareheritage.org/save/>`__ interface. + +|swh| supporting loading Mercurial repositories, but is currently missing some advanced +history manipulation features of Mercurial. diff --git a/docs/user/software-origins/nixguix.rst b/docs/user/software-origins/nixguix.rst index dd042fa6865054743bb43422eeffb9ff9d2385db..6acfc6ffc05ab6c51385c1c1fe96f7c9d9596bcd 100644 --- a/docs/user/software-origins/nixguix.rst +++ b/docs/user/software-origins/nixguix.rst @@ -7,3 +7,13 @@ Nix and Guix This page is a work in progress. .. include:: dynamic/nixguix_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/npm.rst b/docs/user/software-origins/npm.rst index 49f4a8f1f6a77e43e72e24ba95e214cbfef49316..3009aa1dee55f55bba26ba6a230234dc67e90502 100644 --- a/docs/user/software-origins/npm.rst +++ b/docs/user/software-origins/npm.rst @@ -7,3 +7,13 @@ NPM This page is a work in progress. .. include:: dynamic/npm_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/opam.rst b/docs/user/software-origins/opam.rst index 88a5834b30a908e7d449eadece8a32a0f50cb346..3516300e0ac2537bb405b7fed815f9e673550878 100644 --- a/docs/user/software-origins/opam.rst +++ b/docs/user/software-origins/opam.rst @@ -7,3 +7,13 @@ Opam This page is a work in progress. .. include:: dynamic/opam_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/packagist.rst b/docs/user/software-origins/packagist.rst index f51ece2587a92cc14a8dcb0b3b6939c0902f63e1..aa7d94960d55fab947b5ecef384e4d32f0edcd76 100644 --- a/docs/user/software-origins/packagist.rst +++ b/docs/user/software-origins/packagist.rst @@ -7,3 +7,13 @@ Packagist This page is a work in progress. .. include:: dynamic/packagist_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/phabricator.rst b/docs/user/software-origins/phabricator.rst index f86258a62c20b0564650d7f0e87a00d96a450679..66f79a6b171e412a8e6cb44d425967996ec0f199 100644 --- a/docs/user/software-origins/phabricator.rst +++ b/docs/user/software-origins/phabricator.rst @@ -7,3 +7,13 @@ Phabricator This page is a work in progress. .. include:: dynamic/phabricator_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/pubdev.rst b/docs/user/software-origins/pubdev.rst index 1fc7bf757fca24bb837e1b00f3a924529696f8e4..7288a0b4d5fa6d386841bc1bdc18a00d902e93f1 100644 --- a/docs/user/software-origins/pubdev.rst +++ b/docs/user/software-origins/pubdev.rst @@ -7,3 +7,13 @@ Pub.Dev This page is a work in progress. .. include:: dynamic/pubdev_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/puppet.rst b/docs/user/software-origins/puppet.rst index e77efacfbdbfc262e44dfc58b1827c9492785d21..4f0481cdcfe5f3811aac875d5d0ff81b601d5c4b 100644 --- a/docs/user/software-origins/puppet.rst +++ b/docs/user/software-origins/puppet.rst @@ -7,3 +7,13 @@ Puppet This page is a work in progress. .. include:: dynamic/puppet_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/pypi.rst b/docs/user/software-origins/pypi.rst index 8fe948b0132dccee9776f79f88232a4c7ef6a1e4..ecb22274880bf17ba994f56a49c4eff82a834674 100644 --- a/docs/user/software-origins/pypi.rst +++ b/docs/user/software-origins/pypi.rst @@ -7,3 +7,13 @@ PyPI This page is a work in progress. .. include:: dynamic/pypi_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/rubygems.rst b/docs/user/software-origins/rubygems.rst index 37ff1dda049f21ffec06d667ca6ef9ffc8729a5f..61605053781f2b95fcf65e82d5f72b3da0468085 100644 --- a/docs/user/software-origins/rubygems.rst +++ b/docs/user/software-origins/rubygems.rst @@ -7,3 +7,13 @@ RubyGems This page is a work in progress. .. include:: dynamic/rubygems_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/sourceforge.rst b/docs/user/software-origins/sourceforge.rst index 533e06f5a959ab4003f98cb3054b19e6e62af8e7..934afb42a2e2cd961a4baea81f5ece70d419fc0c 100644 --- a/docs/user/software-origins/sourceforge.rst +++ b/docs/user/software-origins/sourceforge.rst @@ -7,3 +7,13 @@ SourceForge This page is a work in progress. .. include:: dynamic/sourceforge_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/svn.rst b/docs/user/software-origins/svn.rst index c36ebeac0a42036af615b4673f6e71d4938120c5..0013055655f1836c5cb53c1b72a1f13ce0bc2f92 100644 --- a/docs/user/software-origins/svn.rst +++ b/docs/user/software-origins/svn.rst @@ -7,3 +7,13 @@ Subversion This page is a work in progress. .. include:: dynamic/svn_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata? diff --git a/docs/user/software-origins/tuleap.rst b/docs/user/software-origins/tuleap.rst index d718a5a9be5f5a176ec3b0152e7f000d34197214..417a37a8251ca5c48ae8fa9fc51a548cfc03d6a9 100644 --- a/docs/user/software-origins/tuleap.rst +++ b/docs/user/software-origins/tuleap.rst @@ -7,3 +7,13 @@ Tuleap This page is a work in progress. .. include:: dynamic/tuleap_status.inc + +TODO: + +* description of the software origin +* summary of the lister's algorithm +* summary of the loader's algorithm +* URL pattern +* collect extrinsic metadata? +* index extrinsic metadata? +* index intrinsic metadata?