diff --git a/README.md b/README.md index 91d6e79781971d4e403a8a789a5a47574b62d7ce..4164a9ee7f7e2595d6baa1763bcd167926702757 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ following Python modules: - `swh.lister.pypi` - `swh.lister.rpm` - `swh.lister.tuleap` +- `swh.lister.bioconductor` Dependencies ------------ diff --git a/setup.py b/setup.py index 1b4455fff21ac1463e7de6eea1ab3c68783f7b8c..49bcfdfdca26d61b78aa9c44ce568a6e3593a532 100755 --- a/setup.py +++ b/setup.py @@ -93,6 +93,7 @@ setup( lister.stagit=swh.lister.stagit:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register + lister.bioconductor=swh.lister.bioconductor:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/bioconductor/__init__.py b/swh/lister/bioconductor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..546ab50d6a2507f7d0023e116093a6e915a1da71 --- /dev/null +++ b/swh/lister/bioconductor/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import BioconductorLister + + return { + "lister": BioconductorLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/bioconductor/lister.py b/swh/lister/bioconductor/lister.py new file mode 100644 index 0000000000000000000000000000000000000000..95895cdecea54d80eec6ae608cb03f3c0a1b3562 --- /dev/null +++ b/swh/lister/bioconductor/lister.py @@ -0,0 +1,314 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass, field +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from debian.deb822 import Sources +import iso8601 +from packaging import version +from requests import HTTPError + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + +Release = str +Category = str +BioconductorListerPage = Optional[Tuple[Release, Category, Dict[str, Any]]] + + +@dataclass +class BioconductorListerState: + """State of the Bioconductor lister""" + + package_versions: Dict[str, Set[str]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage]): + """List origins from Bioconductor, a collection of open source software + for bioinformatics based on the R statistical programming language.""" + + LISTER_NAME = "bioconductor" + VISIT_TYPE = "bioconductor" + INSTANCE = "bioconductor" + + BIOCONDUCTOR_HOMEPAGE = "https://www.bioconductor.org" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str = BIOCONDUCTOR_HOMEPAGE, + instance: str = INSTANCE, + credentials: Optional[CredentialsType] = None, + releases: Optional[List[Release]] = None, + categories: Optional[List[Category]] = None, + incremental: bool = False, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + record_batch_size: int = 1000, + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + record_batch_size=record_batch_size, + ) + + if releases is None: + self.releases = self.fetch_versions() + else: + self.releases = releases + + self.categories = categories or [ + "bioc", + "workflows", + "data/annotation", + "data/experiment", + ] + + self.incremental = incremental + + self.listed_origins: Dict[str, ListedOrigin] = {} + self.origins_to_send: Set[str] = set() + self.package_versions: Dict[str, Set[str]] = {} + + def state_from_dict(self, d: Dict[str, Any]) -> BioconductorListerState: + return BioconductorListerState( + package_versions={k: set(v) for k, v in d.items()} + ) + + def state_to_dict(self, state: BioconductorListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def origin_url_for_package(self, package_name: str) -> str: + return f"{self.BIOCONDUCTOR_HOMEPAGE}/packages/{package_name}" + + def get_pages(self) -> Iterator[BioconductorListerPage]: + """Return an iterator for each page. Every page is a (release, category) pair.""" + for release in self.releases: + if version.parse(release) < version.parse("1.8"): + # only bioc category existed before 1.8 + url_template = urljoin( + self.url, "/packages/{category}/{release}/src/contrib/PACKAGES" + ) + categories = {"bioc"} + elif version.parse(release) < version.parse("2.5"): + # workflows category won't exist for these + url_template = urljoin( + self.url, "/packages/{release}/{category}/src/contrib/PACKAGES" + ) + categories = {"bioc", "data/annotation", "data/experiment"} + else: + url_template = urljoin( + self.url, "/packages/json/{release}/{category}/packages.json" + ) + categories = set(self.categories) + + for category in categories: + url = url_template.format(release=release, category=category) + try: + packages_txt = self.http_request(url).text + packages = self.parse_packages(packages_txt) + except HTTPError as e: + logger.debug( + "Skipping page since got %s response for %s", + e.response.status_code, + url, + ) + continue + + yield (release, category, packages) + + # Yield extra none to signal get_origins_from_page() + # to stop iterating and yield the extracted origins + yield None + + def fetch_versions(self) -> List[str]: + html = self.http_request( + f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements" + ).text + bs = BeautifulSoup(html, "html.parser") + return [ + tr.find_all("td")[0].text + for tr in reversed(bs.find("table").find("tbody").find_all("tr")) + if tr.find_all("td")[2].find("a") + ] + + def parse_packages(self, text: str) -> Dict[str, Any]: + """Parses packages.json and PACKAGES files""" + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + sources = Sources.iter_paragraphs(text) + return {s["Package"]: dict(s) for s in sources} + + def get_origins_from_page( + self, page: BioconductorListerPage + ) -> Iterator[ListedOrigin]: + """Convert a page of BioconductorLister PACKAGES/packages.json + metadata into a list of ListedOrigins""" + assert self.lister_obj.id is not None + + if page is None: + for origin_url in self.origins_to_send: + yield self.listed_origins[origin_url] + + return + + release, category, packages = page + + origins_to_send = set() + + for pkg_name, pkg_metadata in packages.items(): + pkg_version = pkg_metadata["Version"] + last_update_date = None + last_update_str = "" + + if version.parse(release) < version.parse("1.8"): + tar_url = urljoin( + self.url, + f"/packages/{category}/{release}/src/contrib/Source/{pkg_name}_{pkg_metadata['Version']}.tar.gz", + ) + elif version.parse(release) < version.parse("2.5"): + tar_url = urljoin( + self.url, + f"/packages/{release}/{category}/src/contrib/{pkg_name}_{pkg_metadata['Version']}.tar.gz", + ) + else: + # Some packages don't have don't have a download URL (based on source.ver) + # and hence can't be archived. For example see the package + # maEndToEnd at the end of + # https://bioconductor.org/packages/json/3.17/workflows/packages.json + + # Even guessing tar url path based on the expected url format doesn't work. i.e. + # https://bioconductor.org/packages/3.17/workflows/src/contrib/maEndToEnd_2.20.0.tar.gz + # doesn't respond with a tar file. Plus, the mirror clearly shows + # that maEndToEnd tar is missing. + # https://ftp.gwdg.de/pub/misc/bioconductor/packages/3.17/workflows/src/contrib/ + # So skipping such packages + + if "source.ver" not in pkg_metadata: + logger.info( + ( + "Skipping package %s listed in release %s " + "category %s since it doesn't have a download URL" + ), + pkg_name, + release, + category, + ) + continue + + if "git_url" in pkg_metadata: + # Along with the .tar.gz files grab the git repo as well + git_origin_url = pkg_metadata["git_url"] + git_last_update_str = pkg_metadata.get("git_last_commit_date") + self.listed_origins[git_origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + visit_type="git", + url=git_origin_url, + last_update=( + iso8601.parse_date(git_last_update_str) + if git_last_update_str + else None + ), + ) + origins_to_send.add(git_origin_url) + + tar_url = urljoin( + self.url, + f"/packages/{release}/{category}/{pkg_metadata['source.ver']}", + ) + + last_update_str = pkg_metadata.get( + "Date/Publication", pkg_metadata.get("git_last_commit_date") + ) + last_update_date = ( + iso8601.parse_date(last_update_str) if last_update_str else None + ) + # For some packages in releases >= 2.5, last_update can still + # remain None. Example: See "adme16cod.db" entry in + # https://bioconductor.org/packages/json/3.17/data/annotation/packages.json + + origin_url = self.origin_url_for_package(pkg_name) + package_version_key = f"{release}/{category}/{pkg_version}" + + if origin_url not in self.listed_origins: + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=origin_url, + last_update=last_update_date, + extra_loader_arguments={"packages": {}}, + ) + + self.package_versions[pkg_name] = set() + + origins_to_send.add(origin_url) + + optional_fields: Dict[str, Any] = {} + if "MD5sum" in pkg_metadata: + optional_fields["checksums"] = {"md5": pkg_metadata["MD5sum"]} + if last_update_str: + optional_fields["last_update_date"] = last_update_str + + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ] = { + "release": release, + "version": pkg_version, + "category": category, + "package": pkg_name, + "tar_url": tar_url, + } + + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ].update(optional_fields) + + last_update = self.listed_origins[origin_url].last_update + if ( + last_update is not None + and last_update_date is not None + and last_update_date > last_update + ): + self.listed_origins[origin_url].last_update = last_update_date + + self.package_versions[pkg_name].add(package_version_key) + + # package has been listed during a previous listing + if self.incremental and pkg_name in self.state.package_versions: + new_versions = ( + self.package_versions[pkg_name] + - self.state.package_versions[pkg_name] + ) + # no new versions, no need to send the origin to the scheduler + if not new_versions: + origins_to_send.remove(origin_url) + + self.origins_to_send.update(origins_to_send) + + def finalize(self) -> None: + if self.incremental: + self.state.package_versions = self.package_versions + + self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/bioconductor/tasks.py b/swh/lister/bioconductor/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..2486af2bf7b010b14314a8203f9c960af7a41cb5 --- /dev/null +++ b/swh/lister/bioconductor/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2022-2023 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import BioconductorLister + + +@shared_task(name=__name__ + ".BioconductorListerTask") +def list_bioconductor_full(**lister_args) -> Dict[str, int]: + """Full listing of Bioconductor packages""" + lister = BioconductorLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".BioconductorIncrementalListerTask") +def list_bioconductor_incremental(**lister_args) -> Dict[str, int]: + """Incremental listing of Bioconductor packages""" + lister = BioconductorLister.from_configfile(**lister_args, incremental=True) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/bioconductor/tests/__init__.py b/swh/lister/bioconductor/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/1.17-PACKAGES b/swh/lister/bioconductor/tests/data/https_bioconductor.org/1.17-PACKAGES new file mode 100644 index 0000000000000000000000000000000000000000..b0369dd778bd5265295a75ab1ceebabc8efa71b5 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/1.17-PACKAGES @@ -0,0 +1,13 @@ +Package: affylmGUI +Version: 1.4.0 +Depends: limma, tcltk, affy +Suggests: tkrplot, affyPLM, R2HTML, xtable + +Package: affypdnn +Version: 1.4.0 +Depends: R (>= 1.9.0), affy (>= 1.5), affydata, hgu95av2probe + +Package: affyPLM +Version: 1.6.0 +Depends: R (>= 2.0.0), affy (>= 1.5.0), affydata, Biobase, methods, + gcrma diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/2.2-PACKAGES b/swh/lister/bioconductor/tests/data/https_bioconductor.org/2.2-PACKAGES new file mode 100644 index 0000000000000000000000000000000000000000..01d9c32151c3608f1ce1008c1ea02805e0bfb02a --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/2.2-PACKAGES @@ -0,0 +1,12 @@ +Package: ABarray +Version: 1.8.0 +Depends: Biobase, multtest, tcltk +Suggests: limma, LPE + +Package: AnnotationDbi +Version: 1.2.2 +Depends: R (>= 2.7.0), methods, utils, Biobase (>= 1.17.0), DBI (>= + 0.2-4), RSQLite (>= 0.6-4) +Imports: methods, utils, Biobase, DBI, RSQLite +Suggests: hgu95av2.db, hgu95av2, GO.db, GO, human.db0, mouse.db0, + rat.db0, fly.db0, yeast.db0, arabidopsis.db0 diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-bioc-packages.json b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-bioc-packages.json new file mode 100644 index 0000000000000000000000000000000000000000..3575b906a47e58b3de63e4364efcf8bcaeea4b97 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-bioc-packages.json @@ -0,0 +1,166 @@ +{ + "annotation": { + "Package": "annotation", + "Version": "1.24.1", + "Depends": [ + "R (>= 3.3.0)", + "VariantAnnotation", + "AnnotationHub", + "Organism.dplyr", + "TxDb.Hsapiens.UCSC.hg19.knownGene", + "TxDb.Hsapiens.UCSC.hg38.knownGene", + "TxDb.Mmusculus.UCSC.mm10.ensGene", + "org.Hs.eg.db", + "org.Mm.eg.db", + "Homo.sapiens", + "BSgenome.Hsapiens.UCSC.hg19", + "biomaRt", + "BSgenome", + "TxDb.Athaliana.BioMart.plantsmart22" + ], + "Suggests": [ + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "4cb4db8807acb2e164985636091faa93", + "NeedsCompilation": "no", + "Title": "Genomic Annotation Resources", + "Description": "Annotation resources make up a significant proportion of the Bioconductor project. And there are also a diverse set of online resources available which are accessed using specific packages. This walkthrough will describe the most popular of these resources and give some high level examples on how to use them.", + "biocViews": [ + "AnnotationWorkflow", + "Workflow" + ], + "Author": "Marc RJ Carlson [aut], Herve Pages [aut], Sonali Arora [aut], Valerie Obenchain [aut], Martin Morgan [aut], Bioconductor Package Maintainer [cre]", + "Maintainer": "Bioconductor Package Maintainer <maintainer@bioconductor.org>", + "URL": "http://bioconductor.org/help/workflows/annotation/Annotation_Resources/", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/annotation", + "git_branch": "RELEASE_3_17", + "git_last_commit": "4568557", + "git_last_commit_date": "2023-06-28", + "Date/Publication": "2023-06-30", + "source.ver": "src/contrib/annotation_1.24.1.tar.gz", + "vignettes": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.html", + "vignettes/annotation/inst/doc/Annotation_Resources.html" + ], + "vignetteTitles": [ + "Annotating Genomic Ranges", + "Genomic Annotation Resources" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.R", + "vignettes/annotation/inst/doc/Annotation_Resources.R" + ], + "dependencyCount": "143", + "Rank": 23 + }, + "variants": { + "Package": "variants", + "Version": "1.24.0", + "Depends": [ + "R (>= 3.3.0)", + "VariantAnnotation", + "org.Hs.eg.db", + "TxDb.Hsapiens.UCSC.hg19.knownGene", + "BSgenome.Hsapiens.UCSC.hg19", + "PolyPhen.Hsapiens.dbSNP131" + ], + "Suggests": [ + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "38f2c00b73e1a695f5ef4c9b4a728923", + "NeedsCompilation": "no", + "Title": "Annotating Genomic Variants", + "Description": "Read and write VCF files. Identify structural location of variants and compute amino acid coding changes for non-synonymous variants. Use SIFT and PolyPhen database packages to predict consequence of amino acid coding changes.", + "biocViews": [ + "AnnotationWorkflow", + "ImmunoOncologyWorkflow", + "Workflow" + ], + "Author": "Valerie Obenchain [aut], Martin Morgan [ctb], Bioconductor Package Maintainer [cre]", + "Maintainer": "Bioconductor Package Maintainer <maintainer@bioconductor.org>", + "URL": "https://bioconductor.org/help/workflows/variants/", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/variants", + "git_branch": "RELEASE_3_17", + "git_last_commit": "d311e59", + "git_last_commit_date": "2023-04-25", + "Date/Publication": "2023-04-28", + "source.ver": "src/contrib/variants_1.24.0.tar.gz", + "vignettes": [ + "vignettes/variants/inst/doc/Annotating_Genomic_Variants.html" + ], + "vignetteTitles": [ + "Annotating Genomic Variants" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/variants/inst/doc/Annotating_Genomic_Variants.R" + ], + "dependencyCount": "103", + "Rank": 16 + }, + "maEndToEnd": { + "Package": "maEndToEnd", + "Version": "2.20.0", + "Depends": [ + "R (>= 3.5.0)", + "Biobase", + "oligoClasses", + "ArrayExpress", + "pd.hugene.1.0.st.v1", + "hugene10sttranscriptcluster.db", + "oligo", + "arrayQualityMetrics", + "limma", + "topGO", + "ReactomePA", + "clusterProfiler", + "gplots", + "ggplot2", + "geneplotter", + "pheatmap", + "RColorBrewer", + "dplyr", + "tidyr", + "stringr", + "matrixStats", + "genefilter", + "openxlsx", + "Rgraphviz", + "enrichplot" + ], + "Suggests": [ + "BiocStyle", + "knitr", + "devtools", + "rmarkdown" + ], + "License": "MIT + file LICENSE", + "NeedsCompilation": "no", + "Title": "An end to end workflow for differential gene expression using Affymetrix microarrays", + "Description": "In this article, we walk through an end-to-end Affymetrix microarray differential expression workflow using Bioconductor packages. This workflow is directly applicable to current \"Gene\" type arrays, e.g. the HuGene or MoGene arrays, but can easily be adapted to similar platforms. The data analyzed here is a typical clinical microarray data set that compares inflamed and non-inflamed colon tissue in two disease subtypes. For each disease, the differential gene expression between inflamed- and non-inflamed colon tissue was analyzed. We will start from the raw data CEL files, show how to import them into a Bioconductor ExpressionSet, perform quality control and normalization and finally differential gene expression (DE) analysis, followed by some enrichment analysis.", + "biocViews": [ + "GeneExpressionWorkflow", + "Workflow" + ], + "Author": "Bernd Klaus [aut], Stefanie Reisenauer [aut, cre]", + "Maintainer": "Stefanie Reisenauer <steffi.reisenauer@tum.de>", + "URL": "https://www.bioconductor.org/help/workflows/", + "VignetteBuilder": "knitr", + "Rank": 21 + } +} diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-workflows-packages.json b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-workflows-packages.json new file mode 100644 index 0000000000000000000000000000000000000000..08a7e82dc96c9cbb95dec0ae8482c9c696840bf7 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-workflows-packages.json @@ -0,0 +1,162 @@ +{ + "annotation": { + "Package": "annotation", + "Version": "1.24.1", + "Depends": [ + "R (>= 3.3.0)", + "VariantAnnotation", + "AnnotationHub", + "Organism.dplyr", + "TxDb.Hsapiens.UCSC.hg19.knownGene", + "TxDb.Hsapiens.UCSC.hg38.knownGene", + "TxDb.Mmusculus.UCSC.mm10.ensGene", + "org.Hs.eg.db", + "org.Mm.eg.db", + "Homo.sapiens", + "BSgenome.Hsapiens.UCSC.hg19", + "biomaRt", + "BSgenome", + "TxDb.Athaliana.BioMart.plantsmart22" + ], + "Suggests": [ + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "4cb4db8807acb2e164985636091faa93", + "NeedsCompilation": "no", + "Title": "Genomic Annotation Resources", + "Description": "Annotation resources make up a significant proportion of the Bioconductor project. And there are also a diverse set of online resources available which are accessed using specific packages. This walkthrough will describe the most popular of these resources and give some high level examples on how to use them.", + "biocViews": [ + "AnnotationWorkflow", + "Workflow" + ], + "Author": "Marc RJ Carlson [aut], Herve Pages [aut], Sonali Arora [aut], Valerie Obenchain [aut], Martin Morgan [aut], Bioconductor Package Maintainer [cre]", + "Maintainer": "Bioconductor Package Maintainer <maintainer@bioconductor.org>", + "URL": "http://bioconductor.org/help/workflows/annotation/Annotation_Resources/", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/annotation", + "git_branch": "RELEASE_3_17", + "git_last_commit": "4568557", + "git_last_commit_date": "2023-06-28", + "Date/Publication": "2023-06-30", + "source.ver": "src/contrib/annotation_1.24.1.tar.gz", + "vignettes": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.html", + "vignettes/annotation/inst/doc/Annotation_Resources.html" + ], + "vignetteTitles": [ + "Annotating Genomic Ranges", + "Genomic Annotation Resources" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.R", + "vignettes/annotation/inst/doc/Annotation_Resources.R" + ], + "dependencyCount": "144", + "Rank": 8 + }, + "arrays": { + "Package": "arrays", + "Version": "1.26.0", + "Depends": [ + "R (>= 3.0.0)" + ], + "Suggests": [ + "affy", + "limma", + "hgfocuscdf", + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "009ef917ebc047246b8c62c48e02a237", + "NeedsCompilation": "no", + "Title": "Using Bioconductor for Microarray Analysis", + "Description": "Using Bioconductor for Microarray Analysis workflow", + "biocViews": [ + "BasicWorkflow", + "Workflow" + ], + "Author": "Bioconductor Package Maintainer [aut, cre]", + "Maintainer": "Bioconductor Package Maintainer <maintainer@bioconductor.org>", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/arrays", + "git_branch": "RELEASE_3_17", + "git_last_commit": "9981a8c", + "git_last_commit_date": "2023-04-25", + "Date/Publication": "2023-04-28", + "source.ver": "src/contrib/arrays_1.26.0.tar.gz", + "vignettes": [ + "vignettes/arrays/inst/doc/arrays.html" + ], + "vignetteTitles": [ + "Using Bioconductor for Microarray Analysis" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/arrays/inst/doc/arrays.R" + ], + "dependencyCount": "0", + "Rank": 13 + }, + "maEndToEnd": { + "Package": "maEndToEnd", + "Version": "2.20.0", + "Depends": [ + "R (>= 3.5.0)", + "Biobase", + "oligoClasses", + "ArrayExpress", + "pd.hugene.1.0.st.v1", + "hugene10sttranscriptcluster.db", + "oligo", + "arrayQualityMetrics", + "limma", + "topGO", + "ReactomePA", + "clusterProfiler", + "gplots", + "ggplot2", + "geneplotter", + "pheatmap", + "RColorBrewer", + "dplyr", + "tidyr", + "stringr", + "matrixStats", + "genefilter", + "openxlsx", + "Rgraphviz", + "enrichplot" + ], + "Suggests": [ + "BiocStyle", + "knitr", + "devtools", + "rmarkdown" + ], + "License": "MIT + file LICENSE", + "NeedsCompilation": "no", + "Title": "An end to end workflow for differential gene expression using Affymetrix microarrays", + "Description": "In this article, we walk through an end-to-end Affymetrix microarray differential expression workflow using Bioconductor packages. This workflow is directly applicable to current \"Gene\" type arrays, e.g. the HuGene or MoGene arrays, but can easily be adapted to similar platforms. The data analyzed here is a typical clinical microarray data set that compares inflamed and non-inflamed colon tissue in two disease subtypes. For each disease, the differential gene expression between inflamed- and non-inflamed colon tissue was analyzed. We will start from the raw data CEL files, show how to import them into a Bioconductor ExpressionSet, perform quality control and normalization and finally differential gene expression (DE) analysis, followed by some enrichment analysis.", + "biocViews": [ + "GeneExpressionWorkflow", + "Workflow" + ], + "Author": "Bernd Klaus [aut], Stefanie Reisenauer [aut, cre]", + "Maintainer": "Stefanie Reisenauer <steffi.reisenauer@tum.de>", + "URL": "https://www.bioconductor.org/help/workflows/", + "VignetteBuilder": "knitr", + "Rank": 10 + } +} diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/about/release-announcements b/swh/lister/bioconductor/tests/data/https_bioconductor.org/about/release-announcements new file mode 100644 index 0000000000000000000000000000000000000000..7c2e0d3bdea5a088d9dc30d0d3016dd13fee51a1 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/about/release-announcements @@ -0,0 +1,553 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US" +prefix="og: http://ogp.me/ns#"> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> + <meta http-equiv="Content-Language" content="en-us" /> + <meta name="robots" content="all" /> + + + +<!-- Google tag (gtag.js) --> +<script async src="https://www.googletagmanager.com/gtag/js?id=G-WJMEEH1J58"></script> +<script> + window.dataLayer = window.dataLayer || []; + function gtag(){dataLayer.push(arguments);} + gtag('js', new Date()); + + gtag('config', 'G-WJMEEH1J58'); +</script> + + + + + <script type="text/javascript" src="/js/jquery.js"></script> + + + <script src="/js/jquery.tools.min.js"></script> + + + + <title>Bioconductor - Release Announcements</title> + <link rel="stylesheet" type="text/css" href="/style/bioconductor.css" media="screen" /> + <link rel="SHORTCUT ICON" type="image/x-icon" href="/favicon.ico" /> + <link rel="ICON" type="image/x-icon" href="/favicon.ico" /> + <script type="text/javascript" src="/js/bioconductor.js"></script> + + <script type="text/javascript" src="/js/jquery.corner.js"></script> + <script type="text/javascript" src="/js/jquery.timeago.js"></script> + <script type="text/javascript" src="/js/bioc-style.js"></script> + <script type="text/javascript" src="/js/versions.js"></script> + +</head> + + <body> + <a name="top"></a> + +<!-- a few hooks for screen readers --> +<a href="#site-navigation" title="Jump to site nav"></a> + +<a href="#section-navigation" title="Jump to section nav"></a> + +<a href="#site-map" title="Jump to site map"></a> + + + <div id="SiteContainer" class="SiteContainer"> + + <div id="PageContent" class="PageContent WithRightRail"> + + + + + <div id="PageBreadcrumbs" class="PageBreadcrumbs"> + <ul> + + + + + + <li><a href="/">Home</a></li> + + + + + + <li><a href="/about/">About</a></li> + + + + + <li>Release Announcements</li> + + + </ul> +</div> + + + + + + + + <h1 id="bioconductor-releases">Bioconductor releases</h1> + +<p>Each <em>Bioconductor</em> release is designed to work with a specific +version of <em>R</em>. The following table summarizes the relationship, and +links to packages designed to work with the corresponding <em>R</em> / +<em>Bioconductor</em> version.</p> + +<p><em>Bioconductor</em> versions are linked to their release announcement (when +available). Release announcements summarize new package additions, +updates to existing packages, and package removals.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Release</th> + <th style="text-align: left">Date</th> + <th style="text-align: right">Software packages</th> + <th style="text-align: left">R</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_17_release">3.17</a></td> + <td style="text-align: left">April 26, 2023</td> + <td style="text-align: right"><a href="/packages/3.17/">2230</a></td> + <td style="text-align: left">4.3</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_16_release">3.16</a></td> + <td style="text-align: left">November 2, 2022</td> + <td style="text-align: right"><a href="/packages/3.16/">2183</a></td> + <td style="text-align: left">4.2</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_15_release">3.15</a></td> + <td style="text-align: left">April 27, 2022</td> + <td style="text-align: right"><a href="/packages/3.15/">2140</a></td> + <td style="text-align: left">4.2</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_14_release">3.14</a></td> + <td style="text-align: left">October 27, 2021</td> + <td style="text-align: right"><a href="/packages/3.14/">2083</a></td> + <td style="text-align: left">4.1</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_13_release">3.13</a></td> + <td style="text-align: left">May 20, 2021</td> + <td style="text-align: right"><a href="/packages/3.13/">2042</a></td> + <td style="text-align: left">4.1</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_12_release">3.12</a></td> + <td style="text-align: left">October 28, 2020</td> + <td style="text-align: right"><a href="/packages/3.12/">1974</a></td> + <td style="text-align: left">4.0</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_11_release">3.11</a></td> + <td style="text-align: left">April 28, 2020</td> + <td style="text-align: right"><a href="/packages/3.11/">1903</a></td> + <td style="text-align: left">4.0</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_10_release">3.10</a></td> + <td style="text-align: left">October 30, 2019</td> + <td style="text-align: right"><a href="/packages/3.10/">1823</a></td> + <td style="text-align: left">3.6</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_9_release">3.9</a></td> + <td style="text-align: left">May 3, 2019</td> + <td style="text-align: right"><a href="/packages/3.9/">1741</a></td> + <td style="text-align: left">3.6</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_8_release">3.8</a></td> + <td style="text-align: left">October 31, 2018</td> + <td style="text-align: right"><a href="/packages/3.8/">1649</a></td> + <td style="text-align: left">3.5</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_7_release">3.7</a></td> + <td style="text-align: left">May 1, 2018</td> + <td style="text-align: right"><a href="/packages/3.7/">1560</a></td> + <td style="text-align: left">3.5</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_6_release">3.6</a></td> + <td style="text-align: left">October 31, 2017</td> + <td style="text-align: right"><a href="/packages/3.6/">1473</a></td> + <td style="text-align: left">3.4</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_5_release">3.5</a></td> + <td style="text-align: left">April 25, 2017</td> + <td style="text-align: right"><a href="/packages/3.5/">1383</a></td> + <td style="text-align: left">3.4</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_4_release">3.4</a></td> + <td style="text-align: left">October 18, 2016</td> + <td style="text-align: right"><a href="/packages/3.4/">1296</a></td> + <td style="text-align: left">3.3</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_3_release">3.3</a></td> + <td style="text-align: left">May 4, 2016</td> + <td style="text-align: right"><a href="/packages/3.3/">1211</a></td> + <td style="text-align: left">3.3</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_2_release">3.2</a></td> + <td style="text-align: left">October 14, 2015</td> + <td style="text-align: right"><a href="/packages/3.2/">1104</a></td> + <td style="text-align: left">3.2</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_1_release">3.1</a></td> + <td style="text-align: left">April 17, 2015</td> + <td style="text-align: right"><a href="/packages/3.1/">1024</a></td> + <td style="text-align: left">3.2</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_3_0_release">3.0</a></td> + <td style="text-align: left">October 14, 2014</td> + <td style="text-align: right"><a href="/packages/3.0/">934</a></td> + <td style="text-align: left">3.1</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_14_release">2.14</a></td> + <td style="text-align: left">April 14, 2014</td> + <td style="text-align: right"><a href="/packages/2.14/">824</a></td> + <td style="text-align: left">3.1</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_13_release">2.13</a></td> + <td style="text-align: left">October 15, 2013</td> + <td style="text-align: right"><a href="/packages/2.13/">749</a></td> + <td style="text-align: left">3.0</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_12_release">2.12</a></td> + <td style="text-align: left">April 4, 2013</td> + <td style="text-align: right"><a href="/packages/2.12/">671</a></td> + <td style="text-align: left">3.0</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_11_release">2.11</a></td> + <td style="text-align: left">October 3, 2012</td> + <td style="text-align: right"><a href="/packages/2.11/">610</a></td> + <td style="text-align: left">2.15</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_10_release">2.10</a></td> + <td style="text-align: left">April 2, 2012</td> + <td style="text-align: right"><a href="/packages/2.10/">554</a></td> + <td style="text-align: left">2.15</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_9_release">2.9</a></td> + <td style="text-align: left">November 1, 2011</td> + <td style="text-align: right"><a href="/packages/2.9/">517</a></td> + <td style="text-align: left">2.14</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_8_release">2.8</a></td> + <td style="text-align: left">April 14, 2011</td> + <td style="text-align: right"><a href="/packages/2.8/">466</a></td> + <td style="text-align: left">2.13</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_7_release">2.7</a></td> + <td style="text-align: left">October 18, 2010</td> + <td style="text-align: right"><a href="/packages/2.7/">418</a></td> + <td style="text-align: left">2.12</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_6_release">2.6</a></td> + <td style="text-align: left">April 23, 2010</td> + <td style="text-align: right"><a href="/packages/2.6/">389</a></td> + <td style="text-align: left">2.11</td> + </tr> + <tr> + <td style="text-align: left"><a href="/news/bioc_2_5_release">2.5</a></td> + <td style="text-align: left">October 28, 2009</td> + <td style="text-align: right"><a href="/packages/2.5/">352</a></td> + <td style="text-align: left">2.10</td> + </tr> + <tr> + <td style="text-align: left">2.4</td> + <td style="text-align: left">April 21, 2009</td> + <td style="text-align: right"><a href="/packages/2.4/BiocViews.html">320</a></td> + <td style="text-align: left">2.9</td> + </tr> + <tr> + <td style="text-align: left">2.3</td> + <td style="text-align: left">October 22, 2008</td> + <td style="text-align: right"><a href="/packages/2.3/BiocViews.html">294</a></td> + <td style="text-align: left">2.8</td> + </tr> + <tr> + <td style="text-align: left">2.2</td> + <td style="text-align: left">May 1, 2008</td> + <td style="text-align: right"><a href="/packages/2.2/BiocViews.html">260</a></td> + <td style="text-align: left">2.7</td> + </tr> + <tr> + <td style="text-align: left">2.1</td> + <td style="text-align: left">October 8, 2007</td> + <td style="text-align: right"><a href="/packages/2.1/BiocViews.html">233</a></td> + <td style="text-align: left">2.6</td> + </tr> + <tr> + <td style="text-align: left">2.0</td> + <td style="text-align: left">April 26, 2007</td> + <td style="text-align: right"><a href="/packages/2.0/BiocViews.html">214</a></td> + <td style="text-align: left">2.5</td> + </tr> + <tr> + <td style="text-align: left">1.9</td> + <td style="text-align: left">October 4, 2006</td> + <td style="text-align: right"><a href="/packages/1.9/BiocViews.html">188</a></td> + <td style="text-align: left">2.4</td> + </tr> + <tr> + <td style="text-align: left">1.8</td> + <td style="text-align: left">April 27, 2006</td> + <td style="text-align: right"><a href="/packages/1.8/BiocViews.html">172</a></td> + <td style="text-align: left">2.3</td> + </tr> + <tr> + <td style="text-align: left">1.7</td> + <td style="text-align: left">October 14, 2005</td> + <td style="text-align: right"><a href="/packages/bioc/1.7/src/contrib/html/">141</a></td> + <td style="text-align: left">2.2</td> + </tr> + <tr> + <td style="text-align: left">1.6</td> + <td style="text-align: left">May 18, 2005</td> + <td style="text-align: right"><a href="/packages/bioc/1.6/src/contrib/html/">123</a></td> + <td style="text-align: left">2.1</td> + </tr> + <tr> + <td style="text-align: left">1.5</td> + <td style="text-align: left">October 25, 2004</td> + <td style="text-align: right"><a href="/packages/bioc/1.5/src/contrib/html/">100</a></td> + <td style="text-align: left">2.0</td> + </tr> + <tr> + <td style="text-align: left">1.4</td> + <td style="text-align: left">May 17, 2004</td> + <td style="text-align: right">81</td> + <td style="text-align: left">1.9</td> + </tr> + <tr> + <td style="text-align: left">1.3</td> + <td style="text-align: left">October 30, 2003</td> + <td style="text-align: right">49</td> + <td style="text-align: left">1.8</td> + </tr> + <tr> + <td style="text-align: left">1.2</td> + <td style="text-align: left">May 29, 2003</td> + <td style="text-align: right">30</td> + <td style="text-align: left">1.7</td> + </tr> + <tr> + <td style="text-align: left">1.1</td> + <td style="text-align: left">November 19, 2002</td> + <td style="text-align: right">20</td> + <td style="text-align: left">1.6</td> + </tr> + <tr> + <td style="text-align: left">1.0</td> + <td style="text-align: left">May 1, 2002</td> + <td style="text-align: right">15</td> + <td style="text-align: left">1.5</td> + </tr> + </tbody> +</table> + + + + </div> + <div id="RightRail" class="RightRail"> + <a name="section-navigation"></a> <!-- accessibility anchor --> + + + <ul class="section_nav"> + <li><a href="/about/community-advisory-board/">Advisory Board -- Community</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/scientific-advisory-board/">Advisory Board -- Scientific</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/technical-advisory-board/">Advisory Board -- Technical</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/annual-reports/">Annual Reports</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/awards/">BiocAwards</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/code-of-conduct/">Code of Conduct Policy</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/core-team/">Core Team</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/european-bioconductor-society/">European Bioconductor Society</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/logo/">Logos</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/mirrors/">Mirrors</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/package-reviewers/">Package Reviewers</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/related-projects/">Related Projects</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/release-announcements/">Release Announcements</a></li> + </ul> + + <ul class="section_nav"> + <li><a href="/about/removed-packages/">Removed Packages</a></li> + </ul> + + + + + + + </div> + + <div id="SiteGlobalFooter" class="SiteGlobalFooter"> + + <span class="global_footer_logo_bc noprint"><img src="/images/logo_global_footer_bioconductor.gif" height="51" width="154" alt=""/></span> + + <div class="attribution_copyright"> + <p>Contact us: <a href="https://support.bioconductor.org/">support.bioconductor.org</a></p> + <p>Copyright © 2003 - 2023, Bioconductor</p> + </div> + + <a name="site-map"></a> <!-- accessibility anchor --> + <ul class="footer_nav_list noprint"> + <li class="footer_nav_list_element footer_nav_list_element_0"> + <b><a href="/index.html">Home</a></b> + </li> + <li class="footer_nav_list_element footer_nav_list_element_1"> + <ul> + <li><b><a href="/install/index.html">Install</a></b></li> + <li><a href="/install/index.html#install-R">Install R</a></li> + <li><a href="/install/index.html#find-bioconductor-packages">Find Bioconductor Packages</a></li> + <li><a href="/install/index.html#install-bioconductor-packages">Install Bioconductor Packages</a></li> + <li><a href="/install/index.html#update-bioconductor-packages">Update Bioconductor Packages</a></li> + </ul> + </li> + <li class="footer_nav_list_element footer_nav_list_element_2"> + <ul> + <li><b><a href="/help/index.html">Help</a></b></li> + <li><a href="/packages/release/workflows/">Workflows</a></li> + <li><a href="/help/package-vignettes/">Package Vignettes</a></li> + <li><a href="/help/faq/">FAQ</a></li> + <li><a href="/help/support/">Support</a></li> + <li><a href="http://cran.r-project.org/">Using R</a></li> + <li><a href="/help/course-materials/">Courses</a></li> + <li><a href="/help/publications/">Publications</a></li> + <li><a href="/help/docker/">Docker Images</a></li> + <li><a href="https://anvil.bioconductor.org/">Bioc in AnVIL</a></li> + <li><a href="/help/community/">Community Resources</a></li> + + </ul> + </li> + <li class="footer_nav_list_element footer_nav_list_element_3"> + <ul> + <li><b><a href="/developers/index.html">Developers</a></b></li> + <li><a href="https://contributions.bioconductor.org/develop-overview.html">Package Guidelines</a></li> + <li><a href="https://contributions.bioconductor.org/submission-overview.html">Package Submission</a></li> + <li><a href="/developers/release-schedule/">Release Schedule</a></li> + <li><a href="https://contributions.bioconductor.org/git-version-control.html">Source Control</a></li> + </ul> + </li> + <li class="footer_nav_list_element footer_nav_list_element_4"> + <ul> + <li><b><a href="/about/index.html">About</a></b></li> + <li><a href="/about/annual-reports/">Annual Reports</a></li> + <li><a href="/about/core-team/">Core Team</a></li> + <li><a href="/about/mirrors/">Mirrors</a></li> + <li><a href="/about/related-projects/">Related Projects</a></li> + <li><a href="/about/code-of-conduct/">Code of Conduct</a></li> + </ul> + </li> + </ul> + <br style="clear:both"/> + + +</div> + + +<div id="SiteMasthead" class="SiteMasthead"> + + <a name="site-navigation"></a> <!-- accessibility anchor --> + + <span class="logo_vanity_bar noprint"></span> + + <a href="/"> + <img src="/images/logo_bioconductor.gif" border="0" class="masthead_logo" height="78" width="260" alt="Bioconductor - open source software for bioinformatics"/> + </a> + + <div id="SiteMastheadRight" class="SiteMastheadRight"> + <div id="SiteMastheadRightBackground" class="SiteMastheadRightBackground"> + + <a name="site-search"></a> <!-- accessibility anchor --> + <form class="site_search" id="search_form" method="GET" + action="/help/search/index.html"> + Search: <input id="q" name="q" /> + </form> + + <ul class="masthead_nav noprint"> + <li class="masthead_nav_element" id="masthead_nav_element_1"> + <a href="/">Home</a> + </li> + <li class="masthead_nav_element" id="masthead_nav_element_2"> + <a href="/install/">Install</a> + </li> + <li class="masthead_nav_element" id="masthead_nav_element_3"> + <a href="/help/">Help</a> + </li> + <li class="masthead_nav_element" id="masthead_nav_element_4"> + <a href="/developers/">Developers</a> + </li> + <li class="masthead_nav_element" id="masthead_nav_element_5"> + <a href="/about/">About</a> + </li> + </ul> + + </div> + </div> + +</div> + + </div> + </body> +</html> diff --git a/swh/lister/bioconductor/tests/test_lister.py b/swh/lister/bioconductor/tests/test_lister.py new file mode 100644 index 0000000000000000000000000000000000000000..6c0e570a5f3cc10bc2e0e1beb9d89b940fce79d9 --- /dev/null +++ b/swh/lister/bioconductor/tests/test_lister.py @@ -0,0 +1,501 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path +from unittest.mock import Mock + +import pytest +from requests_mock.mocker import Mocker as RequestsMocker + +from swh.lister.bioconductor.lister import BioconductorLister +from swh.scheduler.interface import SchedulerInterface + +BIOCONDUCTOR_URL = "https://www.bioconductor.org" + + +@pytest.fixture +def packages_json1(datadir): + text = Path( + datadir, "https_bioconductor.org", "3.17-bioc-packages.json" + ).read_text() + return text, {} + + +@pytest.fixture +def packages_json2(datadir): + text = Path( + datadir, "https_bioconductor.org", "3.17-workflows-packages.json" + ).read_text() + return text, {} + + +@pytest.fixture +def packages_txt1(datadir): + text = Path(datadir, "https_bioconductor.org", "1.17-PACKAGES").read_text() + return text, {} + + +@pytest.fixture +def packages_txt2(datadir): + text = Path(datadir, "https_bioconductor.org", "2.2-PACKAGES").read_text() + return text, {} + + +@pytest.fixture(autouse=True) +def mock_release_announcements(datadir, requests_mock): + text = Path( + datadir, "https_bioconductor.org", "about", "release-announcements" + ).read_text() + requests_mock.get( + "https://www.bioconductor.org/about/release-announcements", + text=text, + headers={}, + ) + + +def test_bioconductor_incremental_listing( + swh_scheduler, requests_mock, mocker, packages_json1, packages_json2 +): + kwargs = dict() + lister = BioconductorLister( + scheduler=swh_scheduler, + releases=["3.17"], + categories=["bioc", "workflows"], + incremental=True, + **kwargs, + ) + assert lister.url == BIOCONDUCTOR_URL + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + for category, packages_json in [ + ("bioc", packages_json1), + ("workflows", packages_json2), + ]: + text, headers = packages_json + requests_mock.get( + ( + "https://www.bioconductor.org/packages/" + f"json/3.17/{category}/packages.json" + ), + text=text, + headers=headers, + ) + + status = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + assert status.pages == 3 # 2 categories for 3.17 + 1 None page + # annotation pkg origin is in 2 categories + # and we collect git origins as well + assert status.origins == 6 + + assert lister.get_origins_from_page.call_count == 3 + + assert [o.url for o in scheduler_origins] == [ + "https://git.bioconductor.org/packages/annotation", + "https://git.bioconductor.org/packages/arrays", + "https://git.bioconductor.org/packages/variants", + "https://www.bioconductor.org/packages/annotation", + "https://www.bioconductor.org/packages/arrays", + "https://www.bioconductor.org/packages/variants", + ] + + assert [ + o.extra_loader_arguments["packages"] + for o in scheduler_origins + if "packages" in o.extra_loader_arguments + ] == [ + { + "3.17/bioc/1.24.1": { + "package": "annotation", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "bioc/src/contrib/annotation_1.24.1.tar.gz" + ), + "version": "1.24.1", + "category": "bioc", + "checksums": {"md5": "4cb4db8807acb2e164985636091faa93"}, + "last_update_date": "2023-06-30", + }, + "3.17/workflows/1.24.1": { + "package": "annotation", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "workflows/src/contrib/annotation_1.24.1.tar.gz" + ), + "version": "1.24.1", + "category": "workflows", + "checksums": {"md5": "4cb4db8807acb2e164985636091faa93"}, + "last_update_date": "2023-06-30", + }, + }, + { + "3.17/workflows/1.26.0": { + "package": "arrays", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "workflows/src/contrib/arrays_1.26.0.tar.gz" + ), + "version": "1.26.0", + "category": "workflows", + "checksums": {"md5": "009ef917ebc047246b8c62c48e02a237"}, + "last_update_date": "2023-04-28", + } + }, + { + "3.17/bioc/1.24.0": { + "package": "variants", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "bioc/src/contrib/variants_1.24.0.tar.gz" + ), + "version": "1.24.0", + "category": "bioc", + "checksums": {"md5": "38f2c00b73e1a695f5ef4c9b4a728923"}, + "last_update_date": "2023-04-28", + } + }, + ] + + assert lister_state.package_versions == { + "annotation": {"3.17/workflows/1.24.1", "3.17/bioc/1.24.1"}, + "arrays": {"3.17/workflows/1.26.0"}, + "variants": {"3.17/bioc/1.24.0"}, + } + + +@pytest.mark.parametrize("status_code", [400, 500, 404]) +def test_bioconductor_lister_http_error( + swh_scheduler: SchedulerInterface, + requests_mock: RequestsMocker, + packages_json1, + status_code: int, +): + """ + Simulates handling of HTTP Errors while fetching of packages for bioconductor releases. + """ + releases = ["3.8"] + categories = ["workflows", "bioc"] + + requests_mock.get( + "https://www.bioconductor.org/packages/json/3.8/workflows/packages.json", + status_code=status_code, + text="Something went wrong", + ) + + text, headers = packages_json1 + requests_mock.get( + "https://www.bioconductor.org/packages/json/3.8/bioc/packages.json", + text=text, + headers=headers, + ) + + lister = BioconductorLister( + scheduler=swh_scheduler, + releases=releases, + categories=categories, + incremental=True, + ) + + # On facing HTTP errors, it should continue + # to crawl other releases/categories + stats = lister.run() + # 1 None page + 3.8 bioc page + assert stats.pages == 2 + # Both packages have git and bioconductor urls. + assert stats.origins == 4 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert [o.url for o in scheduler_origins] == [ + "https://git.bioconductor.org/packages/annotation", + "https://git.bioconductor.org/packages/variants", + "https://www.bioconductor.org/packages/annotation", + "https://www.bioconductor.org/packages/variants", + ] + assert [ + o.extra_loader_arguments["packages"] + for o in scheduler_origins + if "packages" in o.extra_loader_arguments + ] == [ + { + "3.8/bioc/1.24.1": { + "package": "annotation", + "release": "3.8", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "3.8/bioc/src/contrib/annotation_1.24.1.tar.gz" + ), + "version": "1.24.1", + "category": "bioc", + "checksums": {"md5": "4cb4db8807acb2e164985636091faa93"}, + "last_update_date": "2023-06-30", + } + }, + { + "3.8/bioc/1.24.0": { + "package": "variants", + "release": "3.8", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "3.8/bioc/src/contrib/variants_1.24.0.tar.gz" + ), + "version": "1.24.0", + "category": "bioc", + "checksums": {"md5": "38f2c00b73e1a695f5ef4c9b4a728923"}, + "last_update_date": "2023-04-28", + } + }, + ] + + lister_state = lister.get_state_from_scheduler() + assert lister_state.package_versions == { + "annotation": {"3.8/bioc/1.24.1"}, + "variants": {"3.8/bioc/1.24.0"}, + } + + +def test_bioconductor_fetch_versions(swh_scheduler: SchedulerInterface): + lister = BioconductorLister(scheduler=swh_scheduler) + assert lister.releases == [ + "1.5", + "1.6", + "1.7", + "1.8", + "1.9", + "2.0", + "2.1", + "2.2", + "2.3", + "2.4", + "2.5", + "2.6", + "2.7", + "2.8", + "2.9", + "2.10", + "2.11", + "2.12", + "2.13", + "2.14", + "3.0", + "3.1", + "3.2", + "3.3", + "3.4", + "3.5", + "3.6", + "3.7", + "3.8", + "3.9", + "3.10", + "3.11", + "3.12", + "3.13", + "3.14", + "3.15", + "3.16", + "3.17", + ] + + +def test_bioconductor_lister_parse_packages_txt( + swh_scheduler: SchedulerInterface, packages_json1, packages_txt1 +): + lister = BioconductorLister( + scheduler=swh_scheduler, releases=["3.8"], categories=["bioc"] + ) + + text, _ = packages_json1 + res = lister.parse_packages(text) + assert { + pkg_name: pkg_metadata["Version"] for pkg_name, pkg_metadata in res.items() + } == {"annotation": "1.24.1", "maEndToEnd": "2.20.0", "variants": "1.24.0"} + + text, _ = packages_txt1 + + res = lister.parse_packages(text) + assert res == { + "affylmGUI": { + "Package": "affylmGUI", + "Version": "1.4.0", + "Depends": "limma, tcltk, affy", + "Suggests": "tkrplot, affyPLM, R2HTML, xtable", + }, + "affypdnn": { + "Package": "affypdnn", + "Version": "1.4.0", + "Depends": "R (>= 1.9.0), affy (>= 1.5), affydata, hgu95av2probe", + }, + "affyPLM": { + "Package": "affyPLM", + "Version": "1.6.0", + "Depends": ( + "R (>= 2.0.0), affy (>= 1.5.0), affydata, " + "Biobase, methods,\n gcrma" + ), + }, + } + + +def test_bioconductor_lister_old_releases( + swh_scheduler, mocker, requests_mock, packages_txt1, packages_txt2 +): + releases = ["1.7"] + categories = ["workflows", "bioc"] + + text, headers = packages_txt1 + requests_mock.get( + ("https://www.bioconductor.org/packages/" "bioc/1.7/src/contrib/PACKAGES"), + text=text, + headers=headers, + ) + + text, headers = packages_txt2 + requests_mock.get( + "/packages/2.2/bioc/src/contrib/PACKAGES", + text=text, + headers=headers, + ) + + requests_mock.get( + "/packages/2.2/data/experiment/src/contrib/PACKAGES", status_code=404 + ) + requests_mock.get( + "/packages/2.2/data/annotation/src/contrib/PACKAGES", status_code=404 + ) + + lister = BioconductorLister( + scheduler=swh_scheduler, + releases=releases, + categories=categories, + incremental=True, + ) + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + assert stats.pages == 2 # 1.7 'bioc' + None page + assert stats.origins == 3 + + assert lister.get_origins_from_page.call_count == 2 + + expected_origins = [ + "https://www.bioconductor.org/packages/affyPLM", + "https://www.bioconductor.org/packages/affylmGUI", + "https://www.bioconductor.org/packages/affypdnn", + ] + + assert [o.url for o in scheduler_origins] == expected_origins + + expected_loader_packages = [ + { + "1.7/bioc/1.6.0": { + "package": "affyPLM", + "release": "1.7", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "bioc/1.7/src/contrib/Source/affyPLM_1.6.0.tar.gz" + ), + "version": "1.6.0", + "category": "bioc", + } + }, + { + "1.7/bioc/1.4.0": { + "package": "affylmGUI", + "release": "1.7", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "bioc/1.7/src/contrib/Source/affylmGUI_1.4.0.tar.gz" + ), + "version": "1.4.0", + "category": "bioc", + } + }, + { + "1.7/bioc/1.4.0": { + "package": "affypdnn", + "release": "1.7", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "bioc/1.7/src/contrib/Source/affypdnn_1.4.0.tar.gz" + ), + "version": "1.4.0", + "category": "bioc", + } + }, + ] + + assert [ + o.extra_loader_arguments["packages"] for o in scheduler_origins + ] == expected_loader_packages + + assert lister_state.package_versions == { + "affyPLM": {"1.7/bioc/1.6.0"}, + "affylmGUI": {"1.7/bioc/1.4.0"}, + "affypdnn": {"1.7/bioc/1.4.0"}, + } + + releases.append("2.2") + + lister = BioconductorLister( + scheduler=swh_scheduler, releases=releases, categories=categories + ) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + expected_origins = [ + "https://www.bioconductor.org/packages/ABarray", + "https://www.bioconductor.org/packages/AnnotationDbi", + ] + expected_origins + + assert [o.url for o in scheduler_origins] == expected_origins + + expected_loader_packages = [ + { + "2.2/bioc/1.8.0": { + "package": "ABarray", + "release": "2.2", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "2.2/bioc/src/contrib/ABarray_1.8.0.tar.gz" + ), + "version": "1.8.0", + "category": "bioc", + } + }, + { + "2.2/bioc/1.2.2": { + "package": "AnnotationDbi", + "release": "2.2", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "2.2/bioc/src/contrib/AnnotationDbi_1.2.2.tar.gz" + ), + "version": "1.2.2", + "category": "bioc", + } + }, + ] + expected_loader_packages + + assert [ + o.extra_loader_arguments["packages"] for o in scheduler_origins + ] == expected_loader_packages + + assert lister_state.package_versions == { + "affyPLM": {"1.7/bioc/1.6.0"}, + "affypdnn": {"1.7/bioc/1.4.0"}, + "affylmGUI": {"1.7/bioc/1.4.0"}, + } diff --git a/swh/lister/bioconductor/tests/test_tasks.py b/swh/lister/bioconductor/tests/test_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..337a3a1a5b93764b4e2a32eb44ff0df3c6232869 --- /dev/null +++ b/swh/lister/bioconductor/tests/test_tasks.py @@ -0,0 +1,81 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.bioconductor.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.bioconductor.tasks.BioconductorLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://www.bioconductor.org") + res = swh_scheduler_celery_app.send_task( + "swh.lister.bioconductor.tasks.BioconductorListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.bioconductor.tasks.BioconductorLister") +def test_incremental_listing( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://www.bioconductor.org") + res = swh_scheduler_celery_app.send_task( + "swh.lister.bioconductor.tasks.BioconductorIncrementalListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + kwargs["incremental"] = True + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.bioconductor.tasks.BioconductorLister") +def test_full_listing_with_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://www.bioconductor.org", + instance="bioconductor-test", + releases=["3.7"], + categories=["bioc", "workflows"], + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.bioconductor.tasks.BioconductorListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with()