diff --git a/README.md b/README.md index 91d6e79..4164a9e 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ following Python modules: - `swh.lister.pypi` - `swh.lister.rpm` - `swh.lister.tuleap` +- `swh.lister.bioconductor` Dependencies ------------ diff --git a/setup.py b/setup.py index 1b4455f..49bcfdf 100755 --- a/setup.py +++ b/setup.py @@ -93,6 +93,7 @@ setup( lister.stagit=swh.lister.stagit:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register + lister.bioconductor=swh.lister.bioconductor:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/bioconductor/__init__.py b/swh/lister/bioconductor/__init__.py new file mode 100644 index 0000000..546ab50 --- /dev/null +++ b/swh/lister/bioconductor/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import BioconductorLister + + return { + "lister": BioconductorLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/bioconductor/lister.py b/swh/lister/bioconductor/lister.py new file mode 100644 index 0000000..95895cd --- /dev/null +++ b/swh/lister/bioconductor/lister.py @@ -0,0 +1,314 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass, field +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from urllib.parse import urljoin + +from bs4 import BeautifulSoup +from debian.deb822 import Sources +import iso8601 +from packaging import version +from requests import HTTPError + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + +Release = str +Category = str +BioconductorListerPage = Optional[Tuple[Release, Category, Dict[str, Any]]] + + +@dataclass +class BioconductorListerState: + """State of the Bioconductor lister""" + + package_versions: Dict[str, Set[str]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class BioconductorLister(Lister[BioconductorListerState, BioconductorListerPage]): + """List origins from Bioconductor, a collection of open source software + for bioinformatics based on the R statistical programming language.""" + + LISTER_NAME = "bioconductor" + VISIT_TYPE = "bioconductor" + INSTANCE = "bioconductor" + + BIOCONDUCTOR_HOMEPAGE = "https://www.bioconductor.org" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str = BIOCONDUCTOR_HOMEPAGE, + instance: str = INSTANCE, + credentials: Optional[CredentialsType] = None, + releases: Optional[List[Release]] = None, + categories: Optional[List[Category]] = None, + incremental: bool = False, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + record_batch_size: int = 1000, + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + record_batch_size=record_batch_size, + ) + + if releases is None: + self.releases = self.fetch_versions() + else: + self.releases = releases + + self.categories = categories or [ + "bioc", + "workflows", + "data/annotation", + "data/experiment", + ] + + self.incremental = incremental + + self.listed_origins: Dict[str, ListedOrigin] = {} + self.origins_to_send: Set[str] = set() + self.package_versions: Dict[str, Set[str]] = {} + + def state_from_dict(self, d: Dict[str, Any]) -> BioconductorListerState: + return BioconductorListerState( + package_versions={k: set(v) for k, v in d.items()} + ) + + def state_to_dict(self, state: BioconductorListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def origin_url_for_package(self, package_name: str) -> str: + return f"{self.BIOCONDUCTOR_HOMEPAGE}/packages/{package_name}" + + def get_pages(self) -> Iterator[BioconductorListerPage]: + """Return an iterator for each page. Every page is a (release, category) pair.""" + for release in self.releases: + if version.parse(release) < version.parse("1.8"): + # only bioc category existed before 1.8 + url_template = urljoin( + self.url, "/packages/{category}/{release}/src/contrib/PACKAGES" + ) + categories = {"bioc"} + elif version.parse(release) < version.parse("2.5"): + # workflows category won't exist for these + url_template = urljoin( + self.url, "/packages/{release}/{category}/src/contrib/PACKAGES" + ) + categories = {"bioc", "data/annotation", "data/experiment"} + else: + url_template = urljoin( + self.url, "/packages/json/{release}/{category}/packages.json" + ) + categories = set(self.categories) + + for category in categories: + url = url_template.format(release=release, category=category) + try: + packages_txt = self.http_request(url).text + packages = self.parse_packages(packages_txt) + except HTTPError as e: + logger.debug( + "Skipping page since got %s response for %s", + e.response.status_code, + url, + ) + continue + + yield (release, category, packages) + + # Yield extra none to signal get_origins_from_page() + # to stop iterating and yield the extracted origins + yield None + + def fetch_versions(self) -> List[str]: + html = self.http_request( + f"{self.BIOCONDUCTOR_HOMEPAGE}/about/release-announcements" + ).text + bs = BeautifulSoup(html, "html.parser") + return [ + tr.find_all("td")[0].text + for tr in reversed(bs.find("table").find("tbody").find_all("tr")) + if tr.find_all("td")[2].find("a") + ] + + def parse_packages(self, text: str) -> Dict[str, Any]: + """Parses packages.json and PACKAGES files""" + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + sources = Sources.iter_paragraphs(text) + return {s["Package"]: dict(s) for s in sources} + + def get_origins_from_page( + self, page: BioconductorListerPage + ) -> Iterator[ListedOrigin]: + """Convert a page of BioconductorLister PACKAGES/packages.json + metadata into a list of ListedOrigins""" + assert self.lister_obj.id is not None + + if page is None: + for origin_url in self.origins_to_send: + yield self.listed_origins[origin_url] + + return + + release, category, packages = page + + origins_to_send = set() + + for pkg_name, pkg_metadata in packages.items(): + pkg_version = pkg_metadata["Version"] + last_update_date = None + last_update_str = "" + + if version.parse(release) < version.parse("1.8"): + tar_url = urljoin( + self.url, + f"/packages/{category}/{release}/src/contrib/Source/{pkg_name}_{pkg_metadata['Version']}.tar.gz", + ) + elif version.parse(release) < version.parse("2.5"): + tar_url = urljoin( + self.url, + f"/packages/{release}/{category}/src/contrib/{pkg_name}_{pkg_metadata['Version']}.tar.gz", + ) + else: + # Some packages don't have don't have a download URL (based on source.ver) + # and hence can't be archived. For example see the package + # maEndToEnd at the end of + # https://bioconductor.org/packages/json/3.17/workflows/packages.json + + # Even guessing tar url path based on the expected url format doesn't work. i.e. + # https://bioconductor.org/packages/3.17/workflows/src/contrib/maEndToEnd_2.20.0.tar.gz + # doesn't respond with a tar file. Plus, the mirror clearly shows + # that maEndToEnd tar is missing. + # https://ftp.gwdg.de/pub/misc/bioconductor/packages/3.17/workflows/src/contrib/ + # So skipping such packages + + if "source.ver" not in pkg_metadata: + logger.info( + ( + "Skipping package %s listed in release %s " + "category %s since it doesn't have a download URL" + ), + pkg_name, + release, + category, + ) + continue + + if "git_url" in pkg_metadata: + # Along with the .tar.gz files grab the git repo as well + git_origin_url = pkg_metadata["git_url"] + git_last_update_str = pkg_metadata.get("git_last_commit_date") + self.listed_origins[git_origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + visit_type="git", + url=git_origin_url, + last_update=( + iso8601.parse_date(git_last_update_str) + if git_last_update_str + else None + ), + ) + origins_to_send.add(git_origin_url) + + tar_url = urljoin( + self.url, + f"/packages/{release}/{category}/{pkg_metadata['source.ver']}", + ) + + last_update_str = pkg_metadata.get( + "Date/Publication", pkg_metadata.get("git_last_commit_date") + ) + last_update_date = ( + iso8601.parse_date(last_update_str) if last_update_str else None + ) + # For some packages in releases >= 2.5, last_update can still + # remain None. Example: See "adme16cod.db" entry in + # https://bioconductor.org/packages/json/3.17/data/annotation/packages.json + + origin_url = self.origin_url_for_package(pkg_name) + package_version_key = f"{release}/{category}/{pkg_version}" + + if origin_url not in self.listed_origins: + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=origin_url, + last_update=last_update_date, + extra_loader_arguments={"packages": {}}, + ) + + self.package_versions[pkg_name] = set() + + origins_to_send.add(origin_url) + + optional_fields: Dict[str, Any] = {} + if "MD5sum" in pkg_metadata: + optional_fields["checksums"] = {"md5": pkg_metadata["MD5sum"]} + if last_update_str: + optional_fields["last_update_date"] = last_update_str + + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ] = { + "release": release, + "version": pkg_version, + "category": category, + "package": pkg_name, + "tar_url": tar_url, + } + + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ].update(optional_fields) + + last_update = self.listed_origins[origin_url].last_update + if ( + last_update is not None + and last_update_date is not None + and last_update_date > last_update + ): + self.listed_origins[origin_url].last_update = last_update_date + + self.package_versions[pkg_name].add(package_version_key) + + # package has been listed during a previous listing + if self.incremental and pkg_name in self.state.package_versions: + new_versions = ( + self.package_versions[pkg_name] + - self.state.package_versions[pkg_name] + ) + # no new versions, no need to send the origin to the scheduler + if not new_versions: + origins_to_send.remove(origin_url) + + self.origins_to_send.update(origins_to_send) + + def finalize(self) -> None: + if self.incremental: + self.state.package_versions = self.package_versions + + self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/bioconductor/tasks.py b/swh/lister/bioconductor/tasks.py new file mode 100644 index 0000000..2486af2 --- /dev/null +++ b/swh/lister/bioconductor/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2022-2023 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import BioconductorLister + + +@shared_task(name=__name__ + ".BioconductorListerTask") +def list_bioconductor_full(**lister_args) -> Dict[str, int]: + """Full listing of Bioconductor packages""" + lister = BioconductorLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".BioconductorIncrementalListerTask") +def list_bioconductor_incremental(**lister_args) -> Dict[str, int]: + """Incremental listing of Bioconductor packages""" + lister = BioconductorLister.from_configfile(**lister_args, incremental=True) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/bioconductor/tests/__init__.py b/swh/lister/bioconductor/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/1.17-PACKAGES b/swh/lister/bioconductor/tests/data/https_bioconductor.org/1.17-PACKAGES new file mode 100644 index 0000000..b0369dd --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/1.17-PACKAGES @@ -0,0 +1,13 @@ +Package: affylmGUI +Version: 1.4.0 +Depends: limma, tcltk, affy +Suggests: tkrplot, affyPLM, R2HTML, xtable + +Package: affypdnn +Version: 1.4.0 +Depends: R (>= 1.9.0), affy (>= 1.5), affydata, hgu95av2probe + +Package: affyPLM +Version: 1.6.0 +Depends: R (>= 2.0.0), affy (>= 1.5.0), affydata, Biobase, methods, + gcrma diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/2.2-PACKAGES b/swh/lister/bioconductor/tests/data/https_bioconductor.org/2.2-PACKAGES new file mode 100644 index 0000000..01d9c32 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/2.2-PACKAGES @@ -0,0 +1,12 @@ +Package: ABarray +Version: 1.8.0 +Depends: Biobase, multtest, tcltk +Suggests: limma, LPE + +Package: AnnotationDbi +Version: 1.2.2 +Depends: R (>= 2.7.0), methods, utils, Biobase (>= 1.17.0), DBI (>= + 0.2-4), RSQLite (>= 0.6-4) +Imports: methods, utils, Biobase, DBI, RSQLite +Suggests: hgu95av2.db, hgu95av2, GO.db, GO, human.db0, mouse.db0, + rat.db0, fly.db0, yeast.db0, arabidopsis.db0 diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-bioc-packages.json b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-bioc-packages.json new file mode 100644 index 0000000..3575b90 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-bioc-packages.json @@ -0,0 +1,166 @@ +{ + "annotation": { + "Package": "annotation", + "Version": "1.24.1", + "Depends": [ + "R (>= 3.3.0)", + "VariantAnnotation", + "AnnotationHub", + "Organism.dplyr", + "TxDb.Hsapiens.UCSC.hg19.knownGene", + "TxDb.Hsapiens.UCSC.hg38.knownGene", + "TxDb.Mmusculus.UCSC.mm10.ensGene", + "org.Hs.eg.db", + "org.Mm.eg.db", + "Homo.sapiens", + "BSgenome.Hsapiens.UCSC.hg19", + "biomaRt", + "BSgenome", + "TxDb.Athaliana.BioMart.plantsmart22" + ], + "Suggests": [ + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "4cb4db8807acb2e164985636091faa93", + "NeedsCompilation": "no", + "Title": "Genomic Annotation Resources", + "Description": "Annotation resources make up a significant proportion of the Bioconductor project. And there are also a diverse set of online resources available which are accessed using specific packages. This walkthrough will describe the most popular of these resources and give some high level examples on how to use them.", + "biocViews": [ + "AnnotationWorkflow", + "Workflow" + ], + "Author": "Marc RJ Carlson [aut], Herve Pages [aut], Sonali Arora [aut], Valerie Obenchain [aut], Martin Morgan [aut], Bioconductor Package Maintainer [cre]", + "Maintainer": "Bioconductor Package Maintainer ", + "URL": "http://bioconductor.org/help/workflows/annotation/Annotation_Resources/", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/annotation", + "git_branch": "RELEASE_3_17", + "git_last_commit": "4568557", + "git_last_commit_date": "2023-06-28", + "Date/Publication": "2023-06-30", + "source.ver": "src/contrib/annotation_1.24.1.tar.gz", + "vignettes": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.html", + "vignettes/annotation/inst/doc/Annotation_Resources.html" + ], + "vignetteTitles": [ + "Annotating Genomic Ranges", + "Genomic Annotation Resources" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.R", + "vignettes/annotation/inst/doc/Annotation_Resources.R" + ], + "dependencyCount": "143", + "Rank": 23 + }, + "variants": { + "Package": "variants", + "Version": "1.24.0", + "Depends": [ + "R (>= 3.3.0)", + "VariantAnnotation", + "org.Hs.eg.db", + "TxDb.Hsapiens.UCSC.hg19.knownGene", + "BSgenome.Hsapiens.UCSC.hg19", + "PolyPhen.Hsapiens.dbSNP131" + ], + "Suggests": [ + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "38f2c00b73e1a695f5ef4c9b4a728923", + "NeedsCompilation": "no", + "Title": "Annotating Genomic Variants", + "Description": "Read and write VCF files. Identify structural location of variants and compute amino acid coding changes for non-synonymous variants. Use SIFT and PolyPhen database packages to predict consequence of amino acid coding changes.", + "biocViews": [ + "AnnotationWorkflow", + "ImmunoOncologyWorkflow", + "Workflow" + ], + "Author": "Valerie Obenchain [aut], Martin Morgan [ctb], Bioconductor Package Maintainer [cre]", + "Maintainer": "Bioconductor Package Maintainer ", + "URL": "https://bioconductor.org/help/workflows/variants/", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/variants", + "git_branch": "RELEASE_3_17", + "git_last_commit": "d311e59", + "git_last_commit_date": "2023-04-25", + "Date/Publication": "2023-04-28", + "source.ver": "src/contrib/variants_1.24.0.tar.gz", + "vignettes": [ + "vignettes/variants/inst/doc/Annotating_Genomic_Variants.html" + ], + "vignetteTitles": [ + "Annotating Genomic Variants" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/variants/inst/doc/Annotating_Genomic_Variants.R" + ], + "dependencyCount": "103", + "Rank": 16 + }, + "maEndToEnd": { + "Package": "maEndToEnd", + "Version": "2.20.0", + "Depends": [ + "R (>= 3.5.0)", + "Biobase", + "oligoClasses", + "ArrayExpress", + "pd.hugene.1.0.st.v1", + "hugene10sttranscriptcluster.db", + "oligo", + "arrayQualityMetrics", + "limma", + "topGO", + "ReactomePA", + "clusterProfiler", + "gplots", + "ggplot2", + "geneplotter", + "pheatmap", + "RColorBrewer", + "dplyr", + "tidyr", + "stringr", + "matrixStats", + "genefilter", + "openxlsx", + "Rgraphviz", + "enrichplot" + ], + "Suggests": [ + "BiocStyle", + "knitr", + "devtools", + "rmarkdown" + ], + "License": "MIT + file LICENSE", + "NeedsCompilation": "no", + "Title": "An end to end workflow for differential gene expression using Affymetrix microarrays", + "Description": "In this article, we walk through an end-to-end Affymetrix microarray differential expression workflow using Bioconductor packages. This workflow is directly applicable to current \"Gene\" type arrays, e.g. the HuGene or MoGene arrays, but can easily be adapted to similar platforms. The data analyzed here is a typical clinical microarray data set that compares inflamed and non-inflamed colon tissue in two disease subtypes. For each disease, the differential gene expression between inflamed- and non-inflamed colon tissue was analyzed. We will start from the raw data CEL files, show how to import them into a Bioconductor ExpressionSet, perform quality control and normalization and finally differential gene expression (DE) analysis, followed by some enrichment analysis.", + "biocViews": [ + "GeneExpressionWorkflow", + "Workflow" + ], + "Author": "Bernd Klaus [aut], Stefanie Reisenauer [aut, cre]", + "Maintainer": "Stefanie Reisenauer ", + "URL": "https://www.bioconductor.org/help/workflows/", + "VignetteBuilder": "knitr", + "Rank": 21 + } +} diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-workflows-packages.json b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-workflows-packages.json new file mode 100644 index 0000000..08a7e82 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/3.17-workflows-packages.json @@ -0,0 +1,162 @@ +{ + "annotation": { + "Package": "annotation", + "Version": "1.24.1", + "Depends": [ + "R (>= 3.3.0)", + "VariantAnnotation", + "AnnotationHub", + "Organism.dplyr", + "TxDb.Hsapiens.UCSC.hg19.knownGene", + "TxDb.Hsapiens.UCSC.hg38.knownGene", + "TxDb.Mmusculus.UCSC.mm10.ensGene", + "org.Hs.eg.db", + "org.Mm.eg.db", + "Homo.sapiens", + "BSgenome.Hsapiens.UCSC.hg19", + "biomaRt", + "BSgenome", + "TxDb.Athaliana.BioMart.plantsmart22" + ], + "Suggests": [ + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "4cb4db8807acb2e164985636091faa93", + "NeedsCompilation": "no", + "Title": "Genomic Annotation Resources", + "Description": "Annotation resources make up a significant proportion of the Bioconductor project. And there are also a diverse set of online resources available which are accessed using specific packages. This walkthrough will describe the most popular of these resources and give some high level examples on how to use them.", + "biocViews": [ + "AnnotationWorkflow", + "Workflow" + ], + "Author": "Marc RJ Carlson [aut], Herve Pages [aut], Sonali Arora [aut], Valerie Obenchain [aut], Martin Morgan [aut], Bioconductor Package Maintainer [cre]", + "Maintainer": "Bioconductor Package Maintainer ", + "URL": "http://bioconductor.org/help/workflows/annotation/Annotation_Resources/", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/annotation", + "git_branch": "RELEASE_3_17", + "git_last_commit": "4568557", + "git_last_commit_date": "2023-06-28", + "Date/Publication": "2023-06-30", + "source.ver": "src/contrib/annotation_1.24.1.tar.gz", + "vignettes": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.html", + "vignettes/annotation/inst/doc/Annotation_Resources.html" + ], + "vignetteTitles": [ + "Annotating Genomic Ranges", + "Genomic Annotation Resources" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/annotation/inst/doc/Annotating_Genomic_Ranges.R", + "vignettes/annotation/inst/doc/Annotation_Resources.R" + ], + "dependencyCount": "144", + "Rank": 8 + }, + "arrays": { + "Package": "arrays", + "Version": "1.26.0", + "Depends": [ + "R (>= 3.0.0)" + ], + "Suggests": [ + "affy", + "limma", + "hgfocuscdf", + "knitr", + "rmarkdown", + "BiocStyle" + ], + "License": "Artistic-2.0", + "MD5sum": "009ef917ebc047246b8c62c48e02a237", + "NeedsCompilation": "no", + "Title": "Using Bioconductor for Microarray Analysis", + "Description": "Using Bioconductor for Microarray Analysis workflow", + "biocViews": [ + "BasicWorkflow", + "Workflow" + ], + "Author": "Bioconductor Package Maintainer [aut, cre]", + "Maintainer": "Bioconductor Package Maintainer ", + "VignetteBuilder": "knitr", + "git_url": "https://git.bioconductor.org/packages/arrays", + "git_branch": "RELEASE_3_17", + "git_last_commit": "9981a8c", + "git_last_commit_date": "2023-04-25", + "Date/Publication": "2023-04-28", + "source.ver": "src/contrib/arrays_1.26.0.tar.gz", + "vignettes": [ + "vignettes/arrays/inst/doc/arrays.html" + ], + "vignetteTitles": [ + "Using Bioconductor for Microarray Analysis" + ], + "hasREADME": false, + "hasNEWS": false, + "hasINSTALL": false, + "hasLICENSE": false, + "Rfiles": [ + "vignettes/arrays/inst/doc/arrays.R" + ], + "dependencyCount": "0", + "Rank": 13 + }, + "maEndToEnd": { + "Package": "maEndToEnd", + "Version": "2.20.0", + "Depends": [ + "R (>= 3.5.0)", + "Biobase", + "oligoClasses", + "ArrayExpress", + "pd.hugene.1.0.st.v1", + "hugene10sttranscriptcluster.db", + "oligo", + "arrayQualityMetrics", + "limma", + "topGO", + "ReactomePA", + "clusterProfiler", + "gplots", + "ggplot2", + "geneplotter", + "pheatmap", + "RColorBrewer", + "dplyr", + "tidyr", + "stringr", + "matrixStats", + "genefilter", + "openxlsx", + "Rgraphviz", + "enrichplot" + ], + "Suggests": [ + "BiocStyle", + "knitr", + "devtools", + "rmarkdown" + ], + "License": "MIT + file LICENSE", + "NeedsCompilation": "no", + "Title": "An end to end workflow for differential gene expression using Affymetrix microarrays", + "Description": "In this article, we walk through an end-to-end Affymetrix microarray differential expression workflow using Bioconductor packages. This workflow is directly applicable to current \"Gene\" type arrays, e.g. the HuGene or MoGene arrays, but can easily be adapted to similar platforms. The data analyzed here is a typical clinical microarray data set that compares inflamed and non-inflamed colon tissue in two disease subtypes. For each disease, the differential gene expression between inflamed- and non-inflamed colon tissue was analyzed. We will start from the raw data CEL files, show how to import them into a Bioconductor ExpressionSet, perform quality control and normalization and finally differential gene expression (DE) analysis, followed by some enrichment analysis.", + "biocViews": [ + "GeneExpressionWorkflow", + "Workflow" + ], + "Author": "Bernd Klaus [aut], Stefanie Reisenauer [aut, cre]", + "Maintainer": "Stefanie Reisenauer ", + "URL": "https://www.bioconductor.org/help/workflows/", + "VignetteBuilder": "knitr", + "Rank": 10 + } +} diff --git a/swh/lister/bioconductor/tests/data/https_bioconductor.org/about/release-announcements b/swh/lister/bioconductor/tests/data/https_bioconductor.org/about/release-announcements new file mode 100644 index 0000000..7c2e0d3 --- /dev/null +++ b/swh/lister/bioconductor/tests/data/https_bioconductor.org/about/release-announcements @@ -0,0 +1,553 @@ + + + + + + + + + + + + + + + + + + + + + + + + Bioconductor - Release Announcements + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + +

Bioconductor releases

+ +

Each Bioconductor release is designed to work with a specific +version of R. The following table summarizes the relationship, and +links to packages designed to work with the corresponding R / +Bioconductor version.

+ +

Bioconductor versions are linked to their release announcement (when +available). Release announcements summarize new package additions, +updates to existing packages, and package removals.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ReleaseDateSoftware packagesR
3.17April 26, 202322304.3
3.16November 2, 202221834.2
3.15April 27, 202221404.2
3.14October 27, 202120834.1
3.13May 20, 202120424.1
3.12October 28, 202019744.0
3.11April 28, 202019034.0
3.10October 30, 201918233.6
3.9May 3, 201917413.6
3.8October 31, 201816493.5
3.7May 1, 201815603.5
3.6October 31, 201714733.4
3.5April 25, 201713833.4
3.4October 18, 201612963.3
3.3May 4, 201612113.3
3.2October 14, 201511043.2
3.1April 17, 201510243.2
3.0October 14, 20149343.1
2.14April 14, 20148243.1
2.13October 15, 20137493.0
2.12April 4, 20136713.0
2.11October 3, 20126102.15
2.10April 2, 20125542.15
2.9November 1, 20115172.14
2.8April 14, 20114662.13
2.7October 18, 20104182.12
2.6April 23, 20103892.11
2.5October 28, 20093522.10
2.4April 21, 20093202.9
2.3October 22, 20082942.8
2.2May 1, 20082602.7
2.1October 8, 20072332.6
2.0April 26, 20072142.5
1.9October 4, 20061882.4
1.8April 27, 20061722.3
1.7October 14, 20051412.2
1.6May 18, 20051232.1
1.5October 25, 20041002.0
1.4May 17, 2004811.9
1.3October 30, 2003491.8
1.2May 29, 2003301.7
1.1November 19, 2002201.6
1.0May 1, 2002151.5
+ + + +
+ + + + + +
+ + + + + + + + + +
+
+ + + + + + +
+
+ +
+ +
+ + diff --git a/swh/lister/bioconductor/tests/test_lister.py b/swh/lister/bioconductor/tests/test_lister.py new file mode 100644 index 0000000..6c0e570 --- /dev/null +++ b/swh/lister/bioconductor/tests/test_lister.py @@ -0,0 +1,501 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path +from unittest.mock import Mock + +import pytest +from requests_mock.mocker import Mocker as RequestsMocker + +from swh.lister.bioconductor.lister import BioconductorLister +from swh.scheduler.interface import SchedulerInterface + +BIOCONDUCTOR_URL = "https://www.bioconductor.org" + + +@pytest.fixture +def packages_json1(datadir): + text = Path( + datadir, "https_bioconductor.org", "3.17-bioc-packages.json" + ).read_text() + return text, {} + + +@pytest.fixture +def packages_json2(datadir): + text = Path( + datadir, "https_bioconductor.org", "3.17-workflows-packages.json" + ).read_text() + return text, {} + + +@pytest.fixture +def packages_txt1(datadir): + text = Path(datadir, "https_bioconductor.org", "1.17-PACKAGES").read_text() + return text, {} + + +@pytest.fixture +def packages_txt2(datadir): + text = Path(datadir, "https_bioconductor.org", "2.2-PACKAGES").read_text() + return text, {} + + +@pytest.fixture(autouse=True) +def mock_release_announcements(datadir, requests_mock): + text = Path( + datadir, "https_bioconductor.org", "about", "release-announcements" + ).read_text() + requests_mock.get( + "https://www.bioconductor.org/about/release-announcements", + text=text, + headers={}, + ) + + +def test_bioconductor_incremental_listing( + swh_scheduler, requests_mock, mocker, packages_json1, packages_json2 +): + kwargs = dict() + lister = BioconductorLister( + scheduler=swh_scheduler, + releases=["3.17"], + categories=["bioc", "workflows"], + incremental=True, + **kwargs, + ) + assert lister.url == BIOCONDUCTOR_URL + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + for category, packages_json in [ + ("bioc", packages_json1), + ("workflows", packages_json2), + ]: + text, headers = packages_json + requests_mock.get( + ( + "https://www.bioconductor.org/packages/" + f"json/3.17/{category}/packages.json" + ), + text=text, + headers=headers, + ) + + status = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + assert status.pages == 3 # 2 categories for 3.17 + 1 None page + # annotation pkg origin is in 2 categories + # and we collect git origins as well + assert status.origins == 6 + + assert lister.get_origins_from_page.call_count == 3 + + assert [o.url for o in scheduler_origins] == [ + "https://git.bioconductor.org/packages/annotation", + "https://git.bioconductor.org/packages/arrays", + "https://git.bioconductor.org/packages/variants", + "https://www.bioconductor.org/packages/annotation", + "https://www.bioconductor.org/packages/arrays", + "https://www.bioconductor.org/packages/variants", + ] + + assert [ + o.extra_loader_arguments["packages"] + for o in scheduler_origins + if "packages" in o.extra_loader_arguments + ] == [ + { + "3.17/bioc/1.24.1": { + "package": "annotation", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "bioc/src/contrib/annotation_1.24.1.tar.gz" + ), + "version": "1.24.1", + "category": "bioc", + "checksums": {"md5": "4cb4db8807acb2e164985636091faa93"}, + "last_update_date": "2023-06-30", + }, + "3.17/workflows/1.24.1": { + "package": "annotation", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "workflows/src/contrib/annotation_1.24.1.tar.gz" + ), + "version": "1.24.1", + "category": "workflows", + "checksums": {"md5": "4cb4db8807acb2e164985636091faa93"}, + "last_update_date": "2023-06-30", + }, + }, + { + "3.17/workflows/1.26.0": { + "package": "arrays", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "workflows/src/contrib/arrays_1.26.0.tar.gz" + ), + "version": "1.26.0", + "category": "workflows", + "checksums": {"md5": "009ef917ebc047246b8c62c48e02a237"}, + "last_update_date": "2023-04-28", + } + }, + { + "3.17/bioc/1.24.0": { + "package": "variants", + "release": "3.17", + "tar_url": ( + "https://www.bioconductor.org/packages/3.17/" + "bioc/src/contrib/variants_1.24.0.tar.gz" + ), + "version": "1.24.0", + "category": "bioc", + "checksums": {"md5": "38f2c00b73e1a695f5ef4c9b4a728923"}, + "last_update_date": "2023-04-28", + } + }, + ] + + assert lister_state.package_versions == { + "annotation": {"3.17/workflows/1.24.1", "3.17/bioc/1.24.1"}, + "arrays": {"3.17/workflows/1.26.0"}, + "variants": {"3.17/bioc/1.24.0"}, + } + + +@pytest.mark.parametrize("status_code", [400, 500, 404]) +def test_bioconductor_lister_http_error( + swh_scheduler: SchedulerInterface, + requests_mock: RequestsMocker, + packages_json1, + status_code: int, +): + """ + Simulates handling of HTTP Errors while fetching of packages for bioconductor releases. + """ + releases = ["3.8"] + categories = ["workflows", "bioc"] + + requests_mock.get( + "https://www.bioconductor.org/packages/json/3.8/workflows/packages.json", + status_code=status_code, + text="Something went wrong", + ) + + text, headers = packages_json1 + requests_mock.get( + "https://www.bioconductor.org/packages/json/3.8/bioc/packages.json", + text=text, + headers=headers, + ) + + lister = BioconductorLister( + scheduler=swh_scheduler, + releases=releases, + categories=categories, + incremental=True, + ) + + # On facing HTTP errors, it should continue + # to crawl other releases/categories + stats = lister.run() + # 1 None page + 3.8 bioc page + assert stats.pages == 2 + # Both packages have git and bioconductor urls. + assert stats.origins == 4 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert [o.url for o in scheduler_origins] == [ + "https://git.bioconductor.org/packages/annotation", + "https://git.bioconductor.org/packages/variants", + "https://www.bioconductor.org/packages/annotation", + "https://www.bioconductor.org/packages/variants", + ] + assert [ + o.extra_loader_arguments["packages"] + for o in scheduler_origins + if "packages" in o.extra_loader_arguments + ] == [ + { + "3.8/bioc/1.24.1": { + "package": "annotation", + "release": "3.8", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "3.8/bioc/src/contrib/annotation_1.24.1.tar.gz" + ), + "version": "1.24.1", + "category": "bioc", + "checksums": {"md5": "4cb4db8807acb2e164985636091faa93"}, + "last_update_date": "2023-06-30", + } + }, + { + "3.8/bioc/1.24.0": { + "package": "variants", + "release": "3.8", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "3.8/bioc/src/contrib/variants_1.24.0.tar.gz" + ), + "version": "1.24.0", + "category": "bioc", + "checksums": {"md5": "38f2c00b73e1a695f5ef4c9b4a728923"}, + "last_update_date": "2023-04-28", + } + }, + ] + + lister_state = lister.get_state_from_scheduler() + assert lister_state.package_versions == { + "annotation": {"3.8/bioc/1.24.1"}, + "variants": {"3.8/bioc/1.24.0"}, + } + + +def test_bioconductor_fetch_versions(swh_scheduler: SchedulerInterface): + lister = BioconductorLister(scheduler=swh_scheduler) + assert lister.releases == [ + "1.5", + "1.6", + "1.7", + "1.8", + "1.9", + "2.0", + "2.1", + "2.2", + "2.3", + "2.4", + "2.5", + "2.6", + "2.7", + "2.8", + "2.9", + "2.10", + "2.11", + "2.12", + "2.13", + "2.14", + "3.0", + "3.1", + "3.2", + "3.3", + "3.4", + "3.5", + "3.6", + "3.7", + "3.8", + "3.9", + "3.10", + "3.11", + "3.12", + "3.13", + "3.14", + "3.15", + "3.16", + "3.17", + ] + + +def test_bioconductor_lister_parse_packages_txt( + swh_scheduler: SchedulerInterface, packages_json1, packages_txt1 +): + lister = BioconductorLister( + scheduler=swh_scheduler, releases=["3.8"], categories=["bioc"] + ) + + text, _ = packages_json1 + res = lister.parse_packages(text) + assert { + pkg_name: pkg_metadata["Version"] for pkg_name, pkg_metadata in res.items() + } == {"annotation": "1.24.1", "maEndToEnd": "2.20.0", "variants": "1.24.0"} + + text, _ = packages_txt1 + + res = lister.parse_packages(text) + assert res == { + "affylmGUI": { + "Package": "affylmGUI", + "Version": "1.4.0", + "Depends": "limma, tcltk, affy", + "Suggests": "tkrplot, affyPLM, R2HTML, xtable", + }, + "affypdnn": { + "Package": "affypdnn", + "Version": "1.4.0", + "Depends": "R (>= 1.9.0), affy (>= 1.5), affydata, hgu95av2probe", + }, + "affyPLM": { + "Package": "affyPLM", + "Version": "1.6.0", + "Depends": ( + "R (>= 2.0.0), affy (>= 1.5.0), affydata, " + "Biobase, methods,\n gcrma" + ), + }, + } + + +def test_bioconductor_lister_old_releases( + swh_scheduler, mocker, requests_mock, packages_txt1, packages_txt2 +): + releases = ["1.7"] + categories = ["workflows", "bioc"] + + text, headers = packages_txt1 + requests_mock.get( + ("https://www.bioconductor.org/packages/" "bioc/1.7/src/contrib/PACKAGES"), + text=text, + headers=headers, + ) + + text, headers = packages_txt2 + requests_mock.get( + "/packages/2.2/bioc/src/contrib/PACKAGES", + text=text, + headers=headers, + ) + + requests_mock.get( + "/packages/2.2/data/experiment/src/contrib/PACKAGES", status_code=404 + ) + requests_mock.get( + "/packages/2.2/data/annotation/src/contrib/PACKAGES", status_code=404 + ) + + lister = BioconductorLister( + scheduler=swh_scheduler, + releases=releases, + categories=categories, + incremental=True, + ) + + lister.get_origins_from_page: Mock = mocker.spy(lister, "get_origins_from_page") + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + assert stats.pages == 2 # 1.7 'bioc' + None page + assert stats.origins == 3 + + assert lister.get_origins_from_page.call_count == 2 + + expected_origins = [ + "https://www.bioconductor.org/packages/affyPLM", + "https://www.bioconductor.org/packages/affylmGUI", + "https://www.bioconductor.org/packages/affypdnn", + ] + + assert [o.url for o in scheduler_origins] == expected_origins + + expected_loader_packages = [ + { + "1.7/bioc/1.6.0": { + "package": "affyPLM", + "release": "1.7", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "bioc/1.7/src/contrib/Source/affyPLM_1.6.0.tar.gz" + ), + "version": "1.6.0", + "category": "bioc", + } + }, + { + "1.7/bioc/1.4.0": { + "package": "affylmGUI", + "release": "1.7", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "bioc/1.7/src/contrib/Source/affylmGUI_1.4.0.tar.gz" + ), + "version": "1.4.0", + "category": "bioc", + } + }, + { + "1.7/bioc/1.4.0": { + "package": "affypdnn", + "release": "1.7", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "bioc/1.7/src/contrib/Source/affypdnn_1.4.0.tar.gz" + ), + "version": "1.4.0", + "category": "bioc", + } + }, + ] + + assert [ + o.extra_loader_arguments["packages"] for o in scheduler_origins + ] == expected_loader_packages + + assert lister_state.package_versions == { + "affyPLM": {"1.7/bioc/1.6.0"}, + "affylmGUI": {"1.7/bioc/1.4.0"}, + "affypdnn": {"1.7/bioc/1.4.0"}, + } + + releases.append("2.2") + + lister = BioconductorLister( + scheduler=swh_scheduler, releases=releases, categories=categories + ) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + expected_origins = [ + "https://www.bioconductor.org/packages/ABarray", + "https://www.bioconductor.org/packages/AnnotationDbi", + ] + expected_origins + + assert [o.url for o in scheduler_origins] == expected_origins + + expected_loader_packages = [ + { + "2.2/bioc/1.8.0": { + "package": "ABarray", + "release": "2.2", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "2.2/bioc/src/contrib/ABarray_1.8.0.tar.gz" + ), + "version": "1.8.0", + "category": "bioc", + } + }, + { + "2.2/bioc/1.2.2": { + "package": "AnnotationDbi", + "release": "2.2", + "tar_url": ( + "https://www.bioconductor.org/packages/" + "2.2/bioc/src/contrib/AnnotationDbi_1.2.2.tar.gz" + ), + "version": "1.2.2", + "category": "bioc", + } + }, + ] + expected_loader_packages + + assert [ + o.extra_loader_arguments["packages"] for o in scheduler_origins + ] == expected_loader_packages + + assert lister_state.package_versions == { + "affyPLM": {"1.7/bioc/1.6.0"}, + "affypdnn": {"1.7/bioc/1.4.0"}, + "affylmGUI": {"1.7/bioc/1.4.0"}, + } diff --git a/swh/lister/bioconductor/tests/test_tasks.py b/swh/lister/bioconductor/tests/test_tasks.py new file mode 100644 index 0000000..337a3a1 --- /dev/null +++ b/swh/lister/bioconductor/tests/test_tasks.py @@ -0,0 +1,81 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.bioconductor.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.bioconductor.tasks.BioconductorLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://www.bioconductor.org") + res = swh_scheduler_celery_app.send_task( + "swh.lister.bioconductor.tasks.BioconductorListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.bioconductor.tasks.BioconductorLister") +def test_incremental_listing( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://www.bioconductor.org") + res = swh_scheduler_celery_app.send_task( + "swh.lister.bioconductor.tasks.BioconductorIncrementalListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + kwargs["incremental"] = True + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.bioconductor.tasks.BioconductorLister") +def test_full_listing_with_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://www.bioconductor.org", + instance="bioconductor-test", + releases=["3.7"], + categories=["bioc", "workflows"], + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.bioconductor.tasks.BioconductorListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with()