From ff05191b7db7b217c8682e9888338b8813e2df6a Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Mon, 1 Feb 2021 17:34:10 +0100 Subject: [PATCH] packagist: Reimplement lister using new Lister API The previous implementation was generating tasks for a non implemented Packagist loader. The new implementation extracts source repository URL, VCS type and last update date for each package referenced by Packagist and send those info to the scheduler. Packages metadata are retrieved using Packagist API endpoints whose responses are served from static files, which are guaranteed to be efficient on the Packagist side (no dymamic queries). Furthermore, subsequent listing will send the "If-Modified-Since" HTTP header to only retrieve packages metadata updated since the previous listing operation in order to save bandwidth and return only origins which might have new released versions. Closes T2991 --- swh/lister/packagist/__init__.py | 4 +- swh/lister/packagist/lister.py | 238 +++++++++++------ swh/lister/packagist/models.py | 17 -- swh/lister/packagist/tasks.py | 4 +- swh/lister/packagist/tests/conftest.py | 26 -- .../tests/data/den1n_contextmenu.json | 78 ++++++ .../https_packagist.org/packages_list.json | 9 - .../tests/data/ljjackson_linnworks.json | 83 ++++++ .../packagist/tests/data/lky_wx_article.json | 239 ++++++++++++++++++ .../tests/data/spryker-eco_computop-api.json | 141 +++++++++++ swh/lister/packagist/tests/test_lister.py | 205 +++++++++------ swh/lister/packagist/tests/test_tasks.py | 18 +- 12 files changed, 842 insertions(+), 220 deletions(-) delete mode 100644 swh/lister/packagist/models.py delete mode 100644 swh/lister/packagist/tests/conftest.py create mode 100644 swh/lister/packagist/tests/data/den1n_contextmenu.json delete mode 100644 swh/lister/packagist/tests/data/https_packagist.org/packages_list.json create mode 100644 swh/lister/packagist/tests/data/ljjackson_linnworks.json create mode 100644 swh/lister/packagist/tests/data/lky_wx_article.json create mode 100644 swh/lister/packagist/tests/data/spryker-eco_computop-api.json diff --git a/swh/lister/packagist/__init__.py b/swh/lister/packagist/__init__.py index 262008f..1f4d208 100644 --- a/swh/lister/packagist/__init__.py +++ b/swh/lister/packagist/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import PackagistLister - from .models import PackagistModel return { - "models": [PackagistModel], "lister": PackagistLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index e49a99c..9378691 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -1,102 +1,182 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json +from dataclasses import dataclass +from datetime import datetime, timezone import logging -import random -from typing import Any, Dict, List, Mapping +from typing import Any, Dict, Iterator, List, Optional -from swh.lister.core.lister_transports import ListerOnePageApiTransport -from swh.lister.core.simple_lister import SimpleLister -from swh.scheduler import utils +import iso8601 +import requests -from .models import PackagistModel +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) +PackagistPageType = List[str] -def compute_package_url(repo_name: str) -> str: - """Compute packgist package url from repo name. +@dataclass +class PackagistListerState: + """State of Packagist lister""" + + last_listing_date: Optional[datetime] = None + """Last date when packagist lister was executed""" + + +class PackagistLister(Lister[PackagistListerState, PackagistPageType]): """ - return "https://repo.packagist.org/p/%s.json" % repo_name + List all Packagist projects and send associated origins to scheduler. + The lister queries the Packagist API, whose documentation can be found at + https://packagist.org/apidoc. -class PackagistLister(ListerOnePageApiTransport, SimpleLister): - """List packages available in the Packagist package manager. - - The lister sends the request to the url present in the class - variable `PAGE`, to receive a list of all the package names - present in the Packagist package manager. Iterates over all the - packages and constructs the metadata url of the package from - the name of the package and creates a loading task:: - - Task: - Type: load-packagist - Policy: recurring - Args: - - - - Example:: - - Task: - Type: load-packagist - Policy: recurring - Args: - 'hypejunction/hypegamemechanics' - 'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json' - + For each package, its metadata are retrieved using Packagist API endpoints + whose responses are served from static files, which are guaranteed to be + efficient on the Packagist side (no dymamic queries). + Furthermore, subsequent listing will send the "If-Modified-Since" HTTP + header to only retrieve packages metadata updated since the previous listing + operation in order to save bandwidth and return only origins which might have + new released versions. """ - MODEL = PackagistModel - LISTER_NAME = "packagist" - PAGE = "https://packagist.org/packages/list.json" - instance = "packagist" + LISTER_NAME = "Packagist" + PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" + PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p" - def __init__(self, override_config=None): - ListerOnePageApiTransport.__init__(self) - SimpleLister.__init__(self, override_config=override_config) - - def task_dict( - self, origin_type: str, origin_url: str, **kwargs: Mapping[str, str] - ) -> Dict[str, Any]: - """Return task format dict - - This is overridden from the lister_base as more information is - needed for the ingestion task creation. - - """ - return utils.create_task_dict( - "load-%s" % origin_type, - kwargs.get("policy", "recurring"), - kwargs.get("name"), - origin_url, - retries_left=3, + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.PACKAGIST_PACKAGES_LIST_URL, + instance="packagist", + credentials=credentials, ) - def list_packages(self, response: Any) -> List[str]: - """List the actual packagist origins from the response. + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + self.listing_date = datetime.now().astimezone(tz=timezone.utc) + def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState: + last_listing_date = d.get("last_listing_date") + if last_listing_date is not None: + d["last_listing_date"] = iso8601.parse_date(last_listing_date) + return PackagistListerState(**d) + + def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]: + d: Dict[str, Optional[str]] = {"last_listing_date": None} + last_listing_date = state.last_listing_date + if last_listing_date is not None: + d["last_listing_date"] = last_listing_date.isoformat() + return d + + def api_request(self, url: str) -> Any: + logger.debug("Fetching URL %s", url) + + response = self.session.get(url) + + if response.status_code not in (200, 304): + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + + response.raise_for_status() + + # response is empty when status code is 304 + return response.json() if response.status_code == 200 else {} + + def get_pages(self) -> Iterator[PackagistPageType]: """ - response = json.loads(response.text) - packages = [name for name in response["packageNames"]] - logger.debug("Number of packages: %s", len(packages)) - random.shuffle(packages) - return packages - - def get_model_from_repo(self, repo_name: str) -> Mapping[str, str]: - """Transform from repository representation to model - + Yield a single page listing all Packagist projects. """ - url = compute_package_url(repo_name) - return { - "uid": repo_name, - "name": repo_name, - "full_name": repo_name, - "html_url": url, - "origin_url": url, - "origin_type": "packagist", - } + yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"] + + def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all Packagist projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None + + # save some bandwidth by only getting packages metadata updated since + # last listing + if self.state.last_listing_date is not None: + if_modified_since = self.state.last_listing_date.strftime( + "%a, %d %b %Y %H:%M:%S GMT" + ) + self.session.headers["If-Modified-Since"] = if_modified_since + + # to ensure origins will not be listed multiple times + origin_urls = set() + + for package_name in page: + try: + metadata = self.api_request( + f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json" + ) + if not metadata.get("packages", {}): + # package metadata not updated since last listing + continue + if package_name not in metadata["packages"]: + # missing package metadata in response + continue + versions_info = metadata["packages"][package_name].values() + except requests.exceptions.HTTPError: + # error when getting package metadata (usually 404 when a + # package has been removed), skip it and process next package + continue + + origin_url = None + visit_type = None + last_update = None + + # extract origin url for package, vcs type and latest release date + for version_info in versions_info: + origin_url = version_info.get("source", {}).get("url", "") + if not origin_url: + continue + # can be git, hg or svn + visit_type = version_info.get("source", {}).get("type", "") + dist_time_str = version_info.get("time", "") + if not dist_time_str: + continue + dist_time = iso8601.parse_date(dist_time_str) + if last_update is None or dist_time > last_update: + last_update = dist_time + + # skip package with already seen origin url or with missing required info + if visit_type is None or origin_url is None or origin_url in origin_urls: + continue + + # bitbucket closed its mercurial hosting service, those origins can not be + # loaded into the archive anymore + if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"): + continue + + origin_urls.add(origin_url) + + logger.debug( + "Found package %s last updated on %s", package_name, last_update + ) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type=visit_type, + last_update=last_update, + ) + + def finalize(self) -> None: + self.state.last_listing_date = self.listing_date + self.updated = True diff --git a/swh/lister/packagist/models.py b/swh/lister/packagist/models.py deleted file mode 100644 index 268f884..0000000 --- a/swh/lister/packagist/models.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class PackagistModel(ModelBase): - """a Packagist repository representation - - """ - - __tablename__ = "packagist_repo" - - uid = Column(String, primary_key=True) diff --git a/swh/lister/packagist/tasks.py b/swh/lister/packagist/tasks.py index 6f6087b..9146e38 100644 --- a/swh/lister/packagist/tasks.py +++ b/swh/lister/packagist/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,7 @@ from .lister import PackagistLister @shared_task(name=__name__ + ".PackagistListerTask") def list_packagist(**lister_args): "List the packagist (php) registry" - PackagistLister(**lister_args).run() + return PackagistLister.from_configfile(**lister_args).run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/packagist/tests/conftest.py b/swh/lister/packagist/tests/conftest.py deleted file mode 100644 index 4482346..0000000 --- a/swh/lister/packagist/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "packagist" - - -@pytest.fixture -def lister_packagist(swh_lister): - # Amend the scheduler with an unknown yet load-packagist task type - swh_lister.scheduler.create_task_type( - { - "type": "load-packagist", - "description": "Load packagist origin", - "backend_name": "swh.loader.package.tasks.LoaderPackagist", - "default_interval": "1 day", - } - ) - - return swh_lister diff --git a/swh/lister/packagist/tests/data/den1n_contextmenu.json b/swh/lister/packagist/tests/data/den1n_contextmenu.json new file mode 100644 index 0000000..d0f6d08 --- /dev/null +++ b/swh/lister/packagist/tests/data/den1n_contextmenu.json @@ -0,0 +1,78 @@ +{ + "packages": { + "den1n/contextmenu": { + "dev-default": { + "name": "den1n/contextmenu", + "description": "Context menu custom element.", + "keywords": [ + "javascript", + "JS", + "contextmenu", + "den1n" + ], + "homepage": "https://bitbucket.org/den1n/contextmenu", + "version": "dev-default", + "version_normalized": "9999999-dev", + "license": [ + "MIT" + ], + "authors": [{ + "name": "Dmitry Kadochnikov", + "email": "iqmass@gmail.com" + }], + "source": { + "type": "hg", + "url": "https://bitbucket.org/den1n/contextmenu", + "reference": "c207786b3dcf90fc7796a99dcb9e5fdb860ef2ba" + }, + "dist": { + "type": "zip", + "url": "https://bitbucket.org/den1n/contextmenu/get/c207786b3dcf90fc7796a99dcb9e5fdb860ef2ba.zip", + "reference": "c207786b3dcf90fc7796a99dcb9e5fdb860ef2ba", + "shasum": "" + }, + "type": "library", + "time": "2019-08-27T10:42:55+00:00", + "default-branch": true, + "require": { + "den1n/xelement": "^1.0" + }, + "uid": 4101245 + }, + "v1.0.0": { + "name": "den1n/contextmenu", + "description": "Simple DOM JS context menu.", + "keywords": [ + "javascript", + "JS", + "contextmenu", + "den1n" + ], + "homepage": "https://bitbucket.org/den1n/contextmenu", + "version": "v1.0.0", + "version_normalized": "1.0.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "Dmitry Kadochnikov", + "email": "iqmass@gmail.com" + }], + "source": { + "type": "hg", + "url": "https://bitbucket.org/den1n/contextmenu", + "reference": "278e30a199d1f0e1a8789a4b798814722bd11065" + }, + "dist": { + "type": "zip", + "url": "https://bitbucket.org/den1n/contextmenu/get/278e30a199d1f0e1a8789a4b798814722bd11065.zip", + "reference": "278e30a199d1f0e1a8789a4b798814722bd11065", + "shasum": "" + }, + "type": "library", + "time": "2018-03-07T10:08:41+00:00", + "uid": 1968017 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/https_packagist.org/packages_list.json b/swh/lister/packagist/tests/data/https_packagist.org/packages_list.json deleted file mode 100644 index 2e4843c..0000000 --- a/swh/lister/packagist/tests/data/https_packagist.org/packages_list.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "packageNames": [ - "0.0.0/composer-include-files", - "0.0.0/laravel-env-shim", - "0.0.1/try-make-package", - "0099ff/dialogflowphp", - "00f100/array_dot" - ] -} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/ljjackson_linnworks.json b/swh/lister/packagist/tests/data/ljjackson_linnworks.json new file mode 100644 index 0000000..ba57a81 --- /dev/null +++ b/swh/lister/packagist/tests/data/ljjackson_linnworks.json @@ -0,0 +1,83 @@ +{ + "packages": { + "ljjackson/linnworks": { + "0.1": { + "name": "ljjackson/linnworks", + "description": "A PHP API Integration of Linnworks.", + "keywords": [], + "homepage": "https://github.com/ljjackson", + "version": "0.1", + "version_normalized": "0.1.0.0", + "license": [], + "authors": [{ + "name": "Liam Jackson", + "homepage": "https://github.com/ljjackson", + "role": "Developer" + }], + "source": { + "type": "git", + "url": "https://github.com/ljjackson/linnworks.git", + "reference": "b2d16490823a8a9012a83b80cdcd6a129cfc5dea" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ljjackson/linnworks/zipball/b2d16490823a8a9012a83b80cdcd6a129cfc5dea", + "reference": "b2d16490823a8a9012a83b80cdcd6a129cfc5dea", + "shasum": "" + }, + "type": "library", + "time": "2018-10-22T19:52:25+00:00", + "autoload": { + "psr-4": { + "LJJackson\\Linnworks\\": "src/" + } + }, + "require": { + "php": "^7.0", + "guzzlehttp/guzzle": "^6.3", + "ext-json": "*" + }, + "uid": 2535139 + }, + "dev-master": { + "name": "ljjackson/linnworks", + "description": "A PHP API Integration of Linnworks.", + "keywords": [], + "homepage": "https://github.com/ljjackson", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [], + "authors": [{ + "name": "Liam Jackson", + "homepage": "https://github.com/ljjackson", + "role": "Developer" + }], + "source": { + "type": "git", + "url": "https://github.com/ljjackson/linnworks.git", + "reference": "7c6b1209dc3bafad4284b130bda8450f3478ea26" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ljjackson/linnworks/zipball/7c6b1209dc3bafad4284b130bda8450f3478ea26", + "reference": "7c6b1209dc3bafad4284b130bda8450f3478ea26", + "shasum": "" + }, + "type": "library", + "time": "2018-11-01T21:45:50+00:00", + "autoload": { + "psr-4": { + "LJJackson\\Linnworks\\": "src/" + } + }, + "require": { + "guzzlehttp/guzzle": "^6.3", + "ext-json": "*", + "php": "^7.1.3", + "nesbot/carbon": "*" + }, + "uid": 2517334 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/lky_wx_article.json b/swh/lister/packagist/tests/data/lky_wx_article.json new file mode 100644 index 0000000..5bf3f4c --- /dev/null +++ b/swh/lister/packagist/tests/data/lky_wx_article.json @@ -0,0 +1,239 @@ +{ + "packages": { + "lky/wx_article": { + "1.0": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wxgzharticle", + "version": "1.0", + "version_normalized": "1.0.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "bd1826f17a42a1d3da44c4562af3be370687466b" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/bd1826f17a42a1d3da44c4562af3be370687466b", + "reference": "bd1826f17a42a1d3da44c4562af3be370687466b", + "shasum": "" + }, + "type": "library", + "time": "2018-08-28T06:51:46+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "require": { + "illuminate/support": "~5", + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": "5.2.*", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 2493149 + }, + "dev-master": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wx_article", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be", + "reference": "9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be", + "shasum": "" + }, + "type": "library", + "time": "2018-08-30T07:37:09+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "default-branch": true, + "require": { + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": ">=5.2.0", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 4096807 + }, + "v1.2": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wx_article", + "version": "v1.2", + "version_normalized": "1.2.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "shasum": "" + }, + "type": "library", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "require": { + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": ">=5.2.0", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 2493150 + }, + "v1.6": { + "name": "lky/wx_article", + "description": "wx article editor", + "keywords": [ + "laravel", + "WxGzhArticle" + ], + "homepage": "https://github.com/lky/wx_article", + "version": "v1.6", + "version_normalized": "1.6.0.0", + "license": [ + "MIT" + ], + "authors": [{ + "name": "lky", + "email": "2747865797@qq.com", + "homepage": "http://lky.kim" + }], + "source": { + "type": "git", + "url": "https://github.com/gitlky/wx_article.git", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/gitlky/wx_article/zipball/d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7", + "shasum": "" + }, + "type": "library", + "time": "2018-08-29T08:26:06+00:00", + "autoload": { + "psr-4": { + "lky\\WxGzhArticle\\": "src/" + } + }, + "extra": { + "laravel": { + "providers": [ + "lky\\WxGzhArticle\\WxGzhArticleServiceProvider" + ], + "aliases": { + "WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle" + } + } + }, + "require": { + "ixudra/curl": "6.*", + "guzzlehttp/guzzle": "6.*", + "laravel/framework": ">=5.2.0", + "php": ">=5.6.4" + }, + "require-dev": { + "phpunit/phpunit": "~6.0", + "orchestra/testbench": "~3.0" + }, + "uid": 2427550 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/data/spryker-eco_computop-api.json b/swh/lister/packagist/tests/data/spryker-eco_computop-api.json new file mode 100644 index 0000000..c7d2f16 --- /dev/null +++ b/swh/lister/packagist/tests/data/spryker-eco_computop-api.json @@ -0,0 +1,141 @@ +{ + "packages": { + "spryker-eco/computop-api": { + "1.0.0": { + "name": "spryker-eco/computop-api", + "description": "Computop API Module", + "keywords": [], + "homepage": "", + "version": "1.0.0", + "version_normalized": "1.0.0.0", + "license": [ + "MIT" + ], + "authors": [], + "source": { + "type": "git", + "url": "https://github.com/spryker-eco/computop-api.git", + "reference": "d75dc7d2c80bd93e65081b26433ee559d2c92f0a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/d75dc7d2c80bd93e65081b26433ee559d2c92f0a", + "reference": "d75dc7d2c80bd93e65081b26433ee559d2c92f0a", + "shasum": "" + }, + "type": "library", + "time": "2018-08-31T11:51:23+00:00", + "autoload": { + "psr-4": { + "SprykerEco\\": "src/SprykerEco/" + } + }, + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "require": { + "php": ">=7.1", + "spryker/kernel": "^3.0.0", + "spryker/transfer": "^3.0.0", + "spryker/util-text": "^1.0.0", + "spryker/guzzle": "^2.2.0" + }, + "require-dev": { + "spryker/code-sniffer": "dev-master" + }, + "uid": 2432548 + }, + "dev-dev": { + "name": "spryker-eco/computop-api", + "description": "Computop API Module", + "keywords": [], + "homepage": "", + "version": "dev-dev", + "version_normalized": "dev-dev", + "license": [ + "MIT" + ], + "authors": [], + "source": {}, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/7a695d1e412132296546d072364f410186572790", + "reference": "7a695d1e412132296546d072364f410186572790", + "shasum": "" + }, + "type": "library", + "time": "2018-08-31T11:38:22+00:00", + "autoload": { + "psr-4": { + "SprykerEco\\": "src/SprykerEco/" + } + }, + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "require": { + "php": ">=7.1", + "spryker/kernel": "^3.0.0", + "spryker/transfer": "^3.0.0", + "spryker/util-text": "^1.0.0", + "spryker/guzzle": "^2.2.0" + }, + "require-dev": { + "spryker/code-sniffer": "dev-master" + }, + "uid": 2209824 + }, + "dev-master": { + "name": "spryker-eco/computop-api", + "description": "ComputopApi module", + "keywords": [], + "homepage": "", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "MIT" + ], + "authors": [], + "source": { + "type": "git", + "url": "https://github.com/spryker-eco/computop-api.git", + "reference": "7ac81d5db52c0639bc06a61a35d7738a964fde88" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/7ac81d5db52c0639bc06a61a35d7738a964fde88", + "reference": "7ac81d5db52c0639bc06a61a35d7738a964fde88", + "shasum": "" + }, + "type": "library", + "time": "2020-06-22T15:50:29+00:00", + "autoload": { + "psr-4": { + "SprykerEco\\": "src/SprykerEco/" + } + }, + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "default-branch": true, + "require": { + "php": ">=7.1", + "spryker/kernel": "^3.0.0", + "spryker/transfer": "^3.0.0", + "spryker/util-text": "^1.0.0", + "spryker/guzzle": "^2.2.0" + }, + "require-dev": { + "spryker/code-sniffer": "dev-master" + }, + "uid": 4006827 + } + } + } +} \ No newline at end of file diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py index 808910f..64b4439 100644 --- a/swh/lister/packagist/tests/test_lister.py +++ b/swh/lister/packagist/tests/test_lister.py @@ -1,104 +1,159 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest -from unittest.mock import patch +import json +from pathlib import Path -import requests_mock +import iso8601 -from swh.lister.core.tests.test_lister import HttpSimpleListerTester -from swh.lister.packagist.lister import PackagistLister, compute_package_url +from swh.lister.packagist.lister import PackagistLister -expected_packages = [ - "0.0.0/composer-include-files", - "0.0.0/laravel-env-shim", - "0.0.1/try-make-package", - "0099ff/dialogflowphp", - "00f100/array_dot", -] - -expected_model = { - "uid": "0099ff/dialogflowphp", - "name": "0099ff/dialogflowphp", - "full_name": "0099ff/dialogflowphp", - "html_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json", - "origin_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json", - "origin_type": "packagist", +_packages_list = { + "packageNames": [ + "ljjackson/linnworks", + "lky/wx_article", + "spryker-eco/computop-api", + ] } -class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase): - Lister = PackagistLister - PAGE = "https://packagist.org/packages/list.json" - lister_subdir = "packagist" - good_api_response_file = "data/https_packagist.org/packages_list.json" - entries = 5 +def _package_metadata(datadir, package_name): + return json.loads( + Path(datadir, f"{package_name.replace('/', '_')}.json").read_text() + ) - @requests_mock.Mocker() - def test_list_packages(self, http_mocker): - """List packages from simple api page should retrieve all packages within - """ - http_mocker.get(self.PAGE, text=self.mock_response) - fl = self.get_fl() - packages = fl.list_packages(self.get_api_response(0)) +def _package_origin_info(package_name, package_metadata): + origin_url = None + visit_type = None + last_update = None + for version_info in package_metadata["packages"][package_name].values(): + origin_url = version_info["source"].get("url") + visit_type = version_info["source"].get("type") + if "time" in version_info: + version_date = iso8601.parse_date(version_info["time"]) + if last_update is None or version_date > last_update: + last_update = version_date + return origin_url, visit_type, last_update - for package in expected_packages: - assert package in packages - def test_transport_response_simplified(self): - """Test model created by the lister +def _request_without_if_modified_since(request): + return request.headers.get("If-Modified-Since") is None - """ - fl = self.get_fl() - model = fl.transport_response_simplified(["0099ff/dialogflowphp"]) - assert len(model) == 1 - for key, values in model[0].items(): - assert values == expected_model[key] - @patch("swh.lister.packagist.lister.utils.create_task_dict") - def test_task_dict(self, mock_create_tasks): - """Test the task creation of lister +def _request_with_if_modified_since(request): + return request.headers.get("If-Modified-Since") is not None - """ - fl = self.get_fl() - fl.task_dict( - origin_type="packagist", origin_url="https://abc", name="test_pack" + +def test_packagist_lister(swh_scheduler, requests_mock, datadir): + # first listing, should return one origin per package + lister = PackagistLister(scheduler=swh_scheduler) + requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) + packages_metadata = {} + for package_name in _packages_list["packageNames"]: + metadata = _package_metadata(datadir, package_name) + packages_metadata[package_name] = metadata + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + json=metadata, + additional_matcher=_request_without_if_modified_since, ) - mock_create_tasks.assert_called_once_with( - "load-packagist", "recurring", "test_pack", "https://abc", retries_left=3 + stats = lister.run() + + assert stats.pages == 1 + assert stats.origins == len(_packages_list["packageNames"]) + assert lister.updated + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + for package_name, package_metadata in packages_metadata.items(): + origin_url, visit_type, last_update = _package_origin_info( + package_name, package_metadata + ) + filtered_origins = [o for o in scheduler_origins if o.url == origin_url] + assert filtered_origins + assert filtered_origins[0].visit_type == visit_type + assert filtered_origins[0].last_update == last_update + + # second listing, should return 0 origins as no package metadata + # has been updated since first listing + lister = PackagistLister(scheduler=swh_scheduler) + for package_name in _packages_list["packageNames"]: + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + additional_matcher=_request_with_if_modified_since, + status_code=304, ) + assert lister.get_state_from_scheduler().last_listing_date is not None -def test_compute_package_url(): - expected_url = "https://repo.packagist.org/p/hello.json" - actual_url = compute_package_url("hello") - assert actual_url == expected_url + stats = lister.run() + + assert stats.pages == 1 + assert stats.origins == 0 + assert lister.updated -def test_packagist_lister(lister_packagist, requests_mock_datadir): - lister_packagist.run() +def test_packagist_lister_missing_metadata(swh_scheduler, requests_mock, datadir): + lister = PackagistLister(scheduler=swh_scheduler) + requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) + for package_name in _packages_list["packageNames"]: + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + additional_matcher=_request_without_if_modified_since, + status_code=404, + ) - r = lister_packagist.scheduler.search_tasks(task_type="load-packagist") - assert len(r) == 5 + stats = lister.run() - for row in r: - assert row["type"] == "load-packagist" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 2 + assert stats.pages == 1 + assert stats.origins == 0 - package = args[0] - url = args[1] - expected_url = compute_package_url(package) - assert url == expected_url +def test_packagist_lister_empty_metadata(swh_scheduler, requests_mock, datadir): + lister = PackagistLister(scheduler=swh_scheduler) + requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list) + for package_name in _packages_list["packageNames"]: + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + additional_matcher=_request_without_if_modified_since, + json={"packages": {}}, + ) - # kwargs - kwargs = row["arguments"]["kwargs"] - assert kwargs == {} + stats = lister.run() - assert row["policy"] == "recurring" - assert row["priority"] is None + assert stats.pages == 1 + assert stats.origins == 0 + + +def test_packagist_lister_package_with_bitbucket_hg_origin( + swh_scheduler, requests_mock, datadir +): + package_name = "den1n/contextmenu" + lister = PackagistLister(scheduler=swh_scheduler) + requests_mock.get( + lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]} + ) + requests_mock.get( + f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json", + additional_matcher=_request_without_if_modified_since, + json=_package_metadata(datadir, package_name), + ) + + stats = lister.run() + + assert stats.pages == 1 + assert stats.origins == 0 + + +def test_lister_from_configfile(swh_scheduler_config, mocker): + load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") + load_from_envvar.return_value = { + "scheduler": {"cls": "local", **swh_scheduler_config}, + "credentials": {}, + } + lister = PackagistLister.from_configfile() + assert lister.scheduler is not None + assert lister.credentials is not None diff --git a/swh/lister/packagist/tests/test_tasks.py b/swh/lister/packagist/tests/test_tasks.py index 6c5d15d..9db88e3 100644 --- a/swh/lister/packagist/tests/test_tasks.py +++ b/swh/lister/packagist/tests/test_tasks.py @@ -1,8 +1,8 @@ -# Copyright (C) 2019-2020 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +from swh.lister.pattern import ListerStats def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): @@ -13,11 +13,11 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): assert res.result == "OK" -@patch("swh.lister.packagist.tasks.PackagistLister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked PackagistLister - lister.return_value = lister - lister.run.return_value = None +def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.packagist.tasks.PackagistLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=286500) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( "swh.lister.packagist.tasks.PackagistListerTask" @@ -25,7 +25,7 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): assert res res.wait() assert res.successful() + assert res.result == stats.dict() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()