packagist: Reimplement lister using new Lister API

The previous implementation was generating tasks for a non implemented
Packagist loader.

The new implementation extracts source repository URL, VCS type and
last update date for each package referenced by Packagist and send
those info to the scheduler.

Packages metadata are retrieved using Packagist API endpoints whose
responses are served from static files, which are guaranteed to be
efficient on the Packagist side (no dymamic queries).
Furthermore, subsequent listing will send the "If-Modified-Since" HTTP
header to only retrieve packages metadata updated since the previous
listing operation in order to save bandwidth and return only origins
which might have new released versions.

Closes T2991
This commit is contained in:
Antoine Lambert 2021-02-01 17:34:10 +01:00
parent 82ab96ad06
commit ff05191b7d
12 changed files with 842 additions and 220 deletions

View file

@ -1,14 +1,12 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import PackagistLister
from .models import PackagistModel
return {
"models": [PackagistModel],
"lister": PackagistLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -1,102 +1,182 @@
# Copyright (C) 2019 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
from dataclasses import dataclass
from datetime import datetime, timezone
import logging
import random
from typing import Any, Dict, List, Mapping
from typing import Any, Dict, Iterator, List, Optional
from swh.lister.core.lister_transports import ListerOnePageApiTransport
from swh.lister.core.simple_lister import SimpleLister
from swh.scheduler import utils
import iso8601
import requests
from .models import PackagistModel
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
PackagistPageType = List[str]
def compute_package_url(repo_name: str) -> str:
"""Compute packgist package url from repo name.
@dataclass
class PackagistListerState:
"""State of Packagist lister"""
last_listing_date: Optional[datetime] = None
"""Last date when packagist lister was executed"""
class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
"""
return "https://repo.packagist.org/p/%s.json" % repo_name
List all Packagist projects and send associated origins to scheduler.
The lister queries the Packagist API, whose documentation can be found at
https://packagist.org/apidoc.
class PackagistLister(ListerOnePageApiTransport, SimpleLister):
"""List packages available in the Packagist package manager.
The lister sends the request to the url present in the class
variable `PAGE`, to receive a list of all the package names
present in the Packagist package manager. Iterates over all the
packages and constructs the metadata url of the package from
the name of the package and creates a loading task::
Task:
Type: load-packagist
Policy: recurring
Args:
<package_name>
<package_metadata_url>
Example::
Task:
Type: load-packagist
Policy: recurring
Args:
'hypejunction/hypegamemechanics'
'https://repo.packagist.org/p/hypejunction/hypegamemechanics.json'
For each package, its metadata are retrieved using Packagist API endpoints
whose responses are served from static files, which are guaranteed to be
efficient on the Packagist side (no dymamic queries).
Furthermore, subsequent listing will send the "If-Modified-Since" HTTP
header to only retrieve packages metadata updated since the previous listing
operation in order to save bandwidth and return only origins which might have
new released versions.
"""
MODEL = PackagistModel
LISTER_NAME = "packagist"
PAGE = "https://packagist.org/packages/list.json"
instance = "packagist"
LISTER_NAME = "Packagist"
PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json"
PACKAGIST_REPO_BASE_URL = "https://repo.packagist.org/p"
def __init__(self, override_config=None):
ListerOnePageApiTransport.__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def task_dict(
self, origin_type: str, origin_url: str, **kwargs: Mapping[str, str]
) -> Dict[str, Any]:
"""Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
"""
return utils.create_task_dict(
"load-%s" % origin_type,
kwargs.get("policy", "recurring"),
kwargs.get("name"),
origin_url,
retries_left=3,
def __init__(
self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
url=self.PACKAGIST_PACKAGES_LIST_URL,
instance="packagist",
credentials=credentials,
)
def list_packages(self, response: Any) -> List[str]:
"""List the actual packagist origins from the response.
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
def state_from_dict(self, d: Dict[str, Any]) -> PackagistListerState:
last_listing_date = d.get("last_listing_date")
if last_listing_date is not None:
d["last_listing_date"] = iso8601.parse_date(last_listing_date)
return PackagistListerState(**d)
def state_to_dict(self, state: PackagistListerState) -> Dict[str, Any]:
d: Dict[str, Optional[str]] = {"last_listing_date": None}
last_listing_date = state.last_listing_date
if last_listing_date is not None:
d["last_listing_date"] = last_listing_date.isoformat()
return d
def api_request(self, url: str) -> Any:
logger.debug("Fetching URL %s", url)
response = self.session.get(url)
if response.status_code not in (200, 304):
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
response.raise_for_status()
# response is empty when status code is 304
return response.json() if response.status_code == 200 else {}
def get_pages(self) -> Iterator[PackagistPageType]:
"""
response = json.loads(response.text)
packages = [name for name in response["packageNames"]]
logger.debug("Number of packages: %s", len(packages))
random.shuffle(packages)
return packages
def get_model_from_repo(self, repo_name: str) -> Mapping[str, str]:
"""Transform from repository representation to model
Yield a single page listing all Packagist projects.
"""
url = compute_package_url(repo_name)
return {
"uid": repo_name,
"name": repo_name,
"full_name": repo_name,
"html_url": url,
"origin_url": url,
"origin_type": "packagist",
}
yield self.api_request(self.PACKAGIST_PACKAGES_LIST_URL)["packageNames"]
def get_origins_from_page(self, page: PackagistPageType) -> Iterator[ListedOrigin]:
"""
Iterate on all Packagist projects and yield ListedOrigin instances.
"""
assert self.lister_obj.id is not None
# save some bandwidth by only getting packages metadata updated since
# last listing
if self.state.last_listing_date is not None:
if_modified_since = self.state.last_listing_date.strftime(
"%a, %d %b %Y %H:%M:%S GMT"
)
self.session.headers["If-Modified-Since"] = if_modified_since
# to ensure origins will not be listed multiple times
origin_urls = set()
for package_name in page:
try:
metadata = self.api_request(
f"{self.PACKAGIST_REPO_BASE_URL}/{package_name}.json"
)
if not metadata.get("packages", {}):
# package metadata not updated since last listing
continue
if package_name not in metadata["packages"]:
# missing package metadata in response
continue
versions_info = metadata["packages"][package_name].values()
except requests.exceptions.HTTPError:
# error when getting package metadata (usually 404 when a
# package has been removed), skip it and process next package
continue
origin_url = None
visit_type = None
last_update = None
# extract origin url for package, vcs type and latest release date
for version_info in versions_info:
origin_url = version_info.get("source", {}).get("url", "")
if not origin_url:
continue
# can be git, hg or svn
visit_type = version_info.get("source", {}).get("type", "")
dist_time_str = version_info.get("time", "")
if not dist_time_str:
continue
dist_time = iso8601.parse_date(dist_time_str)
if last_update is None or dist_time > last_update:
last_update = dist_time
# skip package with already seen origin url or with missing required info
if visit_type is None or origin_url is None or origin_url in origin_urls:
continue
# bitbucket closed its mercurial hosting service, those origins can not be
# loaded into the archive anymore
if visit_type == "hg" and origin_url.startswith("https://bitbucket.org/"):
continue
origin_urls.add(origin_url)
logger.debug(
"Found package %s last updated on %s", package_name, last_update
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type=visit_type,
last_update=last_update,
)
def finalize(self) -> None:
self.state.last_listing_date = self.listing_date
self.updated = True

View file

@ -1,17 +0,0 @@
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from ..core.models import ModelBase
class PackagistModel(ModelBase):
"""a Packagist repository representation
"""
__tablename__ = "packagist_repo"
uid = Column(String, primary_key=True)

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019 the Software Heritage developers
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -10,7 +10,7 @@ from .lister import PackagistLister
@shared_task(name=__name__ + ".PackagistListerTask")
def list_packagist(**lister_args):
"List the packagist (php) registry"
PackagistLister(**lister_args).run()
return PackagistLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")

View file

@ -1,26 +0,0 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
@pytest.fixture
def lister_under_test():
return "packagist"
@pytest.fixture
def lister_packagist(swh_lister):
# Amend the scheduler with an unknown yet load-packagist task type
swh_lister.scheduler.create_task_type(
{
"type": "load-packagist",
"description": "Load packagist origin",
"backend_name": "swh.loader.package.tasks.LoaderPackagist",
"default_interval": "1 day",
}
)
return swh_lister

View file

@ -0,0 +1,78 @@
{
"packages": {
"den1n/contextmenu": {
"dev-default": {
"name": "den1n/contextmenu",
"description": "Context menu custom element.",
"keywords": [
"javascript",
"JS",
"contextmenu",
"den1n"
],
"homepage": "https://bitbucket.org/den1n/contextmenu",
"version": "dev-default",
"version_normalized": "9999999-dev",
"license": [
"MIT"
],
"authors": [{
"name": "Dmitry Kadochnikov",
"email": "iqmass@gmail.com"
}],
"source": {
"type": "hg",
"url": "https://bitbucket.org/den1n/contextmenu",
"reference": "c207786b3dcf90fc7796a99dcb9e5fdb860ef2ba"
},
"dist": {
"type": "zip",
"url": "https://bitbucket.org/den1n/contextmenu/get/c207786b3dcf90fc7796a99dcb9e5fdb860ef2ba.zip",
"reference": "c207786b3dcf90fc7796a99dcb9e5fdb860ef2ba",
"shasum": ""
},
"type": "library",
"time": "2019-08-27T10:42:55+00:00",
"default-branch": true,
"require": {
"den1n/xelement": "^1.0"
},
"uid": 4101245
},
"v1.0.0": {
"name": "den1n/contextmenu",
"description": "Simple DOM JS context menu.",
"keywords": [
"javascript",
"JS",
"contextmenu",
"den1n"
],
"homepage": "https://bitbucket.org/den1n/contextmenu",
"version": "v1.0.0",
"version_normalized": "1.0.0.0",
"license": [
"MIT"
],
"authors": [{
"name": "Dmitry Kadochnikov",
"email": "iqmass@gmail.com"
}],
"source": {
"type": "hg",
"url": "https://bitbucket.org/den1n/contextmenu",
"reference": "278e30a199d1f0e1a8789a4b798814722bd11065"
},
"dist": {
"type": "zip",
"url": "https://bitbucket.org/den1n/contextmenu/get/278e30a199d1f0e1a8789a4b798814722bd11065.zip",
"reference": "278e30a199d1f0e1a8789a4b798814722bd11065",
"shasum": ""
},
"type": "library",
"time": "2018-03-07T10:08:41+00:00",
"uid": 1968017
}
}
}
}

View file

@ -1,9 +0,0 @@
{
"packageNames": [
"0.0.0/composer-include-files",
"0.0.0/laravel-env-shim",
"0.0.1/try-make-package",
"0099ff/dialogflowphp",
"00f100/array_dot"
]
}

View file

@ -0,0 +1,83 @@
{
"packages": {
"ljjackson/linnworks": {
"0.1": {
"name": "ljjackson/linnworks",
"description": "A PHP API Integration of Linnworks.",
"keywords": [],
"homepage": "https://github.com/ljjackson",
"version": "0.1",
"version_normalized": "0.1.0.0",
"license": [],
"authors": [{
"name": "Liam Jackson",
"homepage": "https://github.com/ljjackson",
"role": "Developer"
}],
"source": {
"type": "git",
"url": "https://github.com/ljjackson/linnworks.git",
"reference": "b2d16490823a8a9012a83b80cdcd6a129cfc5dea"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/ljjackson/linnworks/zipball/b2d16490823a8a9012a83b80cdcd6a129cfc5dea",
"reference": "b2d16490823a8a9012a83b80cdcd6a129cfc5dea",
"shasum": ""
},
"type": "library",
"time": "2018-10-22T19:52:25+00:00",
"autoload": {
"psr-4": {
"LJJackson\\Linnworks\\": "src/"
}
},
"require": {
"php": "^7.0",
"guzzlehttp/guzzle": "^6.3",
"ext-json": "*"
},
"uid": 2535139
},
"dev-master": {
"name": "ljjackson/linnworks",
"description": "A PHP API Integration of Linnworks.",
"keywords": [],
"homepage": "https://github.com/ljjackson",
"version": "dev-master",
"version_normalized": "9999999-dev",
"license": [],
"authors": [{
"name": "Liam Jackson",
"homepage": "https://github.com/ljjackson",
"role": "Developer"
}],
"source": {
"type": "git",
"url": "https://github.com/ljjackson/linnworks.git",
"reference": "7c6b1209dc3bafad4284b130bda8450f3478ea26"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/ljjackson/linnworks/zipball/7c6b1209dc3bafad4284b130bda8450f3478ea26",
"reference": "7c6b1209dc3bafad4284b130bda8450f3478ea26",
"shasum": ""
},
"type": "library",
"time": "2018-11-01T21:45:50+00:00",
"autoload": {
"psr-4": {
"LJJackson\\Linnworks\\": "src/"
}
},
"require": {
"guzzlehttp/guzzle": "^6.3",
"ext-json": "*",
"php": "^7.1.3",
"nesbot/carbon": "*"
},
"uid": 2517334
}
}
}
}

View file

@ -0,0 +1,239 @@
{
"packages": {
"lky/wx_article": {
"1.0": {
"name": "lky/wx_article",
"description": "wx article editor",
"keywords": [
"laravel",
"WxGzhArticle"
],
"homepage": "https://github.com/lky/wxgzharticle",
"version": "1.0",
"version_normalized": "1.0.0.0",
"license": [
"MIT"
],
"authors": [{
"name": "lky",
"email": "2747865797@qq.com",
"homepage": "http://lky.kim"
}],
"source": {
"type": "git",
"url": "https://github.com/gitlky/wx_article.git",
"reference": "bd1826f17a42a1d3da44c4562af3be370687466b"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/gitlky/wx_article/zipball/bd1826f17a42a1d3da44c4562af3be370687466b",
"reference": "bd1826f17a42a1d3da44c4562af3be370687466b",
"shasum": ""
},
"type": "library",
"time": "2018-08-28T06:51:46+00:00",
"autoload": {
"psr-4": {
"lky\\WxGzhArticle\\": "src/"
}
},
"extra": {
"laravel": {
"providers": [
"lky\\WxGzhArticle\\WxGzhArticleServiceProvider"
],
"aliases": {
"WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle"
}
}
},
"require": {
"illuminate/support": "~5",
"ixudra/curl": "6.*",
"guzzlehttp/guzzle": "6.*",
"laravel/framework": "5.2.*",
"php": ">=5.6.4"
},
"require-dev": {
"phpunit/phpunit": "~6.0",
"orchestra/testbench": "~3.0"
},
"uid": 2493149
},
"dev-master": {
"name": "lky/wx_article",
"description": "wx article editor",
"keywords": [
"laravel",
"WxGzhArticle"
],
"homepage": "https://github.com/lky/wx_article",
"version": "dev-master",
"version_normalized": "9999999-dev",
"license": [
"MIT"
],
"authors": [{
"name": "lky",
"email": "2747865797@qq.com",
"homepage": "http://lky.kim"
}],
"source": {
"type": "git",
"url": "https://github.com/gitlky/wx_article.git",
"reference": "9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/gitlky/wx_article/zipball/9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be",
"reference": "9ef7cddfe1a9715cee52acc7a97d4f51d0f6e2be",
"shasum": ""
},
"type": "library",
"time": "2018-08-30T07:37:09+00:00",
"autoload": {
"psr-4": {
"lky\\WxGzhArticle\\": "src/"
}
},
"extra": {
"laravel": {
"providers": [
"lky\\WxGzhArticle\\WxGzhArticleServiceProvider"
],
"aliases": {
"WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle"
}
}
},
"default-branch": true,
"require": {
"ixudra/curl": "6.*",
"guzzlehttp/guzzle": "6.*",
"laravel/framework": ">=5.2.0",
"php": ">=5.6.4"
},
"require-dev": {
"phpunit/phpunit": "~6.0",
"orchestra/testbench": "~3.0"
},
"uid": 4096807
},
"v1.2": {
"name": "lky/wx_article",
"description": "wx article editor",
"keywords": [
"laravel",
"WxGzhArticle"
],
"homepage": "https://github.com/lky/wx_article",
"version": "v1.2",
"version_normalized": "1.2.0.0",
"license": [
"MIT"
],
"authors": [{
"name": "lky",
"email": "2747865797@qq.com",
"homepage": "http://lky.kim"
}],
"source": {
"type": "git",
"url": "https://github.com/gitlky/wx_article.git",
"reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/gitlky/wx_article/zipball/d332d20b8d848018c7e6a43e7fe47a78cdb926b7",
"reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7",
"shasum": ""
},
"type": "library",
"autoload": {
"psr-4": {
"lky\\WxGzhArticle\\": "src/"
}
},
"extra": {
"laravel": {
"providers": [
"lky\\WxGzhArticle\\WxGzhArticleServiceProvider"
],
"aliases": {
"WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle"
}
}
},
"require": {
"ixudra/curl": "6.*",
"guzzlehttp/guzzle": "6.*",
"laravel/framework": ">=5.2.0",
"php": ">=5.6.4"
},
"require-dev": {
"phpunit/phpunit": "~6.0",
"orchestra/testbench": "~3.0"
},
"uid": 2493150
},
"v1.6": {
"name": "lky/wx_article",
"description": "wx article editor",
"keywords": [
"laravel",
"WxGzhArticle"
],
"homepage": "https://github.com/lky/wx_article",
"version": "v1.6",
"version_normalized": "1.6.0.0",
"license": [
"MIT"
],
"authors": [{
"name": "lky",
"email": "2747865797@qq.com",
"homepage": "http://lky.kim"
}],
"source": {
"type": "git",
"url": "https://github.com/gitlky/wx_article.git",
"reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/gitlky/wx_article/zipball/d332d20b8d848018c7e6a43e7fe47a78cdb926b7",
"reference": "d332d20b8d848018c7e6a43e7fe47a78cdb926b7",
"shasum": ""
},
"type": "library",
"time": "2018-08-29T08:26:06+00:00",
"autoload": {
"psr-4": {
"lky\\WxGzhArticle\\": "src/"
}
},
"extra": {
"laravel": {
"providers": [
"lky\\WxGzhArticle\\WxGzhArticleServiceProvider"
],
"aliases": {
"WxGzhArticle": "lky\\WxGzhArticle\\Facades\\WxGzhArticle"
}
}
},
"require": {
"ixudra/curl": "6.*",
"guzzlehttp/guzzle": "6.*",
"laravel/framework": ">=5.2.0",
"php": ">=5.6.4"
},
"require-dev": {
"phpunit/phpunit": "~6.0",
"orchestra/testbench": "~3.0"
},
"uid": 2427550
}
}
}
}

View file

@ -0,0 +1,141 @@
{
"packages": {
"spryker-eco/computop-api": {
"1.0.0": {
"name": "spryker-eco/computop-api",
"description": "Computop API Module",
"keywords": [],
"homepage": "",
"version": "1.0.0",
"version_normalized": "1.0.0.0",
"license": [
"MIT"
],
"authors": [],
"source": {
"type": "git",
"url": "https://github.com/spryker-eco/computop-api.git",
"reference": "d75dc7d2c80bd93e65081b26433ee559d2c92f0a"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/d75dc7d2c80bd93e65081b26433ee559d2c92f0a",
"reference": "d75dc7d2c80bd93e65081b26433ee559d2c92f0a",
"shasum": ""
},
"type": "library",
"time": "2018-08-31T11:51:23+00:00",
"autoload": {
"psr-4": {
"SprykerEco\\": "src/SprykerEco/"
}
},
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"require": {
"php": ">=7.1",
"spryker/kernel": "^3.0.0",
"spryker/transfer": "^3.0.0",
"spryker/util-text": "^1.0.0",
"spryker/guzzle": "^2.2.0"
},
"require-dev": {
"spryker/code-sniffer": "dev-master"
},
"uid": 2432548
},
"dev-dev": {
"name": "spryker-eco/computop-api",
"description": "Computop API Module",
"keywords": [],
"homepage": "",
"version": "dev-dev",
"version_normalized": "dev-dev",
"license": [
"MIT"
],
"authors": [],
"source": {},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/7a695d1e412132296546d072364f410186572790",
"reference": "7a695d1e412132296546d072364f410186572790",
"shasum": ""
},
"type": "library",
"time": "2018-08-31T11:38:22+00:00",
"autoload": {
"psr-4": {
"SprykerEco\\": "src/SprykerEco/"
}
},
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"require": {
"php": ">=7.1",
"spryker/kernel": "^3.0.0",
"spryker/transfer": "^3.0.0",
"spryker/util-text": "^1.0.0",
"spryker/guzzle": "^2.2.0"
},
"require-dev": {
"spryker/code-sniffer": "dev-master"
},
"uid": 2209824
},
"dev-master": {
"name": "spryker-eco/computop-api",
"description": "ComputopApi module",
"keywords": [],
"homepage": "",
"version": "dev-master",
"version_normalized": "9999999-dev",
"license": [
"MIT"
],
"authors": [],
"source": {
"type": "git",
"url": "https://github.com/spryker-eco/computop-api.git",
"reference": "7ac81d5db52c0639bc06a61a35d7738a964fde88"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/spryker-eco/computop-api/zipball/7ac81d5db52c0639bc06a61a35d7738a964fde88",
"reference": "7ac81d5db52c0639bc06a61a35d7738a964fde88",
"shasum": ""
},
"type": "library",
"time": "2020-06-22T15:50:29+00:00",
"autoload": {
"psr-4": {
"SprykerEco\\": "src/SprykerEco/"
}
},
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"default-branch": true,
"require": {
"php": ">=7.1",
"spryker/kernel": "^3.0.0",
"spryker/transfer": "^3.0.0",
"spryker/util-text": "^1.0.0",
"spryker/guzzle": "^2.2.0"
},
"require-dev": {
"spryker/code-sniffer": "dev-master"
},
"uid": 4006827
}
}
}
}

View file

@ -1,104 +1,159 @@
# Copyright (C) 2019 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from unittest.mock import patch
import json
from pathlib import Path
import requests_mock
import iso8601
from swh.lister.core.tests.test_lister import HttpSimpleListerTester
from swh.lister.packagist.lister import PackagistLister, compute_package_url
from swh.lister.packagist.lister import PackagistLister
expected_packages = [
"0.0.0/composer-include-files",
"0.0.0/laravel-env-shim",
"0.0.1/try-make-package",
"0099ff/dialogflowphp",
"00f100/array_dot",
]
expected_model = {
"uid": "0099ff/dialogflowphp",
"name": "0099ff/dialogflowphp",
"full_name": "0099ff/dialogflowphp",
"html_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json",
"origin_url": "https://repo.packagist.org/p/0099ff/dialogflowphp.json",
"origin_type": "packagist",
_packages_list = {
"packageNames": [
"ljjackson/linnworks",
"lky/wx_article",
"spryker-eco/computop-api",
]
}
class PackagistListerTester(HttpSimpleListerTester, unittest.TestCase):
Lister = PackagistLister
PAGE = "https://packagist.org/packages/list.json"
lister_subdir = "packagist"
good_api_response_file = "data/https_packagist.org/packages_list.json"
entries = 5
def _package_metadata(datadir, package_name):
return json.loads(
Path(datadir, f"{package_name.replace('/', '_')}.json").read_text()
)
@requests_mock.Mocker()
def test_list_packages(self, http_mocker):
"""List packages from simple api page should retrieve all packages within
"""
http_mocker.get(self.PAGE, text=self.mock_response)
fl = self.get_fl()
packages = fl.list_packages(self.get_api_response(0))
def _package_origin_info(package_name, package_metadata):
origin_url = None
visit_type = None
last_update = None
for version_info in package_metadata["packages"][package_name].values():
origin_url = version_info["source"].get("url")
visit_type = version_info["source"].get("type")
if "time" in version_info:
version_date = iso8601.parse_date(version_info["time"])
if last_update is None or version_date > last_update:
last_update = version_date
return origin_url, visit_type, last_update
for package in expected_packages:
assert package in packages
def test_transport_response_simplified(self):
"""Test model created by the lister
def _request_without_if_modified_since(request):
return request.headers.get("If-Modified-Since") is None
"""
fl = self.get_fl()
model = fl.transport_response_simplified(["0099ff/dialogflowphp"])
assert len(model) == 1
for key, values in model[0].items():
assert values == expected_model[key]
@patch("swh.lister.packagist.lister.utils.create_task_dict")
def test_task_dict(self, mock_create_tasks):
"""Test the task creation of lister
def _request_with_if_modified_since(request):
return request.headers.get("If-Modified-Since") is not None
"""
fl = self.get_fl()
fl.task_dict(
origin_type="packagist", origin_url="https://abc", name="test_pack"
def test_packagist_lister(swh_scheduler, requests_mock, datadir):
# first listing, should return one origin per package
lister = PackagistLister(scheduler=swh_scheduler)
requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
packages_metadata = {}
for package_name in _packages_list["packageNames"]:
metadata = _package_metadata(datadir, package_name)
packages_metadata[package_name] = metadata
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
json=metadata,
additional_matcher=_request_without_if_modified_since,
)
mock_create_tasks.assert_called_once_with(
"load-packagist", "recurring", "test_pack", "https://abc", retries_left=3
stats = lister.run()
assert stats.pages == 1
assert stats.origins == len(_packages_list["packageNames"])
assert lister.updated
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
for package_name, package_metadata in packages_metadata.items():
origin_url, visit_type, last_update = _package_origin_info(
package_name, package_metadata
)
filtered_origins = [o for o in scheduler_origins if o.url == origin_url]
assert filtered_origins
assert filtered_origins[0].visit_type == visit_type
assert filtered_origins[0].last_update == last_update
# second listing, should return 0 origins as no package metadata
# has been updated since first listing
lister = PackagistLister(scheduler=swh_scheduler)
for package_name in _packages_list["packageNames"]:
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
additional_matcher=_request_with_if_modified_since,
status_code=304,
)
assert lister.get_state_from_scheduler().last_listing_date is not None
def test_compute_package_url():
expected_url = "https://repo.packagist.org/p/hello.json"
actual_url = compute_package_url("hello")
assert actual_url == expected_url
stats = lister.run()
assert stats.pages == 1
assert stats.origins == 0
assert lister.updated
def test_packagist_lister(lister_packagist, requests_mock_datadir):
lister_packagist.run()
def test_packagist_lister_missing_metadata(swh_scheduler, requests_mock, datadir):
lister = PackagistLister(scheduler=swh_scheduler)
requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
for package_name in _packages_list["packageNames"]:
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
additional_matcher=_request_without_if_modified_since,
status_code=404,
)
r = lister_packagist.scheduler.search_tasks(task_type="load-packagist")
assert len(r) == 5
stats = lister.run()
for row in r:
assert row["type"] == "load-packagist"
# arguments check
args = row["arguments"]["args"]
assert len(args) == 2
assert stats.pages == 1
assert stats.origins == 0
package = args[0]
url = args[1]
expected_url = compute_package_url(package)
assert url == expected_url
def test_packagist_lister_empty_metadata(swh_scheduler, requests_mock, datadir):
lister = PackagistLister(scheduler=swh_scheduler)
requests_mock.get(lister.PACKAGIST_PACKAGES_LIST_URL, json=_packages_list)
for package_name in _packages_list["packageNames"]:
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
additional_matcher=_request_without_if_modified_since,
json={"packages": {}},
)
# kwargs
kwargs = row["arguments"]["kwargs"]
assert kwargs == {}
stats = lister.run()
assert row["policy"] == "recurring"
assert row["priority"] is None
assert stats.pages == 1
assert stats.origins == 0
def test_packagist_lister_package_with_bitbucket_hg_origin(
swh_scheduler, requests_mock, datadir
):
package_name = "den1n/contextmenu"
lister = PackagistLister(scheduler=swh_scheduler)
requests_mock.get(
lister.PACKAGIST_PACKAGES_LIST_URL, json={"packageNames": [package_name]}
)
requests_mock.get(
f"{lister.PACKAGIST_REPO_BASE_URL}/{package_name}.json",
additional_matcher=_request_without_if_modified_since,
json=_package_metadata(datadir, package_name),
)
stats = lister.run()
assert stats.pages == 1
assert stats.origins == 0
def test_lister_from_configfile(swh_scheduler_config, mocker):
load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
load_from_envvar.return_value = {
"scheduler": {"cls": "local", **swh_scheduler_config},
"credentials": {},
}
lister = PackagistLister.from_configfile()
assert lister.scheduler is not None
assert lister.credentials is not None

View file

@ -1,8 +1,8 @@
# Copyright (C) 2019-2020 the Software Heritage developers
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@ -13,11 +13,11 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
assert res.result == "OK"
@patch("swh.lister.packagist.tasks.PackagistLister")
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
# setup the mocked PackagistLister
lister.return_value = lister
lister.run.return_value = None
def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
lister = mocker.patch("swh.lister.packagist.tasks.PackagistLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=1, origins=286500)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task(
"swh.lister.packagist.tasks.PackagistListerTask"
@ -25,7 +25,7 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()