Reimplement PyPI lister using new Lister API

The new lister has only full listing capability.
It scrapes pypi.org list of packages.
Rate-limiting was not encountered but is handled generically.
This commit is contained in:
tenma 2021-01-20 15:43:07 +01:00
parent 565e7423e3
commit 62c825b8cb
6 changed files with 131 additions and 87 deletions

View file

@ -5,10 +5,9 @@
def register():
from .lister import PyPILister
from .models import PyPIModel
return {
"models": [PyPIModel],
"models": [],
"lister": PyPILister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -3,65 +3,70 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
from typing import Any, Dict
import logging
from typing import Iterator, List
from requests import Response
import requests
import xmltodict
from swh.lister.core.lister_transports import ListerOnePageApiTransport
from swh.lister.core.simple_lister import SimpleLister
from swh.scheduler import utils
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .models import PyPIModel
from .. import USER_AGENT
from ..pattern import StatelessLister
logger = logging.getLogger(__name__)
PackageListPage = List[str]
class PyPILister(ListerOnePageApiTransport, SimpleLister):
MODEL = PyPIModel
class PyPILister(StatelessLister[PackageListPage]):
"""List origins from PyPI.
"""
LISTER_NAME = "pypi"
PAGE = "https://pypi.org/simple/"
instance = "pypi" # As of today only the main pypi.org is used
INSTANCE = "pypi" # As of today only the main pypi.org is used
def __init__(self, override_config=None):
ListerOnePageApiTransport.__init__(self)
SimpleLister.__init__(self, override_config=override_config)
PACKAGE_LIST_URL = "https://pypi.org/simple/"
PACKAGE_URL = "https://pypi.org/project/{package_name}/"
def task_dict(self, origin_type: str, origin_url: str, **kwargs):
"""(Override) Return task format dict
def __init__(self, scheduler: SchedulerInterface):
super().__init__(
scheduler=scheduler,
credentials=None,
url=self.PACKAGE_LIST_URL,
instance=self.INSTANCE,
)
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
"""
_type = "load-%s" % origin_type
_policy = kwargs.get("policy", "recurring")
return utils.create_task_dict(_type, _policy, url=origin_url)
def get_pages(self) -> Iterator[PackageListPage]:
def list_packages(self, response: Response) -> list:
"""(Override) List the actual pypi origins from the response.
response = self.session.get(self.PACKAGE_LIST_URL)
"""
result = xmltodict.parse(response.content)
_packages = [p["#text"] for p in result["html"]["body"]["a"]]
random.shuffle(_packages)
return _packages
response.raise_for_status()
def origin_url(self, repo_name: str) -> str:
"""Returns origin_url
page_xmldict = xmltodict.parse(response.content)
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
"""
return "https://pypi.org/project/%s/" % repo_name
yield page_results
def get_model_from_repo(self, repo_name: str) -> Dict[str, Any]:
"""(Override) Transform from repository representation to model
def get_origins_from_page(
self, packages_name: PackageListPage
) -> Iterator[ListedOrigin]:
"""Convert a page of PyPI repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
"""
origin_url = self.origin_url(repo_name)
return {
"uid": origin_url,
"name": repo_name,
"full_name": repo_name,
"html_url": origin_url,
"origin_url": origin_url,
"origin_type": "pypi",
}
for package_name in packages_name:
package_url = self.PACKAGE_URL.format(package_name=package_name)
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=package_url,
visit_type="pypi",
last_update=None, # available on PyPI JSON API
)

View file

@ -1,17 +0,0 @@
# Copyright (C) 2018 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from ..core.models import ModelBase
class PyPIModel(ModelBase):
"""a PyPI repository representation
"""
__tablename__ = "pypi_repo"
uid = Column(String, primary_key=True)

View file

@ -8,9 +8,10 @@ from .lister import PyPILister
@shared_task(name=__name__ + ".PyPIListerTask")
def list_pypi(**lister_args):
"Full update of the PyPI (python) registry"
return PyPILister(**lister_args).run()
def list_pypi():
"Full listing of the PyPI registry"
lister = PyPILister.from_configfile()
return lister.run().dict()
@shared_task(name=__name__ + ".ping")

View file

@ -3,25 +3,80 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from typing import List
def test_pypi_lister(lister_pypi, requests_mock_datadir):
lister_pypi.run()
import pytest
import requests
r = lister_pypi.scheduler.search_tasks(task_type="load-pypi")
assert len(r) == 4
from swh.lister.pypi.lister import PyPILister
from swh.scheduler.model import ListedOrigin
for row in r:
assert row["type"] == "load-pypi"
# arguments check
args = row["arguments"]["args"]
assert len(args) == 0
# kwargs
kwargs = row["arguments"]["kwargs"]
assert len(kwargs) == 1
@pytest.fixture
def pypi_packages_testdata(datadir):
content = Path(datadir, "https_pypi.org", "simple").read_bytes()
names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names]
return content, names, urls
origin_url = kwargs["url"]
assert "https://pypi.org/project" in origin_url
assert row["policy"] == "recurring"
assert row["priority"] is None
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
"""Asserts that the two collections have the same origin URLs"""
sorted_lister_urls = list(sorted(lister_urls))
sorted_scheduler_origins = list(sorted(scheduler_origins))
assert len(sorted_lister_urls) == len(sorted_scheduler_origins)
for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins):
assert l_url == s_origin.url
def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata):
t_content, t_names, t_urls = pypi_packages_testdata
requests_mock.get(
PyPILister.PACKAGE_LIST_URL, [{"content": t_content, "status_code": 200},],
)
lister = PyPILister(scheduler=swh_scheduler)
lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page")
lister.session.get = mocker.spy(lister.session, "get")
stats = lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
lister.get_origins_from_page.assert_called_once_with(t_names)
assert stats.pages == 1
assert stats.origins == 4
assert len(scheduler_origins) == 4
check_listed_origins(t_urls, scheduler_origins)
assert lister.get_state_from_scheduler() is None
@pytest.mark.parametrize("http_code", [400, 429, 500])
def test_pypi_list_http_error(swh_scheduler, requests_mock, mocker, http_code):
requests_mock.get(
PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},],
)
lister = PyPILister(scheduler=swh_scheduler)
lister.session.get = mocker.spy(lister.session, "get")
with pytest.raises(requests.HTTPError):
lister.run()
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins
assert len(scheduler_origins) == 0

View file

@ -5,6 +5,8 @@
from unittest.mock import patch
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.ping")
@ -17,14 +19,13 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@patch("swh.lister.pypi.tasks.PyPILister")
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
# setup the mocked PypiLister
lister.return_value = lister
lister.run.return_value = None
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=1, origins=0)
res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.PyPIListerTask")
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()