Reimplement PyPI lister using new Lister API
The new lister has only full listing capability. It scrapes pypi.org list of packages. Rate-limiting was not encountered but is handled generically.
This commit is contained in:
parent
565e7423e3
commit
62c825b8cb
6 changed files with 131 additions and 87 deletions
|
@ -5,10 +5,9 @@
|
|||
|
||||
def register():
|
||||
from .lister import PyPILister
|
||||
from .models import PyPIModel
|
||||
|
||||
return {
|
||||
"models": [PyPIModel],
|
||||
"models": [],
|
||||
"lister": PyPILister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
||||
|
|
|
@ -3,65 +3,70 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
import random
|
||||
from typing import Any, Dict
|
||||
import logging
|
||||
from typing import Iterator, List
|
||||
|
||||
from requests import Response
|
||||
import requests
|
||||
import xmltodict
|
||||
|
||||
from swh.lister.core.lister_transports import ListerOnePageApiTransport
|
||||
from swh.lister.core.simple_lister import SimpleLister
|
||||
from swh.scheduler import utils
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .models import PyPIModel
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PackageListPage = List[str]
|
||||
|
||||
|
||||
class PyPILister(ListerOnePageApiTransport, SimpleLister):
|
||||
MODEL = PyPIModel
|
||||
class PyPILister(StatelessLister[PackageListPage]):
|
||||
"""List origins from PyPI.
|
||||
|
||||
"""
|
||||
|
||||
LISTER_NAME = "pypi"
|
||||
PAGE = "https://pypi.org/simple/"
|
||||
instance = "pypi" # As of today only the main pypi.org is used
|
||||
INSTANCE = "pypi" # As of today only the main pypi.org is used
|
||||
|
||||
def __init__(self, override_config=None):
|
||||
ListerOnePageApiTransport.__init__(self)
|
||||
SimpleLister.__init__(self, override_config=override_config)
|
||||
PACKAGE_LIST_URL = "https://pypi.org/simple/"
|
||||
PACKAGE_URL = "https://pypi.org/project/{package_name}/"
|
||||
|
||||
def task_dict(self, origin_type: str, origin_url: str, **kwargs):
|
||||
"""(Override) Return task format dict
|
||||
def __init__(self, scheduler: SchedulerInterface):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
credentials=None,
|
||||
url=self.PACKAGE_LIST_URL,
|
||||
instance=self.INSTANCE,
|
||||
)
|
||||
|
||||
This is overridden from the lister_base as more information is
|
||||
needed for the ingestion task creation.
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/html", "User-Agent": USER_AGENT}
|
||||
)
|
||||
|
||||
"""
|
||||
_type = "load-%s" % origin_type
|
||||
_policy = kwargs.get("policy", "recurring")
|
||||
return utils.create_task_dict(_type, _policy, url=origin_url)
|
||||
def get_pages(self) -> Iterator[PackageListPage]:
|
||||
|
||||
def list_packages(self, response: Response) -> list:
|
||||
"""(Override) List the actual pypi origins from the response.
|
||||
response = self.session.get(self.PACKAGE_LIST_URL)
|
||||
|
||||
"""
|
||||
result = xmltodict.parse(response.content)
|
||||
_packages = [p["#text"] for p in result["html"]["body"]["a"]]
|
||||
random.shuffle(_packages)
|
||||
return _packages
|
||||
response.raise_for_status()
|
||||
|
||||
def origin_url(self, repo_name: str) -> str:
|
||||
"""Returns origin_url
|
||||
page_xmldict = xmltodict.parse(response.content)
|
||||
page_results = [p["#text"] for p in page_xmldict["html"]["body"]["a"]]
|
||||
|
||||
"""
|
||||
return "https://pypi.org/project/%s/" % repo_name
|
||||
yield page_results
|
||||
|
||||
def get_model_from_repo(self, repo_name: str) -> Dict[str, Any]:
|
||||
"""(Override) Transform from repository representation to model
|
||||
def get_origins_from_page(
|
||||
self, packages_name: PackageListPage
|
||||
) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of PyPI repositories into a list of ListedOrigins."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
"""
|
||||
origin_url = self.origin_url(repo_name)
|
||||
return {
|
||||
"uid": origin_url,
|
||||
"name": repo_name,
|
||||
"full_name": repo_name,
|
||||
"html_url": origin_url,
|
||||
"origin_url": origin_url,
|
||||
"origin_type": "pypi",
|
||||
}
|
||||
for package_name in packages_name:
|
||||
package_url = self.PACKAGE_URL.format(package_name=package_name)
|
||||
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=package_url,
|
||||
visit_type="pypi",
|
||||
last_update=None, # available on PyPI JSON API
|
||||
)
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
# Copyright (C) 2018 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from sqlalchemy import Column, String
|
||||
|
||||
from ..core.models import ModelBase
|
||||
|
||||
|
||||
class PyPIModel(ModelBase):
|
||||
"""a PyPI repository representation
|
||||
|
||||
"""
|
||||
|
||||
__tablename__ = "pypi_repo"
|
||||
|
||||
uid = Column(String, primary_key=True)
|
|
@ -8,9 +8,10 @@ from .lister import PyPILister
|
|||
|
||||
|
||||
@shared_task(name=__name__ + ".PyPIListerTask")
|
||||
def list_pypi(**lister_args):
|
||||
"Full update of the PyPI (python) registry"
|
||||
return PyPILister(**lister_args).run()
|
||||
def list_pypi():
|
||||
"Full listing of the PyPI registry"
|
||||
lister = PyPILister.from_configfile()
|
||||
return lister.run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
|
|
|
@ -3,25 +3,80 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
def test_pypi_lister(lister_pypi, requests_mock_datadir):
|
||||
lister_pypi.run()
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
r = lister_pypi.scheduler.search_tasks(task_type="load-pypi")
|
||||
assert len(r) == 4
|
||||
from swh.lister.pypi.lister import PyPILister
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
for row in r:
|
||||
assert row["type"] == "load-pypi"
|
||||
# arguments check
|
||||
args = row["arguments"]["args"]
|
||||
assert len(args) == 0
|
||||
|
||||
# kwargs
|
||||
kwargs = row["arguments"]["kwargs"]
|
||||
assert len(kwargs) == 1
|
||||
@pytest.fixture
|
||||
def pypi_packages_testdata(datadir):
|
||||
content = Path(datadir, "https_pypi.org", "simple").read_bytes()
|
||||
names = ["0lever-so", "0lever-utils", "0-orchestrator", "0wned"]
|
||||
urls = [PyPILister.PACKAGE_URL.format(package_name=n) for n in names]
|
||||
return content, names, urls
|
||||
|
||||
origin_url = kwargs["url"]
|
||||
assert "https://pypi.org/project" in origin_url
|
||||
|
||||
assert row["policy"] == "recurring"
|
||||
assert row["priority"] is None
|
||||
def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]):
|
||||
"""Asserts that the two collections have the same origin URLs"""
|
||||
|
||||
sorted_lister_urls = list(sorted(lister_urls))
|
||||
sorted_scheduler_origins = list(sorted(scheduler_origins))
|
||||
|
||||
assert len(sorted_lister_urls) == len(sorted_scheduler_origins)
|
||||
|
||||
for l_url, s_origin in zip(sorted_lister_urls, sorted_scheduler_origins):
|
||||
assert l_url == s_origin.url
|
||||
|
||||
|
||||
def test_pypi_list(swh_scheduler, requests_mock, mocker, pypi_packages_testdata):
|
||||
|
||||
t_content, t_names, t_urls = pypi_packages_testdata
|
||||
|
||||
requests_mock.get(
|
||||
PyPILister.PACKAGE_LIST_URL, [{"content": t_content, "status_code": 200},],
|
||||
)
|
||||
|
||||
lister = PyPILister(scheduler=swh_scheduler)
|
||||
|
||||
lister.get_origins_from_page = mocker.spy(lister, "get_origins_from_page")
|
||||
lister.session.get = mocker.spy(lister.session, "get")
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins
|
||||
|
||||
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
|
||||
lister.get_origins_from_page.assert_called_once_with(t_names)
|
||||
|
||||
assert stats.pages == 1
|
||||
assert stats.origins == 4
|
||||
assert len(scheduler_origins) == 4
|
||||
|
||||
check_listed_origins(t_urls, scheduler_origins)
|
||||
|
||||
assert lister.get_state_from_scheduler() is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("http_code", [400, 429, 500])
|
||||
def test_pypi_list_http_error(swh_scheduler, requests_mock, mocker, http_code):
|
||||
|
||||
requests_mock.get(
|
||||
PyPILister.PACKAGE_LIST_URL, [{"content": None, "status_code": http_code},],
|
||||
)
|
||||
|
||||
lister = PyPILister(scheduler=swh_scheduler)
|
||||
|
||||
lister.session.get = mocker.spy(lister.session, "get")
|
||||
|
||||
with pytest.raises(requests.HTTPError):
|
||||
lister.run()
|
||||
|
||||
lister.session.get.assert_called_once_with(lister.PACKAGE_LIST_URL)
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).origins
|
||||
assert len(scheduler_origins) == 0
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
|
||||
from unittest.mock import patch
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.ping")
|
||||
|
@ -17,14 +19,13 @@ def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
|||
@patch("swh.lister.pypi.tasks.PyPILister")
|
||||
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
# setup the mocked PypiLister
|
||||
lister.return_value = lister
|
||||
lister.run.return_value = None
|
||||
lister.from_configfile.return_value = lister
|
||||
lister.run.return_value = ListerStats(pages=1, origins=0)
|
||||
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.pypi.tasks.PyPIListerTask")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
|
||||
lister.assert_called_once_with()
|
||||
lister.db_last_index.assert_not_called()
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue