pubdev: Retrieve last publication date for each listed package

In order to get a last_update for each ListedOrigin sent to scheduler
database, send an extra HTTP request for each listed package to the
/api/packages/<package_name> endpoint of pub.dev API.

A pub.dev developer inform us that endpoint is heavily used and cached
so there is no particular issues to query that endpoint for each package
in a row periodically.
This commit is contained in:
Antoine Lambert 2022-09-02 16:18:12 +02:00
parent 49b79b0759
commit 44560c2383
4 changed files with 128 additions and 1 deletions

View file

@ -5,7 +5,9 @@
import logging
from typing import Any, Dict, Iterator, List, Optional
import iso8601
import requests
from requests.exceptions import HTTPError
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
@ -90,6 +92,22 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
assert self.lister_obj.id is not None
for pkgname in page:
package_info_url = self.PACKAGE_INFO_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
try:
response = self.page_request(url=package_info_url, params={})
except HTTPError:
logger.warning(
"Failed to fetch metadata for package %s, skipping it from listing.",
pkgname,
)
continue
package_metadata = response.json()
package_versions = package_metadata["versions"]
last_published = max(
package_version["published"] for package_version in package_versions
)
origin_url = self.ORIGIN_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
@ -97,5 +115,5 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=origin_url,
last_update=None,
last_update=iso8601.parse_date(last_published),
)

View file

@ -0,0 +1,44 @@
{
"name": "Autolinker",
"latest": {
"version": "0.1.1",
"pubspec": {
"version": "0.1.1",
"homepage": "https://github.com/hackcave",
"description": "Port of Autolinker.js to dart",
"name": "Autolinker",
"author": "hackcave <hackers@hackcave.org>"
},
"archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz",
"archive_sha256": "0a5209a2d5a292a26fc65d7edb430163f209a7c7c24ba4f301676f1afd79fa3f",
"published": "2014-12-24T22:34:02.534090Z"
},
"versions": [
{
"version": "0.1.0",
"pubspec": {
"version": "0.1.0",
"homepage": "https://github.com/hackcave",
"description": "Port of Autolinker.js to dart",
"name": "Autolinker",
"author": "hackcave <hackers@hackcave.org>"
},
"archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.0.tar.gz",
"archive_sha256": "717b30e27311c775293d4795ce33d15cedb5e5d21fa140f2cb46b30f3e969041",
"published": "2014-12-24T21:16:03.118270Z"
},
{
"version": "0.1.1",
"pubspec": {
"version": "0.1.1",
"homepage": "https://github.com/hackcave",
"description": "Port of Autolinker.js to dart",
"name": "Autolinker",
"author": "hackcave <hackers@hackcave.org>"
},
"archive_url": "https://pub.dartlang.org/packages/Autolinker/versions/0.1.1.tar.gz",
"archive_sha256": "0a5209a2d5a292a26fc65d7edb430163f209a7c7c24ba4f301676f1afd79fa3f",
"published": "2014-12-24T22:34:02.534090Z"
}
]
}

View file

@ -0,0 +1,51 @@
{
"name": "Babylon",
"latest": {
"version": "0.0.3",
"pubspec": {
"version": "0.0.3",
"name": "Babylon",
"dependencies": {
"js": ">=0.6.0",
"browser": ">=0.10.0+2"
},
"author": "Cedric Krause <cedric@cedware.com>",
"description": "A starting point for Dart libraries or applications.",
"homepage": "https://www.cedware.com",
"environment": {
"sdk": ">=1.0.0 <2.0.0"
},
"dev_dependencies": {
"test": ">=0.12.0 <0.13.0"
}
},
"archive_url": "https://pub.dartlang.org/packages/Babylon/versions/0.0.3.tar.gz",
"archive_sha256": "a18166c8082d795f22c38270b7fed0c306d5cb59fe390ce3a34c300770c4a8b3",
"published": "2016-06-01T19:15:38.052Z"
},
"versions": [
{
"version": "0.0.3",
"pubspec": {
"version": "0.0.3",
"name": "Babylon",
"dependencies": {
"js": ">=0.6.0",
"browser": ">=0.10.0+2"
},
"author": "Cedric Krause <cedric@cedware.com>",
"description": "A starting point for Dart libraries or applications.",
"homepage": "https://www.cedware.com",
"environment": {
"sdk": ">=1.0.0 <2.0.0"
},
"dev_dependencies": {
"test": ">=0.12.0 <0.13.0"
}
},
"archive_url": "https://pub.dartlang.org/packages/Babylon/versions/0.0.3.tar.gz",
"archive_sha256": "a18166c8082d795f22c38270b7fed0c306d5cb59fe390ce3a34c300770c4a8b3",
"published": "2016-06-01T19:15:38.052Z"
}
]
}

View file

@ -25,3 +25,17 @@ def test_pubdev_lister(datadir, requests_mock_datadir, swh_scheduler):
for origin in scheduler_origins:
assert origin.visit_type == "pubdev"
assert origin.url in expected_origins
assert origin.last_update is not None
def test_pubdev_lister_skip_package(
datadir, requests_mock_datadir, swh_scheduler, requests_mock
):
requests_mock.get("https://pub.dev/api/packages/Autolinker", status_code=404)
lister = PubDevLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
assert res.origins == 1