cran: Retrieve last update date for each listed package

R package last update date can be found in the "Packaged" field of
package info returned by tools::CRAN_package_db().

So retrieve it and parse it as a datetime to provide as last_update
parameter value in ListedOrigin model.

Closes T2989
This commit is contained in:
Antoine Lambert 2021-01-25 18:26:22 +01:00
parent 6f40ab4c57
commit 22eeb0956e
4 changed files with 83 additions and 10 deletions

View file

@ -4,6 +4,6 @@
# all the packages of R and their description, then convert the API
# response to JSON string and print it
db <- tools::CRAN_package_db()[, c("Package", "Version")]
db <- tools::CRAN_package_db()[, c("Package", "Version", "Packaged")]
dbjson <- jsonlite::toJSON(db)
print(dbjson)

View file

@ -2,10 +2,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import json
import logging
import subprocess
from typing import Dict, Iterator, List, Tuple
from typing import Dict, Iterator, List, Optional, Tuple
import pkg_resources
@ -47,6 +48,7 @@ class CRANLister(StatelessLister[PageType]):
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="tar",
last_update=parse_packaged_date(package_info),
extra_loader_arguments={
"artifacts": [
{"url": artifact_url, "version": package_info["Version"]}
@ -96,3 +98,28 @@ def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]:
origin_url = f"{CRAN_MIRROR}/package={package}"
artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz"
return origin_url, artifact_url
def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]:
packaged_at_str = package_info.get("Packaged", "")
packaged_at = None
if packaged_at_str:
try:
# Packaged field format: "%Y-%m-%d %H:%M:%S UTC; <packager>",
packaged_at = datetime.strptime(
packaged_at_str.split(" UTC;")[0], "%Y-%m-%d %H:%M:%S",
).replace(tzinfo=timezone.utc)
except Exception:
try:
# Some old packages have a different date format:
# "%a %b %d %H:%M:%S %Y; <packager>"
packaged_at = datetime.strptime(
packaged_at_str.split(";")[0], "%a %b %d %H:%M:%S %Y",
).replace(tzinfo=timezone.utc)
except Exception:
logger.debug(
"Could not parse %s package release date: %s",
package_info["Package"],
packaged_at_str,
)
return packaged_at

View file

@ -2,27 +2,39 @@
{
"Package": "SeleMix",
"Version": "1.0.1"
"Version": "1.0.2",
"Packaged": "2020-11-28 22:16:43 UTC; Teresa"
},
{
"Package": "plink",
"Version": "1.5-1"
"Version": "1.5-1",
"Packaged": "2017-04-26 11:36:15 UTC; Jonathan"
},
{
"Package": "justifier",
"Version": "0.1.0"
"Package": "jsonlite",
"Version": "1.7.2",
"Packaged": "2020-12-09 13:54:18 UTC; jeroen"
},
{
"Package": "Records",
"Version": "1.0"
"Version": "1.0",
"Packaged": "2012-10-29 08:57:37 UTC; ripley"
},
{
"Package": "scRNAtools",
"Version": "1.0"
"Version": "1.0",
"Packaged": "2018-07-04 00:49:45 UTC; dell"
},
{
"Package": "Deriv",
"Version": "3.9.0"
"Version": "4.1.2",
"Packaged": "2020-12-10 11:12:28 UTC; sokol"
},
{
"Package": "BayesValidate",
"Version": "0.0",
"Packaged": "Thu Mar 30 10:48:35 2006; hornik"
}
]

View file

@ -3,12 +3,18 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import json
from os import path
import pytest
from swh.lister.cran.lister import CRAN_MIRROR, CRANLister, compute_origin_urls
from swh.lister.cran.lister import (
CRAN_MIRROR,
CRANLister,
compute_origin_urls,
parse_packaged_date,
)
def test_cran_compute_origin_urls():
@ -26,6 +32,32 @@ def test_cran_compute_origin_urls_failure():
compute_origin_urls(incomplete_repo)
def test_parse_packaged_date():
common_date_format = {
"Package": "test",
"Packaged": "2017-04-26 11:36:15 UTC; Jonathan",
}
assert parse_packaged_date(common_date_format) == datetime(
year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc
)
old_date_format = {
"Package": "test",
"Packaged": "Thu Mar 30 10:48:35 2006; hornik",
}
assert parse_packaged_date(old_date_format) == datetime(
year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc
)
invalid_date_format = {
"Package": "test",
"Packaged": "foo",
}
assert parse_packaged_date(invalid_date_format) is None
missing_date = {
"Package": "test",
}
assert parse_packaged_date(missing_date) is None
def test_cran_lister_cran(datadir, swh_scheduler, mocker):
with open(path.join(datadir, "list-r-packages.json")) as f:
cran_data = json.loads(f.read())
@ -55,3 +87,5 @@ def test_cran_lister_cran(datadir, swh_scheduler, mocker):
assert filtered_origins[0].extra_loader_arguments == {
"artifacts": [{"url": artifact_url, "version": package_info["Version"]}]
}
filtered_origins[0].last_update == parse_packaged_date(package_info)