Nuget: Implement incremental listing

The lister is incremental and based on the value of ``commitTimeStamp`` retrieved on index http api endpoint.

Related T1718
This commit is contained in:
Franck Bret 2022-10-19 16:11:23 +02:00
parent e1f3f87c73
commit ea146ce297
7 changed files with 476 additions and 21 deletions

View file

@ -20,9 +20,13 @@ Origins retrieving strategy
Nuget.org provides an `http api`_ with several endpoint to discover and list packages
and versions.
The recommended way to retrieve all packages is to use the `catalog`_ api endpoint.
It provides a first endpoint that list all available pages. We then iterate to get
content of related pages.
The recommended way to `retrieve all packages`_ is to use the `catalog`_ api endpoint.
It provides a `catalog index endpoint`_ that list all available pages. We then iterate to
get content of related pages.
The lister is incremental following a `cursor`_ principle, based on the value of
``commitTimeStamp`` from the catalog index endpoint. It retrieve only pages for which
``commitTimeStamp``is greater than ``lister.state.last_listing_date``.
Page listing
------------
@ -65,9 +69,12 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _nuget.org/packages: https://www.nuget.org/packages
.. _http api: https://api.nuget.org/v3/index.json
.. _catalog: https://learn.microsoft.com/en-us/nuget/api/catalog-resource
.. _catalog index endpoint: https://learn.microsoft.com/en-us/nuget/api/catalog-resource#catalog-page-object-in-the-index
.. _retrieve all packages: https://learn.microsoft.com/en-us/nuget/guides/api/query-for-all-published-packages#initialize-a-cursor
.. _cursor: https://learn.microsoft.com/en-us/nuget/api/catalog-resource#cursor
.. _package metadata: https://learn.microsoft.com/en-us/nuget/api/registration-base-url-resource
.. _package manifest: https://learn.microsoft.com/en-us/nuget/api/package-base-address-resource#download-package-manifest-nuspec # noqa: B950
"""
.. _package manifest: https://learn.microsoft.com/en-us/nuget/api/package-base-address-resource#download-package-manifest-nuspec
""" # noqa: B950
def register():

View file

@ -3,24 +3,36 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass
from datetime import datetime
import logging
from typing import Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional
from bs4 import BeautifulSoup
import iso8601
from requests.exceptions import HTTPError
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
NugetListerPage = List[Dict[str, str]]
class NugetLister(StatelessLister[NugetListerPage]):
@dataclass
class NugetListerState:
"""Store lister state for incremental mode operations"""
last_listing_date: Optional[datetime] = None
"""Last date from main http api endpoint when lister was executed"""
class NugetLister(Lister[NugetListerState, NugetListerPage]):
"""List Nuget (Package manager for .NET) origins."""
LISTER_NAME = "nuget"
@ -39,6 +51,20 @@ class NugetLister(StatelessLister[NugetListerPage]):
instance=self.INSTANCE,
url=self.API_INDEX_URL,
)
self.listing_date: Optional[datetime] = None
def state_from_dict(self, d: Dict[str, Any]) -> NugetListerState:
last_listing_date = d.get("last_listing_date")
if last_listing_date is not None:
d["last_listing_date"] = iso8601.parse_date(last_listing_date)
return NugetListerState(**d)
def state_to_dict(self, state: NugetListerState) -> Dict[str, Any]:
d: Dict[str, Optional[str]] = {"last_listing_date": None}
last_listing_date = state.last_listing_date
if last_listing_date is not None:
d["last_listing_date"] = last_listing_date.isoformat()
return d
def get_pages(self) -> Iterator[NugetListerPage]:
"""Yield an iterator which returns 'page'
@ -48,21 +74,33 @@ class NugetLister(StatelessLister[NugetListerPage]):
"""
index_response = self.http_request(url=self.url)
index = index_response.json()
assert "items" in index
assert "commitTimeStamp" in index
self.listing_date = iso8601.parse_date(index["commitTimeStamp"])
assert "items" in index
for page in index["items"]:
assert page["@id"]
try:
page_response = self.http_request(url=page["@id"])
page_data = page_response.json()
assert "items" in page_data
yield page_data["items"]
except HTTPError:
logger.warning(
"Failed to fetch page %s, skipping it from listing.",
page["@id"],
)
continue
assert page["commitTimeStamp"]
commit_timestamp = iso8601.parse_date(page["commitTimeStamp"])
if (
not self.state.last_listing_date
or commit_timestamp > self.state.last_listing_date
):
try:
page_response = self.http_request(url=page["@id"])
page_data = page_response.json()
assert "items" in page_data
yield page_data["items"]
except HTTPError:
logger.warning(
"Failed to fetch page %s, skipping it from listing.",
page["@id"],
)
continue
def get_origins_from_page(self, page: NugetListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances.
@ -91,6 +129,7 @@ class NugetLister(StatelessLister[NugetListerPage]):
f"https://api.nuget.org/v3-flatcontainer/{pkgname.lower()}/"
f"{data['version'].lower()}/{pkgname.lower()}.nuspec"
)
try:
res_metadata = self.http_request(url=nuspec_url)
except HTTPError:
@ -104,11 +143,16 @@ class NugetLister(StatelessLister[NugetListerPage]):
if repo and "url" in repo.attrs and "type" in repo.attrs:
vcs_url = repo.attrs["url"]
vcs_type = repo.attrs["type"]
last_update = iso8601.parse_date(elt["commitTimeStamp"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=vcs_type,
url=vcs_url,
last_update=None,
last_update=last_update,
)
else:
continue
def finalize(self) -> None:
self.state.last_listing_date = self.listing_date
self.updated = True

View file

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
<metadata>
<id>Moq.AutoMock</id>
<version>3.5.0-ci0287</version>
<authors>Tim Kellogg, Adam Hewitt, Kevin Bost</authors>
<license type="file">LICENSE</license>
<licenseUrl>https://aka.ms/deprecateLicenseUrl</licenseUrl>
<projectUrl>https://github.com/moq/Moq.AutoMocker</projectUrl>
<description>An auto-mocking container that generates mocks using Moq</description>
<copyright>Copyright Tim Kellogg 2022</copyright>
<repository type="git" url="https://github.com/moq/Moq.AutoMocker" commit="5a8b5ab20a68dd549428a602e4c7e81434f3a906" />
<dependencies>
<group targetFramework=".NETFramework4.6.1">
<dependency id="Moq" version="4.18.2" exclude="Build,Analyzers" />
<dependency id="NonBlocking" version="2.1.0" exclude="Build,Analyzers" />
<dependency id="System.ValueTuple" version="4.5.0" exclude="Build,Analyzers" />
</group>
<group targetFramework=".NETStandard2.0">
<dependency id="Moq" version="4.18.2" exclude="Build,Analyzers" />
<dependency id="NonBlocking" version="2.1.0" exclude="Build,Analyzers" />
</group>
</dependencies>
</metadata>
</package>

View file

@ -0,0 +1,187 @@
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json",
"@type": [
"PackageDetails",
"catalog:Permalink"
],
"authors": "Tim Kellogg, Adam Hewitt, Kevin Bost",
"catalog:commitId": "de4b22b8-397b-4fa1-a160-db3a7c5b17cd",
"catalog:commitTimeStamp": "2022-10-10T04:04:00.6654802Z",
"copyright": "Copyright Tim Kellogg 2022",
"created": "2022-10-10T04:01:52.21Z",
"description": "An auto-mocking container that generates mocks using Moq",
"id": "Moq.AutoMock",
"isPrerelease": true,
"lastEdited": "2022-10-10T04:03:52.51Z",
"licenseFile": "LICENSE",
"licenseUrl": "https://aka.ms/deprecateLicenseUrl",
"listed": true,
"packageHash": "jtvxZ9lJGiNWCvKx4oZByy/knRu86ze833hZa2XvAbzYcSR3gSesdWgbGw1yNGDY0TuHobTETq/lorrtE2/pPA==",
"packageHashAlgorithm": "SHA512",
"packageSize": 70853,
"projectUrl": "https://github.com/moq/Moq.AutoMocker",
"published": "2022-10-10T04:01:52.21Z",
"repository": "",
"verbatimVersion": "3.5.0-ci0287",
"version": "3.5.0-ci0287",
"dependencyGroups": [
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1",
"@type": "PackageDependencyGroup",
"dependencies": [
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/moq",
"@type": "PackageDependency",
"id": "Moq",
"range": "[4.18.2, )"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/nonblocking",
"@type": "PackageDependency",
"id": "NonBlocking",
"range": "[2.1.0, )"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netframework4.6.1/system.valuetuple",
"@type": "PackageDependency",
"id": "System.ValueTuple",
"range": "[4.5.0, )"
}
],
"targetFramework": ".NETFramework4.6.1"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0",
"@type": "PackageDependencyGroup",
"dependencies": [
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0/moq",
"@type": "PackageDependency",
"id": "Moq",
"range": "[4.18.2, )"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#dependencygroup/.netstandard2.0/nonblocking",
"@type": "PackageDependency",
"id": "NonBlocking",
"range": "[2.1.0, )"
}
],
"targetFramework": ".NETStandard2.0"
}
],
"packageEntries": [
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#Moq.AutoMock.nuspec",
"@type": "PackageEntry",
"compressedLength": 567,
"fullName": "Moq.AutoMock.nuspec",
"length": 1287,
"name": "Moq.AutoMock.nuspec"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/net461/Moq.AutoMock.dll",
"@type": "PackageEntry",
"compressedLength": 17993,
"fullName": "lib/net461/Moq.AutoMock.dll",
"length": 41984,
"name": "Moq.AutoMock.dll"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/net461/Moq.AutoMock.xml",
"@type": "PackageEntry",
"compressedLength": 5031,
"fullName": "lib/net461/Moq.AutoMock.xml",
"length": 55041,
"name": "Moq.AutoMock.xml"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/netstandard2.0/Moq.AutoMock.dll",
"@type": "PackageEntry",
"compressedLength": 17927,
"fullName": "lib/netstandard2.0/Moq.AutoMock.dll",
"length": 41984,
"name": "Moq.AutoMock.dll"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#lib/netstandard2.0/Moq.AutoMock.xml",
"@type": "PackageEntry",
"compressedLength": 5031,
"fullName": "lib/netstandard2.0/Moq.AutoMock.xml",
"length": 55041,
"name": "Moq.AutoMock.xml"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#LICENSE",
"@type": "PackageEntry",
"compressedLength": 628,
"fullName": "LICENSE",
"length": 1068,
"name": "LICENSE"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#analyzers/dotnet/cs/Moq.AutoMocker.TestGenerator.dll",
"@type": "PackageEntry",
"compressedLength": 9686,
"fullName": "analyzers/dotnet/cs/Moq.AutoMocker.TestGenerator.dll",
"length": 25088,
"name": "Moq.AutoMocker.TestGenerator.dll"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json#.signature.p7s",
"@type": "PackageEntry",
"compressedLength": 11534,
"fullName": ".signature.p7s",
"length": 11534,
"name": ".signature.p7s"
}
],
"@context": {
"@vocab": "http://schema.nuget.org/schema#",
"catalog": "http://schema.nuget.org/catalog#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"dependencies": {
"@id": "dependency",
"@container": "@set"
},
"dependencyGroups": {
"@id": "dependencyGroup",
"@container": "@set"
},
"packageEntries": {
"@id": "packageEntry",
"@container": "@set"
},
"packageTypes": {
"@id": "packageType",
"@container": "@set"
},
"supportedFrameworks": {
"@id": "supportedFramework",
"@container": "@set"
},
"tags": {
"@id": "tag",
"@container": "@set"
},
"vulnerabilities": {
"@id": "vulnerability",
"@container": "@set"
},
"published": {
"@type": "xsd:dateTime"
},
"created": {
"@type": "xsd:dateTime"
},
"lastEdited": {
"@type": "xsd:dateTime"
},
"catalog:commitTimeStamp": {
"@type": "xsd:dateTime"
},
"reasons": {
"@container": "@set"
}
}
}

View file

@ -0,0 +1,46 @@
{
"@id": "https://api.nuget.org/v3/catalog0/index.json",
"@type": [
"CatalogRoot",
"AppendOnlyCatalog",
"Permalink"
],
"commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
"commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
"count": 16959,
"nuget:lastCreated": "2022-10-10T04:20:52.8660454Z",
"nuget:lastDeleted": "2022-10-10T04:20:52.8660454Z",
"nuget:lastEdited": "2022-10-10T04:20:52.8660454Z",
"items": [
{
"@id": "https://api.nuget.org/v3/catalog0/page17100.json",
"@type": "CatalogPage",
"commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
"commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
"count": 545
}
],
"@context": {
"@vocab": "http://schema.nuget.org/catalog#",
"nuget": "http://schema.nuget.org/schema#",
"items": {
"@id": "item",
"@container": "@set"
},
"parent": {
"@type": "@id"
},
"commitTimeStamp": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
},
"nuget:lastCreated": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
},
"nuget:lastEdited": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
},
"nuget:lastDeleted": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
}
}
}

View file

@ -0,0 +1,49 @@
{
"@id": "https://api.nuget.org/v3/catalog0/page17100.json",
"@type": "CatalogPage",
"commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
"commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
"count": 545,
"parent": "https://api.nuget.org/v3/catalog0/index.json",
"items": [
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.04.00/moq.automock.3.5.0-ci0287.json",
"@type": "nuget:PackageDetails",
"commitId": "de4b22b8-397b-4fa1-a160-db3a7c5b17cd",
"commitTimeStamp": "2022-10-10T04:04:00.6654802Z",
"nuget:id": "Moq.AutoMock",
"nuget:version": "3.5.0-ci0287"
},
{
"@id": "https://api.nuget.org/v3/catalog0/data/2022.10.10.04.20.52/alzabox.api.sdk.0.0.13.json",
"@type": "nuget:PackageDetails",
"commitId": "b5e49ade-c7b8-482a-8a9b-3aee7bed9698",
"commitTimeStamp": "2022-10-10T04:20:52.8660454Z",
"nuget:id": "Alzabox.API.SDK",
"nuget:version": "0.0.13"
}
],
"@context": {
"@vocab": "http://schema.nuget.org/catalog#",
"nuget": "http://schema.nuget.org/schema#",
"items": {
"@id": "item",
"@container": "@set"
},
"parent": {
"@type": "@id"
},
"commitTimeStamp": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
},
"nuget:lastCreated": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
},
"nuget:lastEdited": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
},
"nuget:lastDeleted": {
"@type": "http://www.w3.org/2001/XMLSchema#dateTime"
}
}
}

View file

@ -6,6 +6,7 @@
from swh.lister.nuget.lister import NugetLister
expected_origins = ["https://github.com/sillsdev/libpalaso.git"]
expected_origins_incremental = ["https://github.com/moq/Moq.AutoMocker"]
def test_nuget_lister(datadir, requests_mock_datadir, swh_scheduler):
@ -32,3 +33,99 @@ def test_nuget_lister(datadir, requests_mock_datadir, swh_scheduler):
)
for url in expected_origins
]
def test_nuget_lister_incremental(datadir, requests_mock_datadir_visits, swh_scheduler):
# First run
lister = NugetLister(scheduler=swh_scheduler)
assert lister.state.last_listing_date is None
res = lister.run()
assert res.pages == 2
assert res.origins == 1
assert lister.state.last_listing_date
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert [
(
scheduled.visit_type,
scheduled.url,
)
for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
] == [
(
"git",
url,
)
for url in expected_origins
]
last_date = lister.state.last_listing_date
# Second run
lister = NugetLister(scheduler=swh_scheduler)
assert lister.state.last_listing_date == last_date
res = lister.run()
# One page and one new origin
assert lister.state.last_listing_date > last_date
assert res.pages == 1
assert res.origins == 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert [
(
scheduled.visit_type,
scheduled.url,
)
for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
] == [
(
"git",
url,
)
for url in sorted(expected_origins + expected_origins_incremental)
]
def test_nuget_lister_incremental_no_changes(
datadir, requests_mock_datadir, swh_scheduler
):
# First run
lister = NugetLister(scheduler=swh_scheduler)
assert lister.state.last_listing_date is None
res = lister.run()
assert res.pages == 2
assert res.origins == 1
assert lister.state.last_listing_date
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert [
(
scheduled.visit_type,
scheduled.url,
)
for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
] == [
(
"git",
url,
)
for url in expected_origins
]
last_date = lister.state.last_listing_date
# Second run
lister = NugetLister(scheduler=swh_scheduler)
assert lister.state.last_listing_date == last_date
res = lister.run()
# Nothing new
assert lister.state.last_listing_date == last_date
assert res.pages == 0
assert res.origins == 0