Make the SourceForge lister incremental

SourceForge's sitemaps (1 main one + many sharded) give us a "last
modified" date for every subsitemap and project, allowing us to perform
an incremental listing.

We store the subsitemaps' "last modified" dates in the lister state, as
well as those of the empty projects (projects which don't have any VCS
registered), and the rest comes from the already visited origins from
the database.

The tests try to cover the possible cases of a subsitemap that has
changed, one that hasn't, a project that has change, one that hasn't,
and same for an empty project.
This commit is contained in:
Raphaël Gomès 2021-04-30 21:46:29 +02:00
parent 6f8dd5d3f2
commit 3baf1d0999
3 changed files with 316 additions and 22 deletions

View file

@ -2,24 +2,25 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass
from dataclasses import dataclass, field
import datetime
from enum import Enum
import logging
import re
from typing import Iterator, List, Set
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from xml.etree import ElementTree
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.core.api.classes import stream_results
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import StatelessLister
from ..pattern import Lister
logger = logging.getLogger(__name__)
@ -45,6 +46,30 @@ class SourceForgeListerEntry:
last_modified: datetime.date
SubSitemapNameT = str
ProjectNameT = str
# SourceForge only offers day-level granularity, which is good enough for our purposes
LastModifiedT = datetime.date
@dataclass
class SourceForgeListerState:
"""Current state of the SourceForge lister in incremental runs
"""
"""If the subsitemap does not exist, we assume a full run of this subsitemap
is needed. If the date is the same, we skip the subsitemap, otherwise we
request the subsitemap and look up every project's "last modified" date
to compare against `ListedOrigins` from the database."""
subsitemap_last_modified: Dict[SubSitemapNameT, LastModifiedT] = field(
default_factory=dict
)
"""Some projects (not the majority, but still meaningful) have no VCS for us to
archive. We need to remember a mapping of their API URL to their "last modified"
date so we don't keep querying them needlessly every time."""
empty_projects: Dict[str, LastModifiedT] = field(default_factory=dict)
SourceForgeListerPage = List[SourceForgeListerEntry]
MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
@ -71,8 +96,11 @@ PROJ_URL_RE = re.compile(
r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?"
)
# Mapping of `(namespace, project name)` to `last modified` date.
ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT]
class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
"""List origins from the "SourceForge" forge.
"""
@ -80,16 +108,75 @@ class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
# Part of the lister API, that identifies this lister
LISTER_NAME = "sourceforge"
def __init__(self, scheduler: SchedulerInterface):
def __init__(self, scheduler: SchedulerInterface, incremental: bool = False):
super().__init__(
scheduler=scheduler, url="https://sourceforge.net", instance="main"
)
# Will hold the currently saved "last modified" dates to compare against our
# requests.
self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
self.incremental = incremental
def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
subsitemaps = {
k: datetime.date.fromisoformat(v)
for k, v in d.get("subsitemap_last_modified", {}).items()
}
empty_projects = {
k: datetime.date.fromisoformat(v)
for k, v in d.get("empty_projects", {}).items()
}
return SourceForgeListerState(
subsitemap_last_modified=subsitemaps, empty_projects=empty_projects
)
def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]:
return {
"subsitemap_last_modified": {
k: v.isoformat() for k, v in state.subsitemap_last_modified.items()
},
"empty_projects": {
k: v.isoformat() for k, v in state.empty_projects.items()
},
}
def projects_last_modified(self) -> ProjectsLastModifiedCache:
if not self.incremental:
# No point in loading the previous results if we're doing a full run
return {}
if self._project_last_modified is not None:
return self._project_last_modified
# We know there will be at least that many origins
stream = stream_results(
self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000
)
listed_origins = dict()
# Projects can have slashes in them if they're subprojects, but the
# mointpoint (last component) cannot.
url_match = re.compile(
r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*"
)
for origin in stream:
url = origin.url
match = url_match.match(url)
assert match is not None
matches = match.groupdict()
namespace = matches["namespace"]
project = matches["project"]
# "Last modified" dates are the same across all VCS (tools, even)
# within a project or subproject. An assertion here would be overkill.
last_modified = origin.last_update
assert last_modified is not None
listed_origins[(namespace, project)] = last_modified.date()
self._project_last_modified = listed_origins
return listed_origins
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
@ -126,11 +213,21 @@ class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
tree = ElementTree.fromstring(sitemap_contents)
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
# TODO use when adding incremental support
# last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
assert last_modified_el is not None and last_modified_el.text is not None
last_modified = datetime.date.fromisoformat(last_modified_el.text)
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
assert location is not None
assert location is not None and location.text is not None
sub_url = location.text
if self.incremental:
recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)
if recorded_last_mod == last_modified:
# The entire subsitemap hasn't changed, so none of its projects
# have either, skip it.
continue
self.state.subsitemap_last_modified[sub_url] = last_modified
subsitemap_contents = self.page_request(sub_url, {}).text
subtree = ElementTree.fromstring(subsitemap_contents)
@ -151,7 +248,7 @@ class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
def _get_pages_from_subsitemap(
self, subtree: ElementTree.Element
) -> Iterator[SourceForgeListerPage]:
projects: Set[str] = set()
projects: Set[ProjectNameT] = set()
for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
assert last_modified_block is not None
@ -197,6 +294,28 @@ class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
self, namespace, project, last_modified
) -> SourceForgeListerPage:
endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project)
empty_project_last_modified = self.state.empty_projects.get(endpoint)
if empty_project_last_modified is not None:
if last_modified == empty_project_last_modified.isoformat():
# Project has not changed, so is still empty, meaning it has
# no VCS attached that we can archive.
logger.debug(f"Project {namespace}/{project} is still empty")
return []
if self.incremental:
expected = self.projects_last_modified().get((namespace, project))
if expected is not None:
if expected.isoformat() == last_modified:
# Project has not changed
logger.debug(f"Project {namespace}/{project} has not changed")
return []
else:
logger.debug(f"Project {namespace}/{project} was updated")
else:
msg = "New project during an incremental run: %s/%s"
logger.debug(msg, namespace, project)
res = self.page_request(endpoint, {}).json()
tools = res.get("tools")
@ -221,4 +340,10 @@ class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
)
hits.append(entry)
if not hits:
date = datetime.date.fromisoformat(last_modified)
self.state.empty_projects[endpoint] = date
else:
self.state.empty_projects.pop(endpoint, None)
return hits

View file

@ -55,4 +55,15 @@
<lastmod>2017-10-17</lastmod>
<changefreq>daily</changefreq>
</url>
<!-- Copied from subsitemap-1 to test an update to an empty project -->
<url>
<loc>https://sourceforge.net/projects/backapps/files/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/backapps/tickets/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
</urlset>

View file

@ -2,11 +2,13 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import functools
import json
from pathlib import Path
import re
from iso8601 import iso8601
import pytest
from requests.exceptions import HTTPError
@ -15,9 +17,12 @@ from swh.lister.sourceforge.lister import (
MAIN_SITEMAP_URL,
PROJECT_API_URL_FORMAT,
SourceForgeLister,
SourceForgeListerState,
)
# Mapping of project name to namespace
from swh.scheduler.model import ListedOrigin
TEST_PROJECTS = {
"adobexmp": "adobe",
"backapps": "p",
@ -57,6 +62,22 @@ def _check_request_headers(request):
return request.headers.get("User-Agent") == USER_AGENT
def _check_listed_origins(lister, swh_scheduler):
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
assert res == {
"svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"),
"git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"),
"svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"),
"git.code.sf.net/p/mramm/files": ("git", "2019-04-04"),
"git.code.sf.net/p/mramm/git": ("git", "2019-04-04"),
"svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"),
"git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"),
"git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
"svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
}
def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
"""
Simulate a full listing of an artificially restricted sourceforge.
@ -96,20 +117,157 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
# adobe and backapps itself have no repos.
assert stats.pages == 4
assert stats.origins == 9
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
assert res == {
"svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"),
"git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"),
"svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"),
"git.code.sf.net/p/mramm/files": ("git", "2019-04-04"),
"git.code.sf.net/p/mramm/git": ("git", "2019-04-04"),
"svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"),
"git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"),
"git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
"svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
expected_state = {
"subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
"https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18",
},
"empty_projects": {
"https://sourceforge.net/rest/p/backapps": "2021-02-11",
"https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17",
},
}
assert lister.state_to_dict(lister.state) == expected_state
_check_listed_origins(lister, swh_scheduler)
def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, mocker):
"""
Simulate an incremental listing of an artificially restricted sourceforge.
Same dataset as the full run, because it's enough to validate the different cases.
"""
lister = SourceForgeLister(scheduler=swh_scheduler, incremental=True)
requests_mock.get(
MAIN_SITEMAP_URL,
text=get_main_sitemap(datadir),
additional_matcher=_check_request_headers,
)
def not_called(request, *args, **kwargs):
raise AssertionError(f"Should not have been called: '{request.url}'")
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml",
text=get_subsitemap_0(datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml",
text=not_called,
additional_matcher=_check_request_headers,
)
def filtered_get_project_json(request, context):
# These projects should not be requested again
assert URLS_MATCHER[request.url] not in {"adobe", "mojunk"}
return get_project_json(datadir, request, context)
requests_mock.get(
re.compile("https://sourceforge.net/rest/.*"),
json=filtered_get_project_json,
additional_matcher=_check_request_headers,
)
faked_listed_origins = [
# mramm: changed
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="git",
url="git.code.sf.net/p/mramm/files",
last_update=iso8601.parse_date("2019-01-01"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="git",
url="git.code.sf.net/p/mramm/git",
last_update=iso8601.parse_date("2019-01-01"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="svn",
url="svn.code.sf.net/p/mramm/svn",
last_update=iso8601.parse_date("2019-01-01"),
),
# stayed the same, even though its subsitemap has changed
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="git",
url="git.code.sf.net/p/os3dmodels/git",
last_update=iso8601.parse_date("2017-03-31"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="svn",
url="svn.code.sf.net/p/os3dmodels/svn",
last_update=iso8601.parse_date("2017-03-31"),
),
# others: stayed the same, should be skipped
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="git",
url="git.code.sf.net/p/mojunk/git",
last_update=iso8601.parse_date("2017-12-31"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="git",
url="git.code.sf.net/p/mojunk/git2",
last_update=iso8601.parse_date("2017-12-31"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="svn",
url="svn.code.sf.net/p/mojunk/svn",
last_update=iso8601.parse_date("2017-12-31"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="svn",
url="svn.code.sf.net/p/backapps/website/code",
last_update=iso8601.parse_date("2021-02-11"),
),
]
swh_scheduler.record_listed_origins(faked_listed_origins)
to_date = datetime.date.fromisoformat
faked_state = SourceForgeListerState(
subsitemap_last_modified={
# changed
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": to_date(
"2021-02-18"
),
# stayed the same
"https://sourceforge.net/allura_sitemap/sitemap-1.xml": to_date(
"2021-03-18"
),
},
empty_projects={
"https://sourceforge.net/rest/p/backapps": to_date("2020-02-11"),
"https://sourceforge.net/rest/adobe/adobexmp": to_date("2017-10-17"),
},
)
lister.state = faked_state
stats = lister.run()
# - mramm (3 repos), # changed
assert stats.pages == 1
assert stats.origins == 3
expected_state = {
"subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
"https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18",
},
"empty_projects": {
"https://sourceforge.net/rest/p/backapps": "2021-02-11", # changed
"https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17",
},
}
assert lister.state_to_dict(lister.state) == expected_state
# origins have been updated
_check_listed_origins(lister, swh_scheduler)
def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir):