sourceforge: fix support for listing bzr origins

Bazaar support was removed a long time ago and predates a lot of the new
mechanisms in place in the API. Unfortunately, it looks like a lot of
the URLs are offline now, but there are still a few projects that can be
listed, this is pretty low-effort.
This commit is contained in:
Raphaël Gomès 2022-02-14 14:52:55 +01:00
parent b7524bbae0
commit 31b4429ced
4 changed files with 100 additions and 10 deletions

View file

@ -84,6 +84,9 @@ PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}"
# Predictable URL for cloning (in the broad sense) a VCS registered for the project.
#
# Warning: does not apply to bzr repos, and Mercurial are http only, see use of this
# constant below.
#
# `vcs`: VCS type, one of `VCS_NAMES`
# `namespace`: Project namespace. Very often `p`, but can be something else like
# `adobe`.
@ -170,13 +173,24 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
url_match = re.compile(
r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*"
)
bzr_url_match = re.compile(
r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzrroot/([^/]+)"
)
for origin in stream:
url = origin.url
match = url_match.match(url)
assert match is not None
matches = match.groupdict()
namespace = matches["namespace"]
project = matches["project"]
if match is None:
# Should be a bzr special endpoint
match = bzr_url_match.match(url)
assert match is not None
matches = match.groupdict()
project = matches["project"]
namespace = "p" # no special namespacing for bzr projects
else:
matches = match.groupdict()
namespace = matches["namespace"]
project = matches["project"]
# "Last modified" dates are the same across all VCS (tools, even)
# within a project or subproject. An assertion here would be overkill.
last_modified = origin.last_update
@ -356,6 +370,11 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
# SourceForge does not yet support anonymous HTTPS cloning for Mercurial
# See https://sourceforge.net/p/forge/feature-requests/727/
url = url.replace("https://", "http://")
if tool_name == VcsNames.BAZAAR.value:
# SourceForge has removed support for bzr and only keeps legacy projects
# around at a separate (also not https) URL. Bzr projects are very rare
# and a lot of them are 404 now.
url = f"http://{project}.bzr.sourceforge.net/bzrroot/{project}"
entry = SourceForgeListerEntry(
vcs=VcsNames(tool_name), url=url, last_modified=last_modified
)

View file

@ -0,0 +1,53 @@
{
"shortname": "bzr-repo",
"name": "Bazaar repo",
"_id": "4bf3fc291be1ce2f10000052",
"url": "https://sourceforge.net/p/bzr-repo/",
"private": false,
"short_description": "This is an example bzr project",
"creation_date": "2009-10-10",
"summary": "",
"external_homepage": "",
"video_url": "",
"socialnetworks": [],
"status": "active",
"moved_to_url": "",
"preferred_support_tool": "",
"preferred_support_url": "",
"developers": [
{
"username": "Alphare",
"name": "Raphaël Gomès",
"url": "https://sourceforge.net/u/alphare/"
}
],
"tools": [
{
"name": "bzr",
"mount_point": "bzr",
"url": "/p/bzr-repo/bazaar/",
"icons": {
"24": "images/code_24.png",
"32": "images/code_32.png",
"48": "images/code_48.png"
},
"installable": true,
"tool_label": "Bazaar",
"mount_label": "Bazaar"
}
],
"labels": [],
"categories": {
"audience": [],
"developmentstatus": [],
"environment": [],
"language": [],
"license": [],
"translation": [],
"os": [],
"database": [],
"topic": []
},
"icon_url": null,
"screenshots": []
}

View file

@ -40,4 +40,9 @@
<lastmod>2019-05-02</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/bzr-repo/</loc>
<lastmod>2021-01-27</lastmod>
<changefreq>daily</changefreq>
</url>
</urlset>

View file

@ -29,6 +29,7 @@ TEST_PROJECTS = {
"adobexmp": "adobe",
"backapps": "p",
"backapps/website": "p",
"bzr-repo": "p",
"mojunk": "p",
"mramm": "p",
"os3dmodels": "p",
@ -79,6 +80,7 @@ def _check_listed_origins(lister, swh_scheduler):
"https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
"https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
"http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
}
@ -119,9 +121,10 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
# - mojunk (3 repos),
# - backapps/website (1 repo),
# - random-mercurial (1 repo).
# - bzr-repo (1 repo).
# adobe and backapps itself have no repos.
assert stats.pages == 5
assert stats.origins == 10
assert stats.pages == 6
assert stats.origins == 11
expected_state = {
"subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
@ -239,6 +242,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
url="http://hg.code.sf.net/p/random-mercurial/hg",
last_update=iso8601.parse_date("2019-05-02"),
),
ListedOrigin(
lister_id=lister.lister_obj.id,
visit_type="bzr",
url="http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo",
last_update=iso8601.parse_date("2021-01-27"),
),
]
swh_scheduler.record_listed_origins(faked_listed_origins)
@ -319,9 +328,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
# - mojunk (3 repos),
# - backapps/website (1 repo),
# - random-mercurial (1 repo).
# - bzr-repo (1 repo).
# adobe and backapps itself have no repos.
assert stats.pages == 5
assert stats.origins == 10
assert stats.pages == 6
assert stats.origins == 11
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert {o.url: o.visit_type for o in scheduler_origins} == {
@ -335,6 +345,7 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
"https://git.code.sf.net/p/mojunk/git2": "git",
"https://svn.code.sf.net/p/mojunk/svn": "svn",
"http://hg.code.sf.net/p/random-mercurial/hg": "hg",
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr",
}
# Test `time.sleep` is called with exponential retries
@ -402,10 +413,11 @@ def test_sourceforge_lister_project_error(
# - mojunk (3 repos),
# - backapps/website (1 repo),
# - random-mercurial (1 repo).
# - bzr-repo (1 repo).
# adobe and backapps itself have no repos.
# Did *not* list mramm
assert stats.pages == 4
assert stats.origins == 7
assert stats.pages == 5
assert stats.origins == 8
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
@ -418,4 +430,5 @@ def test_sourceforge_lister_project_error(
"https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
"https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
"http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
}