sourceforge: Fix origin URLs for CVS projects

CVS projects are different from other VCS ones, they use the rsync
protocol, a list of modules needs to be fetched from an info page
and multiple origin URLs can be produced for a same project.

Related to T3789
This commit is contained in:
Antoine Lambert 2022-02-15 22:16:45 +01:00
parent 4265e5dd77
commit 6a7479553e
5 changed files with 340 additions and 20 deletions

View file

@ -10,6 +10,7 @@ import re
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from xml.etree import ElementTree
from bs4 import BeautifulSoup
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
@ -360,6 +361,35 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
tool_name = tool["name"]
if tool_name not in VCS_NAMES:
continue
if tool_name == VcsNames.CVS.value:
# CVS projects are different from other VCS ones, they use the rsync
# protocol, a list of modules needs to be fetched from an info page
# and multiple origin URLs can be produced for a same project.
cvs_info_url = f"http://{project}.cvs.sourceforge.net"
try:
response = self.page_request(cvs_info_url, params={})
except requests.HTTPError:
logger.warning(
"CVS info page could not be fetched, skipping project '%s'",
project,
)
continue
else:
bs = BeautifulSoup(response.text, features="html.parser")
cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
for text in [b.text for b in bs.find_all("b")]:
match = re.search(fr".*/cvsroot/{project} co -P (.+)", text)
if match is not None:
module = match.group(1)
url = f"{cvs_base_url}/{project}/{module}"
hits.append(
SourceForgeListerEntry(
vcs=VcsNames(tool_name),
url=url,
last_modified=last_modified,
)
)
continue
url = CLONE_URL_FORMAT.format(
vcs=tool_name,
namespace=namespace,

View file

@ -0,0 +1,23 @@
<html><head>
<meta name="generator" content="cvs-info" />
<meta name="description" content="The world's largest development and download repository of Open Source code and applications" />
<meta name="keywords" content="Open Source, Development, Developers, Projects, Downloads, OSTG, VA Software, SF.net, SourceForge" />
<title>CVS Info for project aaron</title>
<link rel="shortcut icon" href="https://sourceforge.net/favicon.ico" />
</head>
<body>
<p> The aaron project's CVS data is in read-only mode, so the project may have switched over to another source-code-management system. To check, visit the <a href="https://sourceforge.net/projects/aaron">Project Summary Page for aaron</a> and see if the menubar lists a newer code repository, such as SVN or Git.
<p>The CVS data can be accessed as follows.
You can run a per-module CVS checkout via pserver protocol:
<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P aaron</b></li>
<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P www</b></li>
<p>You can view a list of files or copy all the CVS repository data via rsync (the 1st command lists the files, the 2nd copies):
<li><b>rsync -a a.cvs.sourceforge.net::cvsroot/aaron/</b></li>
<li><b>rsync -ai a.cvs.sourceforge.net::cvsroot/aaron/ /my/local/dest/dir/</b></li>
<p>If you are a project admin for aaron, you can request that this page redirect to another repo on your project by submitting a <a href="https://sourceforge.net/support">support request</a>.

View file

@ -0,0 +1,236 @@
{
"shortname": "aaron",
"name": "Aaron: the app, service, and net monitor",
"_id": "5139010d5fcbc97960fd66bb",
"url": "https://sourceforge.net/p/aaron/",
"private": false,
"short_description": "Aaron is an application, service, and network availability monitoring and alert daemon. Notification of unavailable services, networks, etc., levels is sent to the appropriate roles. Aaron is highly customizable enterprise class monitoring software.",
"creation_date": "2001-06-24",
"summary": "",
"external_homepage": "http://aaron.sourceforge.net",
"video_url": "",
"socialnetworks": [],
"status": "active",
"moved_to_url": "",
"preferred_support_tool": "",
"preferred_support_url": "",
"developers": [
{
"username": "kapelmeister",
"name": "Steve Nickels",
"url": "https://sourceforge.net/u/kapelmeister/"
},
{
"username": "thetitan",
"name": "Sean Chittenden",
"url": "https://sourceforge.net/u/thetitan/"
},
{
"username": "stwalker",
"name": "Scott Walker",
"url": "https://sourceforge.net/u/stwalker/"
}
],
"tools": [
{
"name": "support",
"mount_point": "support",
"url": "/p/aaron/support/",
"icons": {
"24": "images/sftheme/24x24/blog_24.png",
"32": "images/sftheme/32x32/blog_32.png",
"48": "images/sftheme/48x48/blog_48.png"
},
"installable": false,
"tool_label": "Support",
"mount_label": "Support"
},
{
"name": "mailman",
"mount_point": "mailman",
"url": "/p/aaron/mailman/",
"icons": {
"24": "images/forums_24.png",
"32": "images/forums_32.png",
"48": "images/forums_48.png"
},
"installable": false,
"tool_label": "Mailing Lists",
"mount_label": "Mailing Lists"
},
{
"name": "reviews",
"mount_point": "reviews",
"url": "/p/aaron/reviews/",
"icons": {
"24": "images/sftheme/24x24/blog_24.png",
"32": "images/sftheme/32x32/blog_32.png",
"48": "images/sftheme/48x48/blog_48.png"
},
"installable": false,
"tool_label": "Reviews",
"mount_label": "Reviews"
},
{
"name": "wiki",
"mount_point": "wiki",
"url": "/p/aaron/wiki/",
"icons": {
"24": "images/wiki_24.png",
"32": "images/wiki_32.png",
"48": "images/wiki_48.png"
},
"installable": true,
"tool_label": "Wiki",
"mount_label": "Wiki"
},
{
"name": "summary",
"mount_point": "summary",
"url": "/p/aaron/summary/",
"icons": {
"24": "images/sftheme/24x24/blog_24.png",
"32": "images/sftheme/32x32/blog_32.png",
"48": "images/sftheme/48x48/blog_48.png"
},
"installable": false,
"tool_label": "Summary",
"mount_label": "Summary",
"sourceforge_group_id": 29993
},
{
"name": "files-sf",
"mount_point": "files",
"url": "/p/aaron/files/",
"icons": {
"24": "images/downloads_24.png",
"32": "images/downloads_32.png",
"48": "images/downloads_48.png"
},
"installable": false,
"tool_label": "Files",
"mount_label": "Files"
},
{
"name": "cvs",
"mount_point": "code",
"url": "/p/aaron/code/",
"icons": {
"24": "images/code_24.png",
"32": "images/code_32.png",
"48": "images/code_48.png"
},
"installable": false,
"tool_label": "CVS",
"mount_label": "Code"
},
{
"name": "activity",
"mount_point": "activity",
"url": "/p/aaron/activity/",
"icons": {
"24": "images/admin_24.png",
"32": "images/admin_32.png",
"48": "images/admin_48.png"
},
"installable": false,
"tool_label": "Tool",
"mount_label": "Activity"
},
{
"name": "discussion",
"mount_point": "discussion",
"url": "/p/aaron/discussion/",
"icons": {
"24": "images/forums_24.png",
"32": "images/forums_32.png",
"48": "images/forums_48.png"
},
"installable": true,
"tool_label": "Discussion",
"mount_label": "Discussion"
}
],
"labels": [],
"categories": {
"audience": [
{
"id": 4,
"shortname": "sysadmins",
"fullname": "System Administrators",
"fullpath": "Intended Audience :: by End-User Class :: System Administrators"
}
],
"developmentstatus": [
{
"id": 8,
"shortname": "prealpha",
"fullname": "2 - Pre-Alpha",
"fullpath": "Development Status :: 2 - Pre-Alpha"
},
{
"id": 7,
"shortname": "planning",
"fullname": "1 - Planning",
"fullpath": "Development Status :: 1 - Planning"
}
],
"environment": [
{
"id": 238,
"shortname": "daemon",
"fullname": "Non-interactive (Daemon)",
"fullpath": "User Interface :: Non-interactive (Daemon)"
}
],
"language": [
{
"id": 164,
"shortname": "c",
"fullname": "C",
"fullpath": "Programming Language :: C"
},
{
"id": 293,
"shortname": "ruby",
"fullname": "Ruby",
"fullpath": "Programming Language :: Ruby"
}
],
"license": [
{
"id": 296,
"shortname": "apache",
"fullname": "Apache Software License",
"fullpath": "License :: OSI-Approved Open Source :: Apache Software License"
}
],
"translation": [
{
"id": 275,
"shortname": "english",
"fullname": "English",
"fullpath": "Translations :: English"
}
],
"os": [
{
"id": 235,
"shortname": "independent",
"fullname": "OS Independent (Written in an interpreted language)",
"fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Independent (Written in an interpreted language)"
}
],
"database": [],
"topic": [
{
"id": 152,
"shortname": "monitoring",
"fullname": "Monitoring",
"fullpath": "Topic :: System :: Networking :: Monitoring"
}
]
},
"icon_url": null,
"screenshots": []
}

View file

@ -1,5 +1,20 @@
<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://sourceforge.net/projects/aaron/files/</loc>
<lastmod>2013-03-07</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/aaron/home/</loc>
<lastmod>2013-03-07</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/aaron/tickets/</loc>
<lastmod>2013-03-07</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/projects/os3dmodels/files/</loc>
<lastmod>2017-03-31</lastmod>

View file

@ -26,6 +26,7 @@ from swh.lister.utils import WAIT_EXP_BASE
from swh.scheduler.model import ListedOrigin
TEST_PROJECTS = {
"aaron": "p",
"adobexmp": "adobe",
"backapps": "p",
"backapps/website": "p",
@ -62,6 +63,10 @@ def get_project_json(datadir, request, context):
return json.loads(Path(datadir, f"{project}.json").read_text())
def get_cvs_info_page(datadir):
return Path(datadir, "aaron.html").read_text()
def _check_request_headers(request):
return request.headers.get("User-Agent") == USER_AGENT
@ -81,6 +86,8 @@ def _check_listed_origins(lister, swh_scheduler):
"https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
"http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"),
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"),
}
@ -114,6 +121,11 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
json=functools.partial(get_project_json, datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"),
text=get_cvs_info_page(datadir),
additional_matcher=_check_request_headers,
)
stats = lister.run()
# - os3dmodels (2 repos),
@ -123,8 +135,8 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
# - random-mercurial (1 repo).
# - bzr-repo (1 repo).
# adobe and backapps itself have no repos.
assert stats.pages == 6
assert stats.origins == 11
assert stats.pages == 7
assert stats.origins == 13
expected_state = {
"subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
@ -178,6 +190,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
additional_matcher=_check_request_headers,
)
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"),
text=get_cvs_info_page(datadir),
additional_matcher=_check_request_headers,
)
faked_listed_origins = [
# mramm: changed
ListedOrigin(
@ -272,8 +290,8 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
stats = lister.run()
# - mramm (3 repos), # changed
assert stats.pages == 1
assert stats.origins == 3
assert stats.pages == 2
assert stats.origins == 5
expected_state = {
"subsitemap_last_modified": {
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
@ -322,6 +340,12 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
additional_matcher=_check_request_headers,
)
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"),
text=get_cvs_info_page(datadir),
additional_matcher=_check_request_headers,
)
stats = lister.run()
# - os3dmodels (2 repos),
# - mramm (3 repos),
@ -330,23 +354,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
# - random-mercurial (1 repo).
# - bzr-repo (1 repo).
# adobe and backapps itself have no repos.
assert stats.pages == 6
assert stats.origins == 11
assert stats.pages == 7
assert stats.origins == 13
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert {o.url: o.visit_type for o in scheduler_origins} == {
"https://svn.code.sf.net/p/backapps/website/code": "svn",
"https://git.code.sf.net/p/os3dmodels/git": "git",
"https://svn.code.sf.net/p/os3dmodels/svn": "svn",
"https://git.code.sf.net/p/mramm/files": "git",
"https://git.code.sf.net/p/mramm/git": "git",
"https://svn.code.sf.net/p/mramm/svn": "svn",
"https://git.code.sf.net/p/mojunk/git": "git",
"https://git.code.sf.net/p/mojunk/git2": "git",
"https://svn.code.sf.net/p/mojunk/svn": "svn",
"http://hg.code.sf.net/p/random-mercurial/hg": "hg",
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr",
}
_check_listed_origins(lister, swh_scheduler)
# Test `time.sleep` is called with exponential retries
assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1])
@ -408,6 +419,11 @@ def test_sourceforge_lister_project_error(
re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code
)
# Make request to CVS info page fail
requests_mock.get(
re.compile("http://aaron.cvs.sourceforge.net/"), status_code=status_code
)
stats = lister.run()
# - os3dmodels (2 repos),
# - mojunk (3 repos),