sourceforge: Fix origin URLs for CVS projects
CVS projects are different from other VCS ones, they use the rsync protocol, a list of modules needs to be fetched from an info page and multiple origin URLs can be produced for a same project. Related to T3789
This commit is contained in:
parent
4265e5dd77
commit
6a7479553e
5 changed files with 340 additions and 20 deletions
|
@ -10,6 +10,7 @@ import re
|
|||
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
|
@ -360,6 +361,35 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
|
|||
tool_name = tool["name"]
|
||||
if tool_name not in VCS_NAMES:
|
||||
continue
|
||||
if tool_name == VcsNames.CVS.value:
|
||||
# CVS projects are different from other VCS ones, they use the rsync
|
||||
# protocol, a list of modules needs to be fetched from an info page
|
||||
# and multiple origin URLs can be produced for a same project.
|
||||
cvs_info_url = f"http://{project}.cvs.sourceforge.net"
|
||||
try:
|
||||
response = self.page_request(cvs_info_url, params={})
|
||||
except requests.HTTPError:
|
||||
logger.warning(
|
||||
"CVS info page could not be fetched, skipping project '%s'",
|
||||
project,
|
||||
)
|
||||
continue
|
||||
else:
|
||||
bs = BeautifulSoup(response.text, features="html.parser")
|
||||
cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
|
||||
for text in [b.text for b in bs.find_all("b")]:
|
||||
match = re.search(fr".*/cvsroot/{project} co -P (.+)", text)
|
||||
if match is not None:
|
||||
module = match.group(1)
|
||||
url = f"{cvs_base_url}/{project}/{module}"
|
||||
hits.append(
|
||||
SourceForgeListerEntry(
|
||||
vcs=VcsNames(tool_name),
|
||||
url=url,
|
||||
last_modified=last_modified,
|
||||
)
|
||||
)
|
||||
continue
|
||||
url = CLONE_URL_FORMAT.format(
|
||||
vcs=tool_name,
|
||||
namespace=namespace,
|
||||
|
|
23
swh/lister/sourceforge/tests/data/aaron.html
Normal file
23
swh/lister/sourceforge/tests/data/aaron.html
Normal file
|
@ -0,0 +1,23 @@
|
|||
<html><head>
|
||||
<meta name="generator" content="cvs-info" />
|
||||
<meta name="description" content="The world's largest development and download repository of Open Source code and applications" />
|
||||
<meta name="keywords" content="Open Source, Development, Developers, Projects, Downloads, OSTG, VA Software, SF.net, SourceForge" />
|
||||
|
||||
<title>CVS Info for project aaron</title>
|
||||
|
||||
<link rel="shortcut icon" href="https://sourceforge.net/favicon.ico" />
|
||||
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<p> The aaron project's CVS data is in read-only mode, so the project may have switched over to another source-code-management system. To check, visit the <a href="https://sourceforge.net/projects/aaron">Project Summary Page for aaron</a> and see if the menubar lists a newer code repository, such as SVN or Git.
|
||||
|
||||
<p>The CVS data can be accessed as follows.
|
||||
You can run a per-module CVS checkout via pserver protocol:
|
||||
<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P aaron</b></li>
|
||||
<li><b>cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P www</b></li>
|
||||
<p>You can view a list of files or copy all the CVS repository data via rsync (the 1st command lists the files, the 2nd copies):
|
||||
<li><b>rsync -a a.cvs.sourceforge.net::cvsroot/aaron/</b></li>
|
||||
<li><b>rsync -ai a.cvs.sourceforge.net::cvsroot/aaron/ /my/local/dest/dir/</b></li>
|
||||
|
||||
<p>If you are a project admin for aaron, you can request that this page redirect to another repo on your project by submitting a <a href="https://sourceforge.net/support">support request</a>.
|
236
swh/lister/sourceforge/tests/data/aaron.json
Normal file
236
swh/lister/sourceforge/tests/data/aaron.json
Normal file
|
@ -0,0 +1,236 @@
|
|||
{
|
||||
"shortname": "aaron",
|
||||
"name": "Aaron: the app, service, and net monitor",
|
||||
"_id": "5139010d5fcbc97960fd66bb",
|
||||
"url": "https://sourceforge.net/p/aaron/",
|
||||
"private": false,
|
||||
"short_description": "Aaron is an application, service, and network availability monitoring and alert daemon. Notification of unavailable services, networks, etc., levels is sent to the appropriate roles. Aaron is highly customizable enterprise class monitoring software.",
|
||||
"creation_date": "2001-06-24",
|
||||
"summary": "",
|
||||
"external_homepage": "http://aaron.sourceforge.net",
|
||||
"video_url": "",
|
||||
"socialnetworks": [],
|
||||
"status": "active",
|
||||
"moved_to_url": "",
|
||||
"preferred_support_tool": "",
|
||||
"preferred_support_url": "",
|
||||
"developers": [
|
||||
{
|
||||
"username": "kapelmeister",
|
||||
"name": "Steve Nickels",
|
||||
"url": "https://sourceforge.net/u/kapelmeister/"
|
||||
},
|
||||
{
|
||||
"username": "thetitan",
|
||||
"name": "Sean Chittenden",
|
||||
"url": "https://sourceforge.net/u/thetitan/"
|
||||
},
|
||||
{
|
||||
"username": "stwalker",
|
||||
"name": "Scott Walker",
|
||||
"url": "https://sourceforge.net/u/stwalker/"
|
||||
}
|
||||
],
|
||||
"tools": [
|
||||
{
|
||||
"name": "support",
|
||||
"mount_point": "support",
|
||||
"url": "/p/aaron/support/",
|
||||
"icons": {
|
||||
"24": "images/sftheme/24x24/blog_24.png",
|
||||
"32": "images/sftheme/32x32/blog_32.png",
|
||||
"48": "images/sftheme/48x48/blog_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "Support",
|
||||
"mount_label": "Support"
|
||||
},
|
||||
{
|
||||
"name": "mailman",
|
||||
"mount_point": "mailman",
|
||||
"url": "/p/aaron/mailman/",
|
||||
"icons": {
|
||||
"24": "images/forums_24.png",
|
||||
"32": "images/forums_32.png",
|
||||
"48": "images/forums_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "Mailing Lists",
|
||||
"mount_label": "Mailing Lists"
|
||||
},
|
||||
{
|
||||
"name": "reviews",
|
||||
"mount_point": "reviews",
|
||||
"url": "/p/aaron/reviews/",
|
||||
"icons": {
|
||||
"24": "images/sftheme/24x24/blog_24.png",
|
||||
"32": "images/sftheme/32x32/blog_32.png",
|
||||
"48": "images/sftheme/48x48/blog_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "Reviews",
|
||||
"mount_label": "Reviews"
|
||||
},
|
||||
{
|
||||
"name": "wiki",
|
||||
"mount_point": "wiki",
|
||||
"url": "/p/aaron/wiki/",
|
||||
"icons": {
|
||||
"24": "images/wiki_24.png",
|
||||
"32": "images/wiki_32.png",
|
||||
"48": "images/wiki_48.png"
|
||||
},
|
||||
"installable": true,
|
||||
"tool_label": "Wiki",
|
||||
"mount_label": "Wiki"
|
||||
},
|
||||
{
|
||||
"name": "summary",
|
||||
"mount_point": "summary",
|
||||
"url": "/p/aaron/summary/",
|
||||
"icons": {
|
||||
"24": "images/sftheme/24x24/blog_24.png",
|
||||
"32": "images/sftheme/32x32/blog_32.png",
|
||||
"48": "images/sftheme/48x48/blog_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "Summary",
|
||||
"mount_label": "Summary",
|
||||
"sourceforge_group_id": 29993
|
||||
},
|
||||
{
|
||||
"name": "files-sf",
|
||||
"mount_point": "files",
|
||||
"url": "/p/aaron/files/",
|
||||
"icons": {
|
||||
"24": "images/downloads_24.png",
|
||||
"32": "images/downloads_32.png",
|
||||
"48": "images/downloads_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "Files",
|
||||
"mount_label": "Files"
|
||||
},
|
||||
{
|
||||
"name": "cvs",
|
||||
"mount_point": "code",
|
||||
"url": "/p/aaron/code/",
|
||||
"icons": {
|
||||
"24": "images/code_24.png",
|
||||
"32": "images/code_32.png",
|
||||
"48": "images/code_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "CVS",
|
||||
"mount_label": "Code"
|
||||
},
|
||||
{
|
||||
"name": "activity",
|
||||
"mount_point": "activity",
|
||||
"url": "/p/aaron/activity/",
|
||||
"icons": {
|
||||
"24": "images/admin_24.png",
|
||||
"32": "images/admin_32.png",
|
||||
"48": "images/admin_48.png"
|
||||
},
|
||||
"installable": false,
|
||||
"tool_label": "Tool",
|
||||
"mount_label": "Activity"
|
||||
},
|
||||
{
|
||||
"name": "discussion",
|
||||
"mount_point": "discussion",
|
||||
"url": "/p/aaron/discussion/",
|
||||
"icons": {
|
||||
"24": "images/forums_24.png",
|
||||
"32": "images/forums_32.png",
|
||||
"48": "images/forums_48.png"
|
||||
},
|
||||
"installable": true,
|
||||
"tool_label": "Discussion",
|
||||
"mount_label": "Discussion"
|
||||
}
|
||||
],
|
||||
"labels": [],
|
||||
"categories": {
|
||||
"audience": [
|
||||
{
|
||||
"id": 4,
|
||||
"shortname": "sysadmins",
|
||||
"fullname": "System Administrators",
|
||||
"fullpath": "Intended Audience :: by End-User Class :: System Administrators"
|
||||
}
|
||||
],
|
||||
"developmentstatus": [
|
||||
{
|
||||
"id": 8,
|
||||
"shortname": "prealpha",
|
||||
"fullname": "2 - Pre-Alpha",
|
||||
"fullpath": "Development Status :: 2 - Pre-Alpha"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"shortname": "planning",
|
||||
"fullname": "1 - Planning",
|
||||
"fullpath": "Development Status :: 1 - Planning"
|
||||
}
|
||||
],
|
||||
"environment": [
|
||||
{
|
||||
"id": 238,
|
||||
"shortname": "daemon",
|
||||
"fullname": "Non-interactive (Daemon)",
|
||||
"fullpath": "User Interface :: Non-interactive (Daemon)"
|
||||
}
|
||||
],
|
||||
"language": [
|
||||
{
|
||||
"id": 164,
|
||||
"shortname": "c",
|
||||
"fullname": "C",
|
||||
"fullpath": "Programming Language :: C"
|
||||
},
|
||||
{
|
||||
"id": 293,
|
||||
"shortname": "ruby",
|
||||
"fullname": "Ruby",
|
||||
"fullpath": "Programming Language :: Ruby"
|
||||
}
|
||||
],
|
||||
"license": [
|
||||
{
|
||||
"id": 296,
|
||||
"shortname": "apache",
|
||||
"fullname": "Apache Software License",
|
||||
"fullpath": "License :: OSI-Approved Open Source :: Apache Software License"
|
||||
}
|
||||
],
|
||||
"translation": [
|
||||
{
|
||||
"id": 275,
|
||||
"shortname": "english",
|
||||
"fullname": "English",
|
||||
"fullpath": "Translations :: English"
|
||||
}
|
||||
],
|
||||
"os": [
|
||||
{
|
||||
"id": 235,
|
||||
"shortname": "independent",
|
||||
"fullname": "OS Independent (Written in an interpreted language)",
|
||||
"fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Independent (Written in an interpreted language)"
|
||||
}
|
||||
],
|
||||
"database": [],
|
||||
"topic": [
|
||||
{
|
||||
"id": 152,
|
||||
"shortname": "monitoring",
|
||||
"fullname": "Monitoring",
|
||||
"fullpath": "Topic :: System :: Networking :: Monitoring"
|
||||
}
|
||||
]
|
||||
},
|
||||
"icon_url": null,
|
||||
"screenshots": []
|
||||
}
|
|
@ -1,5 +1,20 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url>
|
||||
<loc>https://sourceforge.net/projects/aaron/files/</loc>
|
||||
<lastmod>2013-03-07</lastmod>
|
||||
<changefreq>daily</changefreq>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://sourceforge.net/p/aaron/home/</loc>
|
||||
<lastmod>2013-03-07</lastmod>
|
||||
<changefreq>daily</changefreq>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://sourceforge.net/p/aaron/tickets/</loc>
|
||||
<lastmod>2013-03-07</lastmod>
|
||||
<changefreq>daily</changefreq>
|
||||
</url>
|
||||
<url>
|
||||
<loc>https://sourceforge.net/projects/os3dmodels/files/</loc>
|
||||
<lastmod>2017-03-31</lastmod>
|
||||
|
|
|
@ -26,6 +26,7 @@ from swh.lister.utils import WAIT_EXP_BASE
|
|||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
TEST_PROJECTS = {
|
||||
"aaron": "p",
|
||||
"adobexmp": "adobe",
|
||||
"backapps": "p",
|
||||
"backapps/website": "p",
|
||||
|
@ -62,6 +63,10 @@ def get_project_json(datadir, request, context):
|
|||
return json.loads(Path(datadir, f"{project}.json").read_text())
|
||||
|
||||
|
||||
def get_cvs_info_page(datadir):
|
||||
return Path(datadir, "aaron.html").read_text()
|
||||
|
||||
|
||||
def _check_request_headers(request):
|
||||
return request.headers.get("User-Agent") == USER_AGENT
|
||||
|
||||
|
@ -81,6 +86,8 @@ def _check_listed_origins(lister, swh_scheduler):
|
|||
"https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
|
||||
"http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"),
|
||||
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"),
|
||||
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"),
|
||||
"rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"),
|
||||
}
|
||||
|
||||
|
||||
|
@ -114,6 +121,11 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
|
|||
json=functools.partial(get_project_json, datadir),
|
||||
additional_matcher=_check_request_headers,
|
||||
)
|
||||
requests_mock.get(
|
||||
re.compile("http://aaron.cvs.sourceforge.net/"),
|
||||
text=get_cvs_info_page(datadir),
|
||||
additional_matcher=_check_request_headers,
|
||||
)
|
||||
|
||||
stats = lister.run()
|
||||
# - os3dmodels (2 repos),
|
||||
|
@ -123,8 +135,8 @@ def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
|
|||
# - random-mercurial (1 repo).
|
||||
# - bzr-repo (1 repo).
|
||||
# adobe and backapps itself have no repos.
|
||||
assert stats.pages == 6
|
||||
assert stats.origins == 11
|
||||
assert stats.pages == 7
|
||||
assert stats.origins == 13
|
||||
expected_state = {
|
||||
"subsitemap_last_modified": {
|
||||
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
|
||||
|
@ -178,6 +190,12 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
|
|||
additional_matcher=_check_request_headers,
|
||||
)
|
||||
|
||||
requests_mock.get(
|
||||
re.compile("http://aaron.cvs.sourceforge.net/"),
|
||||
text=get_cvs_info_page(datadir),
|
||||
additional_matcher=_check_request_headers,
|
||||
)
|
||||
|
||||
faked_listed_origins = [
|
||||
# mramm: changed
|
||||
ListedOrigin(
|
||||
|
@ -272,8 +290,8 @@ def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, m
|
|||
|
||||
stats = lister.run()
|
||||
# - mramm (3 repos), # changed
|
||||
assert stats.pages == 1
|
||||
assert stats.origins == 3
|
||||
assert stats.pages == 2
|
||||
assert stats.origins == 5
|
||||
expected_state = {
|
||||
"subsitemap_last_modified": {
|
||||
"https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
|
||||
|
@ -322,6 +340,12 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
|
|||
additional_matcher=_check_request_headers,
|
||||
)
|
||||
|
||||
requests_mock.get(
|
||||
re.compile("http://aaron.cvs.sourceforge.net/"),
|
||||
text=get_cvs_info_page(datadir),
|
||||
additional_matcher=_check_request_headers,
|
||||
)
|
||||
|
||||
stats = lister.run()
|
||||
# - os3dmodels (2 repos),
|
||||
# - mramm (3 repos),
|
||||
|
@ -330,23 +354,10 @@ def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir)
|
|||
# - random-mercurial (1 repo).
|
||||
# - bzr-repo (1 repo).
|
||||
# adobe and backapps itself have no repos.
|
||||
assert stats.pages == 6
|
||||
assert stats.origins == 11
|
||||
assert stats.pages == 7
|
||||
assert stats.origins == 13
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert {o.url: o.visit_type for o in scheduler_origins} == {
|
||||
"https://svn.code.sf.net/p/backapps/website/code": "svn",
|
||||
"https://git.code.sf.net/p/os3dmodels/git": "git",
|
||||
"https://svn.code.sf.net/p/os3dmodels/svn": "svn",
|
||||
"https://git.code.sf.net/p/mramm/files": "git",
|
||||
"https://git.code.sf.net/p/mramm/git": "git",
|
||||
"https://svn.code.sf.net/p/mramm/svn": "svn",
|
||||
"https://git.code.sf.net/p/mojunk/git": "git",
|
||||
"https://git.code.sf.net/p/mojunk/git2": "git",
|
||||
"https://svn.code.sf.net/p/mojunk/svn": "svn",
|
||||
"http://hg.code.sf.net/p/random-mercurial/hg": "hg",
|
||||
"http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr",
|
||||
}
|
||||
_check_listed_origins(lister, swh_scheduler)
|
||||
|
||||
# Test `time.sleep` is called with exponential retries
|
||||
assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1])
|
||||
|
@ -408,6 +419,11 @@ def test_sourceforge_lister_project_error(
|
|||
re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code
|
||||
)
|
||||
|
||||
# Make request to CVS info page fail
|
||||
requests_mock.get(
|
||||
re.compile("http://aaron.cvs.sourceforge.net/"), status_code=status_code
|
||||
)
|
||||
|
||||
stats = lister.run()
|
||||
# - os3dmodels (2 repos),
|
||||
# - mojunk (3 repos),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue