Add a non-incremental sourceforge lister

Following zack's work on T735, this change introduces an actual SWH lister for
SourceForge.

SourceForge provides a main sitemap that lists sharded sitemaps, which
themselves list pages. Each page belongs to a project (or sub-project,
though those are rare), information about which can be found by querying
a REST API, which gives us the list of any and all VCS used for said
project. Both sitemaps and pages have a "last modified" timestamp that
will be used in a future patch to implement incremental listing.

More precise information can be found as inline comments or docstrings.
This commit is contained in:
Raphaël Gomès 2021-03-17 17:39:41 +01:00
parent 879170a57d
commit f7b27c6930
16 changed files with 583 additions and 0 deletions

View file

@ -0,0 +1,12 @@
# Copyright (C) 2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import SourceForgeLister
return {
"lister": SourceForgeLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -0,0 +1,224 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass
import datetime
from enum import Enum
import logging
import re
from typing import Iterator, List, Set
from xml.etree import ElementTree
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import StatelessLister
logger = logging.getLogger(__name__)
class VcsNames(Enum):
"""Used to filter SourceForge tool names for valid VCS types"""
# CVS projects are read-only
CVS = "cvs"
GIT = "git"
SUBVERSION = "svn"
MERCURIAL = "hg"
BAZAAR = "bzr"
VCS_NAMES = set(v.value for v in VcsNames.__members__.values())
@dataclass
class SourceForgeListerEntry:
vcs: VcsNames
url: str
last_modified: datetime.date
SourceForgeListerPage = List[SourceForgeListerEntry]
MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
# REST resource endpoint for information about the given project.
#
# `namespace`: Project namespace. Very often `p`, but can be something else like
# `adobe`
# `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`.
PROJECT_REST_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}"
# Predictable URL for cloning (in the broad sense) a VCS registered for the project.
#
# `vcs`: VCS type, one of `VCS_NAMES`
# `namespace`: Project namespace. Very often `p`, but can be something else like
# `adobe`.
# `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`.
# `mount_point`: url path used by the repo. For example, the Code::Blocks project uses
# `git` (https://git.code.sf.net/p/codeblocks/git).
CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}"
PROJ_URL_RE = re.compile(
r"^https://sourceforge.net/(?P<namespace>[^/]+)/(?P<project>[^/]+)/(?P<rest>.*)?"
)
class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
"""List origins from the "SourceForge" forge.
"""
# Part of the lister API, that identifies this lister
LISTER_NAME = "sourceforge"
def __init__(self, scheduler: SchedulerInterface):
super().__init__(
scheduler=scheduler, url="https://sourceforge.net", instance="main"
)
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
# Log listed URL to ease debugging
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
# The lister must fail on blocking errors
response.raise_for_status()
return response
def get_pages(self) -> Iterator[SourceForgeListerPage]:
"""
SourceForge has a main XML sitemap that lists its sharded sitemaps for all
projects.
Each XML sub-sitemap lists project pages, which are not unique per project: a
project can have a wiki, a home, a git, an svn, etc.
For each unique project, we query a REST endpoint that lists (among
other things) the tools associated with said project, some of which are
the VCS used. Subprojects are considered separate projects.
Lastly we use the information of which VCS are used to build the predictable
clone URL for any given VCS.
"""
sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text
tree = ElementTree.fromstring(sitemap_contents)
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
# TODO use when adding incremental support
# last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
assert location is not None
sub_url = location.text
subsitemap_contents = self.page_request(sub_url, {}).text
subtree = ElementTree.fromstring(subsitemap_contents)
yield from self._get_pages_from_subsitemap(subtree)
def get_origins_from_page(
self, page: SourceForgeListerPage
) -> Iterator[ListedOrigin]:
assert self.lister_obj.id is not None
for hit in page:
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=hit.vcs.value,
url=hit.url,
last_update=iso8601.parse_date(hit.last_modified),
)
def _get_pages_from_subsitemap(
self, subtree: ElementTree.Element
) -> Iterator[SourceForgeListerPage]:
projects: Set[str] = set()
for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
assert last_modified_block is not None
last_modified = last_modified_block.text
location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc")
assert location is not None
project_url = location.text
assert project_url is not None
match = PROJ_URL_RE.match(project_url)
if match:
matches = match.groupdict()
namespace = matches["namespace"]
if namespace == "projects":
# These have a `p`-namespaced counterpart, use that instead
continue
project = matches["project"]
rest = matches["rest"]
if rest.count("/") > 1:
# This is a subproject. There exists no sub-subprojects.
subproject_name = rest.rsplit("/", 2)[0]
project = f"{project}/{subproject_name}"
prev_len = len(projects)
projects.add(project)
if prev_len == len(projects):
# Already seen
continue
pages = self._get_pages_for_project(namespace, project, last_modified)
if pages:
yield pages
else:
logger.debug("Project '%s' does not have any VCS", project)
else:
# Should always match, let's log it
msg = "Project URL '%s' does not match expected pattern"
logger.warning(msg, project_url)
def _get_pages_for_project(
self, namespace, project, last_modified
) -> SourceForgeListerPage:
endpoint = PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project)
res = self.page_request(endpoint, {}).json()
tools = res.get("tools")
if tools is None:
# This probably never happens
logger.warning("Project '%s' does not have any tools", endpoint)
return []
hits = []
for tool in tools:
tool_name = tool["name"]
if tool_name not in VCS_NAMES:
continue
url = CLONE_URL_FORMAT.format(
vcs=tool_name,
namespace=namespace,
project=project,
mount_point=tool["mount_point"],
)
entry = SourceForgeListerEntry(
vcs=VcsNames(tool_name), url=url, last_modified=last_modified
)
hits.append(entry)
return hits

View file

@ -0,0 +1,20 @@
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict
from celery import shared_task
from swh.lister.sourceforge.lister import SourceForgeLister
@shared_task(name=__name__ + ".FullSourceForgeLister")
def list_sourceforge_full() -> Dict[str, int]:
"""Full update of a SourceForge instance"""
return SourceForgeLister.from_configfile().run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

View file

@ -0,0 +1 @@
{"shortname": "adobexmp", "name": "Extensible Metadata Platform (XMP)", "_id": "4bfa89ecb9363c60a200004f", "url": "https://sourceforge.net/adobe/adobexmp/", "private": false, "short_description": "Adobe's Extensible Metadata Platform (XMP) is a labeling technology that allows you to embed data about a file, known as metadata, into the file itself. ", "creation_date": "2010-05-24", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "stefanmakswit", "name": "Stefan Makswit", "url": "https://sourceforge.net/u/stefanmakswit/"}, {"username": "samymakki", "name": "Samy Makki", "url": "https://sourceforge.net/u/samymakki/"}, {"username": "frankbiederich", "name": "Frank Biederich", "url": "https://sourceforge.net/u/frankbiederich/"}, {"username": "adobeadmin", "name": "Adobe Admin", "url": "https://sourceforge.net/u/adobeadmin/"}, {"username": "n_oostendorp", "name": "Nathan Oostendorp", "url": "https://sourceforge.net/u/n_oostendorp/"}, {"username": "joergehrlich", "name": "J\u00f6rg Ehrlich", "url": "https://sourceforge.net/u/joergehrlich/"}], "tools": [{"name": "discussion", "mount_point": "discussion", "url": "/adobe/adobexmp/discussion/", "icons": {"24": "images/forums_24.png", "32": "images/forums_32.png", "48": "images/forums_48.png"}, "installable": true, "tool_label": "Discussion", "mount_label": "Discussion"}, {"name": "wiki", "mount_point": "wiki", "url": "/adobe/adobexmp/wiki/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Wiki"}, {"name": "wiki", "mount_point": "home", "url": "/adobe/adobexmp/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}], "labels": ["xmp", "metadata", "adobe"], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": "https://sourceforge.net/adobe/adobexmp/icon", "screenshots": []}

View file

@ -0,0 +1 @@
{"shortname": "backapps/website", "name": "BackApps website", "_id": "4e5b4c310594ca11c1000f67", "url": "https://sourceforge.net/p/backapps/website/", "private": true, "short_description": "BackApps website is the front end of the BackApps service that supplies information for mobile application developers of pros and cons of BackApps service and how to use it. ", "creation_date": "2011-08-29", "summary": "", "external_homepage": "www.backapps.com", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "_url", "preferred_support_url": "www.backapps.com", "developers": [{"username": "shaiamar", "name": "Shai", "url": "https://sourceforge.net/u/shaiamar/"}], "tools": [{"name": "svn", "mount_point": "code", "url": "/p/backapps/website/code/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "Code"}, {"name": "wiki", "mount_point": "home", "url": "/p/backapps/website/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/backapps/website/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "summary", "mount_point": "summary", "url": "/p/backapps/website/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 586632}, {"name": "support", "mount_point": "support", "url": "/p/backapps/website/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "files", "mount_point": "files", "url": "/p/backapps/website/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "activity", "mount_point": "activity", "url": "/p/backapps/website/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": ["backapps", "mobile", "data sharing", "iphone", "ios", "android", "java", "server", "social", "application server", "social application server"], "categories": {"audience": [{"id": 5, "shortname": "other", "fullname": "Other Audience", "fullpath": "Intended Audience :: Other Audience"}], "developmentstatus": [{"id": 9, "shortname": "alpha", "fullname": "3 - Alpha", "fullpath": "Development Status :: 3 - Alpha"}], "environment": [{"id": 237, "shortname": "web", "fullname": "Web-based", "fullpath": "User Interface :: Web-based"}], "language": [{"id": 198, "shortname": "java", "fullname": "Java", "fullpath": "Programming Language :: Java"}], "license": [{"id": 16, "shortname": "lgpl", "fullname": "GNU Library or Lesser General Public License version 2.0 (LGPLv2)", "fullpath": "License :: OSI-Approved Open Source :: GNU Library or Lesser General Public License version 2.0 (LGPLv2)"}], "translation": [], "os": [], "database": [{"id": 502, "shortname": "db_api_jdbc", "fullname": "JDBC", "fullpath": "Database Environment :: Database API :: JDBC"}], "topic": [{"id": 68, "shortname": "frontends", "fullname": "Front-Ends", "fullpath": "Topic :: Database :: Front-Ends"}, {"id": 606, "shortname": "frameworks", "fullname": "Frameworks", "fullpath": "Topic :: Software Development :: Frameworks"}]}, "icon_url": null, "screenshots": []}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://sourceforge.net/allura_sitemap/sitemap-0.xml</loc>
<lastmod>2021-03-18</lastmod>
</sitemap><sitemap>
<loc>https://sourceforge.net/allura_sitemap/sitemap-1.xml</loc>
<lastmod>2021-03-18</lastmod>
</sitemap>
</sitemapindex>

View file

@ -0,0 +1 @@
{"shortname": "mojunk", "name": "mojunk", "_id": "4c34ecc60594ca5c18000572", "url": "https://sourceforge.net/p/mojunk/", "private": false, "short_description": "This is a test project", "creation_date": "2010-07-07", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "matthewmoore", "name": "Matthew S. Moore", "url": "https://sourceforge.net/u/matthewmoore/"}], "tools": [{"name": "svn", "mount_point": "svn", "url": "/p/mojunk/svn/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "Svn"}, {"name": "git", "mount_point": "git", "url": "/p/mojunk/git/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git"}, {"name": "git", "mount_point": "git2", "url": "/p/mojunk/git2/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git2-Label"}, {"name": "wiki", "mount_point": "home", "url": "/p/mojunk/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "support", "mount_point": "support", "url": "/p/mojunk/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "summary", "mount_point": "summary", "url": "/p/mojunk/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 333464}, {"name": "files", "mount_point": "files", "url": "/p/mojunk/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/mojunk/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "activity", "mount_point": "activity", "url": "/p/mojunk/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": [""], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": null, "screenshots": []}

View file

@ -0,0 +1 @@
{"shortname": "mramm", "name": "mramm", "_id": "4bf5c0b51be1ce31a900028f", "url": "https://sourceforge.net/p/mramm/", "private": false, "short_description": "", "creation_date": "2010-11-10", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "noostendorp", "name": "Nathan Oostendorp", "url": "https://sourceforge.net/u/noostendorp/"}, {"username": "rick446", "name": "Rick Copeland \u2615", "url": "https://sourceforge.net/u/rick446/"}, {"username": "jonathanbeard", "name": "Jonathan T. Beard", "url": "https://sourceforge.net/u/jonathanbeard/"}, {"username": "mramm", "name": "Mark Ramm", "url": "https://sourceforge.net/u/mramm/"}, {"username": "yesjustwolf", "name": "Wolf ", "url": "https://sourceforge.net/u/yesjustwolf/"}, {"username": "robinbriggs", "name": "Robin Briggs", "url": "https://sourceforge.net/u/robinbriggs/"}], "tools": [{"name": "wiki", "mount_point": "reviews", "url": "/p/mramm/reviews/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Reviews"}, {"name": "tickets", "mount_point": "todo", "url": "/p/mramm/todo/", "icons": {"24": "images/tickets_24.png", "32": "images/tickets_32.png", "48": "images/tickets_48.png"}, "installable": true, "tool_label": "Tickets", "mount_label": "Todo"}, {"name": "wiki", "mount_point": "notes", "url": "/p/mramm/notes/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Notes"}, {"name": "discussion", "mount_point": "discussion", "url": "/p/mramm/discussion/", "icons": {"24": "images/forums_24.png", "32": "images/forums_32.png", "48": "images/forums_48.png"}, "installable": true, "tool_label": "Discussion", "mount_label": "Discussion"}, {"name": "git", "mount_point": "files", "url": "/p/mramm/files/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Files"}, {"name": "svn", "mount_point": "svn", "url": "/p/mramm/svn/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "SVN"}, {"name": "git", "mount_point": "git", "url": "/p/mramm/git/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git"}, {"name": "wiki", "mount_point": "home", "url": "/p/mramm/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "summary", "mount_point": "summary", "url": "/p/mramm/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 372420}, {"name": "support", "mount_point": "support", "url": "/p/mramm/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "activity", "mount_point": "activity", "url": "/p/mramm/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": [""], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": null, "screenshots": []}

View file

@ -0,0 +1 @@
{"shortname": "os3dmodels", "name": "Open Source 3D Models", "_id": "4bf3fc291be1ce2f10000050", "url": "https://sourceforge.net/p/os3dmodels/", "private": false, "short_description": "This is a set of parametric 3D printable models created for the RepRap/Makerbot", "creation_date": "2010-11-10", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "n_oostendorp", "name": "Nathan Oostendorp", "url": "https://sourceforge.net/u/n_oostendorp/"}], "tools": [{"name": "tickets", "mount_point": "tickets", "url": "/p/os3dmodels/tickets/", "icons": {"24": "images/tickets_24.png", "32": "images/tickets_32.png", "48": "images/tickets_48.png"}, "installable": true, "tool_label": "Tickets", "mount_label": "Tickets"}, {"name": "git", "mount_point": "git", "url": "/p/os3dmodels/git/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git"}, {"name": "svn", "mount_point": "svn", "url": "/p/os3dmodels/svn/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "Svn"}, {"name": "wiki", "mount_point": "home", "url": "/p/os3dmodels/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/os3dmodels/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "support", "mount_point": "support", "url": "/p/os3dmodels/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "files", "mount_point": "files", "url": "/p/os3dmodels/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "summary", "mount_point": "summary", "url": "/p/os3dmodels/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 372436}, {"name": "activity", "mount_point": "activity", "url": "/p/os3dmodels/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": ["makerbot", "reprap", "3d models"], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": null, "screenshots": []}

View file

@ -0,0 +1,58 @@
<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://sourceforge.net/projects/os3dmodels/files/</loc>
<lastmod>2017-03-31</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/os3dmodels/home/</loc>
<lastmod>2017-03-31</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/os3dmodels/tickets/</loc>
<lastmod>2017-03-31</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/mramm/home/</loc>
<lastmod>2019-04-04</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/mramm/todo/</loc>
<lastmod>2019-04-04</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/mramm/notes/</loc>
<lastmod>2019-04-04</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/mramm/reviews/</loc>
<lastmod>2019-04-04</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/mramm/discussion/</loc>
<lastmod>2019-04-04</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/adobe/adobexmp/home/</loc>
<lastmod>2017-10-17</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/adobe/adobexmp/wiki/</loc>
<lastmod>2017-10-17</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/adobe/adobexmp/discussion/</loc>
<lastmod>2017-10-17</lastmod>
<changefreq>daily</changefreq>
</url>
</urlset>

View file

@ -0,0 +1,38 @@
<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://sourceforge.net/projects/backapps/files/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/backapps/tickets/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/backapps/chat/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/backapps/website/files/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/backapps/website/tickets/</loc>
<lastmod>2021-02-11</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/projects/mojunk/files/</loc>
<lastmod>2017-12-31</lastmod>
<changefreq>daily</changefreq>
</url>
<url>
<loc>https://sourceforge.net/p/mojunk/home/</loc>
<lastmod>2017-12-31</lastmod>
<changefreq>daily</changefreq>
</url>
</urlset>

View file

@ -0,0 +1,180 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import functools
import json
from pathlib import Path
import re
import pytest
from requests.exceptions import HTTPError
from swh.lister import USER_AGENT
from swh.lister.sourceforge.lister import (
MAIN_SITEMAP_URL,
PROJECT_REST_URL_FORMAT,
SourceForgeLister,
)
# Mapping of project name to namespace
TEST_PROJECTS = {
"adobexmp": "adobe",
"backapps": "p",
"backapps/website": "p",
"mojunk": "p",
"mramm": "p",
"os3dmodels": "p",
}
URLS_MATCHER = {
PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project): project
for project, namespace in TEST_PROJECTS.items()
}
def get_main_sitemap(datadir):
return Path(datadir, "main-sitemap.xml").read_text()
def get_subsitemap_0(datadir):
return Path(datadir, "subsitemap-0.xml").read_text()
def get_subsitemap_1(datadir):
return Path(datadir, "subsitemap-1.xml").read_text()
def get_project_json(datadir, request, context):
url = request.url
project = URLS_MATCHER.get(url)
assert project is not None, f"Url '{url}' could not be matched"
project = project.replace("/", "-")
return json.loads(Path(datadir, f"{project}.json").read_text())
def _check_request_headers(request):
return request.headers.get("User-Agent") == USER_AGENT
def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
"""
Simulate a full listing of an artificially restricted sourceforge.
There are 5 different projects, spread over two sub-sitemaps, a few of which
have multiple VCS listed, one has none, one is outside of the standard `/p/`
namespace, some with custom mount points.
All non-interesting but related entries have been kept.
"""
lister = SourceForgeLister(scheduler=swh_scheduler)
requests_mock.get(
MAIN_SITEMAP_URL,
text=get_main_sitemap(datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml",
text=get_subsitemap_0(datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml",
text=get_subsitemap_1(datadir),
additional_matcher=_check_request_headers,
)
requests_mock.get(
re.compile("https://sourceforge.net/rest/.*"),
json=functools.partial(get_project_json, datadir),
additional_matcher=_check_request_headers,
)
stats = lister.run()
# - os3dmodels (2 repos),
# - mramm (3 repos),
# - mojunk (3 repos),
# - backapps/website (1 repo).
# adobe and backapps itself have no repos.
assert stats.pages == 4
assert stats.origins == 9
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
assert res == {
"svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"),
"git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"),
"svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"),
"git.code.sf.net/p/mramm/files": ("git", "2019-04-04"),
"git.code.sf.net/p/mramm/git": ("git", "2019-04-04"),
"svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"),
"git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"),
"git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
"svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
}
def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir):
# Exponential retries take a long time, so stub time.sleep
mocked_sleep = mocker.patch("time.sleep", return_value=None)
lister = SourceForgeLister(scheduler=swh_scheduler)
requests_mock.get(
MAIN_SITEMAP_URL,
[
{"status_code": 429},
{"status_code": 429},
{"text": get_main_sitemap(datadir)},
],
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-0.xml",
[{"status_code": 429}, {"text": get_subsitemap_0(datadir), "status_code": 301}],
additional_matcher=_check_request_headers,
)
requests_mock.get(
"https://sourceforge.net/allura_sitemap/sitemap-1.xml",
[{"status_code": 429}, {"text": get_subsitemap_1(datadir)}],
additional_matcher=_check_request_headers,
)
requests_mock.get(
re.compile("https://sourceforge.net/rest/.*"),
[{"status_code": 429}, {"json": functools.partial(get_project_json, datadir)}],
additional_matcher=_check_request_headers,
)
stats = lister.run()
# - os3dmodels (2 repos),
# - mramm (3 repos),
# - mojunk (3 repos),
# - backapps/website (1 repo).
# adobe and backapps itself have no repos.
assert stats.pages == 4
assert stats.origins == 9
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert {o.url: o.visit_type for o in scheduler_origins} == {
"svn.code.sf.net/p/backapps/website/code": "svn",
"git.code.sf.net/p/os3dmodels/git": "git",
"svn.code.sf.net/p/os3dmodels/svn": "svn",
"git.code.sf.net/p/mramm/files": "git",
"git.code.sf.net/p/mramm/git": "git",
"svn.code.sf.net/p/mramm/svn": "svn",
"git.code.sf.net/p/mojunk/git": "git",
"git.code.sf.net/p/mojunk/git2": "git",
"svn.code.sf.net/p/mojunk/svn": "svn",
}
# Test `time.sleep` is called with exponential retries
calls = [1.0, 10.0, 1.0, 1.0]
mocked_sleep.assert_has_calls([mocker.call(c) for c in calls])
@pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404])
def test_sourceforge_lister_http_error(swh_scheduler, requests_mock, status_code):
lister = SourceForgeLister(scheduler=swh_scheduler)
requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code)
with pytest.raises(HTTPError):
lister.run()

View file

@ -0,0 +1,34 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_sourceforge_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.sourceforge.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_sourceforge_full_lister_task(
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
stats = ListerStats(pages=10, origins=900)
mock_lister = mocker.patch("swh.lister.sourceforge.tasks.SourceForgeLister")
mock_lister.from_configfile.return_value = mock_lister
mock_lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task(
"swh.lister.sourceforge.tasks.FullSourceForgeLister"
)
assert res
res.wait()
assert res.successful()
mock_lister.from_configfile.assert_called_once()
mock_lister.run.assert_called_once()
assert res.result == stats.dict()