Validate origin URLs before sending to the scheduler

This commit is contained in:
Valentin Lorentz 2022-11-04 13:48:14 +01:00
parent 60707a45dd
commit 8ea4200909
8 changed files with 339 additions and 12 deletions

View file

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<groupId>al.aldi</groupId>
<artifactId>sprova4j</artifactId>
<version>0.1.0</version>
<name>sprova4j</name>
<description>Java client for Sprova Test Management</description>
<url>https://github.com/aldialimucaj/sprova4j</url>
<inceptionYear>2018</inceptionYear>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<developers>
<developer>
<id>aldi</id>
<name>Aldi Alimucaj</name>
<email>aldi.alimucaj@gmail.com</email>
</developer>
</developers>
<scm>
<connection>scm:git@github.com/aldialimucaj/sprova4j.git</connection>
<url>git@github.com/aldialimucaj/sprova4j</url>
</scm>
</project>

View file

@ -170,6 +170,53 @@ def test_maven_full_listing_malformed(
assert scheduler_state.last_seen_pom == -1
def test_maven_ignore_invalid_url(
swh_scheduler,
requests_mock,
datadir,
):
"""Covers full listing of multiple pages, checking page results with a malformed
scm entry in pom."""
lister = MavenLister(
scheduler=swh_scheduler,
url=MVN_URL,
instance="maven.org",
index_url=INDEX_URL,
incremental=False,
)
# Set up test.
requests_mock.get(
URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.invalidurl.pom").read_bytes()
)
# Then run the lister.
stats = lister.run()
# Start test checks.
assert stats.pages == 5
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
origin_urls = [origin.url for origin in scheduler_origins]
# 1 git origins (the other ignored) + 1 maven origin with 2 releases (one per jar)
assert set(origin_urls) == {ORIGIN_GIT_INCR, ORIGIN_SRC}
assert len(origin_urls) == len(set(origin_urls))
for origin in scheduler_origins:
if origin.visit_type == "maven":
for src in LIST_SRC_DATA:
last_update_src = iso8601.parse_date(src["time"])
assert last_update_src <= origin.last_update
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
scheduler_state = lister.get_state_from_scheduler()
assert scheduler_state is not None
assert scheduler_state.last_seen_doc == -1
assert scheduler_state.last_seen_pom == -1
def test_maven_incremental_listing(
swh_scheduler,
requests_mock,

View file

@ -48,7 +48,7 @@ def test_mock_init_repository_update(mock_opam, tmp_path, datadir):
mock_init, mock_popen = mock_opam
instance = "fake_opam_repo"
instance_url = f"file://{datadir}/{instance}"
instance_url = f"http://example.org/{instance}"
opam_root = str(tmp_path / "test-opam")
os.makedirs(opam_root, exist_ok=True)
@ -112,8 +112,17 @@ def test_urls(swh_scheduler, mock_opam, tmp_path):
assert expected_urls == result_urls
def test_opam_binary(datadir, swh_scheduler, tmp_path):
instance_url = f"file://{datadir}/fake_opam_repo"
def test_opam_binary(datadir, swh_scheduler, tmp_path, mocker):
from swh.lister.opam.lister import opam_init
instance_url = "http://example.org/fake_opam_repo"
def mock_opam_init(opam_root, instance, url, env):
assert url == instance_url
return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env)
# Patch opam_init to use the local directory
mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init)
lister = OpamLister(
swh_scheduler,
@ -141,8 +150,17 @@ def test_opam_binary(datadir, swh_scheduler, tmp_path):
assert expected_urls == result_urls
def test_opam_multi_instance(datadir, swh_scheduler, tmp_path):
instance_url = f"file://{datadir}/fake_opam_repo"
def test_opam_multi_instance(datadir, swh_scheduler, tmp_path, mocker):
from swh.lister.opam.lister import opam_init
instance_url = "http://example.org/fake_opam_repo"
def mock_opam_init(opam_root, instance, url, env):
assert url == instance_url
return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env)
# Patch opam_init to use the local directory
mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init)
lister = OpamLister(
swh_scheduler,

View file

@ -0,0 +1,151 @@
{
"packages": {
"payrix/payrix-php": {
"dev-master": {
"name": "payrix/payrix-php",
"description": "PayrixPHP PHP SDK package",
"keywords": [],
"homepage": "https://portal.payrix.com",
"version": "dev-master",
"version_normalized": "9999999-dev",
"license": [
"Apache-2.0"
],
"authors": [],
"source": {
"url": "git@gitlab.com:payrix/public/payrix-php.git",
"type": "git",
"reference": "cf02195d3c32424396932e087824bf581966e703"
},
"dist": {
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703",
"type": "zip",
"shasum": "",
"reference": "cf02195d3c32424396932e087824bf581966e703"
},
"type": "library",
"time": "2021-05-25T14:12:28+00:00",
"autoload": {
"psr-4": {
"PayrixPHP\\": "lib/"
}
},
"default-branch": true,
"require": {
"php": ">=5.4.0",
"ext-curl": "*",
"ext-openssl": "*"
},
"uid": 4416889
},
"v2.0.0": {
"name": "payrix/payrix-php",
"description": "PayrixPHP PHP SDK package",
"keywords": [],
"homepage": "https://portal.payrix.com",
"version": "v2.0.0",
"version_normalized": "2.0.0.0",
"license": [
"Apache-2.0"
],
"authors": [],
"source": {
"url": "https://gitlab.com/payrix/public/payrix-php.git",
"type": "git",
"reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68"
},
"dist": {
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68",
"type": "zip",
"shasum": "",
"reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68"
},
"type": "library",
"time": "2020-09-03T11:26:52+00:00",
"autoload": {
"psr-4": {
"PayrixPHP\\": "lib/"
}
},
"require": {
"php": ">=5.4.0",
"ext-curl": "*",
"ext-openssl": "*"
},
"uid": 4416947
},
"v2.0.1": {
"name": "payrix/payrix-php",
"description": "PayrixPHP PHP SDK package",
"keywords": [],
"homepage": "https://portal.payrix.com",
"version": "v2.0.1",
"version_normalized": "2.0.1.0",
"license": [
"Apache-2.0"
],
"authors": [],
"source": {
"url": "https://gitlab.com/payrix/public/payrix-php.git",
"type": "git",
"reference": "9693f2dff0a589e16c88a9bf838069ab89166103"
},
"dist": {
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=9693f2dff0a589e16c88a9bf838069ab89166103",
"type": "zip",
"shasum": "",
"reference": "9693f2dff0a589e16c88a9bf838069ab89166103"
},
"type": "library",
"time": "2021-05-10T02:32:57+00:00",
"autoload": {
"psr-4": {
"PayrixPHP\\": "lib/"
}
},
"require": {
"php": ">=5.4.0",
"ext-curl": "*",
"ext-openssl": "*"
},
"uid": 5183918
},
"v2.0.2": {
"name": "payrix/payrix-php",
"description": "PayrixPHP PHP SDK package",
"keywords": [],
"homepage": "https://portal.payrix.com",
"version": "v2.0.2",
"version_normalized": "2.0.2.0",
"license": [
"Apache-2.0"
],
"authors": [],
"source": {
"url": "https://gitlab.com/payrix/public/payrix-php.git",
"type": "git",
"reference": "cf02195d3c32424396932e087824bf581966e703"
},
"dist": {
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703",
"type": "zip",
"shasum": "",
"reference": "cf02195d3c32424396932e087824bf581966e703"
},
"type": "library",
"time": "2021-05-25T10:12:28+00:00",
"autoload": {
"psr-4": {
"PayrixPHP\\": "lib/"
}
},
"require": {
"php": ">=5.4.0",
"ext-curl": "*",
"ext-openssl": "*"
},
"uid": 5232658
}
}
}
}

View file

@ -0,0 +1,24 @@
{
"packages": {
"ycms/module-main": {
"dev-master": {
"name": "with/invalid_url",
"description": "",
"keywords": [],
"homepage": "",
"version": "dev-master",
"version_normalized": "9999999-dev",
"license": [],
"authors": [],
"source": {
"type": "git",
"url": "git@example.org/invalid/url.git",
"reference": "0000000000000000000000000000000000000000"
},
"time": "2015-08-23T04:42:33+00:00",
"default-branch": true,
"uid": 4064797
}
}
}
}

View file

@ -14,7 +14,9 @@ _packages_list = {
"ljjackson/linnworks",
"lky/wx_article",
"spryker-eco/computop-api",
"idevlab/essential",
"idevlab/essential", # Git SSH URL
"payrix/payrix-php",
"with/invalid_url", # invalid URL
]
}
@ -49,7 +51,7 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
stats = lister.run()
assert stats.pages == 1
assert stats.origins == len(_packages_list["packageNames"])
assert stats.origins == len(_packages_list["packageNames"]) - 2
assert lister.updated
expected_origins = {
@ -69,9 +71,9 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
datetime.datetime.fromisoformat("2020-06-22T15:50:29+00:00"),
),
(
"git@gitlab.com:idevlab/Essential.git", # not GitHub
"https://gitlab.com/payrix/public/payrix-php.git", # not GitHub
"git",
datetime.datetime.fromisoformat("2022-10-12T10:34:29+00:00"),
datetime.datetime.fromisoformat("2021-05-25T14:12:28+00:00"),
),
}

View file

@ -20,7 +20,7 @@ from swh.scheduler import get_scheduler, model
from swh.scheduler.interface import SchedulerInterface
from . import USER_AGENT_TEMPLATE
from .utils import http_retry
from .utils import http_retry, is_valid_origin_url
logger = logging.getLogger(__name__)
@ -277,8 +277,15 @@ class Lister(Generic[StateType, PageType]):
Returns:
the list of origin URLs recorded in scheduler database
"""
valid_origins = []
for origin in origins:
if is_valid_origin_url(origin.url):
valid_origins.append(origin)
else:
logger.warning("Skipping invalid origin: %s", origin.url)
recorded_origins = []
for batch_origins in grouper(origins, n=1000):
for batch_origins in grouper(valid_origins, n=1000):
ret = self.scheduler.record_listed_origins(batch_origins)
recorded_origins += [origin.url for origin in ret]

View file

@ -2,7 +2,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Callable, Iterator, Tuple
from typing import Callable, Iterator, Optional, Tuple
import urllib.parse
from requests.exceptions import ConnectionError, HTTPError
from requests.status_codes import codes
@ -111,3 +112,50 @@ def http_retry(
"""
return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args)
def is_valid_origin_url(url: Optional[str]) -> bool:
"""Returns whether the given string is a valid origin URL.
This excludes Git SSH URLs and pseudo-URLs (eg. ``ssh://git@example.org:foo``
and ``git@example.org:foo``), as they are not supported by the Git loader
and usually require authentication.
All HTTP URLs are allowed:
>>> is_valid_origin_url("http://example.org/repo.git")
True
>>> is_valid_origin_url("http://example.org/repo")
True
>>> is_valid_origin_url("https://example.org/repo")
True
>>> is_valid_origin_url("https://foo:bar@example.org/repo")
True
Scheme-less URLs are rejected;
>>> is_valid_origin_url("example.org/repo")
False
>>> is_valid_origin_url("example.org:repo")
False
Git SSH URLs and pseudo-URLs are rejected:
>>> is_valid_origin_url("git@example.org:repo")
False
>>> is_valid_origin_url("ssh://git@example.org:repo")
False
"""
if not url:
# Empty or None
return False
parsed = urllib.parse.urlparse(url)
if not parsed.netloc:
# Is parsed as a relative URL
return False
if parsed.scheme == "ssh":
# Git SSH URL
return False
return True