Validate origin URLs before sending to the scheduler
This commit is contained in:
parent
60707a45dd
commit
8ea4200909
8 changed files with 339 additions and 12 deletions
30
swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom
Normal file
30
swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom
Normal file
|
@ -0,0 +1,30 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>al.aldi</groupId>
|
||||
<artifactId>sprova4j</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<name>sprova4j</name>
|
||||
<description>Java client for Sprova Test Management</description>
|
||||
<url>https://github.com/aldialimucaj/sprova4j</url>
|
||||
<inceptionYear>2018</inceptionYear>
|
||||
<licenses>
|
||||
<license>
|
||||
<name>The Apache Software License, Version 2.0</name>
|
||||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
|
||||
<distribution>repo</distribution>
|
||||
</license>
|
||||
</licenses>
|
||||
<developers>
|
||||
<developer>
|
||||
<id>aldi</id>
|
||||
<name>Aldi Alimucaj</name>
|
||||
<email>aldi.alimucaj@gmail.com</email>
|
||||
</developer>
|
||||
</developers>
|
||||
<scm>
|
||||
<connection>scm:git@github.com/aldialimucaj/sprova4j.git</connection>
|
||||
<url>git@github.com/aldialimucaj/sprova4j</url>
|
||||
</scm>
|
||||
</project>
|
||||
|
|
@ -170,6 +170,53 @@ def test_maven_full_listing_malformed(
|
|||
assert scheduler_state.last_seen_pom == -1
|
||||
|
||||
|
||||
def test_maven_ignore_invalid_url(
|
||||
swh_scheduler,
|
||||
requests_mock,
|
||||
datadir,
|
||||
):
|
||||
"""Covers full listing of multiple pages, checking page results with a malformed
|
||||
scm entry in pom."""
|
||||
|
||||
lister = MavenLister(
|
||||
scheduler=swh_scheduler,
|
||||
url=MVN_URL,
|
||||
instance="maven.org",
|
||||
index_url=INDEX_URL,
|
||||
incremental=False,
|
||||
)
|
||||
|
||||
# Set up test.
|
||||
requests_mock.get(
|
||||
URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.invalidurl.pom").read_bytes()
|
||||
)
|
||||
|
||||
# Then run the lister.
|
||||
stats = lister.run()
|
||||
|
||||
# Start test checks.
|
||||
assert stats.pages == 5
|
||||
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
origin_urls = [origin.url for origin in scheduler_origins]
|
||||
|
||||
# 1 git origins (the other ignored) + 1 maven origin with 2 releases (one per jar)
|
||||
assert set(origin_urls) == {ORIGIN_GIT_INCR, ORIGIN_SRC}
|
||||
assert len(origin_urls) == len(set(origin_urls))
|
||||
|
||||
for origin in scheduler_origins:
|
||||
if origin.visit_type == "maven":
|
||||
for src in LIST_SRC_DATA:
|
||||
last_update_src = iso8601.parse_date(src["time"])
|
||||
assert last_update_src <= origin.last_update
|
||||
assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA)
|
||||
|
||||
scheduler_state = lister.get_state_from_scheduler()
|
||||
assert scheduler_state is not None
|
||||
assert scheduler_state.last_seen_doc == -1
|
||||
assert scheduler_state.last_seen_pom == -1
|
||||
|
||||
|
||||
def test_maven_incremental_listing(
|
||||
swh_scheduler,
|
||||
requests_mock,
|
||||
|
|
|
@ -48,7 +48,7 @@ def test_mock_init_repository_update(mock_opam, tmp_path, datadir):
|
|||
mock_init, mock_popen = mock_opam
|
||||
|
||||
instance = "fake_opam_repo"
|
||||
instance_url = f"file://{datadir}/{instance}"
|
||||
instance_url = f"http://example.org/{instance}"
|
||||
opam_root = str(tmp_path / "test-opam")
|
||||
|
||||
os.makedirs(opam_root, exist_ok=True)
|
||||
|
@ -112,8 +112,17 @@ def test_urls(swh_scheduler, mock_opam, tmp_path):
|
|||
assert expected_urls == result_urls
|
||||
|
||||
|
||||
def test_opam_binary(datadir, swh_scheduler, tmp_path):
|
||||
instance_url = f"file://{datadir}/fake_opam_repo"
|
||||
def test_opam_binary(datadir, swh_scheduler, tmp_path, mocker):
|
||||
from swh.lister.opam.lister import opam_init
|
||||
|
||||
instance_url = "http://example.org/fake_opam_repo"
|
||||
|
||||
def mock_opam_init(opam_root, instance, url, env):
|
||||
assert url == instance_url
|
||||
return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env)
|
||||
|
||||
# Patch opam_init to use the local directory
|
||||
mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init)
|
||||
|
||||
lister = OpamLister(
|
||||
swh_scheduler,
|
||||
|
@ -141,8 +150,17 @@ def test_opam_binary(datadir, swh_scheduler, tmp_path):
|
|||
assert expected_urls == result_urls
|
||||
|
||||
|
||||
def test_opam_multi_instance(datadir, swh_scheduler, tmp_path):
|
||||
instance_url = f"file://{datadir}/fake_opam_repo"
|
||||
def test_opam_multi_instance(datadir, swh_scheduler, tmp_path, mocker):
|
||||
from swh.lister.opam.lister import opam_init
|
||||
|
||||
instance_url = "http://example.org/fake_opam_repo"
|
||||
|
||||
def mock_opam_init(opam_root, instance, url, env):
|
||||
assert url == instance_url
|
||||
return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env)
|
||||
|
||||
# Patch opam_init to use the local directory
|
||||
mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init)
|
||||
|
||||
lister = OpamLister(
|
||||
swh_scheduler,
|
||||
|
|
151
swh/lister/packagist/tests/data/payrix_payrix-php.json
Normal file
151
swh/lister/packagist/tests/data/payrix_payrix-php.json
Normal file
|
@ -0,0 +1,151 @@
|
|||
{
|
||||
"packages": {
|
||||
"payrix/payrix-php": {
|
||||
"dev-master": {
|
||||
"name": "payrix/payrix-php",
|
||||
"description": "PayrixPHP PHP SDK package",
|
||||
"keywords": [],
|
||||
"homepage": "https://portal.payrix.com",
|
||||
"version": "dev-master",
|
||||
"version_normalized": "9999999-dev",
|
||||
"license": [
|
||||
"Apache-2.0"
|
||||
],
|
||||
"authors": [],
|
||||
"source": {
|
||||
"url": "git@gitlab.com:payrix/public/payrix-php.git",
|
||||
"type": "git",
|
||||
"reference": "cf02195d3c32424396932e087824bf581966e703"
|
||||
},
|
||||
"dist": {
|
||||
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703",
|
||||
"type": "zip",
|
||||
"shasum": "",
|
||||
"reference": "cf02195d3c32424396932e087824bf581966e703"
|
||||
},
|
||||
"type": "library",
|
||||
"time": "2021-05-25T14:12:28+00:00",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"PayrixPHP\\": "lib/"
|
||||
}
|
||||
},
|
||||
"default-branch": true,
|
||||
"require": {
|
||||
"php": ">=5.4.0",
|
||||
"ext-curl": "*",
|
||||
"ext-openssl": "*"
|
||||
},
|
||||
"uid": 4416889
|
||||
},
|
||||
"v2.0.0": {
|
||||
"name": "payrix/payrix-php",
|
||||
"description": "PayrixPHP PHP SDK package",
|
||||
"keywords": [],
|
||||
"homepage": "https://portal.payrix.com",
|
||||
"version": "v2.0.0",
|
||||
"version_normalized": "2.0.0.0",
|
||||
"license": [
|
||||
"Apache-2.0"
|
||||
],
|
||||
"authors": [],
|
||||
"source": {
|
||||
"url": "https://gitlab.com/payrix/public/payrix-php.git",
|
||||
"type": "git",
|
||||
"reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68"
|
||||
},
|
||||
"dist": {
|
||||
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68",
|
||||
"type": "zip",
|
||||
"shasum": "",
|
||||
"reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68"
|
||||
},
|
||||
"type": "library",
|
||||
"time": "2020-09-03T11:26:52+00:00",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"PayrixPHP\\": "lib/"
|
||||
}
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.4.0",
|
||||
"ext-curl": "*",
|
||||
"ext-openssl": "*"
|
||||
},
|
||||
"uid": 4416947
|
||||
},
|
||||
"v2.0.1": {
|
||||
"name": "payrix/payrix-php",
|
||||
"description": "PayrixPHP PHP SDK package",
|
||||
"keywords": [],
|
||||
"homepage": "https://portal.payrix.com",
|
||||
"version": "v2.0.1",
|
||||
"version_normalized": "2.0.1.0",
|
||||
"license": [
|
||||
"Apache-2.0"
|
||||
],
|
||||
"authors": [],
|
||||
"source": {
|
||||
"url": "https://gitlab.com/payrix/public/payrix-php.git",
|
||||
"type": "git",
|
||||
"reference": "9693f2dff0a589e16c88a9bf838069ab89166103"
|
||||
},
|
||||
"dist": {
|
||||
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=9693f2dff0a589e16c88a9bf838069ab89166103",
|
||||
"type": "zip",
|
||||
"shasum": "",
|
||||
"reference": "9693f2dff0a589e16c88a9bf838069ab89166103"
|
||||
},
|
||||
"type": "library",
|
||||
"time": "2021-05-10T02:32:57+00:00",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"PayrixPHP\\": "lib/"
|
||||
}
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.4.0",
|
||||
"ext-curl": "*",
|
||||
"ext-openssl": "*"
|
||||
},
|
||||
"uid": 5183918
|
||||
},
|
||||
"v2.0.2": {
|
||||
"name": "payrix/payrix-php",
|
||||
"description": "PayrixPHP PHP SDK package",
|
||||
"keywords": [],
|
||||
"homepage": "https://portal.payrix.com",
|
||||
"version": "v2.0.2",
|
||||
"version_normalized": "2.0.2.0",
|
||||
"license": [
|
||||
"Apache-2.0"
|
||||
],
|
||||
"authors": [],
|
||||
"source": {
|
||||
"url": "https://gitlab.com/payrix/public/payrix-php.git",
|
||||
"type": "git",
|
||||
"reference": "cf02195d3c32424396932e087824bf581966e703"
|
||||
},
|
||||
"dist": {
|
||||
"url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703",
|
||||
"type": "zip",
|
||||
"shasum": "",
|
||||
"reference": "cf02195d3c32424396932e087824bf581966e703"
|
||||
},
|
||||
"type": "library",
|
||||
"time": "2021-05-25T10:12:28+00:00",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"PayrixPHP\\": "lib/"
|
||||
}
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.4.0",
|
||||
"ext-curl": "*",
|
||||
"ext-openssl": "*"
|
||||
},
|
||||
"uid": 5232658
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
24
swh/lister/packagist/tests/data/with_invalid_url.json
Normal file
24
swh/lister/packagist/tests/data/with_invalid_url.json
Normal file
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"packages": {
|
||||
"ycms/module-main": {
|
||||
"dev-master": {
|
||||
"name": "with/invalid_url",
|
||||
"description": "",
|
||||
"keywords": [],
|
||||
"homepage": "",
|
||||
"version": "dev-master",
|
||||
"version_normalized": "9999999-dev",
|
||||
"license": [],
|
||||
"authors": [],
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "git@example.org/invalid/url.git",
|
||||
"reference": "0000000000000000000000000000000000000000"
|
||||
},
|
||||
"time": "2015-08-23T04:42:33+00:00",
|
||||
"default-branch": true,
|
||||
"uid": 4064797
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -14,7 +14,9 @@ _packages_list = {
|
|||
"ljjackson/linnworks",
|
||||
"lky/wx_article",
|
||||
"spryker-eco/computop-api",
|
||||
"idevlab/essential",
|
||||
"idevlab/essential", # Git SSH URL
|
||||
"payrix/payrix-php",
|
||||
"with/invalid_url", # invalid URL
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -49,7 +51,7 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
|
|||
stats = lister.run()
|
||||
|
||||
assert stats.pages == 1
|
||||
assert stats.origins == len(_packages_list["packageNames"])
|
||||
assert stats.origins == len(_packages_list["packageNames"]) - 2
|
||||
assert lister.updated
|
||||
|
||||
expected_origins = {
|
||||
|
@ -69,9 +71,9 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d
|
|||
datetime.datetime.fromisoformat("2020-06-22T15:50:29+00:00"),
|
||||
),
|
||||
(
|
||||
"git@gitlab.com:idevlab/Essential.git", # not GitHub
|
||||
"https://gitlab.com/payrix/public/payrix-php.git", # not GitHub
|
||||
"git",
|
||||
datetime.datetime.fromisoformat("2022-10-12T10:34:29+00:00"),
|
||||
datetime.datetime.fromisoformat("2021-05-25T14:12:28+00:00"),
|
||||
),
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ from swh.scheduler import get_scheduler, model
|
|||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
||||
from . import USER_AGENT_TEMPLATE
|
||||
from .utils import http_retry
|
||||
from .utils import http_retry, is_valid_origin_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -277,8 +277,15 @@ class Lister(Generic[StateType, PageType]):
|
|||
Returns:
|
||||
the list of origin URLs recorded in scheduler database
|
||||
"""
|
||||
valid_origins = []
|
||||
for origin in origins:
|
||||
if is_valid_origin_url(origin.url):
|
||||
valid_origins.append(origin)
|
||||
else:
|
||||
logger.warning("Skipping invalid origin: %s", origin.url)
|
||||
|
||||
recorded_origins = []
|
||||
for batch_origins in grouper(origins, n=1000):
|
||||
for batch_origins in grouper(valid_origins, n=1000):
|
||||
ret = self.scheduler.record_listed_origins(batch_origins)
|
||||
recorded_origins += [origin.url for origin in ret]
|
||||
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from typing import Callable, Iterator, Tuple
|
||||
from typing import Callable, Iterator, Optional, Tuple
|
||||
import urllib.parse
|
||||
|
||||
from requests.exceptions import ConnectionError, HTTPError
|
||||
from requests.status_codes import codes
|
||||
|
@ -111,3 +112,50 @@ def http_retry(
|
|||
|
||||
"""
|
||||
return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args)
|
||||
|
||||
|
||||
def is_valid_origin_url(url: Optional[str]) -> bool:
|
||||
"""Returns whether the given string is a valid origin URL.
|
||||
This excludes Git SSH URLs and pseudo-URLs (eg. ``ssh://git@example.org:foo``
|
||||
and ``git@example.org:foo``), as they are not supported by the Git loader
|
||||
and usually require authentication.
|
||||
|
||||
All HTTP URLs are allowed:
|
||||
|
||||
>>> is_valid_origin_url("http://example.org/repo.git")
|
||||
True
|
||||
>>> is_valid_origin_url("http://example.org/repo")
|
||||
True
|
||||
>>> is_valid_origin_url("https://example.org/repo")
|
||||
True
|
||||
>>> is_valid_origin_url("https://foo:bar@example.org/repo")
|
||||
True
|
||||
|
||||
Scheme-less URLs are rejected;
|
||||
|
||||
>>> is_valid_origin_url("example.org/repo")
|
||||
False
|
||||
>>> is_valid_origin_url("example.org:repo")
|
||||
False
|
||||
|
||||
Git SSH URLs and pseudo-URLs are rejected:
|
||||
|
||||
>>> is_valid_origin_url("git@example.org:repo")
|
||||
False
|
||||
>>> is_valid_origin_url("ssh://git@example.org:repo")
|
||||
False
|
||||
"""
|
||||
if not url:
|
||||
# Empty or None
|
||||
return False
|
||||
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
if not parsed.netloc:
|
||||
# Is parsed as a relative URL
|
||||
return False
|
||||
|
||||
if parsed.scheme == "ssh":
|
||||
# Git SSH URL
|
||||
return False
|
||||
|
||||
return True
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue