Adapt maven lister to list canonical gh urls if any

That means detected github urls {https,git,http}://github.com/${user_repo}(.git) are
canonicalized to https://github.com/${user_repo} format.

This avoids duplication of origins.

Related to T4232
This commit is contained in:
Antoine R. Dumont (@ardumont) 2022-05-20 16:37:15 +02:00
parent 2ffe9c2aea
commit 263db667d0
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
3 changed files with 92 additions and 38 deletions

View file

@ -1,2 +1,2 @@
swh.core[db,github] >= 2.6
swh.core[db,github] >= 2.8
swh.scheduler >= 0.8

View file

@ -14,6 +14,7 @@ import requests
from tenacity.before_sleep import before_sleep_log
import xmltodict
from swh.core.github.utils import GitHubSession
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
@ -25,6 +26,8 @@ logger = logging.getLogger(__name__)
RepoPage = Dict[str, Any]
SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr")
@dataclass
class MavenListerState:
@ -98,6 +101,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
)
self.jar_origins: Dict[str, ListedOrigin] = {}
self.github_session = GitHubSession(
credentials=self.credentials, user_agent=USER_AGENT
)
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
return MavenListerState(**d)
@ -271,35 +277,63 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
except xmltodict.expat.ExpatError as error:
logger.info("Could not parse POM %s XML: %s. Next.", pom, error)
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
"""Convert a page of Maven repositories into a list of ListedOrigins."""
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
"""Retrieve scm origin out of the page information. Only called when type of the
page is scm.
Try and detect an scm/vcs repository. Note that official format is in the form:
scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
the repo url (without the "scm:type"), so we have to check against the content
to extract the type and url properly.
Raises
AssertionError when the type of the page is not 'scm'
Returns
ListedOrigin with proper canonical scm url (for github) if any is found,
None otherwise.
"""
assert page["type"] == "scm"
visit_type: Optional[str] = None
url: Optional[str] = None
m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
if m_scm is None:
return None
scm_type = m_scm.group("type")
if scm_type and scm_type in SUPPORTED_SCM_TYPES:
url = m_scm.group("url")
visit_type = scm_type
elif page["url"].endswith(".git"):
url = page["url"].lstrip("scm:")
visit_type = "git"
else:
return None
if url and visit_type == "git":
# Non-github urls will be returned as is, github ones will be canonical ones
url = self.github_session.get_canonical_url(url)
if not url:
return None
assert visit_type is not None
assert self.lister_obj.id is not None
scm_types_ok = ("git", "svn", "hg", "cvs", "bzr")
return ListedOrigin(
lister_id=self.lister_obj.id,
url=url,
visit_type=visit_type,
)
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
"""Convert a page of Maven repositories into a list of ListedOrigins."""
if page["type"] == "scm":
# If origin is a scm url: detect scm type and yield.
# Note that the official format is:
# scm:git:git://github.com/openengsb/openengsb-framework.git
# but many, many projects directly put the repo url, so we have to
# detect the content to match it properly.
m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
if m_scm is not None:
scm_type = m_scm.group("type")
if scm_type in scm_types_ok:
scm_url = m_scm.group("url")
origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=scm_url,
visit_type=scm_type,
)
yield origin
else:
if page["url"].endswith(".git"):
origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=page["url"],
visit_type="git",
)
yield origin
listed_origin = self.get_scm(page)
if listed_origin:
yield listed_origin
else:
# Origin is gathering source archives:
last_update_dt = None
@ -326,6 +360,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
if origin_url not in self.jar_origins:
# Create ListedOrigin instance if we did not see that origin yet
assert self.lister_obj.id is not None
jar_origin = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,

View file

@ -18,12 +18,17 @@ URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
LIST_GIT = (
"git://github.com/aldialimucaj/sprova4j.git",
"https://github.com/aldialimucaj/sprova4j.git",
)
LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
USER_REPO0 = "aldialimucaj/sprova4j"
GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}"
GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}"
LIST_GIT = (GIT_REPO_URL0_HTTPS,)
USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java"
GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}"
GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git"
GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}"
LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,)
LIST_SRC = (MVN_URL + "al/aldi/sprova4j",)
@ -86,6 +91,20 @@ def maven_pom_3(datadir) -> bytes:
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
@pytest.fixture
def requests_mock(requests_mock):
"""If github api calls for the configured scm repository, returns its canonical url."""
for url_api, url_html in [
(GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS),
(GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS),
]:
requests_mock.get(
url_api,
json={"html_url": url_html},
)
yield requests_mock
@pytest.fixture(autouse=True)
def network_requests_mock(
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
@ -118,7 +137,7 @@ def test_maven_full_listing(swh_scheduler):
origin_urls = [origin.url for origin in scheduler_origins]
# 3 git origins + 1 maven origin with 2 releases (one per jar)
assert len(origin_urls) == 4
assert len(origin_urls) == 3
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
for origin in scheduler_origins:
@ -164,7 +183,7 @@ def test_maven_full_listing_malformed(
# 2 git origins + 1 maven origin with 2 releases (one per jar)
assert len(origin_urls) == 3
assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC)
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
for origin in scheduler_origins:
if origin.visit_type == "maven":
@ -212,7 +231,7 @@ def test_maven_incremental_listing(
# 1 git origins + 1 maven origin with 1 release (one per jar)
assert len(origin_urls) == 2
assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC)
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC)
for origin in scheduler_origins:
if origin.visit_type == "maven":
@ -294,7 +313,7 @@ def test_maven_list_http_error_artifacts(
# If the maven_index_full step succeeded but not the get_pom step,
# then we get only one maven-jar origin and one git origin.
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 3
assert len(scheduler_origins) == 2
def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime):
@ -331,4 +350,4 @@ def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1):
# If the maven_index_full step succeeded but not the pom parsing step,
# then we get only one maven-jar origin and one git origin.
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == 3
assert len(scheduler_origins) == 2