Adapt maven lister to list canonical gh urls if any
That means detected github urls {https,git,http}://github.com/${user_repo}(.git) are canonicalized to https://github.com/${user_repo} format. This avoids duplication of origins. Related to T4232
This commit is contained in:
parent
2ffe9c2aea
commit
263db667d0
3 changed files with 92 additions and 38 deletions
|
@ -1,2 +1,2 @@
|
|||
swh.core[db,github] >= 2.6
|
||||
swh.core[db,github] >= 2.8
|
||||
swh.scheduler >= 0.8
|
||||
|
|
|
@ -14,6 +14,7 @@ import requests
|
|||
from tenacity.before_sleep import before_sleep_log
|
||||
import xmltodict
|
||||
|
||||
from swh.core.github.utils import GitHubSession
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
@ -25,6 +26,8 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
RepoPage = Dict[str, Any]
|
||||
|
||||
SUPPORTED_SCM_TYPES = ("git", "svn", "hg", "cvs", "bzr")
|
||||
|
||||
|
||||
@dataclass
|
||||
class MavenListerState:
|
||||
|
@ -98,6 +101,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
)
|
||||
|
||||
self.jar_origins: Dict[str, ListedOrigin] = {}
|
||||
self.github_session = GitHubSession(
|
||||
credentials=self.credentials, user_agent=USER_AGENT
|
||||
)
|
||||
|
||||
def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
|
||||
return MavenListerState(**d)
|
||||
|
@ -271,35 +277,63 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
except xmltodict.expat.ExpatError as error:
|
||||
logger.info("Could not parse POM %s XML: %s. Next.", pom, error)
|
||||
|
||||
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
|
||||
"""Convert a page of Maven repositories into a list of ListedOrigins."""
|
||||
def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
|
||||
"""Retrieve scm origin out of the page information. Only called when type of the
|
||||
page is scm.
|
||||
|
||||
Try and detect an scm/vcs repository. Note that official format is in the form:
|
||||
scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
|
||||
the repo url (without the "scm:type"), so we have to check against the content
|
||||
to extract the type and url properly.
|
||||
|
||||
Raises
|
||||
AssertionError when the type of the page is not 'scm'
|
||||
|
||||
Returns
|
||||
ListedOrigin with proper canonical scm url (for github) if any is found,
|
||||
None otherwise.
|
||||
|
||||
"""
|
||||
|
||||
assert page["type"] == "scm"
|
||||
visit_type: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
|
||||
if m_scm is None:
|
||||
return None
|
||||
|
||||
scm_type = m_scm.group("type")
|
||||
if scm_type and scm_type in SUPPORTED_SCM_TYPES:
|
||||
url = m_scm.group("url")
|
||||
visit_type = scm_type
|
||||
elif page["url"].endswith(".git"):
|
||||
url = page["url"].lstrip("scm:")
|
||||
visit_type = "git"
|
||||
else:
|
||||
return None
|
||||
|
||||
if url and visit_type == "git":
|
||||
# Non-github urls will be returned as is, github ones will be canonical ones
|
||||
url = self.github_session.get_canonical_url(url)
|
||||
|
||||
if not url:
|
||||
return None
|
||||
|
||||
assert visit_type is not None
|
||||
assert self.lister_obj.id is not None
|
||||
scm_types_ok = ("git", "svn", "hg", "cvs", "bzr")
|
||||
return ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=url,
|
||||
visit_type=visit_type,
|
||||
)
|
||||
|
||||
def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
|
||||
|
||||
"""Convert a page of Maven repositories into a list of ListedOrigins."""
|
||||
if page["type"] == "scm":
|
||||
# If origin is a scm url: detect scm type and yield.
|
||||
# Note that the official format is:
|
||||
# scm:git:git://github.com/openengsb/openengsb-framework.git
|
||||
# but many, many projects directly put the repo url, so we have to
|
||||
# detect the content to match it properly.
|
||||
m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
|
||||
if m_scm is not None:
|
||||
scm_type = m_scm.group("type")
|
||||
if scm_type in scm_types_ok:
|
||||
scm_url = m_scm.group("url")
|
||||
origin = ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=scm_url,
|
||||
visit_type=scm_type,
|
||||
)
|
||||
yield origin
|
||||
else:
|
||||
if page["url"].endswith(".git"):
|
||||
origin = ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=page["url"],
|
||||
visit_type="git",
|
||||
)
|
||||
yield origin
|
||||
listed_origin = self.get_scm(page)
|
||||
if listed_origin:
|
||||
yield listed_origin
|
||||
else:
|
||||
# Origin is gathering source archives:
|
||||
last_update_dt = None
|
||||
|
@ -326,6 +360,7 @@ class MavenLister(Lister[MavenListerState, RepoPage]):
|
|||
|
||||
if origin_url not in self.jar_origins:
|
||||
# Create ListedOrigin instance if we did not see that origin yet
|
||||
assert self.lister_obj.id is not None
|
||||
jar_origin = ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=origin_url,
|
||||
|
|
|
@ -18,12 +18,17 @@ URL_POM_1 = MVN_URL + "al/aldi/sprova4j/0.1.0/sprova4j-0.1.0.pom"
|
|||
URL_POM_2 = MVN_URL + "al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom"
|
||||
URL_POM_3 = MVN_URL + "com/arangodb/arangodb-graphql/1.2/arangodb-graphql-1.2.pom"
|
||||
|
||||
LIST_GIT = (
|
||||
"git://github.com/aldialimucaj/sprova4j.git",
|
||||
"https://github.com/aldialimucaj/sprova4j.git",
|
||||
)
|
||||
|
||||
LIST_GIT_INCR = ("git://github.com/ArangoDB-Community/arangodb-graphql-java.git",)
|
||||
USER_REPO0 = "aldialimucaj/sprova4j"
|
||||
GIT_REPO_URL0_HTTPS = f"https://github.com/{USER_REPO0}"
|
||||
GIT_REPO_URL0_API = f"https://api.github.com/repos/{USER_REPO0}"
|
||||
LIST_GIT = (GIT_REPO_URL0_HTTPS,)
|
||||
|
||||
USER_REPO1 = "ArangoDB-Community/arangodb-graphql-java"
|
||||
GIT_REPO_URL1_HTTPS = f"https://github.com/{USER_REPO1}"
|
||||
GIT_REPO_URL1_GIT = f"git://github.com/{USER_REPO1}.git"
|
||||
GIT_REPO_URL1_API = f"https://api.github.com/repos/{USER_REPO1}"
|
||||
LIST_GIT_INCR = (GIT_REPO_URL1_HTTPS,)
|
||||
|
||||
LIST_SRC = (MVN_URL + "al/aldi/sprova4j",)
|
||||
|
||||
|
@ -86,6 +91,20 @@ def maven_pom_3(datadir) -> bytes:
|
|||
return Path(datadir, "https_maven.org", "arangodb-graphql-1.2.pom").read_bytes()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def requests_mock(requests_mock):
|
||||
"""If github api calls for the configured scm repository, returns its canonical url."""
|
||||
for url_api, url_html in [
|
||||
(GIT_REPO_URL0_API, GIT_REPO_URL0_HTTPS),
|
||||
(GIT_REPO_URL1_API, GIT_REPO_URL1_HTTPS),
|
||||
]:
|
||||
requests_mock.get(
|
||||
url_api,
|
||||
json={"html_url": url_html},
|
||||
)
|
||||
yield requests_mock
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def network_requests_mock(
|
||||
requests_mock, maven_index_full, maven_pom_1, maven_pom_2, maven_pom_3
|
||||
|
@ -118,7 +137,7 @@ def test_maven_full_listing(swh_scheduler):
|
|||
origin_urls = [origin.url for origin in scheduler_origins]
|
||||
|
||||
# 3 git origins + 1 maven origin with 2 releases (one per jar)
|
||||
assert len(origin_urls) == 4
|
||||
assert len(origin_urls) == 3
|
||||
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
|
||||
|
||||
for origin in scheduler_origins:
|
||||
|
@ -164,7 +183,7 @@ def test_maven_full_listing_malformed(
|
|||
|
||||
# 2 git origins + 1 maven origin with 2 releases (one per jar)
|
||||
assert len(origin_urls) == 3
|
||||
assert sorted(origin_urls) == sorted((LIST_GIT[1],) + LIST_GIT_INCR + LIST_SRC)
|
||||
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_GIT_INCR + LIST_SRC)
|
||||
|
||||
for origin in scheduler_origins:
|
||||
if origin.visit_type == "maven":
|
||||
|
@ -212,7 +231,7 @@ def test_maven_incremental_listing(
|
|||
|
||||
# 1 git origins + 1 maven origin with 1 release (one per jar)
|
||||
assert len(origin_urls) == 2
|
||||
assert sorted(origin_urls) == sorted((LIST_GIT[0],) + LIST_SRC)
|
||||
assert sorted(origin_urls) == sorted(LIST_GIT + LIST_SRC)
|
||||
|
||||
for origin in scheduler_origins:
|
||||
if origin.visit_type == "maven":
|
||||
|
@ -294,7 +313,7 @@ def test_maven_list_http_error_artifacts(
|
|||
# If the maven_index_full step succeeded but not the get_pom step,
|
||||
# then we get only one maven-jar origin and one git origin.
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert len(scheduler_origins) == 3
|
||||
assert len(scheduler_origins) == 2
|
||||
|
||||
|
||||
def test_maven_lister_null_mtime(swh_scheduler, requests_mock, maven_index_null_mtime):
|
||||
|
@ -331,4 +350,4 @@ def test_maven_list_pom_bad_encoding(swh_scheduler, requests_mock, maven_pom_1):
|
|||
# If the maven_index_full step succeeded but not the pom parsing step,
|
||||
# then we get only one maven-jar origin and one git origin.
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
|
||||
assert len(scheduler_origins) == 3
|
||||
assert len(scheduler_origins) == 2
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue