cgit: Add support for last_update information during listing
Related to T2988
This commit is contained in:
parent
bb0184c004
commit
91fcde8341
2 changed files with 125 additions and 12 deletions
|
@ -2,8 +2,10 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
from typing import Iterator, List, Optional
|
||||
import re
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -16,7 +18,7 @@ from swh.scheduler.model import ListedOrigin
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Repositories = List[str]
|
||||
Repositories = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class CGitLister(StatelessLister[Repositories]):
|
||||
|
@ -68,7 +70,8 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
|
||||
def get_pages(self) -> Iterator[Repositories]:
|
||||
"""Generate git 'project' URLs found on the current CGit server
|
||||
|
||||
The last_update date is retrieved on the list of repo page to avoid
|
||||
to compute it on the repository details which only give a date per branch
|
||||
"""
|
||||
next_page: Optional[str] = self.url
|
||||
while next_page:
|
||||
|
@ -78,7 +81,16 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
for tr in bs_idx.find("div", {"class": "content"}).find_all(
|
||||
"tr", {"class": ""}
|
||||
):
|
||||
page_results.append(urljoin(self.url, tr.find("a")["href"]))
|
||||
url = urljoin(self.url, tr.find("a")["href"])
|
||||
span = tr.find("span", {"class": re.compile("age-")})
|
||||
if span:
|
||||
last_updated_date = span["title"]
|
||||
else:
|
||||
last_updated_date = None
|
||||
|
||||
page_results.append(
|
||||
{"url": url, "last_updated_date": last_updated_date}
|
||||
)
|
||||
|
||||
yield page_results
|
||||
|
||||
|
@ -99,8 +111,8 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
"""Convert a page of cgit repositories into a list of ListedOrigins."""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for repository_url in repositories:
|
||||
origin_url = self._get_origin_from_repository_url(repository_url)
|
||||
for repository in repositories:
|
||||
origin_url = self._get_origin_from_repository_url(repository["url"])
|
||||
if not origin_url:
|
||||
continue
|
||||
|
||||
|
@ -108,7 +120,7 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
lister_id=self.lister_obj.id,
|
||||
url=origin_url,
|
||||
visit_type="git",
|
||||
last_update=None,
|
||||
last_update=_parse_last_updated_date(repository),
|
||||
)
|
||||
|
||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
|
||||
|
@ -134,3 +146,28 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
# otherwise, choose the first one
|
||||
origin_url = urls[0]
|
||||
return origin_url
|
||||
|
||||
|
||||
def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]:
|
||||
"""Parse the last updated date"""
|
||||
date = repository.get("last_updated_date")
|
||||
if not date:
|
||||
return None
|
||||
|
||||
parsed_date = None
|
||||
for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"):
|
||||
try:
|
||||
parsed_date = datetime.strptime(date, date_format)
|
||||
# force UTC to avoid naive datetime
|
||||
if not parsed_date.tzinfo:
|
||||
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not parsed_date:
|
||||
logger.warning(
|
||||
"Could not parse %s last_updated date: %s", repository["url"], date,
|
||||
)
|
||||
|
||||
return parsed_date
|
||||
|
|
|
@ -2,10 +2,13 @@
|
|||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from swh.lister import __version__
|
||||
from swh.lister.cgit.lister import CGitLister
|
||||
from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
|
@ -17,13 +20,15 @@ def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler):
|
|||
flattened_repos = sum(repos, [])
|
||||
assert len(flattened_repos) == 977
|
||||
|
||||
assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
|
||||
assert (
|
||||
flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
|
||||
)
|
||||
# note the url below is NOT a subpath of /cgit/
|
||||
assert (
|
||||
flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/"
|
||||
flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git/"
|
||||
) # noqa
|
||||
# note the url below is NOT on the same server
|
||||
assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/"
|
||||
assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git/"
|
||||
|
||||
|
||||
def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
|
||||
|
@ -37,7 +42,7 @@ def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
|
|||
assert len(flattened_repos) == 16
|
||||
|
||||
|
||||
def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
|
||||
def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
|
||||
"""cgit lister supports pagination"""
|
||||
|
||||
url = "https://git.tizen/cgit/"
|
||||
|
@ -66,3 +71,74 @@ def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
|
|||
user_agent = request.headers["User-Agent"]
|
||||
assert "Software Heritage Lister" in user_agent
|
||||
assert __version__ in user_agent
|
||||
|
||||
|
||||
def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler):
|
||||
"""cgit lister returns last updated date"""
|
||||
|
||||
url = "https://git.tizen/cgit"
|
||||
|
||||
urls_without_date = [
|
||||
f"https://git.tizen.org/cgit/{suffix_url}"
|
||||
for suffix_url in ["All-Projects", "All-Users", "Lock-Projects",]
|
||||
]
|
||||
|
||||
lister_cgit = CGitLister(swh_scheduler, url=url)
|
||||
|
||||
stats = lister_cgit.run()
|
||||
|
||||
expected_nb_origins = 16
|
||||
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
|
||||
|
||||
# test page parsing
|
||||
scheduler_origins = swh_scheduler.get_listed_origins(
|
||||
lister_cgit.lister_obj.id
|
||||
).results
|
||||
assert len(scheduler_origins) == expected_nb_origins
|
||||
|
||||
# test listed repositories
|
||||
for listed_origin in scheduler_origins:
|
||||
if listed_origin.url in urls_without_date:
|
||||
assert listed_origin.last_update is None
|
||||
else:
|
||||
assert listed_origin.last_update is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_str,expected_date",
|
||||
[
|
||||
({}, None),
|
||||
("unexpected date", None),
|
||||
("2020-0140-10 10:10:10 (GMT)", None),
|
||||
(
|
||||
"2020-01-10 10:10:10 (GMT)",
|
||||
datetime(
|
||||
year=2020,
|
||||
month=1,
|
||||
day=10,
|
||||
hour=10,
|
||||
minute=10,
|
||||
second=10,
|
||||
tzinfo=timezone.utc,
|
||||
),
|
||||
),
|
||||
(
|
||||
"2019-08-04 05:10:41 +0100",
|
||||
datetime(
|
||||
year=2019,
|
||||
month=8,
|
||||
day=4,
|
||||
hour=5,
|
||||
minute=10,
|
||||
second=41,
|
||||
tzinfo=timezone(timedelta(hours=1)),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_lister_cgit_date_parsing(date_str, expected_date):
|
||||
"""test cgit lister date parsing"""
|
||||
|
||||
repository = {"url": "url", "last_updated_date": date_str}
|
||||
|
||||
assert _parse_last_updated_date(repository) == expected_date
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue