cgit: Add support for last_update information during listing

Related to T2988
This commit is contained in:
Vincent SELLIER 2021-01-26 18:03:27 +01:00
parent bb0184c004
commit 91fcde8341
No known key found for this signature in database
GPG key ID: 3F13C434EADAD17D
2 changed files with 125 additions and 12 deletions

View file

@ -2,8 +2,10 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
from typing import Iterator, List, Optional
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
@ -16,7 +18,7 @@ from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
Repositories = List[str]
Repositories = List[Dict[str, Any]]
class CGitLister(StatelessLister[Repositories]):
@ -68,7 +70,8 @@ class CGitLister(StatelessLister[Repositories]):
def get_pages(self) -> Iterator[Repositories]:
"""Generate git 'project' URLs found on the current CGit server
The last_update date is retrieved on the list of repo page to avoid
to compute it on the repository details which only give a date per branch
"""
next_page: Optional[str] = self.url
while next_page:
@ -78,7 +81,16 @@ class CGitLister(StatelessLister[Repositories]):
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
page_results.append(urljoin(self.url, tr.find("a")["href"]))
url = urljoin(self.url, tr.find("a")["href"])
span = tr.find("span", {"class": re.compile("age-")})
if span:
last_updated_date = span["title"]
else:
last_updated_date = None
page_results.append(
{"url": url, "last_updated_date": last_updated_date}
)
yield page_results
@ -99,8 +111,8 @@ class CGitLister(StatelessLister[Repositories]):
"""Convert a page of cgit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for repository_url in repositories:
origin_url = self._get_origin_from_repository_url(repository_url)
for repository in repositories:
origin_url = self._get_origin_from_repository_url(repository["url"])
if not origin_url:
continue
@ -108,7 +120,7 @@ class CGitLister(StatelessLister[Repositories]):
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=None,
last_update=_parse_last_updated_date(repository),
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
@ -134,3 +146,28 @@ class CGitLister(StatelessLister[Repositories]):
# otherwise, choose the first one
origin_url = urls[0]
return origin_url
def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]:
"""Parse the last updated date"""
date = repository.get("last_updated_date")
if not date:
return None
parsed_date = None
for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"):
try:
parsed_date = datetime.strptime(date, date_format)
# force UTC to avoid naive datetime
if not parsed_date.tzinfo:
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
break
except Exception:
pass
if not parsed_date:
logger.warning(
"Could not parse %s last_updated date: %s", repository["url"], date,
)
return parsed_date

View file

@ -2,10 +2,13 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
from typing import List
import pytest
from swh.lister import __version__
from swh.lister.cgit.lister import CGitLister
from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date
from swh.lister.pattern import ListerStats
@ -17,13 +20,15 @@ def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler):
flattened_repos = sum(repos, [])
assert len(flattened_repos) == 977
assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
assert (
flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
)
# note the url below is NOT a subpath of /cgit/
assert (
flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/"
flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git/"
) # noqa
# note the url below is NOT on the same server
assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/"
assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git/"
def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
@ -37,7 +42,7 @@ def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
assert len(flattened_repos) == 16
def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
"""cgit lister supports pagination"""
url = "https://git.tizen/cgit/"
@ -66,3 +71,74 @@ def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
user_agent = request.headers["User-Agent"]
assert "Software Heritage Lister" in user_agent
assert __version__ in user_agent
def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler):
"""cgit lister returns last updated date"""
url = "https://git.tizen/cgit"
urls_without_date = [
f"https://git.tizen.org/cgit/{suffix_url}"
for suffix_url in ["All-Projects", "All-Users", "Lock-Projects",]
]
lister_cgit = CGitLister(swh_scheduler, url=url)
stats = lister_cgit.run()
expected_nb_origins = 16
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_cgit.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
if listed_origin.url in urls_without_date:
assert listed_origin.last_update is None
else:
assert listed_origin.last_update is not None
@pytest.mark.parametrize(
"date_str,expected_date",
[
({}, None),
("unexpected date", None),
("2020-0140-10 10:10:10 (GMT)", None),
(
"2020-01-10 10:10:10 (GMT)",
datetime(
year=2020,
month=1,
day=10,
hour=10,
minute=10,
second=10,
tzinfo=timezone.utc,
),
),
(
"2019-08-04 05:10:41 +0100",
datetime(
year=2019,
month=8,
day=4,
hour=5,
minute=10,
second=41,
tzinfo=timezone(timedelta(hours=1)),
),
),
],
)
def test_lister_cgit_date_parsing(date_str, expected_date):
"""test cgit lister date parsing"""
repository = {"url": "url", "last_updated_date": date_str}
assert _parse_last_updated_date(repository) == expected_date