Add non-incremental Golang modules lister

This uses https://index.golang.org. An associated loader will be sent in
the near future, as well as an incremental version of this lister.

[1] https://go.dev/ref/mod#goproxy-protocol
This commit is contained in:
Raphaël Gomès 2022-03-09 22:35:40 +01:00
parent 0acf5b0f4f
commit 60405e78ae
11 changed files with 319 additions and 1 deletions

View file

@ -17,6 +17,7 @@ following Python modules:
- `swh.lister.github`
- `swh.lister.gitlab`
- `swh.lister.gnu`
- `swh.lister.golang`
- `swh.lister.launchpad`
- `swh.lister.maven`
- `swh.lister.npm`
@ -38,7 +39,7 @@ Local deployment
## lister configuration
Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
must be configured by following the instructions below (please note that you have to replace
`<lister_name>` by one of the lister name introduced above).

View file

@ -67,6 +67,7 @@ setup(
lister.github=swh.lister.github:register
lister.gitlab=swh.lister.gitlab:register
lister.gnu=swh.lister.gnu:register
lister.golang=swh.lister.golang:register
lister.launchpad=swh.lister.launchpad:register
lister.npm=swh.lister.npm:register
lister.opam=swh.lister.opam:register

View file

@ -0,0 +1,12 @@
# Copyright (C) 2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import GolangLister
return {
"lister": GolangLister,
"task_modules": ["%s.tasks" % __name__],
}

145
swh/lister/golang/lister.py Normal file
View file

@ -0,0 +1,145 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime
import json
import logging
from typing import Any, Dict, Iterator, List, Optional, Tuple
import iso8601
import requests
from tenacity import before_sleep_log
from swh.lister.utils import retry_policy_generic, throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
GolangPageType = List[Dict[str, Any]]
class GolangLister(StatelessLister[GolangPageType]):
"""
List all Golang modules and send associated origins to scheduler.
The lister queries the Golang module index, whose documentation can be found
at https://index.golang.org
"""
GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index"
# `limit` seems to be... limited to 2000.
GOLANG_MODULES_INDEX_LIMIT = 2000
LISTER_NAME = "Golang"
def __init__(
self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler,
url=self.GOLANG_MODULES_INDEX_URL,
instance="Golang",
credentials=credentials,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
@throttling_retry(
retry=retry_policy_generic,
before_sleep=before_sleep_log(logger, logging.WARNING),
)
def api_request(self, url: str) -> List[str]:
logger.debug("Fetching URL %s", url)
response = self.session.get(url)
if response.status_code not in (200, 304):
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s for URL %s",
response.status_code,
response.url,
)
response.raise_for_status()
return response.text.split()
def get_single_page(
self, since: Optional[datetime] = None
) -> Tuple[GolangPageType, Optional[datetime]]:
"""Return a page from the API and the timestamp of its last entry.
Since all entries are sorted by chronological order, the timestamp is useful
both for pagination and later for incremental runs."""
url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}"
if since is not None:
# The Golang index does not understand `+00:00` for some reason
# and expects the "timezone zero" notation instead. This works
# because all times are UTC.
utc_offset = since.utcoffset()
assert (
utc_offset is not None and utc_offset.total_seconds() == 0
), "Non-UTC datetime"
as_date = since.isoformat().replace("+00:00", "Z")
url = f"{url}&since={as_date}"
entries = self.api_request(url)
page: GolangPageType = []
if not entries:
return page, since
for as_json in entries:
entry = json.loads(as_json)
timestamp = iso8601.parse_date(entry["Timestamp"])
# We've already parsed it and we'll need the datetime later, save it
entry["Timestamp"] = timestamp
page.append(entry)
# The index is guaranteed to be sorted in chronological order
since = timestamp
return page, since
def get_pages(self) -> Iterator[GolangPageType]:
page, since = self.get_single_page()
last_since = since
while page:
yield page
page, since = self.get_single_page(since=since)
if last_since == since:
# The index returns packages whose timestamp are greater or
# equal to the date provided as parameter, which will create
# an infinite loop if not stopped here.
return []
last_since = since
def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]:
"""
Iterate on all Golang projects and yield ListedOrigin instances.
"""
assert self.lister_obj.id is not None
for module in page:
path = module["Path"]
# The loader will be expected to use the golang proxy to do the
# actual downloading. We're using `pkg.go.dev` so that the URL points
# to somewhere useful for a human instead of an (incomplete) API path.
origin_url = f"https://pkg.go.dev/{path}"
# Since the Go index lists versions and not just packages, there will
# be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side,
# so only the last timestamp will be used, with no duplicates.
# Performance should not be an issue as they are sent to the db in bulk.
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="golang",
last_update=module["Timestamp"],
)

View file

@ -0,0 +1,18 @@
# Copyright (C) 2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from .lister import GolangLister
@shared_task(name=__name__ + ".FullGolangLister")
def list_golang(**lister_args):
"List the Golang module registry"
return GolangLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

View file

@ -0,0 +1,5 @@
{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"}
{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"}
{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"}
{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"}
{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"}

View file

@ -0,0 +1,4 @@
{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"}
{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"}
{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"}
{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"}

View file

@ -0,0 +1,10 @@
{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"}
{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"}
{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"}
{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"}
{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"}
{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"}
{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"}
{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"}
{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"}
{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"}

View file

@ -0,0 +1,90 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
import iso8601
from swh.lister.golang.lister import GolangLister
from swh.lister.tests.test_utils import assert_sleep_calls
from swh.lister.utils import WAIT_EXP_BASE
# https://pkg.go.dev prefix omitted
expected_listed = [
("collectd.org", "2019-04-11T18:47:25.450546+00:00"),
("github.com/blang/semver", "2019-04-15T13:54:39.107258+00:00",),
("github.com/bmizerany/pat", "2019-04-11T18:47:29.390564+00:00",),
("github.com/djherbis/buffer", "2019-04-11T18:47:29.974874+00:00",),
("github.com/djherbis/nio", "2019-04-11T18:47:32.283312+00:00",),
("github.com/gobuffalo/buffalo-plugins", "2019-04-15T13:54:34.222985+00:00",),
("github.com/gobuffalo/buffalo-pop", "2019-04-15T13:54:39.135792+00:00",),
("github.com/gobuffalo/clara", "2019-04-15T13:54:40.651916+00:00",),
("github.com/gobuffalo/genny", "2019-04-15T13:54:37.841547+00:00",),
("github.com/gobuffalo/packr", "2019-04-15T13:54:35.688900+00:00",),
("github.com/markbates/refresh", "2019-04-15T13:54:35.250835+00:00",),
("github.com/mitchellh/go-homedir", "2019-04-15T13:54:35.678214+00:00",),
("github.com/nats-io/nuid", "2019-04-11T18:47:28.102348+00:00",),
("github.com/oklog/ulid", "2019-04-11T18:47:23.234198+00:00",),
("github.com/pkg/errors", "2019-04-18T02:07:41.336899+00:00",),
("golang.org/x/sys", "2019-04-15T13:54:37.555525+00:00",),
("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"),
# only one x/tools listed even though there are two version, and only the
# latest one's timestamp is used.
("golang.org/x/tools", "2019-04-15T13:54:41.905064+00:00",),
]
def _generate_responses(datadir, requests_mock):
responses = []
for file in Path(datadir).glob("page-*.txt"):
# Test that throttling and server errors are retries
responses.append({"text": "", "status_code": 429})
responses.append({"text": "", "status_code": 500})
# Also test that the lister appropriately gets out of the infinite loop
responses.append({"text": file.read_text(), "status_code": 200})
requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses)
def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
# first listing, should return one origin per package
lister = GolangLister(scheduler=swh_scheduler)
# Exponential retries take a long time, so stub time.sleep
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
_generate_responses(datadir, requests_mock)
stats = lister.run()
assert stats.pages == 3
# The two `golang.org/x/tools` versions are *not* listed as separate origins
assert stats.origins == 18
scheduler_origins = sorted(
swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
key=lambda x: x.url,
)
for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed):
assert scheduled.url == f"https://pkg.go.dev/{url}"
assert scheduled.last_update == iso8601.parse_date(timestamp)
assert scheduled.visit_type == "golang"
assert len(scheduler_origins) == len(expected_listed)
# Test `time.sleep` is called with exponential retries
assert_sleep_calls(
mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE]
)
# doing it all again (without incremental) should give us the same result
lister = GolangLister(scheduler=swh_scheduler)
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
_generate_responses(datadir, requests_mock)
stats = lister.run()
assert stats.pages == 3
assert stats.origins == 18

View file

@ -0,0 +1,32 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_golang_full_listing_task(
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
lister = mocker.patch("swh.lister.golang.tasks.GolangLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=1, origins=28000)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()