Add non-incremental Golang modules lister
This uses https://index.golang.org. An associated loader will be sent in the near future, as well as an incremental version of this lister. [1] https://go.dev/ref/mod#goproxy-protocol
This commit is contained in:
parent
0acf5b0f4f
commit
60405e78ae
11 changed files with 319 additions and 1 deletions
|
@ -17,6 +17,7 @@ following Python modules:
|
|||
- `swh.lister.github`
|
||||
- `swh.lister.gitlab`
|
||||
- `swh.lister.gnu`
|
||||
- `swh.lister.golang`
|
||||
- `swh.lister.launchpad`
|
||||
- `swh.lister.maven`
|
||||
- `swh.lister.npm`
|
||||
|
@ -38,7 +39,7 @@ Local deployment
|
|||
## lister configuration
|
||||
|
||||
Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
|
||||
`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
|
||||
`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
|
||||
must be configured by following the instructions below (please note that you have to replace
|
||||
`<lister_name>` by one of the lister name introduced above).
|
||||
|
||||
|
|
1
setup.py
1
setup.py
|
@ -67,6 +67,7 @@ setup(
|
|||
lister.github=swh.lister.github:register
|
||||
lister.gitlab=swh.lister.gitlab:register
|
||||
lister.gnu=swh.lister.gnu:register
|
||||
lister.golang=swh.lister.golang:register
|
||||
lister.launchpad=swh.lister.launchpad:register
|
||||
lister.npm=swh.lister.npm:register
|
||||
lister.opam=swh.lister.opam:register
|
||||
|
|
12
swh/lister/golang/__init__.py
Normal file
12
swh/lister/golang/__init__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Copyright (C) 2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
|
||||
def register():
|
||||
from .lister import GolangLister
|
||||
|
||||
return {
|
||||
"lister": GolangLister,
|
||||
"task_modules": ["%s.tasks" % __name__],
|
||||
}
|
145
swh/lister/golang/lister.py
Normal file
145
swh/lister/golang/lister.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from datetime import datetime
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
||||
|
||||
import iso8601
|
||||
import requests
|
||||
from tenacity import before_sleep_log
|
||||
|
||||
from swh.lister.utils import retry_policy_generic, throttling_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
from swh.scheduler.model import ListedOrigin
|
||||
|
||||
from .. import USER_AGENT
|
||||
from ..pattern import CredentialsType, StatelessLister
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
GolangPageType = List[Dict[str, Any]]
|
||||
|
||||
|
||||
class GolangLister(StatelessLister[GolangPageType]):
|
||||
"""
|
||||
List all Golang modules and send associated origins to scheduler.
|
||||
|
||||
The lister queries the Golang module index, whose documentation can be found
|
||||
at https://index.golang.org
|
||||
"""
|
||||
|
||||
GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index"
|
||||
# `limit` seems to be... limited to 2000.
|
||||
GOLANG_MODULES_INDEX_LIMIT = 2000
|
||||
LISTER_NAME = "Golang"
|
||||
|
||||
def __init__(
|
||||
self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
|
||||
):
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
url=self.GOLANG_MODULES_INDEX_URL,
|
||||
instance="Golang",
|
||||
credentials=credentials,
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{"Accept": "application/json", "User-Agent": USER_AGENT}
|
||||
)
|
||||
|
||||
@throttling_retry(
|
||||
retry=retry_policy_generic,
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)
|
||||
def api_request(self, url: str) -> List[str]:
|
||||
logger.debug("Fetching URL %s", url)
|
||||
|
||||
response = self.session.get(url)
|
||||
|
||||
if response.status_code not in (200, 304):
|
||||
# Log response content to ease debugging
|
||||
logger.warning(
|
||||
"Unexpected HTTP status code %s for URL %s",
|
||||
response.status_code,
|
||||
response.url,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
return response.text.split()
|
||||
|
||||
def get_single_page(
|
||||
self, since: Optional[datetime] = None
|
||||
) -> Tuple[GolangPageType, Optional[datetime]]:
|
||||
"""Return a page from the API and the timestamp of its last entry.
|
||||
Since all entries are sorted by chronological order, the timestamp is useful
|
||||
both for pagination and later for incremental runs."""
|
||||
url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}"
|
||||
if since is not None:
|
||||
# The Golang index does not understand `+00:00` for some reason
|
||||
# and expects the "timezone zero" notation instead. This works
|
||||
# because all times are UTC.
|
||||
utc_offset = since.utcoffset()
|
||||
assert (
|
||||
utc_offset is not None and utc_offset.total_seconds() == 0
|
||||
), "Non-UTC datetime"
|
||||
as_date = since.isoformat().replace("+00:00", "Z")
|
||||
url = f"{url}&since={as_date}"
|
||||
|
||||
entries = self.api_request(url)
|
||||
page: GolangPageType = []
|
||||
if not entries:
|
||||
return page, since
|
||||
|
||||
for as_json in entries:
|
||||
entry = json.loads(as_json)
|
||||
timestamp = iso8601.parse_date(entry["Timestamp"])
|
||||
# We've already parsed it and we'll need the datetime later, save it
|
||||
entry["Timestamp"] = timestamp
|
||||
page.append(entry)
|
||||
# The index is guaranteed to be sorted in chronological order
|
||||
since = timestamp
|
||||
|
||||
return page, since
|
||||
|
||||
def get_pages(self) -> Iterator[GolangPageType]:
|
||||
page, since = self.get_single_page()
|
||||
last_since = since
|
||||
while page:
|
||||
yield page
|
||||
page, since = self.get_single_page(since=since)
|
||||
if last_since == since:
|
||||
# The index returns packages whose timestamp are greater or
|
||||
# equal to the date provided as parameter, which will create
|
||||
# an infinite loop if not stopped here.
|
||||
return []
|
||||
last_since = since
|
||||
|
||||
def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]:
|
||||
"""
|
||||
Iterate on all Golang projects and yield ListedOrigin instances.
|
||||
"""
|
||||
assert self.lister_obj.id is not None
|
||||
|
||||
for module in page:
|
||||
path = module["Path"]
|
||||
# The loader will be expected to use the golang proxy to do the
|
||||
# actual downloading. We're using `pkg.go.dev` so that the URL points
|
||||
# to somewhere useful for a human instead of an (incomplete) API path.
|
||||
origin_url = f"https://pkg.go.dev/{path}"
|
||||
|
||||
# Since the Go index lists versions and not just packages, there will
|
||||
# be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side,
|
||||
# so only the last timestamp will be used, with no duplicates.
|
||||
# Performance should not be an issue as they are sent to the db in bulk.
|
||||
yield ListedOrigin(
|
||||
lister_id=self.lister_obj.id,
|
||||
url=origin_url,
|
||||
visit_type="golang",
|
||||
last_update=module["Timestamp"],
|
||||
)
|
18
swh/lister/golang/tasks.py
Normal file
18
swh/lister/golang/tasks.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Copyright (C) 2022 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from .lister import GolangLister
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".FullGolangLister")
|
||||
def list_golang(**lister_args):
|
||||
"List the Golang module registry"
|
||||
return GolangLister.from_configfile(**lister_args).run().dict()
|
||||
|
||||
|
||||
@shared_task(name=__name__ + ".ping")
|
||||
def _ping():
|
||||
return "OK"
|
0
swh/lister/golang/tests/__init__.py
Normal file
0
swh/lister/golang/tests/__init__.py
Normal file
5
swh/lister/golang/tests/data/page-1.txt
Normal file
5
swh/lister/golang/tests/data/page-1.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"}
|
||||
{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"}
|
||||
{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"}
|
||||
{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"}
|
||||
{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"}
|
4
swh/lister/golang/tests/data/page-2.txt
Normal file
4
swh/lister/golang/tests/data/page-2.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"}
|
||||
{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"}
|
||||
{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"}
|
||||
{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"}
|
10
swh/lister/golang/tests/data/page-3.txt
Normal file
10
swh/lister/golang/tests/data/page-3.txt
Normal file
|
@ -0,0 +1,10 @@
|
|||
{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"}
|
||||
{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"}
|
||||
{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"}
|
||||
{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"}
|
||||
{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"}
|
||||
{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"}
|
||||
{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"}
|
||||
{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"}
|
||||
{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"}
|
||||
{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"}
|
90
swh/lister/golang/tests/test_lister.py
Normal file
90
swh/lister/golang/tests/test_lister.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import iso8601
|
||||
|
||||
from swh.lister.golang.lister import GolangLister
|
||||
from swh.lister.tests.test_utils import assert_sleep_calls
|
||||
from swh.lister.utils import WAIT_EXP_BASE
|
||||
|
||||
# https://pkg.go.dev prefix omitted
|
||||
expected_listed = [
|
||||
("collectd.org", "2019-04-11T18:47:25.450546+00:00"),
|
||||
("github.com/blang/semver", "2019-04-15T13:54:39.107258+00:00",),
|
||||
("github.com/bmizerany/pat", "2019-04-11T18:47:29.390564+00:00",),
|
||||
("github.com/djherbis/buffer", "2019-04-11T18:47:29.974874+00:00",),
|
||||
("github.com/djherbis/nio", "2019-04-11T18:47:32.283312+00:00",),
|
||||
("github.com/gobuffalo/buffalo-plugins", "2019-04-15T13:54:34.222985+00:00",),
|
||||
("github.com/gobuffalo/buffalo-pop", "2019-04-15T13:54:39.135792+00:00",),
|
||||
("github.com/gobuffalo/clara", "2019-04-15T13:54:40.651916+00:00",),
|
||||
("github.com/gobuffalo/genny", "2019-04-15T13:54:37.841547+00:00",),
|
||||
("github.com/gobuffalo/packr", "2019-04-15T13:54:35.688900+00:00",),
|
||||
("github.com/markbates/refresh", "2019-04-15T13:54:35.250835+00:00",),
|
||||
("github.com/mitchellh/go-homedir", "2019-04-15T13:54:35.678214+00:00",),
|
||||
("github.com/nats-io/nuid", "2019-04-11T18:47:28.102348+00:00",),
|
||||
("github.com/oklog/ulid", "2019-04-11T18:47:23.234198+00:00",),
|
||||
("github.com/pkg/errors", "2019-04-18T02:07:41.336899+00:00",),
|
||||
("golang.org/x/sys", "2019-04-15T13:54:37.555525+00:00",),
|
||||
("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"),
|
||||
# only one x/tools listed even though there are two version, and only the
|
||||
# latest one's timestamp is used.
|
||||
("golang.org/x/tools", "2019-04-15T13:54:41.905064+00:00",),
|
||||
]
|
||||
|
||||
|
||||
def _generate_responses(datadir, requests_mock):
|
||||
responses = []
|
||||
for file in Path(datadir).glob("page-*.txt"):
|
||||
# Test that throttling and server errors are retries
|
||||
responses.append({"text": "", "status_code": 429})
|
||||
responses.append({"text": "", "status_code": 500})
|
||||
# Also test that the lister appropriately gets out of the infinite loop
|
||||
responses.append({"text": file.read_text(), "status_code": 200})
|
||||
|
||||
requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses)
|
||||
|
||||
|
||||
def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
|
||||
# first listing, should return one origin per package
|
||||
lister = GolangLister(scheduler=swh_scheduler)
|
||||
|
||||
# Exponential retries take a long time, so stub time.sleep
|
||||
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
|
||||
|
||||
_generate_responses(datadir, requests_mock)
|
||||
|
||||
stats = lister.run()
|
||||
|
||||
assert stats.pages == 3
|
||||
# The two `golang.org/x/tools` versions are *not* listed as separate origins
|
||||
assert stats.origins == 18
|
||||
|
||||
scheduler_origins = sorted(
|
||||
swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
|
||||
key=lambda x: x.url,
|
||||
)
|
||||
|
||||
for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed):
|
||||
assert scheduled.url == f"https://pkg.go.dev/{url}"
|
||||
assert scheduled.last_update == iso8601.parse_date(timestamp)
|
||||
assert scheduled.visit_type == "golang"
|
||||
|
||||
assert len(scheduler_origins) == len(expected_listed)
|
||||
|
||||
# Test `time.sleep` is called with exponential retries
|
||||
assert_sleep_calls(
|
||||
mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE]
|
||||
)
|
||||
|
||||
# doing it all again (without incremental) should give us the same result
|
||||
lister = GolangLister(scheduler=swh_scheduler)
|
||||
mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
|
||||
_generate_responses(datadir, requests_mock)
|
||||
stats = lister.run()
|
||||
|
||||
assert stats.pages == 3
|
||||
assert stats.origins == 18
|
32
swh/lister/golang/tests/test_tasks.py
Normal file
32
swh/lister/golang/tests/test_tasks.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
# Copyright (C) 2022 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from swh.lister.pattern import ListerStats
|
||||
|
||||
|
||||
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == "OK"
|
||||
|
||||
|
||||
def test_golang_full_listing_task(
|
||||
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
|
||||
):
|
||||
lister = mocker.patch("swh.lister.golang.tasks.GolangLister")
|
||||
lister.from_configfile.return_value = lister
|
||||
stats = ListerStats(pages=1, origins=28000)
|
||||
lister.run.return_value = stats
|
||||
|
||||
res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister")
|
||||
assert res
|
||||
res.wait()
|
||||
assert res.successful()
|
||||
assert res.result == stats.dict()
|
||||
|
||||
lister.from_configfile.assert_called_once_with()
|
||||
lister.run.assert_called_once_with()
|
Loading…
Add table
Add a link
Reference in a new issue