docs: Add new "howto write a lister tutorial" with unified lister api

This adds a new tutorial which details how to currently write the new listers (both
incremental or stateless). This proposes a python template file to start a new lister.

At last, this renames the previous tutorial into tutorial-2017.

Related to T3073
This commit is contained in:
tenma 2021-02-26 14:59:13 +01:00 committed by Antoine R. Dumont (@ardumont)
parent 5b4dc289b7
commit 2e17729e97
No known key found for this signature in database
GPG key ID: 52E2E9840D10C3B8
3 changed files with 853 additions and 321 deletions

166
docs/new_lister_template.py Normal file
View file

@ -0,0 +1,166 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import logging
from typing import Any, Dict, Iterator, List
from urllib.parse import urljoin
import requests
from tenacity.before_sleep import before_sleep_log
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
NewForgeListerPage = List[Dict[str, Any]]
@dataclass
class NewForgeListerState:
"""The NewForgeLister instance state. This is used for incremental listing.
"""
current: str = ""
"""Id of the last origin listed on an incremental pass"""
# If there is no need to keep state, subclass StatelessLister[NewForgeListerPage]
class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
"""List origins from the "NewForge" forge.
"""
# Part of the lister API, that identifies this lister
LISTER_NAME = ""
# (Optional) CVS type of the origins listed by this lister, if constant
VISIT_TYPE = ""
# Instance URLs include the hostname and the common path prefix of processed URLs
EXAMPLE_BASE_URL = "https://netloc/api/v1/"
# Path of a specific resource to process, to join the base URL with
EXAMPLE_PATH = "origins/list"
def __init__(
self,
# Required
scheduler: SchedulerInterface,
# Instance URL, required for multi-instances listers (e.g gitlab, ...)
url: str,
# Instance name (free form) required for multi-instance listers,
# or computed from `url`
instance: str,
# Required whether lister supports authentication or not
credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler, credentials=credentials, url=url, instance=instance,
)
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState:
return NewForgeListerState(**d)
def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
return asdict(state)
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
# Do the network resource request under a retrying decorator
# to handle rate limiting and transient errors up to a limit.
# `throttling_retry` by default use the `requests` library to check
# only for rate-limit and a base-10 exponential waiting strategy.
# This can be customized by passed waiting, retrying and logging strategies
# as functions. See the `tenacity` library documentation.
# Log listed URL to ease debugging
logger.debug("Fetching URL %s with params %s", url, params)
response = self.session.get(url, params=params)
if response.status_code != 200:
# Log response content to ease debugging
logger.warning(
"Unexpected HTTP status code %s on %s: %s",
response.status_code,
response.url,
response.content,
)
# The lister must fail on blocking errors
response.raise_for_status()
return response
def get_pages(self) -> Iterator[NewForgeListerPage]:
# The algorithm depends on the service, but should request data reliably,
# following pagination if relevant and yielding pages in a streaming fashion.
# If incremental listing is supported, initialize from saved lister state.
# Make use of any next page URL provided.
# Simplify the results early to ease testing and debugging.
# Initialize from the lister saved state
current = ""
if self.state.current is not None:
current = self.state.current
# Construct the URL of a service endpoint, the lister can have others to fetch
url = urljoin(self.url, self.EXAMPLE_PATH)
while current is not None:
# Parametrize the request for incremental listing
body = self.page_request(url, {"current": current}).json()
# Simplify the page if possible to only the necessary elements
# and yield it
yield body
# Get the next page parameter or end the loop when there is none
current = body.get("next")
def get_origins_from_page(self, page: NewForgeListerPage) -> Iterator[ListedOrigin]:
"""Convert a page of NewForgeLister repositories into a list of ListedOrigins"""
assert self.lister_obj.id is not None
for element in page:
yield ListedOrigin(
# Required. Should use this value.
lister_id=self.lister_obj.id,
# Required. Visit type of the currently processed origin
visit_type=self.VISIT_TYPE,
# Required. URL corresponding to the origin for loaders to ingest
url=...,
# Should get it if the service provides it and if it induces no
# substantial additional processing cost
last_update=...,
)
def commit_page(self, page: NewForgeListerPage) -> None:
# Update the lister state to the latest `current`
current = page[-1]["current"]
if current > self.state.current:
self.state.current = current
def finalize(self) -> None:
# Pull fresh lister state from the scheduler backend, in case multiple
# listers run concurrently
scheduler_state = self.get_state_from_scheduler()
# Update the lister state in the backend only if `current` is fresher than
# the one stored in the database.
if self.state.current > scheduler_state.current:
self.updated = True

366
docs/tutorial-2017.rst Normal file
View file

@ -0,0 +1,366 @@
.. _lister-tutorial-2017:
Tutorial: list the content of your favorite forge in just a few steps
=====================================================================
(the `original version
<https://www.softwareheritage.org/2017/03/24/list-the-content-of-your-favorite-forge-in-just-a-few-steps/>`_
of this article appeared on the Software Heritage blog)
Back in November 2016, Nicolas Dandrimont wrote about structural code changes
`leading to a massive (+15 million!) upswing in the number of repositories
archived by Software Heritage
<https://www.softwareheritage.org/2016/11/09/listing-47-million-repositories-refactoring-our-github-lister/>`_
through a combination of automatic linkage between the listing and loading
scheduler, new understanding of how to deal with extremely large repository
hosts like `GitHub <https://github.com/>`_, and activating a new set of
repositories that had previously been skipped over.
In the post, Nicolas outlined the three major phases of work in Software
Heritage's preservation process (listing, scheduling updates, loading) and
highlighted that the ability to preserve the world's free software heritage
depends on our ability to find and list the repositories.
At the time, Software Heritage was only able to list projects on
GitHub. Focusing early on GitHub, one of the largest and most active forge in
the world, allowed for a big value-to-effort ratio and a rapid launch for the
archive. As the old Italian proverb goes, "Il meglio è nemico del bene," or in
modern English parlance, "Perfect is the enemy of good," right? Right. So the
plan from the beginning was to implement a lister for GitHub, then maybe
implement another one, and then take a few giant steps backward and squint our
eyes.
Why? Because source code hosting services don't behave according to a unified
standard. Each new service requires dedicated development time to implement a
new scraping client for the non-transferable requirements and intricacies of
that service's API. At the time, doing it in an extensible and adaptable way
required a level of exposure to the myriad differences between these services
that we just didn't think we had yet.
Nicolas' post closed by saying "We haven't carved out a stable API yet that
allows you to just fill in the blanks, as we only have the GitHub lister
currently, and a proven API will emerge organically only once we have some
diversity."
That has since changed. As of March 6, 2017, the Software Heritage **lister
code has been aggressively restructured, abstracted, and commented** to make
creating new listers significantly easier. There may yet be a few kinks to iron
out, but **now making a new lister is practically like filling in the blanks**.
Fundamentally, a basic lister must follow these steps:
1. Issue a network request for a service endpoint.
2. Convert the response into a canonical format.
3. Populate a work queue for fetching and ingesting source repositories.
Steps 1 and 3 are generic problems, so they can get generic solutions hidden
away in the base code, most of which never needs to change. That leaves us to
implement step 2, which can be trivially done now for services with a clean web
APIs.
In the new code, we've tried to hide away as much generic functionality as
possible, turning it into set-and-forget plumbing between a few simple
customized elements. Different hosting services might use different network
protocols, rate-limit messages, or pagination schemes, but, as long as there is
some way to get a list of the hosted repositories, we think that the new base
code will make getting those repositories much easier.
First, let me give you the 30,000 foot view…
The old GitHub-specific lister code looked like this (265 lines of Python):
.. figure:: images/old_github_lister.png
By contrast, the new GitHub-specific code looks like this (34 lines of Python):
.. figure:: images/new_github_lister.png
And the new BitBucket-specific code is even shorter and looks like this (24 lines of Python):
.. figure:: images/new_bitbucket_lister.png
And now this is common shared code in a few abstract base classes, with some new
features and loads of docstring comments (in red):
.. figure:: images/new_base.png
So how does the lister code work now, and **how might a contributing developer
go about making a new one**
The first thing to know is that we now have a generic lister base class and ORM
model. A subclass of the lister base should already be able to do almost
everything needed to complete a listing task for a single service
request/response cycle with the following implementation requirements:
1. A member variable must be declared called ``MODEL``, which is equal to a
subclass (Note: type, not instance) of the base ORM model. The reasons for
using a subclass is mostly just because different services use different
incompatible primary identifiers for their repositories. The model
subclasses are typically only one or two additional variable declarations.
2. A method called ``transport_request`` must be implemented, which takes the
complete target identifier (e.g., a URL) and tries to request it one time
using whatever transport protocol is required for interacting with the
service. It should not attempt to retry on timeouts or do anything else with
the response (that is already done for you). It should just either return
the response or raise a ``FetchError`` exception.
3. A method called ``transport_response_to_string`` must be implemented, which
takes the entire response of the request in (1) and converts it to a string
for logging purposes.
4. A method called ``transport_quota_check`` must be implemented, which takes
the entire response of the request in (1) and checks to see if the process
has run afoul of any query quotas or rate limits. If the service says to
wait before making more requests, the method should return ``True`` and also
the number of seconds to wait, otherwise it returns ``False``.
5. A method called ``transport_response_simplified`` must be implemented, which
also takes the entire response of the request in (1) and converts it to a
Python list of dicts (one dict for each repository) with keys given
according to the aforementioned ``MODEL`` class members.
Because 1, 2, 3, and 4 are basically dependent only on the chosen network
protocol, we also have an HTTP mix-in module, which supplements the lister base
and provides default implementations for those methods along with optional
request header injection using the Python Requests library. The
``transport_quota_check`` method as provided follows the IETF standard for
communicating rate limits with `HTTP code 429
<https://tools.ietf.org/html/rfc6585#section-4>`_ which some hosting services
have chosen not to follow, so it's possible that a specific lister will need to
override it.
On top of all of that, we also provide another layer over the base lister class
which adds support for sequentially looping over indices. What are indices?
Well, some services (`BitBucket <https://bitbucket.org/>`_ and GitHub for
example) don't send you the entire list of all of their repositories at once,
because that server response would be unwieldy. Instead they paginate their
results, and they also allow you to query their APIs like this:
``https://server_address.tld/query_type?start_listing_from_id=foo``. Changing
the value of 'foo' lets you fetch a set of repositories starting from there. We
call 'foo' an index, and we call a service that works this way an indexing
service. GitHub uses the repository unique identifier and BitBucket uses the
repository creation time, but a service can really use anything as long as the
values monotonically increase with new repositories. A good indexing service
also includes the URL of the next page with a later 'foo' in its responses. For
these indexing services we provide another intermediate lister called the
indexing lister. Instead of inheriting from :class:`ListerBase
<swh.lister.core.lister_base.ListerBase>`, the lister class would inherit
from :class:`IndexingLister
<swh.lister.core.indexing_lister.IndexingLister>`. Along with the
requirements of the lister base, the indexing lister base adds one extra
requirement:
1. A method called ``get_next_target_from_response`` must be defined, which
takes a complete request response and returns the index ('foo' above) of the
next page.
So those are all the basic requirements. There are, of course, a few other
little bits and pieces (covered for now in the code's docstring comments), but
for the most part that's it. It sounds like a lot of information to absorb and
implement, but remember that most of the implementation requirements mentioned
above are already provided for 99% of services by the HTTP mix-in module. It
looks much simpler when we look at the actual implementations of the two
new-style indexing listers we currently have…
When developing a new lister, it's important to test. For this, add the tests
(check `swh/lister/*/tests/`) and register the celery tasks in the main
conftest.py (`swh/lister/core/tests/conftest.py`).
Another important step is to actually run it within the
docker-dev (:ref:`run-lister-tutorial`).
This is the entire source code for the BitBucket repository lister::
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3 or later
# See top-level LICENSE file for more information
from urllib import parse
from swh.lister.bitbucket.models import BitBucketModel
from swh.lister.core.indexing_lister import IndexingHttpLister
class BitBucketLister(IndexingHttpLister):
PATH_TEMPLATE = '/repositories?after=%s'
MODEL = BitBucketModel
def get_model_from_repo(self, repo):
return {'uid': repo['uuid'],
'indexable': repo['created_on'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['links']['html']['href'],
'origin_url': repo['links']['clone'][0]['href'],
'origin_type': repo['scm'],
'description': repo['description']}
def get_next_target_from_response(self, response):
body = response.json()
if 'next' in body:
return parse.unquote(body['next'].split('after=')[1])
else:
return None
def transport_response_simplified(self, response):
repos = response.json()['values']
return [self.get_model_from_repo(repo) for repo in repos]
And this is the entire source code for the GitHub repository lister::
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3 or later
# See top-level LICENSE file for more information
import time
from swh.lister.core.indexing_lister import IndexingHttpLister
from swh.lister.github.models import GitHubModel
class GitHubLister(IndexingHttpLister):
PATH_TEMPLATE = '/repositories?since=%d'
MODEL = GitHubModel
def get_model_from_repo(self, repo):
return {'uid': repo['id'],
'indexable': repo['id'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['html_url'],
'origin_url': repo['html_url'],
'origin_type': 'git',
'description': repo['description']}
def get_next_target_from_response(self, response):
if 'next' in response.links:
next_url = response.links['next']['url']
return int(next_url.split('since=')[1])
else:
return None
def transport_response_simplified(self, response):
repos = response.json()
return [self.get_model_from_repo(repo) for repo in repos]
def request_headers(self):
return {'Accept': 'application/vnd.github.v3+json'}
def transport_quota_check(self, response):
remain = int(response.headers['X-RateLimit-Remaining'])
if response.status_code == 403 and remain == 0:
reset_at = int(response.headers['X-RateLimit-Reset'])
delay = min(reset_at - time.time(), 3600)
return True, delay
else:
return False, 0
We can see that there are some common elements:
* Both use the HTTP transport mixin (:class:`IndexingHttpLister
<swh.lister.core.indexing_lister.IndexingHttpLister>`) just combines
:class:`ListerHttpTransport
<swh.lister.core.lister_transports.ListerHttpTransport>` and
:class:`IndexingLister
<swh.lister.core.indexing_lister.IndexingLister>`) to get most of the
network request functionality for free.
* Both also define ``MODEL`` and ``PATH_TEMPLATE`` variables. It should be
clear to developers that ``PATH_TEMPLATE``, when combined with the base
service URL (e.g., ``https://some_service.com``) and passed a value (the
'foo' index described earlier) results in a complete identifier for making
API requests to these services. It is required by our HTTP module.
* Both services respond using JSON, so both implementations of
``transport_response_simplified`` are similar and quite short.
We can also see that there are a few differences:
* GitHub sends the next URL as part of the response header, while BitBucket
sends it in the response body.
* GitHub differentiates API versions with a request header (our HTTP
transport mix-in will automatically use any headers provided by an
optional request_headers method that we implement here), while
BitBucket has it as part of their base service URL. BitBucket uses
the IETF standard HTTP 429 response code for their rate limit
notifications (the HTTP transport mix-in automatically handles
that), while GitHub uses their own custom response headers that need
special treatment.
* But look at them! 58 lines of Python code, combined, to absorb all
repositories from two of the largest and most influential source code hosting
services.
Ok, so what is going on behind the scenes?
To trace the operation of the code, let's start with a sample instantiation and
progress from there to see which methods get called when. What follows will be
a series of extremely reductionist pseudocode methods. This is not what the
code actually looks like (it's not even real code), but it does have the same
basic flow. Bear with me while I try to lay out lister operation in a
quasi-linear way…::
# main task
ghl = GitHubLister(lister_name='github.com',
api_baseurl='https://github.com')
ghl.run()
⇓ (IndexingLister.run)::
# IndexingLister.run
identifier = None
do
response, repos = ListerBase.ingest_data(identifier)
identifier = GitHubLister.get_next_target_from_response(response)
while(identifier)
⇓ (ListerBase.ingest_data)::
# ListerBase.ingest_data
response = ListerBase.safely_issue_request(identifier)
repos = GitHubLister.transport_response_simplified(response)
injected = ListerBase.inject_repo_data_into_db(repos)
return response, injected
⇓ (ListerBase.safely_issue_request)::
# ListerBase.safely_issue_request
repeat:
resp = ListerHttpTransport.transport_request(identifier)
retry, delay = ListerHttpTransport.transport_quota_check(resp)
if retry:
sleep(delay)
until((not retry) or too_many_retries)
return resp
⇓ (ListerHttpTransport.transport_request)::
# ListerHttpTransport.transport_request
path = ListerBase.api_baseurl
+ ListerHttpTransport.PATH_TEMPLATE % identifier
headers = ListerHttpTransport.request_headers()
return http.get(path, headers)
(Oh look, there's our ``PATH_TEMPLATE``)
⇓ (ListerHttpTransport.request_headers)::
# ListerHttpTransport.request_headers
override → GitHubLister.request_headers
↑↑ (ListerBase.safely_issue_request)
⇓ (ListerHttpTransport.transport_quota_check)::
# ListerHttpTransport.transport_quota_check
override → GitHubLister.transport_quota_check
And then we're done. From start to finish, I hope this helps you understand how
the few customized pieces fit into the new shared plumbing.
Now you can go and write up a lister for a code hosting site we don't have yet!

View file

@ -3,363 +3,363 @@
Tutorial: list the content of your favorite forge in just a few steps
=====================================================================
(the `original version
<https://www.softwareheritage.org/2017/03/24/list-the-content-of-your-favorite-forge-in-just-a-few-steps/>`_
of this article appeared on the Software Heritage blog)
Overview
--------
Back in November 2016, Nicolas Dandrimont wrote about structural code changes
`leading to a massive (+15 million!) upswing in the number of repositories
archived by Software Heritage
<https://www.softwareheritage.org/2016/11/09/listing-47-million-repositories-refactoring-our-github-lister/>`_
through a combination of automatic linkage between the listing and loading
scheduler, new understanding of how to deal with extremely large repository
hosts like `GitHub <https://github.com/>`_, and activating a new set of
repositories that had previously been skipped over.
The three major phases of work in Software Heritage's preservation process, on the
technical side, are *listing software sources*, *scheduling updates* and *loading the
software artifacts into the archive*.
In the post, Nicolas outlined the three major phases of work in Software
Heritage's preservation process (listing, scheduling updates, loading) and
highlighted that the ability to preserve the world's free software heritage
depends on our ability to find and list the repositories.
A previous effort in 2017 consisted in designing the framework to make lister a
straightforward "fill in the blanks" process, based on gained experience on the
diversity found in the listed services. This is the second iteration on the lister
framework design, comprising a library and an API which is easier to work with and less
"magic" (read implicit). This new design is part of a larger effort in redesigning the
scheduling system for the recurring tasks updating the content of the archive.
At the time, Software Heritage was only able to list projects on
GitHub. Focusing early on GitHub, one of the largest and most active forge in
the world, allowed for a big value-to-effort ratio and a rapid launch for the
archive. As the old Italian proverb goes, "Il meglio è nemico del bene," or in
modern English parlance, "Perfect is the enemy of good," right? Right. So the
plan from the beginning was to implement a lister for GitHub, then maybe
implement another one, and then take a few giant steps backward and squint our
eyes.
.. _fundamentals:
Why? Because source code hosting services don't behave according to a unified
standard. Each new service requires dedicated development time to implement a
new scraping client for the non-transferable requirements and intricacies of
that service's API. At the time, doing it in an extensible and adaptable way
required a level of exposure to the myriad differences between these services
that we just didn't think we had yet.
Nicolas' post closed by saying "We haven't carved out a stable API yet that
allows you to just fill in the blanks, as we only have the GitHub lister
currently, and a proven API will emerge organically only once we have some
diversity."
That has since changed. As of March 6, 2017, the Software Heritage **lister
code has been aggressively restructured, abstracted, and commented** to make
creating new listers significantly easier. There may yet be a few kinks to iron
out, but **now making a new lister is practically like filling in the blanks**.
Fundamentals
------------
Fundamentally, a basic lister must follow these steps:
1. Issue a network request for a service endpoint.
2. Convert the response into a canonical format.
3. Populate a work queue for fetching and ingesting source repositories.
2. Convert the response data into a model object.
3. Send the model object to the scheduler.
Steps 1 and 3 are generic problems, so they can get generic solutions hidden
away in the base code, most of which never needs to change. That leaves us to
implement step 2, which can be trivially done now for services with a clean web
APIs.
Steps 1 and 3 are generic problems, that are often already solved by helpers or in other
listers. That leaves us mainly to implement step 2, which is simple when the remote
service provides an API.
In the new code, we've tried to hide away as much generic functionality as
possible, turning it into set-and-forget plumbing between a few simple
customized elements. Different hosting services might use different network
protocols, rate-limit messages, or pagination schemes, but, as long as there is
some way to get a list of the hosted repositories, we think that the new base
code will make getting those repositories much easier.
.. _prerequisites:
First, let me give you the 30,000 foot view…
Prerequisites
-------------
The old GitHub-specific lister code looked like this (265 lines of Python):
Skills:
.. figure:: images/old_github_lister.png
* object-oriented Python
* requesting remote services through HTTP
* scrapping if no API is offered
By contrast, the new GitHub-specific code looks like this (34 lines of Python):
Analysis of the target service. Prepare the following elements to write the lister:
.. figure:: images/new_github_lister.png
* instance names and URLs
* requesting scheme: base URL, path, query_string, POST data, headers
* authentication types and which one to support, if any
* rate-limiting: HTTP codes and headers used
* data format: JSON/XML/HTML/...?
* mapping between remote data and needed data (ListedOrigin model, internal state)
And the new BitBucket-specific code is even shorter and looks like this (24 lines of Python):
We will now walk through the steps to build a new lister.
Please use this template to start with: :download:`new_lister_template.py`
.. figure:: images/new_bitbucket_lister.png
.. _lister-declaration:
And now this is common shared code in a few abstract base classes, with some new features and loads of docstring comments (in red):
Lister declaration
------------------
.. figure:: images/new_base.png
In order to write a lister, two basic elements are required. These are the
:py:class:`Lister` base class and the :py:class:`ListedOrigin` scheduler model class.
Optionally, for listers that need to keep a state and support incremental listing, an
additional object :py:class:`ListerState` will come into play.
So how does the lister code work now, and **how might a contributing developer
go about making a new one**
Each lister must subclass :py:class:`Lister <swh.lister.pattern.Lister>` either directly
or through a subclass such as :py:class:`StatelessLister
<swh.lister.pattern.StatelessLister>` for stateless ones.
The first thing to know is that we now have a generic lister base class and ORM
model. A subclass of the lister base should already be able to do almost
everything needed to complete a listing task for a single service
request/response cycle with the following implementation requirements:
We extensively type-annotate our listers, as any new code, which makes proeminent that
those lister classes are generic, and take the following parameters:
1. A member variable must be declared called ``MODEL``, which is equal to a
subclass (Note: type, not instance) of the base ORM model. The reasons for
using a subclass is mostly just because different services use different
incompatible primary identifiers for their repositories. The model
subclasses are typically only one or two additional variable declarations.
* :py:class:`Lister`: the lister state type, the page type
* :py:class:`StatelessLister`: only the page type
2. A method called ``transport_request`` must be implemented, which takes the
complete target identifier (e.g., a URL) and tries to request it one time
using whatever transport protocol is required for interacting with the
service. It should not attempt to retry on timeouts or do anything else with
the response (that is already done for you). It should just either return
the response or raise a ``FetchError`` exception.
You can can start by declaring a stateless lister and leave the implementation of state
for later if the listing needs it. We will see how to in :ref:`handling-lister-state`.
3. A method called ``transport_response_to_string`` must be implemented, which
takes the entire response of the request in (1) and converts it to a string
for logging purposes.
Both the lister state type and the page type are user-defined types. However, while the
page type may only exist as a type annotation, the state type for a stateful lister must
be associated with a concrete object. The state type is commonly defined as a dataclass
whereas the page type is often a mere annotation, potentially given a nice alias.
4. A method called ``transport_quota_check`` must be implemented, which takes
the entire response of the request in (1) and checks to see if the process
has run afoul of any query quotas or rate limits. If the service says to
wait before making more requests, the method should return ``True`` and also
the number of seconds to wait, otherwise it returns ``False``.
Example lister declaration::
5. A method called ``transport_response_simplified`` must be implemented, which
also takes the entire response of the request in (1) and converts it to a
Python list of dicts (one dict for each repository) with keys given
according to the aforementioned ``MODEL`` class members.
NewForgePage = List[Dict[str, Any]]
Because 1, 2, 3, and 4 are basically dependent only on the chosen network
protocol, we also have an HTTP mix-in module, which supplements the lister base
and provides default implementations for those methods along with optional
request header injection using the Python Requests library. The
``transport_quota_check`` method as provided follows the IETF standard for
communicating rate limits with `HTTP code 429
<https://tools.ietf.org/html/rfc6585#section-4>`_ which some hosting services
have chosen not to follow, so it's possible that a specific lister will need to
override it.
@dataclass
class NewForgeListerState:
...
On top of all of that, we also provide another layer over the base lister class
which adds support for sequentially looping over indices. What are indices?
Well, some services (`BitBucket <https://bitbucket.org/>`_ and GitHub for
example) don't send you the entire list of all of their repositories at once,
because that server response would be unwieldy. Instead they paginate their
results, and they also allow you to query their APIs like this:
``https://server_address.tld/query_type?start_listing_from_id=foo``. Changing
the value of 'foo' lets you fetch a set of repositories starting from there. We
call 'foo' an index, and we call a service that works this way an indexing
service. GitHub uses the repository unique identifier and BitBucket uses the
repository creation time, but a service can really use anything as long as the
values monotonically increase with new repositories. A good indexing service
also includes the URL of the next page with a later 'foo' in its responses. For
these indexing services we provide another intermediate lister called the
indexing lister. Instead of inheriting from :class:`ListerBase
<swh.lister.core.lister_base.ListerBase>`, the lister class would inherit
from :class:`IndexingLister
<swh.lister.core.indexing_lister.IndexingLister>`. Along with the
requirements of the lister base, the indexing lister base adds one extra
requirement:
class NewForgeLister(Lister[NewForgeListerState, NewForgePage]):
LISTER_NAME = "My"
...
1. A method called ``get_next_target_from_response`` must be defined, which
takes a complete request response and returns the index ('foo' above) of the
next page.
The new lister must declare a name through the :py:attr:`LISTER_NAME` class attribute.
So those are all the basic requirements. There are, of course, a few other
little bits and pieces (covered for now in the code's docstring comments), but
for the most part that's it. It sounds like a lot of information to absorb and
implement, but remember that most of the implementation requirements mentioned
above are already provided for 99% of services by the HTTP mix-in module. It
looks much simpler when we look at the actual implementations of the two
new-style indexing listers we currently have…
.. _lister-construction:
Lister construction
-------------------
The lister constructor is only required to ask for a :py:class:`SchedulerInterface`
object to pass to the base class. But it does not mean that it is all that's needed for
it to useful. A lister need information on which remote service to talk to. It needs an
URL.
Some services are centralized and offered by a single organization. Think of Github.
Others are offered by many people across the Internet, each using a different hosting,
each providing specific data. Think of the many Gitlab instances. We need a name to
identify each instance, and even if there is only one, we need its URL to access it
concretely.
Now, you may think of any strategy to infer the information or hardcode it, but the base
class needs an URL and an instance name. In any case, for a multi-instance service, you
better be explicit and require the URL as constructor argument. We recommend the URL to
be some form of a base URL, to be concatenated with any variable part appearing either
because there exist multiple instances or the URL need recomputation in the listing
process.
If we need any credentials to access a remote service, and do so in our polite but
persistent fashion (remember that we want fresh information), you are encouraged to
provide support for authenticated access. The base class support handling credentials as
a set of identifier/secret pair. It knows how to load from a secrets store the right
ones for the current ("lister name", "instance name") setting, if none were originally
provided through the task parameters. You can ask for other types of access tokens in a
separate parameter, but then you lose this advantage.
Example of a typical lister constructor::
def __init__(
self,
scheduler: SchedulerInterface,
url: str,
instance: str,
credentials: CredentialsType = None,
):
super().__init__(
scheduler=scheduler, url=url, instance=instance, credentials=credentials,
)
...
.. _core-lister-functionality:
Core lister functionality
-------------------------
For the lister to contribute data to the archive, you now have to write the logic to
fetch data from the remote service, and format it in the canonical form the scheduler
expects, as outined in :ref:`fundamentals`. To this purpose, the two methods to
implement are::
def get_pages(self) -> Iterator[NewForgePage]:
...
def get_origins_from_page(self, page: NewForgePage) -> Iterator[ListedOrigin]:
...
Those two core functions are called by the principal lister method,
:py:meth:`Lister.run`, found in the base class.
:py:meth:`get_pages` is the guts of the lister. It takes no arguments and must produce
data pages. An iterator is fine here, as the :py:meth:`Lister.run` method only mean to
iterate in a single pass on it. This method gets its input from a network request to a
remote service's endpoint to retrieve the data we long for.
Depending on whether the data is adequately structured for our purpose can be tricky.
Here you may have to show off your data scraping skills, or just consume a well-designed
API. Those aspects are discussed more specifically in the section
:ref:`handling-specific-topics`.
In any case, we want the data we return to be usefully filtered and structured. The
easiest way to create an iterator is to use the `yield` keyword. Yield each data page
you have structured in accordance with the page type you have declared. The page type
exists only for static type checking of data passed from :py:meth:`get_pages` to
:py:meth:`get_origins_from_page`; you can choose whatever fits the bill.
:py:meth:`get_origins_from_page` is simpler. For each individual software origin you
have received in the page, you convert and yield a :py:class:`ListedOrigin` model
object. This datatype has the following mandatory fields:
* lister id: you generally fill this with the value of :py:attr:`self.lister_obj.id`
* visit type: the type of software distribution format the service provides. For use by
a corresponding loader. It is an identifier, so you have to either use an existing
value or craft a new one if you get off the beaten track and tackle a new software
source. But then you will have to discuss the name with the core developers.
Example: Phabricator is a forge that can handle Git or SVN repositories. The visit
type would be "git" when listing such a repo that provides a Git URL that we can load.
* origin URL: an URL that, combined with the visit type, will serve as the input of
loader.
This datatype can also further be detailed with the optional fields:
* last update date: freshness information on this origin, which is useful to the
scheduler for optimizing its scheduling decisions. Fill it if provided by the service,
at no substantial additional runtime cost, e.g. in the same request.
* extra loader arguments: extra parameters to be passed to the loader for it to be
able to load the origin. It is needed for example when additional context is needed
along with the URL to effectively load from the origin.
See the definition of ListedOrigin_.
Now that that we showed how those two methods operate, let's put it together by showing
how they fit in the principal :py:meth:`Lister.run` method::
def run(self) -> ListerStats:
full_stats = ListerStats()
try:
for page in self.get_pages():
full_stats.pages += 1
origins = self.get_origins_from_page(page)
full_stats.origins += self.send_origins(origins)
self.commit_page(page)
finally:
self.finalize()
if self.updated:
self.set_state_in_scheduler()
return full_stats
:py:meth:`Lister.send_origins` is the method that sends listed origins to the scheduler.
The :py:class:`ListerState` datastructure, defined along the base lister class, is used
to compute the number of listed pages and origins in a single lister run. It is useful
both for the scheduler that automatically collects this information and to test the
lister.
You see that the bulk of a lister run consists in streaming data gathered from the
remote service to the scheduler. And this is done under a ``try...finally`` construct to
have the lister state reliably recorded in case of unhandled error. We will explain the
role of the remaining methods and attributes appearing here in the next section as it is
related to the lister state.
.. _ListedOrigin: https://archive.softwareheritage.org/browse/swh:1:rev:03460207a17d82635ef5a6f12358392143eb9eef/?origin_url=https://forge.softwareheritage.org/source/swh-scheduler.git&path=swh/scheduler/model.py&revision=03460207a17d82635ef5a6f12358392143eb9eef#L134-L177
.. _handling-lister-state:
Handling lister state
---------------------
With what we have covered until now you can write a stateless lister. Unfortunately,
some services provide too much data to efficiently deal with it in a one-shot fashion.
Listing a given software source can take several hours or days to process. Our listers
can also give valid output, but fail on an unexpected condition and would have to start
over. As we want to be able to resume the listing process from a given element, provided
by the remote service and guaranteed to be ordered, such as a date or a numeric
identifier, we need to deal with state.
The remaining part of the lister API is reserved for dealing with lister state.
If the service to list has no pagination, then the data set to handle is small enough to
not require keeping lister state. In the opposite case, you will have to determine which
piece of information should be recorded in the lister state. As said earlier, we
recommend declaring a dataclass for the lister state::
@dataclass
class NewForgeListerState:
current: str = ""
class NewForgeLister(Lister[NewForgeListerState, NewForgePage]):
...
A pair of methods, :py:meth:`state_from_dict` and :py:meth:`state_to_dict` are used to
respectively import lister state from the scheduler and export lister state to the
scheduler. Some fields may need help to be serialized to the scheduler, such as dates,
so this needs to be handled there.
Where is the state used? Taking the general case of a paginating service, the lister
state is used at the beginning of the :py:meth:`get_pages` method to initialize the
variables associated with the last listing progress. That way we can start from an
arbitrary element, or just the first one if there is no last lister state.
The :py:meth:`commit_page` is called on successful page processing, after the new
origins are sent to the scheduler. Here you should mainly update the lister state by
taking into account the new page processed, e.g. advance a date or serial field.
Finally, upon either completion or error, the :py:meth:`finalize` is called. There you
must set attribute :py:attr:`updated` to True if you were successful in advancing in the
listing process. To do this you will commonly retrieve the latest saved lister state
from the scheduler and compare with your current lister state. If lister state was
updated, ultimately the current lister state will be recorded in the scheduler.
We have now seen the stateful lister API. Note that some listers may implement more
flexibility in the use of lister state. Some allow an `incremental` parameter that
governs whether or not we will do a stateful listing or not. It is up to you to support
additional functionality if it seems relevant.
.. _handling-specific-topics:
Handling specific topics
------------------------
Here is a quick coverage of common topics left out from lister construction and
:py:meth:`get_pages` descriptions.
Sessions
^^^^^^^^
When requesting a web service repeatedly, most parameters including headers do not
change and could be set up once initially. We recommend setting up a e.g. HTTP session,
as instance attribute so that further requesting code can focus on what really changes.
Some ubiquitous HTTP headers include "Accept" to set to the service response format and
"User-Agent" for which we provide a recommended value :py:const:`USER_AGENT` to be
imported from :py:mod:`swh.lister`. Authentication is also commonly provided through
headers, so you can also set it up in the session.
Transport error handling
^^^^^^^^^^^^^^^^^^^^^^^^
We generally recommend logging every unhandleable error with the response content and
then immediately stop the listing by doing an equivalent of
:py:meth:`Response.raise_for_status` from the `requests` library. As for rate-limiting
errors, we have a strategy of using a flexible decorator to handle the retrying for us.
It is based on the `tenacity` library and accessible as :py:func:`throttling_retry` from
:py:mod:`swh.lister.utils`.
Pagination
^^^^^^^^^^
This one is a moving target. You have to understand how the pagination mechanics of the
particular service works. Some guidelines though. The identifier may be minimal (an id
to pass as query parameter), compound (a set of such parameters) or complete (a whole
URL). If the service provides the next URL, use it. The piece of information may be
found either in the response body, or in a header. Once identified, you still have to
implement the logic of requesting and extracting it in a loop and quitting the loop when
there is no more data to fetch.
Page results
^^^^^^^^^^^^
First, when retrieving page results, which involves some protocols and parsing logic,
please make sure that any deviance from what was expected will result in an
informational error. You also have to simplify the results, both with filtering request
parameters if the service supports it, and by extracting from the response only the
information needed into a structured page. This all makes for easier debugging.
Testing your lister
-------------------
When developing a new lister, it's important to test. For this, add the tests
(check `swh/lister/*/tests/`) and register the celery tasks in the main
conftest.py (`swh/lister/core/tests/conftest.py`).
Another important step is to actually run it within the
docker-dev (:ref:`run-lister-tutorial`).
Another important step is to actually run it within the docker-dev
(:ref:`run-lister-tutorial`).
This is the entire source code for the BitBucket repository lister::
More about listers
------------------
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3 or later
# See top-level LICENSE file for more information
See current implemented listers as examples (GitHub_, Bitbucket_, CGit_, GitLab_ ).
from urllib import parse
from swh.lister.bitbucket.models import BitBucketModel
from swh.lister.core.indexing_lister import IndexingHttpLister
Old (2017) lister tutorial :ref:`lister-tutorial-2017`
class BitBucketLister(IndexingHttpLister):
PATH_TEMPLATE = '/repositories?after=%s'
MODEL = BitBucketModel
def get_model_from_repo(self, repo):
return {'uid': repo['uuid'],
'indexable': repo['created_on'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['links']['html']['href'],
'origin_url': repo['links']['clone'][0]['href'],
'origin_type': repo['scm'],
'description': repo['description']}
def get_next_target_from_response(self, response):
body = response.json()
if 'next' in body:
return parse.unquote(body['next'].split('after=')[1])
else:
return None
def transport_response_simplified(self, response):
repos = response.json()['values']
return [self.get_model_from_repo(repo) for repo in repos]
And this is the entire source code for the GitHub repository lister::
# Copyright (C) 2017 the Software Heritage developers
# License: GNU General Public License version 3 or later
# See top-level LICENSE file for more information
import time
from swh.lister.core.indexing_lister import IndexingHttpLister
from swh.lister.github.models import GitHubModel
class GitHubLister(IndexingHttpLister):
PATH_TEMPLATE = '/repositories?since=%d'
MODEL = GitHubModel
def get_model_from_repo(self, repo):
return {'uid': repo['id'],
'indexable': repo['id'],
'name': repo['name'],
'full_name': repo['full_name'],
'html_url': repo['html_url'],
'origin_url': repo['html_url'],
'origin_type': 'git',
'description': repo['description']}
def get_next_target_from_response(self, response):
if 'next' in response.links:
next_url = response.links['next']['url']
return int(next_url.split('since=')[1])
else:
return None
def transport_response_simplified(self, response):
repos = response.json()
return [self.get_model_from_repo(repo) for repo in repos]
def request_headers(self):
return {'Accept': 'application/vnd.github.v3+json'}
def transport_quota_check(self, response):
remain = int(response.headers['X-RateLimit-Remaining'])
if response.status_code == 403 and remain == 0:
reset_at = int(response.headers['X-RateLimit-Reset'])
delay = min(reset_at - time.time(), 3600)
return True, delay
else:
return False, 0
We can see that there are some common elements:
* Both use the HTTP transport mixin (:class:`IndexingHttpLister
<swh.lister.core.indexing_lister.IndexingHttpLister>`) just combines
:class:`ListerHttpTransport
<swh.lister.core.lister_transports.ListerHttpTransport>` and
:class:`IndexingLister
<swh.lister.core.indexing_lister.IndexingLister>`) to get most of the
network request functionality for free.
* Both also define ``MODEL`` and ``PATH_TEMPLATE`` variables. It should be
clear to developers that ``PATH_TEMPLATE``, when combined with the base
service URL (e.g., ``https://some_service.com``) and passed a value (the
'foo' index described earlier) results in a complete identifier for making
API requests to these services. It is required by our HTTP module.
* Both services respond using JSON, so both implementations of
``transport_response_simplified`` are similar and quite short.
We can also see that there are a few differences:
* GitHub sends the next URL as part of the response header, while BitBucket
sends it in the response body.
* GitHub differentiates API versions with a request header (our HTTP
transport mix-in will automatically use any headers provided by an
optional request_headers method that we implement here), while
BitBucket has it as part of their base service URL. BitBucket uses
the IETF standard HTTP 429 response code for their rate limit
notifications (the HTTP transport mix-in automatically handles
that), while GitHub uses their own custom response headers that need
special treatment.
* But look at them! 58 lines of Python code, combined, to absorb all
repositories from two of the largest and most influential source code hosting
services.
Ok, so what is going on behind the scenes?
To trace the operation of the code, let's start with a sample instantiation and
progress from there to see which methods get called when. What follows will be
a series of extremely reductionist pseudocode methods. This is not what the
code actually looks like (it's not even real code), but it does have the same
basic flow. Bear with me while I try to lay out lister operation in a
quasi-linear way…::
# main task
ghl = GitHubLister(lister_name='github.com',
api_baseurl='https://github.com')
ghl.run()
⇓ (IndexingLister.run)::
# IndexingLister.run
identifier = None
do
response, repos = ListerBase.ingest_data(identifier)
identifier = GitHubLister.get_next_target_from_response(response)
while(identifier)
⇓ (ListerBase.ingest_data)::
# ListerBase.ingest_data
response = ListerBase.safely_issue_request(identifier)
repos = GitHubLister.transport_response_simplified(response)
injected = ListerBase.inject_repo_data_into_db(repos)
return response, injected
⇓ (ListerBase.safely_issue_request)::
# ListerBase.safely_issue_request
repeat:
resp = ListerHttpTransport.transport_request(identifier)
retry, delay = ListerHttpTransport.transport_quota_check(resp)
if retry:
sleep(delay)
until((not retry) or too_many_retries)
return resp
⇓ (ListerHttpTransport.transport_request)::
# ListerHttpTransport.transport_request
path = ListerBase.api_baseurl
+ ListerHttpTransport.PATH_TEMPLATE % identifier
headers = ListerHttpTransport.request_headers()
return http.get(path, headers)
(Oh look, there's our ``PATH_TEMPLATE``)
⇓ (ListerHttpTransport.request_headers)::
# ListerHttpTransport.request_headers
override → GitHubLister.request_headers
↑↑ (ListerBase.safely_issue_request)
⇓ (ListerHttpTransport.transport_quota_check)::
# ListerHttpTransport.transport_quota_check
override → GitHubLister.transport_quota_check
And then we're done. From start to finish, I hope this helps you understand how
the few customized pieces fit into the new shared plumbing.
Now you can go and write up a lister for a code hosting site we don't have yet!
.. _GitHub: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/github/lister.py
.. _Bitbucket: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/bitbucket/lister.py
.. _CGit: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/cgit/lister.py
.. _GitLab: https://forge.softwareheritage.org/source/swh-lister/browse/master/swh/lister/gitlab/lister.py