From 5f567b3c3425f7840f0639a766dabbdcb7688fb3 Mon Sep 17 00:00:00 2001 From: Nicolas Dandrimont Date: Wed, 1 Dec 2021 16:00:14 +0100 Subject: [PATCH] Deduplicate origins in the GitHub lister In some circumstances, GitHub will return two separate repos with the same html_url in the same page. This makes the lister fail with a cardinality error. --- swh/lister/github/lister.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index bbb1f63..f4246a2 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -8,7 +8,7 @@ import datetime import logging import random import time -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, List, Optional, Set from urllib.parse import parse_qs, urlparse import iso8601 @@ -305,11 +305,17 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): """ assert self.lister_obj.id is not None + seen_in_page: Set[str] = set() + for repo in page: if not repo: # null repositories in listings happen sometimes... continue + if repo["html_url"] in seen_in_page: + continue + seen_in_page.add(repo["html_url"]) + pushed_at_str = repo.get("pushed_at") pushed_at: Optional[datetime.datetime] = None if pushed_at_str: