my.github: some work in progress on generating consistent ids
sadly it seems that there are at several issues: - gdpr has less detailed data so it's hard to generate a proper ID at times - sometimes there is a small (1s?) discrepancy between created_at between same event in GDPR an API - some API events can have duplicate payload, but different id, which violates uniqueness
This commit is contained in:
parent
386234970b
commit
5ef2775265
3 changed files with 38 additions and 10 deletions
|
@ -1,6 +1,9 @@
|
||||||
"""
|
"""
|
||||||
Github events and their metadata: comments/issues/pull requests
|
Github events and their metadata: comments/issues/pull requests
|
||||||
"""
|
"""
|
||||||
|
from ..core import __NOT_HPI_MODULE__
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional, NamedTuple, Iterable, Set, Tuple
|
from typing import Optional, NamedTuple, Iterable, Set, Tuple
|
||||||
|
|
||||||
|
@ -48,4 +51,12 @@ def parse_dt(s: str) -> datetime:
|
||||||
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ'))
|
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ'))
|
||||||
|
|
||||||
|
|
||||||
from ..core import __NOT_HPI_MODULE__
|
# experimental way of supportint event ids... not sure
|
||||||
|
class EventIds:
|
||||||
|
@staticmethod
|
||||||
|
def repo_created(*, dts: str, name: str, ref_type: str, ref: Optional[str]) -> str:
|
||||||
|
return f'{dts}_repocreated_{name}_{ref_type}_{ref}'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pr(*, dts: str, action: str, url: str) -> str:
|
||||||
|
return f'{dts}_pr{action}_{url}'
|
||||||
|
|
|
@ -8,7 +8,7 @@ from typing import Iterable, Dict, Any
|
||||||
from ..core.error import Res
|
from ..core.error import Res
|
||||||
from ..core import get_files
|
from ..core import get_files
|
||||||
|
|
||||||
from .common import Event, parse_dt
|
from .common import Event, parse_dt, EventIds
|
||||||
|
|
||||||
# TODO later, use a separate user config? (github_gdpr)
|
# TODO later, use a separate user config? (github_gdpr)
|
||||||
from my.config import github as user_config
|
from my.config import github as user_config
|
||||||
|
@ -87,11 +87,14 @@ def _parse_common(d: Dict) -> Dict:
|
||||||
def _parse_repository(d: Dict) -> Event:
|
def _parse_repository(d: Dict) -> Event:
|
||||||
pref = 'https://github.com/'
|
pref = 'https://github.com/'
|
||||||
url = d['url']
|
url = d['url']
|
||||||
|
dts = d['created_at']
|
||||||
|
rt = d['type']
|
||||||
assert url.startswith(pref); name = url[len(pref):]
|
assert url.startswith(pref); name = url[len(pref):]
|
||||||
|
eid = EventIds.repo_created(dts=dts, name=name, ref_type=rt, ref=None)
|
||||||
return Event( # type: ignore[misc]
|
return Event( # type: ignore[misc]
|
||||||
**_parse_common(d),
|
**_parse_common(d),
|
||||||
summary='created ' + name,
|
summary='created ' + name,
|
||||||
eid='created_' + name, # TODO ??
|
eid=eid,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -119,6 +122,7 @@ def _parse_issue(d: Dict) -> Event:
|
||||||
|
|
||||||
|
|
||||||
def _parse_pull_request(d: Dict) -> Event:
|
def _parse_pull_request(d: Dict) -> Event:
|
||||||
|
dts = d['created_at']
|
||||||
url = d['url']
|
url = d['url']
|
||||||
title = d['title']
|
title = d['title']
|
||||||
is_bot = "[bot]" in d["user"]
|
is_bot = "[bot]" in d["user"]
|
||||||
|
@ -127,7 +131,7 @@ def _parse_pull_request(d: Dict) -> Event:
|
||||||
# TODO distinguish incoming/outgoing?
|
# TODO distinguish incoming/outgoing?
|
||||||
# TODO action? opened/closed??
|
# TODO action? opened/closed??
|
||||||
summary=f'opened PR {title}',
|
summary=f'opened PR {title}',
|
||||||
eid='pull_request_' + url,
|
eid=EventIds.pr(dts=dts, action='opened', url=url),
|
||||||
is_bot=is_bot,
|
is_bot=is_bot,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -146,6 +150,7 @@ def _parse_project(d: Dict) -> Event:
|
||||||
is_bot=is_bot,
|
is_bot=is_bot,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_release(d: Dict) -> Event:
|
def _parse_release(d: Dict) -> Event:
|
||||||
tag = d['tag_name']
|
tag = d['tag_name']
|
||||||
return Event( # type: ignore[misc]
|
return Event( # type: ignore[misc]
|
||||||
|
|
|
@ -46,7 +46,7 @@ from typing import Tuple, Dict, Sequence, Optional
|
||||||
from my.core import get_files, Path, LazyLogger
|
from my.core import get_files, Path, LazyLogger
|
||||||
from my.core.common import mcachew
|
from my.core.common import mcachew
|
||||||
|
|
||||||
from .common import Event, parse_dt, Results
|
from .common import Event, parse_dt, Results, EventIds
|
||||||
|
|
||||||
|
|
||||||
logger = LazyLogger(__name__)
|
logger = LazyLogger(__name__)
|
||||||
|
@ -61,9 +61,16 @@ def _dal() -> dal.DAL:
|
||||||
return dal.DAL(sources)
|
return dal.DAL(sources)
|
||||||
|
|
||||||
|
|
||||||
# todo cachew: hmm. not good, need to be lazier?...
|
|
||||||
@mcachew(depends_on=lambda: inputs())
|
@mcachew(depends_on=lambda: inputs())
|
||||||
def events() -> Results:
|
def events() -> Results:
|
||||||
|
from my.core.common import ensure_unique
|
||||||
|
key = lambda e: object() if isinstance(e, Exception) else e.eid
|
||||||
|
# crap. sometimes API events can be repeated with exactly the same payload and different id
|
||||||
|
# yield from ensure_unique(_events(), key=key)
|
||||||
|
yield from _events()
|
||||||
|
|
||||||
|
|
||||||
|
def _events() -> Results:
|
||||||
dal = _dal()
|
dal = _dal()
|
||||||
for d in dal.events():
|
for d in dal.events():
|
||||||
if isinstance(d, Exception):
|
if isinstance(d, Exception):
|
||||||
|
@ -93,6 +100,7 @@ EventId = str
|
||||||
Body = str
|
Body = str
|
||||||
def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Body]]:
|
def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Body]]:
|
||||||
# TODO would be nice to give access to raw event within timeline
|
# TODO would be nice to give access to raw event within timeline
|
||||||
|
dts = e['created_at']
|
||||||
eid = e['id']
|
eid = e['id']
|
||||||
tp = e['type']
|
tp = e['type']
|
||||||
pl = e['payload']
|
pl = e['payload']
|
||||||
|
@ -119,10 +127,13 @@ def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Bo
|
||||||
what = mapping[tp]
|
what = mapping[tp]
|
||||||
rt = pl['ref_type']
|
rt = pl['ref_type']
|
||||||
ref = pl['ref']
|
ref = pl['ref']
|
||||||
# TODO link to branch? only contains weird API link though
|
if what == 'created':
|
||||||
# TODO hmm. include timestamp instead?
|
# FIXME should handle delection?...
|
||||||
|
eid = EventIds.repo_created(dts=dts, name=rname, ref_type=rt, ref=ref)
|
||||||
|
mref = '' if ref is None else ' ' + ref
|
||||||
|
# todo link to branch? only contains weird API link though
|
||||||
# TODO combine automatically instead
|
# TODO combine automatically instead
|
||||||
return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}', None
|
return f"{rname}: {what} {rt}{mref}", None, eid, None
|
||||||
elif tp == 'PullRequestEvent':
|
elif tp == 'PullRequestEvent':
|
||||||
pr = pl['pull_request']
|
pr = pl['pull_request']
|
||||||
title = pr['title']
|
title = pr['title']
|
||||||
|
@ -130,7 +141,8 @@ def _get_summary(e) -> Tuple[str, Optional[Link], Optional[EventId], Optional[Bo
|
||||||
link = pr['html_url']
|
link = pr['html_url']
|
||||||
body = pr['body']
|
body = pr['body']
|
||||||
action = pl['action']
|
action = pl['action']
|
||||||
return f"{rname}: {action} PR: {title}", link, f'{rname}_{action}_pr_{link}', body
|
eid = EventIds.pr(dts=dts, action=action, url=link)
|
||||||
|
return f"{rname}: {action} PR: {title}", link, eid, body
|
||||||
elif tp == 'PullRequestReviewEvent':
|
elif tp == 'PullRequestReviewEvent':
|
||||||
pr = pl['pull_request']
|
pr = pl['pull_request']
|
||||||
title = pr['title']
|
title = pr['title']
|
||||||
|
|
Loading…
Add table
Reference in a new issue