github: start moving to a proper artbitrated module

This commit is contained in:
Dima Gerasimov 2020-06-01 22:10:29 +01:00
parent 67cf4d0c04
commit d7aff1be3f
5 changed files with 27 additions and 13 deletions

View file

@ -116,6 +116,7 @@ from ..kython.klogging import setup_logger, LazyLogger
Paths = Union[Sequence[PathIsh], PathIsh]
# TODO support '' for emtpy path
DEFAULT_GLOB = '*'
def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]:
"""

View file

@ -1,7 +1,7 @@
"""
Github events and their metadata: comments/issues/pull requests
"""
from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterator, TypeVar, Set
from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterable, TypeVar, Set
from datetime import datetime
import json
@ -10,7 +10,7 @@ import pytz
from ..kython.klogging import LazyLogger
from ..kython.kompress import CPath
from ..common import get_files, mcachew
from ..error import Res
from ..core.error import Res, sort_res_by
from my.config import github as config
import my.config.repos.ghexport.dal as ghexport
@ -197,7 +197,7 @@ def _parse_event(d: Dict) -> Event:
)
def iter_gdpr_events() -> Iterator[Res[Event]]:
def iter_gdpr_events() -> Iterable[Res[Event]]:
"""
Parses events from GDPR export (https://github.com/settings/admin)
"""
@ -240,12 +240,12 @@ def iter_gdpr_events() -> Iterator[Res[Event]]:
# TODO hmm. not good, need to be lazier?...
@mcachew(config.cache_dir, hashf=lambda dal: dal.sources)
def iter_backup_events(dal=_dal()) -> Iterator[Event]:
def iter_backup_events(dal=_dal()) -> Iterable[Event]:
for d in dal.events():
yield _parse_event(d)
def iter_events() -> Iterator[Res[Event]]:
def events() -> Iterable[Res[Event]]:
from itertools import chain
emitted: Set[Tuple[datetime, str]] = set()
for e in chain(iter_gdpr_events(), iter_backup_events()):
@ -260,13 +260,16 @@ def iter_events() -> Iterator[Res[Event]]:
logger.debug('ignoring %s: %s', key, e)
continue
yield e
emitted.add(key)
emitted.add(key) # todo more_itertools
def get_events():
return sorted(iter_events(), key=lambda e: e.dt)
def get_events() -> Iterable[Res[Event]]:
return sort_res_by(events(), key=lambda e: e.dt)
# TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper?
# from github.Event import Event as GEvent # type: ignore
# # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__
# e = GEvent(None, None, raw_event, True)
# todo deprecate
iter_events = events

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from datetime import datetime
from typing import NamedTuple, List
from typing import NamedTuple, List, Iterable
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout
@ -16,7 +16,7 @@ class Watched(NamedTuple):
return f'{self.url}-{self.when.isoformat()}'
def get_watched():
def watched() -> Iterable[Watched]:
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
@ -30,6 +30,10 @@ def get_watched():
return list(sorted(watches, key=lambda e: e.when))
# todo deprecate
get_watched = watched
def main():
# TODO shit. a LOT of watches...
for w in get_watched():

View file

@ -1,5 +1,12 @@
#!/usr/bin/env python3
from my.coding.github import get_events
from more_itertools import ilen
from my.coding.github import get_events, iter_gdpr_events
def test_gdpr():
assert ilen(iter_gdpr_events()) > 100
def test():
events = get_events()

View file

@ -1,5 +1,4 @@
# TODO move elsewhere?
# these tests would only make sense with some existing data? although some of them would work for everyone..
# not sure what's a good way of handling this..
@ -7,7 +6,7 @@ from my.media.youtube import get_watched, Watched
def test():
watched = get_watched()
watched = list(get_watched())
assert len(watched) > 1000
from datetime import datetime