github: start moving to a proper artbitrated module

This commit is contained in:
Dima Gerasimov 2020-06-01 22:10:29 +01:00
parent 67cf4d0c04
commit d7aff1be3f
5 changed files with 27 additions and 13 deletions

View file

@ -116,6 +116,7 @@ from ..kython.klogging import setup_logger, LazyLogger
Paths = Union[Sequence[PathIsh], PathIsh] Paths = Union[Sequence[PathIsh], PathIsh]
# TODO support '' for emtpy path
DEFAULT_GLOB = '*' DEFAULT_GLOB = '*'
def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]: def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]:
""" """

View file

@ -1,7 +1,7 @@
""" """
Github events and their metadata: comments/issues/pull requests Github events and their metadata: comments/issues/pull requests
""" """
from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterator, TypeVar, Set from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterable, TypeVar, Set
from datetime import datetime from datetime import datetime
import json import json
@ -10,7 +10,7 @@ import pytz
from ..kython.klogging import LazyLogger from ..kython.klogging import LazyLogger
from ..kython.kompress import CPath from ..kython.kompress import CPath
from ..common import get_files, mcachew from ..common import get_files, mcachew
from ..error import Res from ..core.error import Res, sort_res_by
from my.config import github as config from my.config import github as config
import my.config.repos.ghexport.dal as ghexport import my.config.repos.ghexport.dal as ghexport
@ -197,7 +197,7 @@ def _parse_event(d: Dict) -> Event:
) )
def iter_gdpr_events() -> Iterator[Res[Event]]: def iter_gdpr_events() -> Iterable[Res[Event]]:
""" """
Parses events from GDPR export (https://github.com/settings/admin) Parses events from GDPR export (https://github.com/settings/admin)
""" """
@ -240,12 +240,12 @@ def iter_gdpr_events() -> Iterator[Res[Event]]:
# TODO hmm. not good, need to be lazier?... # TODO hmm. not good, need to be lazier?...
@mcachew(config.cache_dir, hashf=lambda dal: dal.sources) @mcachew(config.cache_dir, hashf=lambda dal: dal.sources)
def iter_backup_events(dal=_dal()) -> Iterator[Event]: def iter_backup_events(dal=_dal()) -> Iterable[Event]:
for d in dal.events(): for d in dal.events():
yield _parse_event(d) yield _parse_event(d)
def iter_events() -> Iterator[Res[Event]]: def events() -> Iterable[Res[Event]]:
from itertools import chain from itertools import chain
emitted: Set[Tuple[datetime, str]] = set() emitted: Set[Tuple[datetime, str]] = set()
for e in chain(iter_gdpr_events(), iter_backup_events()): for e in chain(iter_gdpr_events(), iter_backup_events()):
@ -260,13 +260,16 @@ def iter_events() -> Iterator[Res[Event]]:
logger.debug('ignoring %s: %s', key, e) logger.debug('ignoring %s: %s', key, e)
continue continue
yield e yield e
emitted.add(key) emitted.add(key) # todo more_itertools
def get_events(): def get_events() -> Iterable[Res[Event]]:
return sorted(iter_events(), key=lambda e: e.dt) return sort_res_by(events(), key=lambda e: e.dt)
# TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper? # TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper?
# from github.Event import Event as GEvent # type: ignore # from github.Event import Event as GEvent # type: ignore
# # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__ # # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__
# e = GEvent(None, None, raw_event, True) # e = GEvent(None, None, raw_event, True)
# todo deprecate
iter_events = events

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from datetime import datetime from datetime import datetime
from typing import NamedTuple, List from typing import NamedTuple, List, Iterable
from ..google.takeout.html import read_html from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout from ..google.takeout.paths import get_last_takeout
@ -16,7 +16,7 @@ class Watched(NamedTuple):
return f'{self.url}-{self.when.isoformat()}' return f'{self.url}-{self.when.isoformat()}'
def get_watched(): def watched() -> Iterable[Watched]:
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
@ -30,6 +30,10 @@ def get_watched():
return list(sorted(watches, key=lambda e: e.when)) return list(sorted(watches, key=lambda e: e.when))
# todo deprecate
get_watched = watched
def main(): def main():
# TODO shit. a LOT of watches... # TODO shit. a LOT of watches...
for w in get_watched(): for w in get_watched():

View file

@ -1,5 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from my.coding.github import get_events from more_itertools import ilen
from my.coding.github import get_events, iter_gdpr_events
def test_gdpr():
assert ilen(iter_gdpr_events()) > 100
def test(): def test():
events = get_events() events = get_events()

View file

@ -1,5 +1,4 @@
# TODO move elsewhere? # TODO move elsewhere?
# these tests would only make sense with some existing data? although some of them would work for everyone.. # these tests would only make sense with some existing data? although some of them would work for everyone..
# not sure what's a good way of handling this.. # not sure what's a good way of handling this..
@ -7,7 +6,7 @@ from my.media.youtube import get_watched, Watched
def test(): def test():
watched = get_watched() watched = list(get_watched())
assert len(watched) > 1000 assert len(watched) > 1000
from datetime import datetime from datetime import datetime