From d7aff1be3ff5cd4a51574bf97ecdb1395e73a91f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 1 Jun 2020 22:10:29 +0100 Subject: [PATCH 1/4] github: start moving to a proper artbitrated module --- my/core/common.py | 1 + my/{coding/github.py => github/common.py} | 19 +++++++++++-------- my/media/youtube.py | 8 ++++++-- tests/github.py | 9 ++++++++- tests/youtube.py | 3 +-- 5 files changed, 27 insertions(+), 13 deletions(-) rename my/{coding/github.py => github/common.py} (95%) diff --git a/my/core/common.py b/my/core/common.py index 64e7b23..74aac5e 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -116,6 +116,7 @@ from ..kython.klogging import setup_logger, LazyLogger Paths = Union[Sequence[PathIsh], PathIsh] +# TODO support '' for emtpy path DEFAULT_GLOB = '*' def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]: """ diff --git a/my/coding/github.py b/my/github/common.py similarity index 95% rename from my/coding/github.py rename to my/github/common.py index 3f5dd63..1f05a19 100644 --- a/my/coding/github.py +++ b/my/github/common.py @@ -1,7 +1,7 @@ """ Github events and their metadata: comments/issues/pull requests """ -from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterator, TypeVar, Set +from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterable, TypeVar, Set from datetime import datetime import json @@ -10,7 +10,7 @@ import pytz from ..kython.klogging import LazyLogger from ..kython.kompress import CPath from ..common import get_files, mcachew -from ..error import Res +from ..core.error import Res, sort_res_by from my.config import github as config import my.config.repos.ghexport.dal as ghexport @@ -197,7 +197,7 @@ def _parse_event(d: Dict) -> Event: ) -def iter_gdpr_events() -> Iterator[Res[Event]]: +def iter_gdpr_events() -> Iterable[Res[Event]]: """ Parses events from GDPR export (https://github.com/settings/admin) """ @@ -240,12 +240,12 @@ def iter_gdpr_events() -> Iterator[Res[Event]]: # TODO hmm. not good, need to be lazier?... @mcachew(config.cache_dir, hashf=lambda dal: dal.sources) -def iter_backup_events(dal=_dal()) -> Iterator[Event]: +def iter_backup_events(dal=_dal()) -> Iterable[Event]: for d in dal.events(): yield _parse_event(d) -def iter_events() -> Iterator[Res[Event]]: +def events() -> Iterable[Res[Event]]: from itertools import chain emitted: Set[Tuple[datetime, str]] = set() for e in chain(iter_gdpr_events(), iter_backup_events()): @@ -260,13 +260,16 @@ def iter_events() -> Iterator[Res[Event]]: logger.debug('ignoring %s: %s', key, e) continue yield e - emitted.add(key) + emitted.add(key) # todo more_itertools -def get_events(): - return sorted(iter_events(), key=lambda e: e.dt) +def get_events() -> Iterable[Res[Event]]: + return sort_res_by(events(), key=lambda e: e.dt) # TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper? # from github.Event import Event as GEvent # type: ignore # # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__ # e = GEvent(None, None, raw_event, True) + +# todo deprecate +iter_events = events diff --git a/my/media/youtube.py b/my/media/youtube.py index ffe2740..faeb09a 100755 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 from datetime import datetime -from typing import NamedTuple, List +from typing import NamedTuple, List, Iterable from ..google.takeout.html import read_html from ..google.takeout.paths import get_last_takeout @@ -16,7 +16,7 @@ class Watched(NamedTuple): return f'{self.url}-{self.when.isoformat()}' -def get_watched(): +def watched() -> Iterable[Watched]: # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json @@ -30,6 +30,10 @@ def get_watched(): return list(sorted(watches, key=lambda e: e.when)) +# todo deprecate +get_watched = watched + + def main(): # TODO shit. a LOT of watches... for w in get_watched(): diff --git a/tests/github.py b/tests/github.py index d296096..a007a42 100644 --- a/tests/github.py +++ b/tests/github.py @@ -1,5 +1,12 @@ #!/usr/bin/env python3 -from my.coding.github import get_events +from more_itertools import ilen + +from my.coding.github import get_events, iter_gdpr_events + + +def test_gdpr(): + assert ilen(iter_gdpr_events()) > 100 + def test(): events = get_events() diff --git a/tests/youtube.py b/tests/youtube.py index 104f2d8..b8c1aa8 100644 --- a/tests/youtube.py +++ b/tests/youtube.py @@ -1,5 +1,4 @@ # TODO move elsewhere? - # these tests would only make sense with some existing data? although some of them would work for everyone.. # not sure what's a good way of handling this.. @@ -7,7 +6,7 @@ from my.media.youtube import get_watched, Watched def test(): - watched = get_watched() + watched = list(get_watched()) assert len(watched) > 1000 from datetime import datetime From ca39187c6347c645ba37d92246235b97e81eed84 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 1 Jun 2020 22:42:45 +0100 Subject: [PATCH 2/4] github: DEPRECATE my.coding.github Instead my.github.all should be used (still backward compatible) The reasons are a) I don't feel that grouping (i.e. my.coding.*) makes much sense b) using .all pattern (same way as twitter) allows for more composable and cleaner separation of GDPR and API data --- my/coding/github.py | 8 ++ my/github/all.py | 17 +++ my/github/common.py | 257 +++--------------------------------------- my/github/gdpr.py | 128 +++++++++++++++++++++ my/github/ghexport.py | 111 ++++++++++++++++++ my/twitter/all.py | 2 +- tests/github.py | 7 +- 7 files changed, 286 insertions(+), 244 deletions(-) create mode 100644 my/coding/github.py create mode 100644 my/github/all.py create mode 100644 my/github/gdpr.py create mode 100644 my/github/ghexport.py diff --git a/my/coding/github.py b/my/coding/github.py new file mode 100644 index 0000000..e1e0d77 --- /dev/null +++ b/my/coding/github.py @@ -0,0 +1,8 @@ +import warnings + +warnings.warn('my.coding.github is deprecated! Please use my.github.all instead!', DeprecationWarning) + +from ..github.all import events, get_events + +# todo deprecate properly +iter_events = events diff --git a/my/github/all.py b/my/github/all.py new file mode 100644 index 0000000..61dcef3 --- /dev/null +++ b/my/github/all.py @@ -0,0 +1,17 @@ +from . import gdpr, ghexport + +from .common import merge_events, Results + + +def events() -> Results: + yield from merge_events( + gdpr.events(), + ghexport.events(), + ) + + +# todo hmm. not sure, maybe should be named sorted_events or something.. +# also, not great that it's in all.py... think of a better way... +def get_events() -> Results: + from ..core.error import sort_res_by + return sort_res_by(events(), key=lambda e: e.dt) diff --git a/my/github/common.py b/my/github/common.py index 1f05a19..6e003d3 100644 --- a/my/github/common.py +++ b/my/github/common.py @@ -1,22 +1,13 @@ """ Github events and their metadata: comments/issues/pull requests """ -from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterable, TypeVar, Set from datetime import datetime -import json +from typing import Optional, NamedTuple, Iterable, Set, Tuple import pytz -from ..kython.klogging import LazyLogger -from ..kython.kompress import CPath -from ..common import get_files, mcachew -from ..core.error import Res, sort_res_by - -from my.config import github as config -import my.config.repos.ghexport.dal as ghexport - - -logger = LazyLogger(__name__) +from ..core import warn_if_empty +from ..core.error import Res class Event(NamedTuple): @@ -28,227 +19,15 @@ class Event(NamedTuple): is_bot: bool = False -# TODO hmm. need some sort of abstract syntax for this... -# TODO split further, title too -def _get_summary(e) -> Tuple[str, Optional[str], Optional[str]]: - # TODO would be nice to give access to raw event withing timeline - eid = e['id'] - tp = e['type'] - pl = e['payload'] - rname = e['repo']['name'] +Results = Iterable[Res[Event]] - mapping = { - 'CreateEvent': 'created', - 'DeleteEvent': 'deleted', - } - - if tp == 'ForkEvent': - url = e['payload']['forkee']['html_url'] - return f"{rname}: forked", url, None - elif tp == 'PushEvent': - commits = pl['commits'] - messages = [c['message'] for c in commits] - body = '\n'.join(messages) - return f"{rname}: pushed\n{body}", None, None - elif tp == 'WatchEvent': - return f"{rname}: watching", None, None - elif tp in mapping: - what = mapping[tp] - rt = pl['ref_type'] - ref = pl['ref'] - # TODO link to branch? only contains weird API link though - # TODO hmm. include timestamp instead? - # breakpoint() - # TODO combine automatically instead - return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}' - elif tp == 'PullRequestEvent': - pr = pl['pull_request'] - action = pl['action'] - link = pr['html_url'] - title = pr['title'] - return f"{rname}: {action} PR {title}", link, f'{rname}_{action}_pr_{link}' - elif tp == "IssuesEvent": - action = pl['action'] - iss = pl['issue'] - link = iss['html_url'] - title = iss['title'] - return f"{rname}: {action} issue {title}", link, None - elif tp == "IssueCommentEvent": - com = pl['comment'] - link = com['html_url'] - iss = pl['issue'] - title = iss['title'] - return f"{rname}: commented on issue {title}", link, f'issue_comment_' + link - elif tp == "ReleaseEvent": - action = pl['action'] - rel = pl['release'] - tag = rel['tag_name'] - link = rel['html_url'] - return f"{rname}: {action} [{tag}]", link, None - elif tp in 'PublicEvent': - return f'{tp} {e}', None, None # TODO ??? - else: - return tp, None, None - - -def inputs(): - return get_files(config.export_dir) - - -def _dal(): - sources = inputs() - sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg? - return ghexport.DAL(sources) - - -def _parse_dt(s: str) -> datetime: - # TODO isoformat? - return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')) - - -# TODO extract to separate gdpr module? -# TODO typing.TypedDict could be handy here.. -def _parse_common(d: Dict) -> Dict: - url = d['url'] - body = d.get('body') - return { - 'dt' : _parse_dt(d['created_at']), - 'link': url, - 'body': body, - } - - -def _parse_repository(d: Dict) -> Event: - pref = 'https://github.com/' - url = d['url'] - assert url.startswith(pref); name = url[len(pref):] - return Event( # type: ignore[misc] - **_parse_common(d), - summary='created ' + name, - eid='created_' + name, # TODO ?? - ) - -def _parse_issue_comment(d: Dict) -> Event: - url = d['url'] - is_bot = "[bot]" in d["user"] - return Event( # type: ignore[misc] - **_parse_common(d), - summary=f'commented on issue {url}', - eid='issue_comment_' + url, - is_bot=is_bot, - ) - - -def _parse_issue(d: Dict) -> Event: - url = d['url'] - title = d['title'] - is_bot = "[bot]" in d["user"] - return Event( # type: ignore[misc] - **_parse_common(d), - summary=f'opened issue {title}', - eid='issue_comment_' + url, - is_bot=is_bot, - ) - - -def _parse_pull_request(d: Dict) -> Event: - url = d['url'] - title = d['title'] - is_bot = "[bot]" in d["user"] - return Event( # type: ignore[misc] - **_parse_common(d), - # TODO distinguish incoming/outgoing? - # TODO action? opened/closed?? - summary=f'opened PR {title}', - eid='pull_request_' + url, - is_bot=is_bot, - ) - - -def _parse_release(d: Dict) -> Event: - tag = d['tag_name'] - return Event( # type: ignore[misc] - **_parse_common(d), - summary=f'released {tag}', - eid='release_' + tag, - ) - - -def _parse_commit_comment(d: Dict) -> Event: - url = d['url'] - return Event( # type: ignore[misc] - **_parse_common(d), - summary=f'commented on {url}', - eid='commoit_comment_' + url, - ) - - -def _parse_event(d: Dict) -> Event: - summary, link, eid = _get_summary(d) - if eid is None: - eid = d['id'] - body = d.get('payload', {}).get('comment', {}).get('body') - return Event( - dt=_parse_dt(d['created_at']), - summary=summary, - link=link, - eid=eid, - body=body, - ) - - -def iter_gdpr_events() -> Iterable[Res[Event]]: - """ - Parses events from GDPR export (https://github.com/settings/admin) - """ - # TODO allow using archive here? - files = get_files(config.gdpr_dir, glob='*.json') - handler_map = { - 'schema' : None, - 'issue_events_': None, # eh, doesn't seem to have any useful bodies - 'attachments_' : None, # not sure if useful - 'users' : None, # just contains random users - 'repositories_' : _parse_repository, - 'issue_comments_': _parse_issue_comment, - 'issues_' : _parse_issue, - 'pull_requests_' : _parse_pull_request, - 'releases_' : _parse_release, - 'commit_comments': _parse_commit_comment, - } - for f in files: - handler: Any - for prefix, h in handler_map.items(): - if not f.name.startswith(prefix): - continue - handler = h - break - else: - yield RuntimeError(f'Unhandled file: {f}') - continue - - if handler is None: - # ignored - continue - - j = json.loads(f.read_text()) - for r in j: - try: - yield handler(r) - except Exception as e: - yield e - - -# TODO hmm. not good, need to be lazier?... -@mcachew(config.cache_dir, hashf=lambda dal: dal.sources) -def iter_backup_events(dal=_dal()) -> Iterable[Event]: - for d in dal.events(): - yield _parse_event(d) - - -def events() -> Iterable[Res[Event]]: +@warn_if_empty +def merge_events(*sources: Results) -> Results: + from ..kython.klogging import LazyLogger + logger = LazyLogger(__name__) from itertools import chain emitted: Set[Tuple[datetime, str]] = set() - for e in chain(iter_gdpr_events(), iter_backup_events()): + for e in chain(*sources): if isinstance(e, Exception): yield e continue @@ -260,16 +39,14 @@ def events() -> Iterable[Res[Event]]: logger.debug('ignoring %s: %s', key, e) continue yield e - emitted.add(key) # todo more_itertools + emitted.add(key) + # todo use unique_everseen? Might be tricky with Exception etc.. -def get_events() -> Iterable[Res[Event]]: - return sort_res_by(events(), key=lambda e: e.dt) +def parse_dt(s: str) -> datetime: + # TODO isoformat? + return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')) -# TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper? -# from github.Event import Event as GEvent # type: ignore -# # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__ -# e = GEvent(None, None, raw_event, True) - -# todo deprecate -iter_events = events +# TODO not sure +# def get_events() -> Iterable[Res[Event]]: +# return sort_res_by(events(), key=lambda e: e.dt) diff --git a/my/github/gdpr.py b/my/github/gdpr.py new file mode 100644 index 0000000..b1504e9 --- /dev/null +++ b/my/github/gdpr.py @@ -0,0 +1,128 @@ +from datetime import datetime +import json +from typing import Iterable, Dict, Any + +from ..core.error import Res +from ..core import get_files + +from .common import Event, parse_dt + +from my.config import github as config + + +def events() -> Iterable[Res[Event]]: + """ + Parses events from GDPR export (https://github.com/settings/admin) + """ + # TODO allow using archive here? + files = get_files(config.gdpr_dir, glob='*.json') + handler_map = { + 'schema' : None, + 'issue_events_': None, # eh, doesn't seem to have any useful bodies + 'attachments_' : None, # not sure if useful + 'users' : None, # just contains random users + 'repositories_' : _parse_repository, + 'issue_comments_': _parse_issue_comment, + 'issues_' : _parse_issue, + 'pull_requests_' : _parse_pull_request, + 'releases_' : _parse_release, + 'commit_comments': _parse_commit_comment, + } + for f in files: + handler: Any + for prefix, h in handler_map.items(): + if not f.name.startswith(prefix): + continue + handler = h + break + else: + yield RuntimeError(f'Unhandled file: {f}') + continue + + if handler is None: + # ignored + continue + + j = json.loads(f.read_text()) + for r in j: + try: + yield handler(r) + except Exception as e: + yield e + + +# TODO typing.TypedDict could be handy here.. +def _parse_common(d: Dict) -> Dict: + url = d['url'] + body = d.get('body') + return { + 'dt' : parse_dt(d['created_at']), + 'link': url, + 'body': body, + } + + +def _parse_repository(d: Dict) -> Event: + pref = 'https://github.com/' + url = d['url'] + assert url.startswith(pref); name = url[len(pref):] + return Event( # type: ignore[misc] + **_parse_common(d), + summary='created ' + name, + eid='created_' + name, # TODO ?? + ) + + +def _parse_issue_comment(d: Dict) -> Event: + url = d['url'] + is_bot = "[bot]" in d["user"] + return Event( # type: ignore[misc] + **_parse_common(d), + summary=f'commented on issue {url}', + eid='issue_comment_' + url, + is_bot=is_bot, + ) + + +def _parse_issue(d: Dict) -> Event: + url = d['url'] + title = d['title'] + is_bot = "[bot]" in d["user"] + return Event( # type: ignore[misc] + **_parse_common(d), + summary=f'opened issue {title}', + eid='issue_comment_' + url, + is_bot=is_bot, + ) + + +def _parse_pull_request(d: Dict) -> Event: + url = d['url'] + title = d['title'] + is_bot = "[bot]" in d["user"] + return Event( # type: ignore[misc] + **_parse_common(d), + # TODO distinguish incoming/outgoing? + # TODO action? opened/closed?? + summary=f'opened PR {title}', + eid='pull_request_' + url, + is_bot=is_bot, + ) + + +def _parse_release(d: Dict) -> Event: + tag = d['tag_name'] + return Event( # type: ignore[misc] + **_parse_common(d), + summary=f'released {tag}', + eid='release_' + tag, + ) + + +def _parse_commit_comment(d: Dict) -> Event: + url = d['url'] + return Event( # type: ignore[misc] + **_parse_common(d), + summary=f'commented on {url}', + eid='commoit_comment_' + url, + ) diff --git a/my/github/ghexport.py b/my/github/ghexport.py new file mode 100644 index 0000000..2a7c239 --- /dev/null +++ b/my/github/ghexport.py @@ -0,0 +1,111 @@ +from pathlib import Path +from typing import Tuple, Optional, Iterable, Dict, Sequence + +from ..core import get_files +from ..core.common import mcachew +from ..kython.kompress import CPath + +from .common import Event, parse_dt, Results + +from my.config import github as config +import my.config.repos.ghexport.dal as ghexport + + +def inputs() -> Sequence[Path]: + return get_files(config.export_dir) + + +def _dal(): + sources = inputs() + sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg? + return ghexport.DAL(sources) + + +# TODO hmm. not good, need to be lazier?... +@mcachew(config.cache_dir, hashf=lambda dal: dal.sources) +def events(dal=_dal()) -> Results: + for d in dal.events(): + yield _parse_event(d) + + +# TODO hmm. need some sort of abstract syntax for this... +# TODO split further, title too +def _get_summary(e) -> Tuple[str, Optional[str], Optional[str]]: + # TODO would be nice to give access to raw event withing timeline + eid = e['id'] + tp = e['type'] + pl = e['payload'] + rname = e['repo']['name'] + + mapping = { + 'CreateEvent': 'created', + 'DeleteEvent': 'deleted', + } + + if tp == 'ForkEvent': + url = e['payload']['forkee']['html_url'] + return f"{rname}: forked", url, None + elif tp == 'PushEvent': + commits = pl['commits'] + messages = [c['message'] for c in commits] + body = '\n'.join(messages) + return f"{rname}: pushed\n{body}", None, None + elif tp == 'WatchEvent': + return f"{rname}: watching", None, None + elif tp in mapping: + what = mapping[tp] + rt = pl['ref_type'] + ref = pl['ref'] + # TODO link to branch? only contains weird API link though + # TODO hmm. include timestamp instead? + # breakpoint() + # TODO combine automatically instead + return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}' + elif tp == 'PullRequestEvent': + pr = pl['pull_request'] + action = pl['action'] + link = pr['html_url'] + title = pr['title'] + return f"{rname}: {action} PR {title}", link, f'{rname}_{action}_pr_{link}' + elif tp == "IssuesEvent": + action = pl['action'] + iss = pl['issue'] + link = iss['html_url'] + title = iss['title'] + return f"{rname}: {action} issue {title}", link, None + elif tp == "IssueCommentEvent": + com = pl['comment'] + link = com['html_url'] + iss = pl['issue'] + title = iss['title'] + return f"{rname}: commented on issue {title}", link, f'issue_comment_' + link + elif tp == "ReleaseEvent": + action = pl['action'] + rel = pl['release'] + tag = rel['tag_name'] + link = rel['html_url'] + return f"{rname}: {action} [{tag}]", link, None + elif tp in 'PublicEvent': + return f'{tp} {e}', None, None # TODO ??? + else: + return tp, None, None + + +def _parse_event(d: Dict) -> Event: + summary, link, eid = _get_summary(d) + if eid is None: + eid = d['id'] + body = d.get('payload', {}).get('comment', {}).get('body') + return Event( + dt=parse_dt(d['created_at']), + summary=summary, + link=link, + eid=eid, + body=body, + ) + + +# TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper? +# from github.Event import Event as GEvent # type: ignore +# # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__ +# e = GEvent(None, None, raw_event, True) diff --git a/my/twitter/all.py b/my/twitter/all.py index 5c8103c..0899454 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -7,13 +7,13 @@ from . import twint, archive from .common import merge_tweets + def tweets(): yield from merge_tweets( twint .tweets(), archive.tweets(), ) -from .common import merge_tweets def likes(): yield from merge_tweets( diff --git a/tests/github.py b/tests/github.py index a007a42..5817756 100644 --- a/tests/github.py +++ b/tests/github.py @@ -1,15 +1,16 @@ #!/usr/bin/env python3 from more_itertools import ilen -from my.coding.github import get_events, iter_gdpr_events +from my.coding.github import get_events def test_gdpr(): - assert ilen(iter_gdpr_events()) > 100 + import my.github.gdpr as gdpr + assert ilen(gdpr.events()) > 100 def test(): events = get_events() - assert len(events) > 100 + assert ilen(events) > 100 for e in events: print(e) From a267aeec5b87f8bda555aff281131f9e1ec57731 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 1 Jun 2020 23:33:34 +0100 Subject: [PATCH 3/4] github: add config templates + docs - ghexport: use export_path (export_dir is still supported) --- doc/MODULES.org | 32 +++++++++++++++++++++ my/github/all.py | 4 +++ my/github/gdpr.py | 25 ++++++++++++---- my/github/ghexport.py | 67 ++++++++++++++++++++++++++++++++++++++----- my/twitter/archive.py | 3 +- my/twitter/twint.py | 1 + 6 files changed, 118 insertions(+), 14 deletions(-) diff --git a/doc/MODULES.org b/doc/MODULES.org index 763bebd..a30e814 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -25,6 +25,8 @@ If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting][ - [[#mylastfm][my.lastfm]] - [[#myreadingpolar][my.reading.polar]] - [[#myinstapaper][my.instapaper]] + - [[#mygithubgdpr][my.github.gdpr]] + - [[#mygithubghexport][my.github.ghexport]] :END: * Intro @@ -74,6 +76,8 @@ modules = [ ('lastfm' , 'my.lastfm' ), ('polar' , 'my.reading.polar' ), ('instapaper' , 'my.instapaper' ), + ('github' , 'my.github.gdpr' ), + ('github' , 'my.github.ghexport' ), ] def indent(s, spaces=4): @@ -227,3 +231,31 @@ for cls, p in modules: # alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/instapexport instapexport: Optional[PathIsh] = None #+end_src +** [[file:../my/github/gdpr.py][my.github.gdpr]] + + Github data (uses [[https://github.com/settings/admin][official GDPR export]]) + + #+begin_src python + class github: + gdpr_dir: PathIsh # path to unpacked GDPR archive + #+end_src +** [[file:../my/github/ghexport.py][my.github.ghexport]] + + Github data: events, comments, etc. (API data) + + #+begin_src python + class github: + ''' + Uses [[https://github.com/karlicoss/ghexport][ghexport]] outputs. + ''' + # path[s]/glob to the exported JSON data + export_path: Paths + + # path to a local clone of ghexport + # alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/ghexport + ghexport : Optional[PathIsh] = None + + # path to a cache directory + # if omitted, will use /tmp + cache_dir: Optional[PathIsh] = None + #+end_src diff --git a/my/github/all.py b/my/github/all.py index 61dcef3..f885dde 100644 --- a/my/github/all.py +++ b/my/github/all.py @@ -1,3 +1,7 @@ +""" +Unified Github data (merged from GDPR export and periodic API updates) +""" + from . import gdpr, ghexport from .common import merge_events, Results diff --git a/my/github/gdpr.py b/my/github/gdpr.py index b1504e9..cc813a8 100644 --- a/my/github/gdpr.py +++ b/my/github/gdpr.py @@ -1,3 +1,7 @@ +""" +Github data (uses [[https://github.com/settings/admin][official GDPR export]]) +""" + from datetime import datetime import json from typing import Iterable, Dict, Any @@ -7,14 +11,25 @@ from ..core import get_files from .common import Event, parse_dt -from my.config import github as config +# TODO later, use a separate user config? (github_gdpr) +from my.config import github as user_config + +from dataclasses import dataclass +from ..core import PathIsh + +@dataclass +class github(user_config): + gdpr_dir: PathIsh # path to unpacked GDPR archive + +### + + +from ..core.cfg import make_config +config = make_config(github) def events() -> Iterable[Res[Event]]: - """ - Parses events from GDPR export (https://github.com/settings/admin) - """ - # TODO allow using archive here? + # TODO FIXME allow using archive here? files = get_files(config.gdpr_dir, glob='*.json') handler_map = { 'schema' : None, diff --git a/my/github/ghexport.py b/my/github/ghexport.py index 2a7c239..30fd76c 100644 --- a/my/github/ghexport.py +++ b/my/github/ghexport.py @@ -1,5 +1,61 @@ +""" +Github data: events, comments, etc. (API data) +""" +from dataclasses import dataclass +from typing import Optional + +from ..core import Paths, PathIsh + +from my.config import github as user_config + + +@dataclass +class github(user_config): + ''' + Uses [[https://github.com/karlicoss/ghexport][ghexport]] outputs. + ''' + # path[s]/glob to the exported JSON data + export_path: Paths + + # path to a local clone of ghexport + # alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/ghexport + ghexport : Optional[PathIsh] = None + + # path to a cache directory + # if omitted, will use /tmp + cache_dir: Optional[PathIsh] = None + + @property + def dal_module(self): + rpath = self.ghexport + if rpath is not None: + from .core.common import import_dir + return import_dir(rpath, '.dal') + else: + import my.config.repos.ghexport.dal as dal + return dal +### + +# TODO perhaps using /tmp in case of None isn't ideal... maybe it should be treated as if cache is off + +from ..core.cfg import make_config, Attrs +def migration(attrs: Attrs) -> Attrs: + if 'export_dir' in attrs: # legacy name + attrs['export_path'] = attrs['export_dir'] + return attrs +config = make_config(github, migration=migration) + + +from typing import TYPE_CHECKING +if TYPE_CHECKING: + import my.config.repos.ghexport.dal as dal +else: + dal = config.dal_module + +############################ + from pathlib import Path -from typing import Tuple, Optional, Iterable, Dict, Sequence +from typing import Tuple, Iterable, Dict, Sequence from ..core import get_files from ..core.common import mcachew @@ -7,18 +63,15 @@ from ..kython.kompress import CPath from .common import Event, parse_dt, Results -from my.config import github as config -import my.config.repos.ghexport.dal as ghexport - def inputs() -> Sequence[Path]: - return get_files(config.export_dir) + return get_files(config.export_path) -def _dal(): +def _dal() -> dal.DAL: sources = inputs() sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg? - return ghexport.DAL(sources) + return dal.DAL(sources) # TODO hmm. not good, need to be lazier?... diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 031701f..c44272c 100755 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -18,9 +18,8 @@ except ImportError as e: from dataclasses import dataclass -from ..core.common import Paths +from ..core import Paths -# TODO perhaps rename to twitter_archive? dunno @dataclass class twitter_archive(user_config): export_path: Paths # path[s]/glob to the twitter archive takeout diff --git a/my/twitter/twint.py b/my/twitter/twint.py index 0c45a0d..3a2b327 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -14,6 +14,7 @@ from my.config import twint as user_config class twint(user_config): export_path: Paths # path[s]/glob to the twint Sqlite database +#### from ..core.cfg import make_config config = make_config(twint) From 3d7844b71130f75b914a138871898dec3f956007 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 1 Jun 2020 23:45:26 +0100 Subject: [PATCH 4/4] core: support '' for explicitly set empty path set --- doc/MODULES.org | 4 +++- doc/SETUP.org | 3 +-- my/core/common.py | 13 +++++++++---- tests/get_files.py | 3 +++ 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/doc/MODULES.org b/doc/MODULES.org index a30e814..4b33143 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -46,7 +46,9 @@ Some explanations: - =/a/path/to/directory/=, so the module will consume all files from this directory - a list of files/directories (it will be flattened) - a [[https://docs.python.org/3/library/glob.html?highlight=glob#glob.glob][glob]] string, so you can be flexible about the format of your data on disk (e.g. if you want to keep it compressed) - - empty sequence (e.g. ~export_path = ()~), this is useful for modules that merge multiple data sources (for example, =my.twitter=) + - empty string (e.g. ~export_path = ''~), this will prevent the module from consuming any data + + This can be useful for modules that merge multiple data sources (for example, =my.twitter= or =my.github=) Typically, such variable will be passed to =get_files= to actually extract the list of real files to use. You can see usage examples [[https://github.com/karlicoss/HPI/blob/master/tests/get_files.py][here]]. diff --git a/doc/SETUP.org b/doc/SETUP.org index bacb489..bd4c6fd 100644 --- a/doc/SETUP.org +++ b/doc/SETUP.org @@ -474,8 +474,7 @@ Since you have two different sources of raw data, you need to specify two bits o : class twitter_archive: : export_path = '/backups/twitter-archives/*.zip' -Note that you can also just use =my.twitter.archive= or =my.twitter.twint= directly, or set either of paths to 'empty path': =()= -# TODO empty string? +Note that you can also just use =my.twitter.archive= or =my.twitter.twint= directly, or set either of paths to empty string: =''= # (TODO mypy-safe?) # #addingmodifying-modules diff --git a/my/core/common.py b/my/core/common.py index 74aac5e..324ae26 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -125,11 +125,16 @@ def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense """ # TODO FIXME mm, some wrapper to assert iterator isn't empty? - sources: List[Path] = [] - if isinstance(pp, (str, Path)): - sources.append(Path(pp)) + sources: List[Path] + if isinstance(pp, Path): + sources = [pp] + elif isinstance(pp, str): + if pp == '': + # special case -- makes sense for optional data sources, etc + return () # early return to prevent warnings etc + sources = [Path(pp)] else: - sources.extend(map(Path, pp)) + sources = [Path(p) for p in pp] def caller() -> str: import traceback diff --git a/tests/get_files.py b/tests/get_files.py index 14f2711..aa71e7b 100644 --- a/tests/get_files.py +++ b/tests/get_files.py @@ -102,6 +102,9 @@ def test_no_files(): ''' Test for empty matches. They work, but should result in warning ''' + assert get_files('') == () + + # todo test these for warnings? assert get_files([]) == () assert get_files('bad*glob') == ()