Merge pull request #61 from karlicoss/updates

github module: cleanup and proper modular layout
This commit is contained in:
karlicoss 2020-06-01 23:52:07 +01:00 committed by GitHub
commit a94b64c273
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 453 additions and 284 deletions

View file

@ -25,6 +25,8 @@ If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting][
- [[#mylastfm][my.lastfm]] - [[#mylastfm][my.lastfm]]
- [[#myreadingpolar][my.reading.polar]] - [[#myreadingpolar][my.reading.polar]]
- [[#myinstapaper][my.instapaper]] - [[#myinstapaper][my.instapaper]]
- [[#mygithubgdpr][my.github.gdpr]]
- [[#mygithubghexport][my.github.ghexport]]
:END: :END:
* Intro * Intro
@ -44,7 +46,9 @@ Some explanations:
- =/a/path/to/directory/=, so the module will consume all files from this directory - =/a/path/to/directory/=, so the module will consume all files from this directory
- a list of files/directories (it will be flattened) - a list of files/directories (it will be flattened)
- a [[https://docs.python.org/3/library/glob.html?highlight=glob#glob.glob][glob]] string, so you can be flexible about the format of your data on disk (e.g. if you want to keep it compressed) - a [[https://docs.python.org/3/library/glob.html?highlight=glob#glob.glob][glob]] string, so you can be flexible about the format of your data on disk (e.g. if you want to keep it compressed)
- empty sequence (e.g. ~export_path = ()~), this is useful for modules that merge multiple data sources (for example, =my.twitter=) - empty string (e.g. ~export_path = ''~), this will prevent the module from consuming any data
This can be useful for modules that merge multiple data sources (for example, =my.twitter= or =my.github=)
Typically, such variable will be passed to =get_files= to actually extract the list of real files to use. You can see usage examples [[https://github.com/karlicoss/HPI/blob/master/tests/get_files.py][here]]. Typically, such variable will be passed to =get_files= to actually extract the list of real files to use. You can see usage examples [[https://github.com/karlicoss/HPI/blob/master/tests/get_files.py][here]].
@ -74,6 +78,8 @@ modules = [
('lastfm' , 'my.lastfm' ), ('lastfm' , 'my.lastfm' ),
('polar' , 'my.reading.polar' ), ('polar' , 'my.reading.polar' ),
('instapaper' , 'my.instapaper' ), ('instapaper' , 'my.instapaper' ),
('github' , 'my.github.gdpr' ),
('github' , 'my.github.ghexport' ),
] ]
def indent(s, spaces=4): def indent(s, spaces=4):
@ -227,3 +233,31 @@ for cls, p in modules:
# alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/instapexport # alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/instapexport
instapexport: Optional[PathIsh] = None instapexport: Optional[PathIsh] = None
#+end_src #+end_src
** [[file:../my/github/gdpr.py][my.github.gdpr]]
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
#+begin_src python
class github:
gdpr_dir: PathIsh # path to unpacked GDPR archive
#+end_src
** [[file:../my/github/ghexport.py][my.github.ghexport]]
Github data: events, comments, etc. (API data)
#+begin_src python
class github:
'''
Uses [[https://github.com/karlicoss/ghexport][ghexport]] outputs.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
# path to a local clone of ghexport
# alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/ghexport
ghexport : Optional[PathIsh] = None
# path to a cache directory
# if omitted, will use /tmp
cache_dir: Optional[PathIsh] = None
#+end_src

View file

@ -474,8 +474,7 @@ Since you have two different sources of raw data, you need to specify two bits o
: class twitter_archive: : class twitter_archive:
: export_path = '/backups/twitter-archives/*.zip' : export_path = '/backups/twitter-archives/*.zip'
Note that you can also just use =my.twitter.archive= or =my.twitter.twint= directly, or set either of paths to 'empty path': =()= Note that you can also just use =my.twitter.archive= or =my.twitter.twint= directly, or set either of paths to empty string: =''=
# TODO empty string?
# (TODO mypy-safe?) # (TODO mypy-safe?)
# #addingmodifying-modules # #addingmodifying-modules

View file

@ -1,272 +1,8 @@
""" import warnings
Github events and their metadata: comments/issues/pull requests
"""
from typing import Dict, Any, NamedTuple, Tuple, Optional, Iterator, TypeVar, Set
from datetime import datetime
import json
import pytz warnings.warn('my.coding.github is deprecated! Please use my.github.all instead!', DeprecationWarning)
from ..kython.klogging import LazyLogger from ..github.all import events, get_events
from ..kython.kompress import CPath
from ..common import get_files, mcachew
from ..error import Res
from my.config import github as config # todo deprecate properly
import my.config.repos.ghexport.dal as ghexport iter_events = events
logger = LazyLogger(__name__)
class Event(NamedTuple):
dt: datetime
summary: str
eid: str
link: Optional[str]
body: Optional[str]=None
is_bot: bool = False
# TODO hmm. need some sort of abstract syntax for this...
# TODO split further, title too
def _get_summary(e) -> Tuple[str, Optional[str], Optional[str]]:
# TODO would be nice to give access to raw event withing timeline
eid = e['id']
tp = e['type']
pl = e['payload']
rname = e['repo']['name']
mapping = {
'CreateEvent': 'created',
'DeleteEvent': 'deleted',
}
if tp == 'ForkEvent':
url = e['payload']['forkee']['html_url']
return f"{rname}: forked", url, None
elif tp == 'PushEvent':
commits = pl['commits']
messages = [c['message'] for c in commits]
body = '\n'.join(messages)
return f"{rname}: pushed\n{body}", None, None
elif tp == 'WatchEvent':
return f"{rname}: watching", None, None
elif tp in mapping:
what = mapping[tp]
rt = pl['ref_type']
ref = pl['ref']
# TODO link to branch? only contains weird API link though
# TODO hmm. include timestamp instead?
# breakpoint()
# TODO combine automatically instead
return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}'
elif tp == 'PullRequestEvent':
pr = pl['pull_request']
action = pl['action']
link = pr['html_url']
title = pr['title']
return f"{rname}: {action} PR {title}", link, f'{rname}_{action}_pr_{link}'
elif tp == "IssuesEvent":
action = pl['action']
iss = pl['issue']
link = iss['html_url']
title = iss['title']
return f"{rname}: {action} issue {title}", link, None
elif tp == "IssueCommentEvent":
com = pl['comment']
link = com['html_url']
iss = pl['issue']
title = iss['title']
return f"{rname}: commented on issue {title}", link, f'issue_comment_' + link
elif tp == "ReleaseEvent":
action = pl['action']
rel = pl['release']
tag = rel['tag_name']
link = rel['html_url']
return f"{rname}: {action} [{tag}]", link, None
elif tp in 'PublicEvent':
return f'{tp} {e}', None, None # TODO ???
else:
return tp, None, None
def inputs():
return get_files(config.export_dir)
def _dal():
sources = inputs()
sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg?
return ghexport.DAL(sources)
def _parse_dt(s: str) -> datetime:
# TODO isoformat?
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ'))
# TODO extract to separate gdpr module?
# TODO typing.TypedDict could be handy here..
def _parse_common(d: Dict) -> Dict:
url = d['url']
body = d.get('body')
return {
'dt' : _parse_dt(d['created_at']),
'link': url,
'body': body,
}
def _parse_repository(d: Dict) -> Event:
pref = 'https://github.com/'
url = d['url']
assert url.startswith(pref); name = url[len(pref):]
return Event( # type: ignore[misc]
**_parse_common(d),
summary='created ' + name,
eid='created_' + name, # TODO ??
)
def _parse_issue_comment(d: Dict) -> Event:
url = d['url']
is_bot = "[bot]" in d["user"]
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'commented on issue {url}',
eid='issue_comment_' + url,
is_bot=is_bot,
)
def _parse_issue(d: Dict) -> Event:
url = d['url']
title = d['title']
is_bot = "[bot]" in d["user"]
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'opened issue {title}',
eid='issue_comment_' + url,
is_bot=is_bot,
)
def _parse_pull_request(d: Dict) -> Event:
url = d['url']
title = d['title']
is_bot = "[bot]" in d["user"]
return Event( # type: ignore[misc]
**_parse_common(d),
# TODO distinguish incoming/outgoing?
# TODO action? opened/closed??
summary=f'opened PR {title}',
eid='pull_request_' + url,
is_bot=is_bot,
)
def _parse_release(d: Dict) -> Event:
tag = d['tag_name']
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'released {tag}',
eid='release_' + tag,
)
def _parse_commit_comment(d: Dict) -> Event:
url = d['url']
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'commented on {url}',
eid='commoit_comment_' + url,
)
def _parse_event(d: Dict) -> Event:
summary, link, eid = _get_summary(d)
if eid is None:
eid = d['id']
body = d.get('payload', {}).get('comment', {}).get('body')
return Event(
dt=_parse_dt(d['created_at']),
summary=summary,
link=link,
eid=eid,
body=body,
)
def iter_gdpr_events() -> Iterator[Res[Event]]:
"""
Parses events from GDPR export (https://github.com/settings/admin)
"""
# TODO allow using archive here?
files = get_files(config.gdpr_dir, glob='*.json')
handler_map = {
'schema' : None,
'issue_events_': None, # eh, doesn't seem to have any useful bodies
'attachments_' : None, # not sure if useful
'users' : None, # just contains random users
'repositories_' : _parse_repository,
'issue_comments_': _parse_issue_comment,
'issues_' : _parse_issue,
'pull_requests_' : _parse_pull_request,
'releases_' : _parse_release,
'commit_comments': _parse_commit_comment,
}
for f in files:
handler: Any
for prefix, h in handler_map.items():
if not f.name.startswith(prefix):
continue
handler = h
break
else:
yield RuntimeError(f'Unhandled file: {f}')
continue
if handler is None:
# ignored
continue
j = json.loads(f.read_text())
for r in j:
try:
yield handler(r)
except Exception as e:
yield e
# TODO hmm. not good, need to be lazier?...
@mcachew(config.cache_dir, hashf=lambda dal: dal.sources)
def iter_backup_events(dal=_dal()) -> Iterator[Event]:
for d in dal.events():
yield _parse_event(d)
def iter_events() -> Iterator[Res[Event]]:
from itertools import chain
emitted: Set[Tuple[datetime, str]] = set()
for e in chain(iter_gdpr_events(), iter_backup_events()):
if isinstance(e, Exception):
yield e
continue
if e.is_bot:
continue
key = (e.dt, e.eid) # use both just in case
# TODO wtf?? some minor (e.g. 1 sec) discrepancies (e.g. create repository events)
if key in emitted:
logger.debug('ignoring %s: %s', key, e)
continue
yield e
emitted.add(key)
def get_events():
return sorted(iter_events(), key=lambda e: e.dt)
# TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper?
# from github.Event import Event as GEvent # type: ignore
# # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__
# e = GEvent(None, None, raw_event, True)

View file

@ -116,6 +116,7 @@ from ..kython.klogging import setup_logger, LazyLogger
Paths = Union[Sequence[PathIsh], PathIsh] Paths = Union[Sequence[PathIsh], PathIsh]
# TODO support '' for emtpy path
DEFAULT_GLOB = '*' DEFAULT_GLOB = '*'
def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]: def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]:
""" """
@ -124,11 +125,16 @@ def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path,
Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense
""" """
# TODO FIXME mm, some wrapper to assert iterator isn't empty? # TODO FIXME mm, some wrapper to assert iterator isn't empty?
sources: List[Path] = [] sources: List[Path]
if isinstance(pp, (str, Path)): if isinstance(pp, Path):
sources.append(Path(pp)) sources = [pp]
elif isinstance(pp, str):
if pp == '':
# special case -- makes sense for optional data sources, etc
return () # early return to prevent warnings etc
sources = [Path(pp)]
else: else:
sources.extend(map(Path, pp)) sources = [Path(p) for p in pp]
def caller() -> str: def caller() -> str:
import traceback import traceback

21
my/github/all.py Normal file
View file

@ -0,0 +1,21 @@
"""
Unified Github data (merged from GDPR export and periodic API updates)
"""
from . import gdpr, ghexport
from .common import merge_events, Results
def events() -> Results:
yield from merge_events(
gdpr.events(),
ghexport.events(),
)
# todo hmm. not sure, maybe should be named sorted_events or something..
# also, not great that it's in all.py... think of a better way...
def get_events() -> Results:
from ..core.error import sort_res_by
return sort_res_by(events(), key=lambda e: e.dt)

52
my/github/common.py Normal file
View file

@ -0,0 +1,52 @@
"""
Github events and their metadata: comments/issues/pull requests
"""
from datetime import datetime
from typing import Optional, NamedTuple, Iterable, Set, Tuple
import pytz
from ..core import warn_if_empty
from ..core.error import Res
class Event(NamedTuple):
dt: datetime
summary: str
eid: str
link: Optional[str]
body: Optional[str]=None
is_bot: bool = False
Results = Iterable[Res[Event]]
@warn_if_empty
def merge_events(*sources: Results) -> Results:
from ..kython.klogging import LazyLogger
logger = LazyLogger(__name__)
from itertools import chain
emitted: Set[Tuple[datetime, str]] = set()
for e in chain(*sources):
if isinstance(e, Exception):
yield e
continue
if e.is_bot:
continue
key = (e.dt, e.eid) # use both just in case
# TODO wtf?? some minor (e.g. 1 sec) discrepancies (e.g. create repository events)
if key in emitted:
logger.debug('ignoring %s: %s', key, e)
continue
yield e
emitted.add(key)
# todo use unique_everseen? Might be tricky with Exception etc..
def parse_dt(s: str) -> datetime:
# TODO isoformat?
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ'))
# TODO not sure
# def get_events() -> Iterable[Res[Event]]:
# return sort_res_by(events(), key=lambda e: e.dt)

143
my/github/gdpr.py Normal file
View file

@ -0,0 +1,143 @@
"""
Github data (uses [[https://github.com/settings/admin][official GDPR export]])
"""
from datetime import datetime
import json
from typing import Iterable, Dict, Any
from ..core.error import Res
from ..core import get_files
from .common import Event, parse_dt
# TODO later, use a separate user config? (github_gdpr)
from my.config import github as user_config
from dataclasses import dataclass
from ..core import PathIsh
@dataclass
class github(user_config):
gdpr_dir: PathIsh # path to unpacked GDPR archive
###
from ..core.cfg import make_config
config = make_config(github)
def events() -> Iterable[Res[Event]]:
# TODO FIXME allow using archive here?
files = get_files(config.gdpr_dir, glob='*.json')
handler_map = {
'schema' : None,
'issue_events_': None, # eh, doesn't seem to have any useful bodies
'attachments_' : None, # not sure if useful
'users' : None, # just contains random users
'repositories_' : _parse_repository,
'issue_comments_': _parse_issue_comment,
'issues_' : _parse_issue,
'pull_requests_' : _parse_pull_request,
'releases_' : _parse_release,
'commit_comments': _parse_commit_comment,
}
for f in files:
handler: Any
for prefix, h in handler_map.items():
if not f.name.startswith(prefix):
continue
handler = h
break
else:
yield RuntimeError(f'Unhandled file: {f}')
continue
if handler is None:
# ignored
continue
j = json.loads(f.read_text())
for r in j:
try:
yield handler(r)
except Exception as e:
yield e
# TODO typing.TypedDict could be handy here..
def _parse_common(d: Dict) -> Dict:
url = d['url']
body = d.get('body')
return {
'dt' : parse_dt(d['created_at']),
'link': url,
'body': body,
}
def _parse_repository(d: Dict) -> Event:
pref = 'https://github.com/'
url = d['url']
assert url.startswith(pref); name = url[len(pref):]
return Event( # type: ignore[misc]
**_parse_common(d),
summary='created ' + name,
eid='created_' + name, # TODO ??
)
def _parse_issue_comment(d: Dict) -> Event:
url = d['url']
is_bot = "[bot]" in d["user"]
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'commented on issue {url}',
eid='issue_comment_' + url,
is_bot=is_bot,
)
def _parse_issue(d: Dict) -> Event:
url = d['url']
title = d['title']
is_bot = "[bot]" in d["user"]
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'opened issue {title}',
eid='issue_comment_' + url,
is_bot=is_bot,
)
def _parse_pull_request(d: Dict) -> Event:
url = d['url']
title = d['title']
is_bot = "[bot]" in d["user"]
return Event( # type: ignore[misc]
**_parse_common(d),
# TODO distinguish incoming/outgoing?
# TODO action? opened/closed??
summary=f'opened PR {title}',
eid='pull_request_' + url,
is_bot=is_bot,
)
def _parse_release(d: Dict) -> Event:
tag = d['tag_name']
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'released {tag}',
eid='release_' + tag,
)
def _parse_commit_comment(d: Dict) -> Event:
url = d['url']
return Event( # type: ignore[misc]
**_parse_common(d),
summary=f'commented on {url}',
eid='commoit_comment_' + url,
)

164
my/github/ghexport.py Normal file
View file

@ -0,0 +1,164 @@
"""
Github data: events, comments, etc. (API data)
"""
from dataclasses import dataclass
from typing import Optional
from ..core import Paths, PathIsh
from my.config import github as user_config
@dataclass
class github(user_config):
'''
Uses [[https://github.com/karlicoss/ghexport][ghexport]] outputs.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
# path to a local clone of ghexport
# alternatively, you can put the repository (or a symlink) in $MY_CONFIG/my/config/repos/ghexport
ghexport : Optional[PathIsh] = None
# path to a cache directory
# if omitted, will use /tmp
cache_dir: Optional[PathIsh] = None
@property
def dal_module(self):
rpath = self.ghexport
if rpath is not None:
from .core.common import import_dir
return import_dir(rpath, '.dal')
else:
import my.config.repos.ghexport.dal as dal
return dal
###
# TODO perhaps using /tmp in case of None isn't ideal... maybe it should be treated as if cache is off
from ..core.cfg import make_config, Attrs
def migration(attrs: Attrs) -> Attrs:
if 'export_dir' in attrs: # legacy name
attrs['export_path'] = attrs['export_dir']
return attrs
config = make_config(github, migration=migration)
from typing import TYPE_CHECKING
if TYPE_CHECKING:
import my.config.repos.ghexport.dal as dal
else:
dal = config.dal_module
############################
from pathlib import Path
from typing import Tuple, Iterable, Dict, Sequence
from ..core import get_files
from ..core.common import mcachew
from ..kython.kompress import CPath
from .common import Event, parse_dt, Results
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
def _dal() -> dal.DAL:
sources = inputs()
sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg?
return dal.DAL(sources)
# TODO hmm. not good, need to be lazier?...
@mcachew(config.cache_dir, hashf=lambda dal: dal.sources)
def events(dal=_dal()) -> Results:
for d in dal.events():
yield _parse_event(d)
# TODO hmm. need some sort of abstract syntax for this...
# TODO split further, title too
def _get_summary(e) -> Tuple[str, Optional[str], Optional[str]]:
# TODO would be nice to give access to raw event withing timeline
eid = e['id']
tp = e['type']
pl = e['payload']
rname = e['repo']['name']
mapping = {
'CreateEvent': 'created',
'DeleteEvent': 'deleted',
}
if tp == 'ForkEvent':
url = e['payload']['forkee']['html_url']
return f"{rname}: forked", url, None
elif tp == 'PushEvent':
commits = pl['commits']
messages = [c['message'] for c in commits]
body = '\n'.join(messages)
return f"{rname}: pushed\n{body}", None, None
elif tp == 'WatchEvent':
return f"{rname}: watching", None, None
elif tp in mapping:
what = mapping[tp]
rt = pl['ref_type']
ref = pl['ref']
# TODO link to branch? only contains weird API link though
# TODO hmm. include timestamp instead?
# breakpoint()
# TODO combine automatically instead
return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}'
elif tp == 'PullRequestEvent':
pr = pl['pull_request']
action = pl['action']
link = pr['html_url']
title = pr['title']
return f"{rname}: {action} PR {title}", link, f'{rname}_{action}_pr_{link}'
elif tp == "IssuesEvent":
action = pl['action']
iss = pl['issue']
link = iss['html_url']
title = iss['title']
return f"{rname}: {action} issue {title}", link, None
elif tp == "IssueCommentEvent":
com = pl['comment']
link = com['html_url']
iss = pl['issue']
title = iss['title']
return f"{rname}: commented on issue {title}", link, f'issue_comment_' + link
elif tp == "ReleaseEvent":
action = pl['action']
rel = pl['release']
tag = rel['tag_name']
link = rel['html_url']
return f"{rname}: {action} [{tag}]", link, None
elif tp in 'PublicEvent':
return f'{tp} {e}', None, None # TODO ???
else:
return tp, None, None
def _parse_event(d: Dict) -> Event:
summary, link, eid = _get_summary(d)
if eid is None:
eid = d['id']
body = d.get('payload', {}).get('comment', {}).get('body')
return Event(
dt=parse_dt(d['created_at']),
summary=summary,
link=link,
eid=eid,
body=body,
)
# TODO mm. ok, not much point in deserializing as github.Event as it's basically a fancy dict wrapper?
# from github.Event import Event as GEvent # type: ignore
# # see https://github.com/PyGithub/PyGithub/blob/master/github/GithubObject.py::GithubObject.__init__
# e = GEvent(None, None, raw_event, True)

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from datetime import datetime from datetime import datetime
from typing import NamedTuple, List from typing import NamedTuple, List, Iterable
from ..google.takeout.html import read_html from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout from ..google.takeout.paths import get_last_takeout
@ -16,7 +16,7 @@ class Watched(NamedTuple):
return f'{self.url}-{self.when.isoformat()}' return f'{self.url}-{self.when.isoformat()}'
def get_watched(): def watched() -> Iterable[Watched]:
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
@ -30,6 +30,10 @@ def get_watched():
return list(sorted(watches, key=lambda e: e.when)) return list(sorted(watches, key=lambda e: e.when))
# todo deprecate
get_watched = watched
def main(): def main():
# TODO shit. a LOT of watches... # TODO shit. a LOT of watches...
for w in get_watched(): for w in get_watched():

View file

@ -7,13 +7,13 @@ from . import twint, archive
from .common import merge_tweets from .common import merge_tweets
def tweets(): def tweets():
yield from merge_tweets( yield from merge_tweets(
twint .tweets(), twint .tweets(),
archive.tweets(), archive.tweets(),
) )
from .common import merge_tweets
def likes(): def likes():
yield from merge_tweets( yield from merge_tweets(

View file

@ -18,9 +18,8 @@ except ImportError as e:
from dataclasses import dataclass from dataclasses import dataclass
from ..core.common import Paths from ..core import Paths
# TODO perhaps rename to twitter_archive? dunno
@dataclass @dataclass
class twitter_archive(user_config): class twitter_archive(user_config):
export_path: Paths # path[s]/glob to the twitter archive takeout export_path: Paths # path[s]/glob to the twitter archive takeout

View file

@ -14,6 +14,7 @@ from my.config import twint as user_config
class twint(user_config): class twint(user_config):
export_path: Paths # path[s]/glob to the twint Sqlite database export_path: Paths # path[s]/glob to the twint Sqlite database
####
from ..core.cfg import make_config from ..core.cfg import make_config
config = make_config(twint) config = make_config(twint)

View file

@ -102,6 +102,9 @@ def test_no_files():
''' '''
Test for empty matches. They work, but should result in warning Test for empty matches. They work, but should result in warning
''' '''
assert get_files('') == ()
# todo test these for warnings?
assert get_files([]) == () assert get_files([]) == ()
assert get_files('bad*glob') == () assert get_files('bad*glob') == ()

View file

@ -1,8 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from more_itertools import ilen
from my.coding.github import get_events from my.coding.github import get_events
def test_gdpr():
import my.github.gdpr as gdpr
assert ilen(gdpr.events()) > 100
def test(): def test():
events = get_events() events = get_events()
assert len(events) > 100 assert ilen(events) > 100
for e in events: for e in events:
print(e) print(e)

View file

@ -1,5 +1,4 @@
# TODO move elsewhere? # TODO move elsewhere?
# these tests would only make sense with some existing data? although some of them would work for everyone.. # these tests would only make sense with some existing data? although some of them would work for everyone..
# not sure what's a good way of handling this.. # not sure what's a good way of handling this..
@ -7,7 +6,7 @@ from my.media.youtube import get_watched, Watched
def test(): def test():
watched = get_watched() watched = list(get_watched())
assert len(watched) > 1000 assert len(watched) > 1000
from datetime import datetime from datetime import datetime