Merge pull request #37 from karlicoss/updates

various updates: implicit globs for get-files, mcachew type checking, modules cleanup
This commit is contained in:
karlicoss 2020-05-03 17:19:55 +01:00 committed by GitHub
commit 5aecc037e9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 285 additions and 170 deletions

View file

@ -20,8 +20,7 @@ from my.config import github as config
import my.config.repos.ghexport.dal as ghexport
logger = LazyLogger('my.github')
# TODO __package__???
logger = LazyLogger(__name__)
class Event(NamedTuple):
@ -32,56 +31,75 @@ class Event(NamedTuple):
body: Optional[str]=None
# TODO hmm. need some sort of abstract syntax for this...
# TODO split further, title too
def _get_summary(e) -> Tuple[str, Optional[str], Optional[str]]:
# TODO would be nice to give access to raw event withing timeline
eid = e['id']
tp = e['type']
pl = e['payload']
rname = e['repo']['name']
mapping = {
'CreateEvent': 'created',
'DeleteEvent': 'deleted',
}
if tp == 'ForkEvent':
url = e['payload']['forkee']['html_url']
return f"forked {rname}", url, None
return f"{rname}: forked", url, None
elif tp == 'PushEvent':
return f"pushed to {rname}", None, None
commits = pl['commits']
messages = [c['message'] for c in commits]
body = '\n'.join(messages)
return f"{rname}: pushed\n{body}", None, None
elif tp == 'WatchEvent':
return f"watching {rname}", None, None
elif tp == 'CreateEvent':
# TODO eh, only weird API link?
return f"created {rname}", None, f'created_{rname}'
return f"{rname}: watching", None, None
elif tp in mapping:
what = mapping[tp]
rt = pl['ref_type']
ref = pl['ref']
# TODO link to branch? only contains weird API link though
# TODO hmm. include timestamp instead?
# breakpoint()
# TODO combine automatically instead
return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}'
elif tp == 'PullRequestEvent':
pr = pl['pull_request']
action = pl['action']
link = pr['html_url']
title = pr['title']
return f"{action} PR {title}", link, f'pull_request_{link}'
return f"{rname}: {action} PR {title}", link, f'{rname}_{action}_pr_{link}'
elif tp == "IssuesEvent":
action = pl['action']
iss = pl['issue']
link = iss['html_url']
title = iss['title']
return f"{action} issue {title}", link, None
return f"{rname}: {action} issue {title}", link, None
elif tp == "IssueCommentEvent":
com = pl['comment']
link = com['html_url']
iss = pl['issue']
title = iss['title']
return f"commented on issue {title}", link, f'issue_comment_' + link
return f"{rname}: commented on issue {title}", link, f'issue_comment_' + link
elif tp == "ReleaseEvent":
action = pl['action']
rel = pl['release']
tag = rel['tag_name']
link = rel['html_url']
return f"{action} {rname} [{tag}]", link, None
elif tp in (
"DeleteEvent",
"PublicEvent",
):
return tp, None, None # TODO ???
return f"{rname}: {action} [{tag}]", link, None
elif tp in 'PublicEvent':
return f'{tp} {e}', None, None # TODO ???
else:
return tp, None, None
def get_dal():
sources = get_files(config.export_dir, glob='*.json*')
def inputs():
return get_files(config.export_dir, glob='*.json*')
def _dal():
sources = inputs()
sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg?
return ghexport.DAL(sources)
@ -218,7 +236,7 @@ def iter_gdpr_events() -> Iterator[Res[Event]]:
# TODO hmm. not good, need to be lazier?...
@mcachew(config.cache_dir, hashf=lambda dal: dal.sources)
def iter_backup_events(dal=get_dal()) -> Iterator[Event]:
def iter_backup_events(dal=_dal()) -> Iterator[Event]:
for d in dal.events():
yield _parse_event(d)

View file

@ -1,7 +1,9 @@
from glob import glob as do_glob
from pathlib import Path
import functools
import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple
import warnings
from . import init
@ -46,6 +48,7 @@ def the(l: Iterable[T]) -> T:
return first
# TODO more_itertools.bucket?
def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]:
res: Dict[K, List[T]] = {}
for i in l:
@ -106,9 +109,12 @@ from .kython.klogging import setup_logger, LazyLogger
Paths = Union[Sequence[PathIsh], PathIsh]
def get_files(pp: Paths, glob: str, sort: bool=True) -> List[Path]:
DEFAULT_GLOB = '*'
def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]:
"""
Helper function to avoid boilerplate.
Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense
"""
# TODO FIXME mm, some wrapper to assert iterator isn't empty?
sources: List[Path] = []
@ -122,17 +128,38 @@ def get_files(pp: Paths, glob: str, sort: bool=True) -> List[Path]:
if src.is_dir():
gp: Iterable[Path] = src.glob(glob)
paths.extend(gp)
else:
ss = str(src)
if '*' in ss:
if glob != DEFAULT_GLOB:
warnings.warn(f"Treating {ss} as glob path. Explicit glob={glob} argument is ignored!")
paths.extend(map(Path, do_glob(ss)))
else:
assert src.is_file(), src
# TODO FIXME assert matches glob??
# todo assert matches glob??
paths.append(src)
if sort:
paths = list(sorted(paths))
return paths
return tuple(paths)
def mcachew(*args, **kwargs):
# TODO annotate it, perhaps use 'dependent' type (for @doublewrap stuff)
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import Callable, TypeVar
from typing_extensions import Protocol
# TODO reuse types from cachew? although not sure if we want hard dependency on it in typecheck time..
# I guess, later just define pass through once this is fixed: https://github.com/python/typing/issues/270
# ok, that's actually a super nice 'pattern'
F = TypeVar('F')
class McachewType(Protocol):
def __call__(self, cache_path: Any=None, *, hashf: Any=None, chunk_by: int=0, logger: Any=None) -> Callable[[F], F]:
...
mcachew: McachewType
def mcachew(*args, **kwargs): # type: ignore[no-redef]
"""
Stands for 'Maybe cachew'.
Defensive wrapper around @cachew to make it an optional dependency.
@ -140,7 +167,6 @@ def mcachew(*args, **kwargs):
try:
import cachew
except ModuleNotFoundError:
import warnings
warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew')
return lambda orig_func: orig_func
else:

View file

@ -5,26 +5,21 @@
Consumes data exported by https://github.com/karlicoss/backup-emfit
"""
import json
import logging
from collections import OrderedDict as odict
from dataclasses import dataclass
from datetime import date, datetime, time, timedelta
from itertools import groupby
from pathlib import Path
from typing import Dict, Iterator, List, NamedTuple, Any, cast
import pytz
from more_itertools import bucket
from ..common import get_files, LazyLogger, cproperty, group_by_key, mcachew
from ..common import get_files, LazyLogger, cproperty, mcachew
from my.config import emfit as config
logger = LazyLogger('my.emfit', level='info')
# TODO FIXME remove?
import kython
timed = lambda f: kython.timed(f, logger=logger)
logger = LazyLogger(__name__, level='info')
def hhmm(minutes):
@ -35,13 +30,10 @@ AWAKE = 4
Sid = str
# TODO use tz provider for that?
_TZ = pytz.timezone(config.tz)
# TODO use common tz thing?
def fromts(ts) -> datetime:
dt = datetime.fromtimestamp(ts)
return _TZ.localize(dt)
dt = datetime.fromtimestamp(ts, tz=pytz.utc)
return dt
class Mixin:
@ -295,14 +287,14 @@ class Emfit(Mixin):
# TODO move to common?
def dir_hash(path: Path):
mtimes = tuple(p.stat().st_mtime for p in sorted(path.glob('*.json')))
mtimes = tuple(p.stat().st_mtime for p in get_files(path, glob='*.json'))
return mtimes
# TODO take __file__ into account somehow?
@mcachew(cache_path=config.cache_path, hashf=dir_hash, logger=logger)
def iter_datas_cached(path: Path) -> Iterator[Emfit]:
# TODO use get_files?
for f in sorted(path.glob('*.json')):
def iter_datas(path: Path=config.export_path) -> Iterator[Emfit]:
for f in get_files(path, glob='*.json'):
sid = f.stem
if sid in config.excluded_sids:
continue
@ -311,20 +303,17 @@ def iter_datas_cached(path: Path) -> Iterator[Emfit]:
yield from Emfit.make(em)
def iter_datas(path=config.export_path) -> Iterator[Emfit]:
yield from iter_datas_cached(path)
def get_datas() -> List[Emfit]:
return list(sorted(iter_datas(), key=lambda e: e.start))
# TODO move away old entries if there is a diff??
@timed
def by_night() -> Dict[date, Emfit]:
res: Dict[date, Emfit] = odict()
res: Dict[date, Emfit] = {}
# TODO shit. I need some sort of interrupted sleep detection?
for dd, sleeps in group_by_key(get_datas(), key=lambda s: s.date).items():
grouped = bucket(get_datas(), key=lambda s: s.date)
for dd in grouped:
sleeps = list(grouped[dd])
if len(sleeps) > 1:
logger.warning("multiple sleeps per night, not handled yet: %s", sleeps)
continue

View file

@ -15,10 +15,10 @@ from .common import get_files, LazyLogger
from my.config import foursquare as config
logger = LazyLogger(__package__)
logger = LazyLogger(__name__)
def _get_exports() -> List[Path]:
def inputs():
return get_files(config.export_path, '*.json')
@ -62,7 +62,7 @@ class Place:
def get_raw(fname=None):
if fname is None:
fname = max(_get_exports())
fname = max(inputs())
j = json.loads(Path(fname).read_text())
assert isinstance(j, list)

View file

@ -3,50 +3,41 @@
"""
from . import init
from .common import PathIsh
import my.config.repos.hypexport as hypexport
from my.config.repos.hypexport import dal
from .common import get_files
from .error import Res, sort_res_by
import my.config.repos.hypexport.dal as hypexport
from my.config import hypothesis as config
export_path: PathIsh = config.export_path
###
from typing import List
from .common import get_files, cproperty, group_by_key
from .error import Res, sort_res_by
# TODO weird. not sure why e.g. from dal import Highlight doesn't work..
Highlight = dal.Highlight
DAL = dal.DAL
Page = dal.Page
Highlight = hypexport.Highlight
Page = hypexport.Page
# TODO eh. not sure if I should rename everything to dao/DAO or not...
def dao() -> DAL:
sources = get_files(export_path, '*.json')
model = DAL(sources)
return model
def _dal() -> hypexport.DAL:
sources = get_files(config.export_path, '*.json')
return hypexport.DAL(sources)
def get_highlights() -> List[Res[Highlight]]:
return sort_res_by(dao().highlights(), key=lambda h: h.created)
def highlights() -> List[Res[Highlight]]:
return sort_res_by(_dal().highlights(), key=lambda h: h.created)
# TODO eh. always provide iterators? although sort_res_by could be neat too...
def get_pages() -> List[Res[Page]]:
return sort_res_by(dao().pages(), key=lambda h: h.created)
def pages() -> List[Res[Page]]:
return sort_res_by(_dal().pages(), key=lambda h: h.created)
# TODO move to side tests?
def test():
get_pages()
get_highlights()
list(pages())
list(highlights())
def _main():
@ -55,3 +46,6 @@ def _main():
if __name__ == '__main__':
_main()
get_highlights = highlights # TODO deprecate
get_pages = pages # TODO deprecate

View file

@ -1,55 +1,32 @@
"""
Instapaper bookmarks, highlights and annotations
"""
from pathlib import Path
from typing import NamedTuple, Optional, List, Iterator
from .common import group_by_key, PathIsh, get_files
from .common import get_files
from my.config import instapaper as config
import my.config.repos.instapexport.dal as dal
def _get_files():
return get_files(config.export_path, glob='*.json')
Highlight = dal.Highlight
Bookmark = dal.Bookmark
def get_dal() -> dal.DAL:
return dal.DAL(_get_files())
def inputs():
return get_files(config.export_path)
# TODO meh, come up with better name...
class HighlightWithBm(NamedTuple):
highlight: dal.Highlight
bookmark: dal.Bookmark
def _dal() -> dal.DAL:
return dal.DAL(inputs())
def iter_highlights(**kwargs) -> Iterator[HighlightWithBm]:
# meh...
dl = get_dal()
hls = dl.highlights()
bms = dl.bookmarks()
for _, h in hls.items():
yield HighlightWithBm(highlight=h, bookmark=bms[h.bid])
def pages():
return _dal().pages()
get_pages = pages # todo also deprecate..
# def get_highlights(**kwargs) -> List[Highlight]:
# return list(iter_highlights(**kwargs))
def get_pages():
return get_dal().pages()
def get_todos() -> Iterator[HighlightWithBm]:
def is_todo(hl: HighlightWithBm):
h = hl.highlight
note = h.note or ''
# TODO dunno, move this to private?
def is_todo(hl: Highlight) -> bool:
note = hl.note or ''
note = note.lstrip().lower()
return note.startswith('todo')
return filter(is_todo, iter_highlights())
def main():
for h in get_todos():
print(h)

View file

@ -2,27 +2,31 @@
Last.fm scrobbles
'''
from .. import init
from functools import lru_cache
from typing import NamedTuple, Dict, Any
from ..common import get_files, mcachew, Json
from datetime import datetime
from pathlib import Path
import json
from pathlib import Path
from typing import NamedTuple, Any, Sequence, Iterable
import pytz
from my.config import lastfm as config
# TODO Json type?
# TODO memoised properties?
# TODO lazy mode and eager mode?
# lazy is a bit nicer in terms of more flexibility and less processing?
# eager is a bit more explicit for error handling
class Scrobble(NamedTuple):
raw: Dict[str, Any]
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
class Scrobble(NamedTuple):
raw: Json
# TODO mm, no timezone? hopefuly it's UTC
@property
def dt(self) -> datetime:
ts = int(self.raw['date'])
@ -45,22 +49,10 @@ class Scrobble(NamedTuple):
# TODO could also be nice to make generic? maybe even depending on eagerness
# TODO memoise...?
# TODO watch out, if we keep the app running it might expire
def _iter_scrobbles():
# TODO use get_files
last = max(Path(config.export_path).glob('*.json'))
# TODO mm, no timezone? hopefuly it's UTC
@mcachew(hashf=lambda: inputs())
def scrobbles() -> Iterable[Scrobble]:
last = max(inputs())
j = json.loads(last.read_text())
for raw in j:
yield Scrobble(raw=raw)
@lru_cache(1)
def get_scrobbles():
return list(sorted(_iter_scrobbles(), key=lambda s: s.dt))
def test():
assert len(get_scrobbles()) > 1000

View file

@ -1,11 +1,11 @@
#!/usr/bin/env python3
# pip install influxdb
from influxdb import InfluxDBClient # type: ignore
from my.lastfm import get_scrobbles
from my.lastfm import scrobbles
def main():
scrobbles = get_scrobbles()
def main() -> None:
scrobbles = scrobbles()
client = InfluxDBClient()
# TODO client.create_database('lastfm')

View file

@ -17,7 +17,7 @@ from ..error import Res
from my.config import photos as config
log = LazyLogger('my.photos')
log = LazyLogger(__name__)
@ -46,13 +46,12 @@ class Photo(NamedTuple):
raise RuntimeError(f'Weird path {self.path}, cant match against anything')
@property
def linkname(self) -> str:
def name(self) -> str:
return self._basename.strip('/')
@property
def url(self) -> str:
PHOTOS_URL = 'TODO FIXME'
return PHOTOS_URL + self._basename
return f'{config.base_url}{self._basename}'
from .utils import get_exif_from_file, ExifTags, Exif, dt_from_path, convert_ref

View file

@ -8,7 +8,6 @@ from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional
import json
import pytz
# TODO declare DEPENDS = [pytz??]
from ..common import LazyLogger, get_files

View file

@ -15,13 +15,14 @@ import my.config.repos.rexport.dal as rexport
def get_sources() -> Sequence[Path]:
# TODO use zstd?
# TODO maybe add assert to get_files? (and allow to suppress it)
files = get_files(config.export_dir, glob='*.json.xz')
# TODO rename to export_path?
files = get_files(config.export_dir)
res = list(map(CPath, files)); assert len(res) > 0
# todo move the assert to get_files?
return tuple(res)
logger = LazyLogger(__package__, level='debug')
logger = LazyLogger(__name__, level='debug')
Sid = rexport.Sid
@ -31,7 +32,7 @@ Submission = rexport.Submission
Upvote = rexport.Upvote
def dal():
def dal() -> rexport.DAL:
# TODO lru cache? but be careful when it runs continuously
return rexport.DAL(get_sources())
@ -173,12 +174,12 @@ def get_events(*args, **kwargs) -> List[Event]:
return list(sorted(evit, key=lambda e: e.cmp_key))
def test():
def test() -> None:
get_events(backups=get_sources()[-1:])
list(saved())
def test_unfav():
def test_unfav() -> None:
events = get_events()
url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
uevents = [e for e in events if e.url == url]
@ -188,15 +189,15 @@ def test_unfav():
uf = uevents[1]
assert uf.text == 'unfavorited'
def test_get_all_saves():
# TODO move out..
def test_get_all_saves() -> None:
# TODO not sure if this is necesasry anymore?
saves = list(saved())
# just check that they are unique..
make_dict(saves, key=lambda s: s.sid)
def test_disappearing():
def test_disappearing() -> None:
# eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
# but I guess it was just a short glitch... so whatever
saves = get_events()
@ -205,14 +206,14 @@ def test_disappearing():
assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc)
def test_unfavorite():
def test_unfavorite() -> None:
events = get_events()
unfavs = [s for s in events if s.text == 'unfavorited']
[xxx] = [u for u in unfavs if u.eid == 'unf-19ifop']
assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc)
def main():
def main() -> None:
# TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM...
events = get_events(parallel=False)
print(len(events))

View file

@ -18,7 +18,7 @@ from my.config import rescuetime as config
log = LazyLogger(__package__, level='info')
def _get_exports() -> List[Path]:
def inputs():
return get_files(config.export_path, '*.json')
@ -28,7 +28,7 @@ Model = rescuexport.Model
# TODO cache?
def get_model(last=0) -> Model:
return Model(_get_exports()[-last:])
return Model(inputs()[-last:])
def _without_errors():

View file

@ -4,8 +4,9 @@
from setuptools import setup, find_namespace_packages # type: ignore
INSTALL_REQUIRES = [
'appdirs',
'pytz', # even though it's not needed by the core, it's so common anyway...
'appdirs', # very common, and makes it portable
'more-itertools', # it's just too useful and very common anyway
]

113
tests/common.py Normal file
View file

@ -0,0 +1,113 @@
from pathlib import Path
from my.common import get_files
import pytest # type: ignore
def test_single_file():
'''
Regular file path is just returned as is.
'''
"Exception if it doesn't exist"
with pytest.raises(Exception):
get_files('/tmp/hpi_test/file.ext')
create('/tmp/hpi_test/file.ext')
'''
Couple of things:
1. Return type is a tuple, it's friendlier for hashing/caching
2. It always return pathlib.Path instead of plain strings
'''
assert get_files('/tmp/hpi_test/file.ext') == (
Path('/tmp/hpi_test/file.ext'),
)
def test_multiple_files():
'''
If you pass a directory/multiple directories, it flattens the contents
'''
create('/tmp/hpi_test/dir1/')
create('/tmp/hpi_test/dir1/zzz')
create('/tmp/hpi_test/dir1/yyy')
# create('/tmp/hpi_test/dir1/whatever/') # TODO not sure about this... should really allow extra dirs
create('/tmp/hpi_test/dir2/')
create('/tmp/hpi_test/dir2/mmm')
create('/tmp/hpi_test/dir2/nnn')
create('/tmp/hpi_test/dir3/')
create('/tmp/hpi_test/dir3/ttt')
assert get_files([
Path('/tmp/hpi_test/dir3'), # it takes in Path as well as str
'/tmp/hpi_test/dir1',
]) == (
# the paths are always returned in sorted order (unless you pass sort=False)
Path('/tmp/hpi_test/dir1/yyy'),
Path('/tmp/hpi_test/dir1/zzz'),
Path('/tmp/hpi_test/dir3/ttt'),
)
def test_explicit_glob():
'''
You can pass a glob to restrict the extensions
'''
create('/tmp/hpi_test/file_3.zip')
create('/tmp/hpi_test/file_2.zip')
create('/tmp/hpi_test/ignoreme')
create('/tmp/hpi_test/file.zip')
# todo walrus operator would be great here...
expected = (
Path('/tmp/hpi_test/file_2.zip'),
Path('/tmp/hpi_test/file_3.zip'),
)
assert get_files('/tmp/hpi_test', 'file_*.zip') == expected
"named argument should work too"
assert get_files('/tmp/hpi_test', glob='file_*.zip') == expected
def test_implicit_blog():
'''
Asterisc in the path results in globing too.
'''
# todo hopefully that makes sense? dunno why would anyone actually rely on asteriscs in names..
# this is very convenient in configs, so people don't have to use some special types
create('/tmp/hpi_test/123/')
create('/tmp/hpi_test/123/dummy')
create('/tmp/hpi_test/123/file.zip')
create('/tmp/hpi_test/456/')
create('/tmp/hpi_test/456/dummy')
create('/tmp/hpi_test/456/file.zip')
assert get_files(['/tmp/hpi_test/*/*.zip']) == (
Path('/tmp/hpi_test/123/file.zip'),
Path('/tmp/hpi_test/456/file.zip'),
)
# TODO not sure if should uniquify if the filenames end up same?
# TODO not sure about the symlinks? and hidden files?
test_path = Path('/tmp/hpi_test')
def setup():
teardown()
test_path.mkdir()
def teardown():
import shutil
if test_path.is_dir():
shutil.rmtree(test_path)
def create(f: str) -> None:
if f.endswith('/'):
Path(f).mkdir()
else:
Path(f).touch()

View file

@ -1,6 +1,5 @@
from my.instapaper import get_todos
from my.instapaper import pages
def test_get_todos():
for t in get_todos():
print(t)
def test_pages():
assert len(list(pages())) > 3

7
tests/lastfm.py Normal file
View file

@ -0,0 +1,7 @@
from more_itertools import ilen
from my.lastfm import scrobbles
def test():
assert ilen(scrobbles()) > 1000