From 4244f403ed7206b95dbf77a1482720aed09ee986 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 08:22:15 +0100 Subject: [PATCH 01/11] simplify instapaper module --- my/instapaper.py | 51 +++++++++++++----------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/my/instapaper.py b/my/instapaper.py index aa70527..1ad402a 100644 --- a/my/instapaper.py +++ b/my/instapaper.py @@ -1,55 +1,32 @@ """ Instapaper bookmarks, highlights and annotations """ -from pathlib import Path -from typing import NamedTuple, Optional, List, Iterator - -from .common import group_by_key, PathIsh, get_files +from .common import get_files from my.config import instapaper as config import my.config.repos.instapexport.dal as dal +Highlight = dal.Highlight +Bookmark = dal.Bookmark + + def _get_files(): return get_files(config.export_path, glob='*.json') -def get_dal() -> dal.DAL: +def _dal() -> dal.DAL: return dal.DAL(_get_files()) -# TODO meh, come up with better name... -class HighlightWithBm(NamedTuple): - highlight: dal.Highlight - bookmark: dal.Bookmark +def pages(): + return _dal().pages() +get_pages = pages # todo also deprecate.. -def iter_highlights(**kwargs) -> Iterator[HighlightWithBm]: - # meh... - dl = get_dal() - hls = dl.highlights() - bms = dl.bookmarks() - for _, h in hls.items(): - yield HighlightWithBm(highlight=h, bookmark=bms[h.bid]) - - -# def get_highlights(**kwargs) -> List[Highlight]: -# return list(iter_highlights(**kwargs)) -def get_pages(): - return get_dal().pages() - - - -def get_todos() -> Iterator[HighlightWithBm]: - def is_todo(hl: HighlightWithBm): - h = hl.highlight - note = h.note or '' - note = note.lstrip().lower() - return note.startswith('todo') - return filter(is_todo, iter_highlights()) - - -def main(): - for h in get_todos(): - print(h) +# TODO dunno, move this to private? +def is_todo(hl: Highlight) -> bool: + note = hl.note or '' + note = note.lstrip().lower() + return note.startswith('todo') From a521885aa09b37044702ed07a7a45f5b6581536d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 09:29:23 +0100 Subject: [PATCH 02/11] prettify github extractors --- my/coding/github.py | 58 +++++++++++++++++++++++++++++---------------- my/instapaper.py | 4 ++-- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/my/coding/github.py b/my/coding/github.py index 735ee93..15bfab6 100644 --- a/my/coding/github.py +++ b/my/coding/github.py @@ -20,8 +20,7 @@ from my.config import github as config import my.config.repos.ghexport.dal as ghexport -logger = LazyLogger('my.github') -# TODO __package__??? +logger = LazyLogger(__name__) class Event(NamedTuple): @@ -32,56 +31,75 @@ class Event(NamedTuple): body: Optional[str]=None +# TODO hmm. need some sort of abstract syntax for this... # TODO split further, title too def _get_summary(e) -> Tuple[str, Optional[str], Optional[str]]: + # TODO would be nice to give access to raw event withing timeline + eid = e['id'] tp = e['type'] pl = e['payload'] rname = e['repo']['name'] + + mapping = { + 'CreateEvent': 'created', + 'DeleteEvent': 'deleted', + } + if tp == 'ForkEvent': url = e['payload']['forkee']['html_url'] - return f"forked {rname}", url, None + return f"{rname}: forked", url, None elif tp == 'PushEvent': - return f"pushed to {rname}", None, None + commits = pl['commits'] + messages = [c['message'] for c in commits] + body = '\n'.join(messages) + return f"{rname}: pushed\n{body}", None, None elif tp == 'WatchEvent': - return f"watching {rname}", None, None - elif tp == 'CreateEvent': - # TODO eh, only weird API link? - return f"created {rname}", None, f'created_{rname}' + return f"{rname}: watching", None, None + elif tp in mapping: + what = mapping[tp] + rt = pl['ref_type'] + ref = pl['ref'] + # TODO link to branch? only contains weird API link though + # TODO hmm. include timestamp instead? + # breakpoint() + # TODO combine automatically instead + return f"{rname}: {what} {rt} {ref}", None, f'{rname}_{what}_{rt}_{ref}_{eid}' elif tp == 'PullRequestEvent': pr = pl['pull_request'] action = pl['action'] link = pr['html_url'] title = pr['title'] - return f"{action} PR {title}", link, f'pull_request_{link}' + return f"{rname}: {action} PR {title}", link, f'{rname}_{action}_pr_{link}' elif tp == "IssuesEvent": action = pl['action'] iss = pl['issue'] link = iss['html_url'] title = iss['title'] - return f"{action} issue {title}", link, None + return f"{rname}: {action} issue {title}", link, None elif tp == "IssueCommentEvent": com = pl['comment'] link = com['html_url'] iss = pl['issue'] title = iss['title'] - return f"commented on issue {title}", link, f'issue_comment_' + link + return f"{rname}: commented on issue {title}", link, f'issue_comment_' + link elif tp == "ReleaseEvent": action = pl['action'] rel = pl['release'] tag = rel['tag_name'] link = rel['html_url'] - return f"{action} {rname} [{tag}]", link, None - elif tp in ( - "DeleteEvent", - "PublicEvent", - ): - return tp, None, None # TODO ??? + return f"{rname}: {action} [{tag}]", link, None + elif tp in 'PublicEvent': + return f'{tp} {e}', None, None # TODO ??? else: return tp, None, None -def get_dal(): - sources = get_files(config.export_dir, glob='*.json*') +def inputs(): + return get_files(config.export_dir, glob='*.json*') + + +def _dal(): + sources = inputs() sources = list(map(CPath, sources)) # TODO maybe move it to get_files? e.g. compressed=True arg? return ghexport.DAL(sources) @@ -218,7 +236,7 @@ def iter_gdpr_events() -> Iterator[Res[Event]]: # TODO hmm. not good, need to be lazier?... @mcachew(config.cache_dir, hashf=lambda dal: dal.sources) -def iter_backup_events(dal=get_dal()) -> Iterator[Event]: +def iter_backup_events(dal=_dal()) -> Iterator[Event]: for d in dal.events(): yield _parse_event(d) diff --git a/my/instapaper.py b/my/instapaper.py index 1ad402a..1564be7 100644 --- a/my/instapaper.py +++ b/my/instapaper.py @@ -12,12 +12,12 @@ Highlight = dal.Highlight Bookmark = dal.Bookmark -def _get_files(): +def inputs(): return get_files(config.export_path, glob='*.json') def _dal() -> dal.DAL: - return dal.DAL(_get_files()) + return dal.DAL(inputs()) def pages(): From 22e2d68e5deed31e353d82c7207417c0cd8c0a83 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 10:27:58 +0100 Subject: [PATCH 03/11] cleanup hypothesis module --- my/hypothesis.py | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/my/hypothesis.py b/my/hypothesis.py index 16b48cd..46e00bc 100644 --- a/my/hypothesis.py +++ b/my/hypothesis.py @@ -3,50 +3,41 @@ """ from . import init -from .common import PathIsh - -import my.config.repos.hypexport as hypexport -from my.config.repos.hypexport import dal +from .common import get_files +from .error import Res, sort_res_by +import my.config.repos.hypexport.dal as hypexport from my.config import hypothesis as config -export_path: PathIsh = config.export_path ### from typing import List -from .common import get_files, cproperty, group_by_key -from .error import Res, sort_res_by - - - # TODO weird. not sure why e.g. from dal import Highlight doesn't work.. -Highlight = dal.Highlight -DAL = dal.DAL -Page = dal.Page +Highlight = hypexport.Highlight +Page = hypexport.Page # TODO eh. not sure if I should rename everything to dao/DAO or not... -def dao() -> DAL: - sources = get_files(export_path, '*.json') - model = DAL(sources) - return model +def _dal() -> hypexport.DAL: + sources = get_files(config.export_path, '*.json') + return hypexport.DAL(sources) -def get_highlights() -> List[Res[Highlight]]: - return sort_res_by(dao().highlights(), key=lambda h: h.created) +def highlights() -> List[Res[Highlight]]: + return sort_res_by(_dal().highlights(), key=lambda h: h.created) # TODO eh. always provide iterators? although sort_res_by could be neat too... -def get_pages() -> List[Res[Page]]: - return sort_res_by(dao().pages(), key=lambda h: h.created) +def pages() -> List[Res[Page]]: + return sort_res_by(_dal().pages(), key=lambda h: h.created) # TODO move to side tests? def test(): - get_pages() - get_highlights() + list(pages()) + list(highlights()) def _main(): @@ -55,3 +46,6 @@ def _main(): if __name__ == '__main__': _main() + +get_highlights = highlights # TODO deprecate +get_pages = pages # TODO deprecate From 2bf62e2db3b05679a20c2018d63bc54b2ff14c7c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 12:26:18 +0100 Subject: [PATCH 04/11] fix photo link --- my/photos/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/my/photos/__init__.py b/my/photos/__init__.py index 7f4e1fe..c11fe4c 100644 --- a/my/photos/__init__.py +++ b/my/photos/__init__.py @@ -17,7 +17,7 @@ from ..error import Res from my.config import photos as config -log = LazyLogger('my.photos') +log = LazyLogger(__name__) @@ -46,13 +46,12 @@ class Photo(NamedTuple): raise RuntimeError(f'Weird path {self.path}, cant match against anything') @property - def linkname(self) -> str: + def name(self) -> str: return self._basename.strip('/') @property def url(self) -> str: - PHOTOS_URL = 'TODO FIXME' - return PHOTOS_URL + self._basename + return f'{config.base_url}{self._basename}' from .utils import get_exif_from_file, ExifTags, Exif, dt_from_path, convert_ref From 78dbbd3c55a11cb736ad78fd80bf3f0355dcb2b3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 13:36:54 +0100 Subject: [PATCH 05/11] prettify emfit provider --- my/common.py | 9 ++++++--- my/emfit/__init__.py | 39 ++++++++++++++------------------------- my/foursquare.py | 6 +++--- my/rescuetime.py | 4 ++-- setup.py | 5 +++-- 5 files changed, 28 insertions(+), 35 deletions(-) diff --git a/my/common.py b/my/common.py index 2c241cd..89ee916 100644 --- a/my/common.py +++ b/my/common.py @@ -1,7 +1,7 @@ from pathlib import Path import functools import types -from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast +from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple from . import init @@ -46,6 +46,7 @@ def the(l: Iterable[T]) -> T: return first +# TODO more_itertools.bucket? def group_by_key(l: Iterable[T], key: Callable[[T], K]) -> Dict[K, List[T]]: res: Dict[K, List[T]] = {} for i in l: @@ -106,9 +107,11 @@ from .kython.klogging import setup_logger, LazyLogger Paths = Union[Sequence[PathIsh], PathIsh] -def get_files(pp: Paths, glob: str, sort: bool=True) -> List[Path]: +def get_files(pp: Paths, glob: str, sort: bool=True) -> Tuple[Path, ...]: """ Helper function to avoid boilerplate. + + Tuple as return type is a bit friendlier for hashing/caching, so hopefully makes sense """ # TODO FIXME mm, some wrapper to assert iterator isn't empty? sources: List[Path] = [] @@ -129,7 +132,7 @@ def get_files(pp: Paths, glob: str, sort: bool=True) -> List[Path]: if sort: paths = list(sorted(paths)) - return paths + return tuple(paths) def mcachew(*args, **kwargs): diff --git a/my/emfit/__init__.py b/my/emfit/__init__.py index b245824..2b8f5a8 100755 --- a/my/emfit/__init__.py +++ b/my/emfit/__init__.py @@ -5,26 +5,21 @@ Consumes data exported by https://github.com/karlicoss/backup-emfit """ import json -import logging -from collections import OrderedDict as odict from dataclasses import dataclass from datetime import date, datetime, time, timedelta +from itertools import groupby from pathlib import Path from typing import Dict, Iterator, List, NamedTuple, Any, cast import pytz +from more_itertools import bucket -from ..common import get_files, LazyLogger, cproperty, group_by_key, mcachew +from ..common import get_files, LazyLogger, cproperty, mcachew from my.config import emfit as config -logger = LazyLogger('my.emfit', level='info') - - -# TODO FIXME remove? -import kython -timed = lambda f: kython.timed(f, logger=logger) +logger = LazyLogger(__name__, level='info') def hhmm(minutes): @@ -35,13 +30,10 @@ AWAKE = 4 Sid = str -# TODO use tz provider for that? -_TZ = pytz.timezone(config.tz) - # TODO use common tz thing? def fromts(ts) -> datetime: - dt = datetime.fromtimestamp(ts) - return _TZ.localize(dt) + dt = datetime.fromtimestamp(ts, tz=pytz.utc) + return dt class Mixin: @@ -295,14 +287,14 @@ class Emfit(Mixin): # TODO move to common? def dir_hash(path: Path): - mtimes = tuple(p.stat().st_mtime for p in sorted(path.glob('*.json'))) + mtimes = tuple(p.stat().st_mtime for p in get_files(path, glob='*.json')) return mtimes +# TODO take __file__ into account somehow? @mcachew(cache_path=config.cache_path, hashf=dir_hash, logger=logger) -def iter_datas_cached(path: Path) -> Iterator[Emfit]: - # TODO use get_files? - for f in sorted(path.glob('*.json')): +def iter_datas(path: Path=config.export_path) -> Iterator[Emfit]: + for f in get_files(path, glob='*.json'): sid = f.stem if sid in config.excluded_sids: continue @@ -311,20 +303,17 @@ def iter_datas_cached(path: Path) -> Iterator[Emfit]: yield from Emfit.make(em) -def iter_datas(path=config.export_path) -> Iterator[Emfit]: - yield from iter_datas_cached(path) - - def get_datas() -> List[Emfit]: return list(sorted(iter_datas(), key=lambda e: e.start)) # TODO move away old entries if there is a diff?? -@timed def by_night() -> Dict[date, Emfit]: - res: Dict[date, Emfit] = odict() + res: Dict[date, Emfit] = {} # TODO shit. I need some sort of interrupted sleep detection? - for dd, sleeps in group_by_key(get_datas(), key=lambda s: s.date).items(): + grouped = bucket(get_datas(), key=lambda s: s.date) + for dd in grouped: + sleeps = list(grouped[dd]) if len(sleeps) > 1: logger.warning("multiple sleeps per night, not handled yet: %s", sleeps) continue diff --git a/my/foursquare.py b/my/foursquare.py index 03cc312..ed55a24 100755 --- a/my/foursquare.py +++ b/my/foursquare.py @@ -15,10 +15,10 @@ from .common import get_files, LazyLogger from my.config import foursquare as config -logger = LazyLogger(__package__) +logger = LazyLogger(__name__) -def _get_exports() -> List[Path]: +def inputs(): return get_files(config.export_path, '*.json') @@ -62,7 +62,7 @@ class Place: def get_raw(fname=None): if fname is None: - fname = max(_get_exports()) + fname = max(inputs()) j = json.loads(Path(fname).read_text()) assert isinstance(j, list) diff --git a/my/rescuetime.py b/my/rescuetime.py index 3ee2730..5bf136c 100644 --- a/my/rescuetime.py +++ b/my/rescuetime.py @@ -18,7 +18,7 @@ from my.config import rescuetime as config log = LazyLogger(__package__, level='info') -def _get_exports() -> List[Path]: +def inputs(): return get_files(config.export_path, '*.json') @@ -28,7 +28,7 @@ Model = rescuexport.Model # TODO cache? def get_model(last=0) -> Model: - return Model(_get_exports()[-last:]) + return Model(inputs()[-last:]) def _without_errors(): diff --git a/setup.py b/setup.py index ddf25fb..233829a 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,9 @@ from setuptools import setup, find_namespace_packages # type: ignore INSTALL_REQUIRES = [ - 'appdirs', - 'pytz', # even though it's not needed by the core, it's so common anyway... + 'pytz', # even though it's not needed by the core, it's so common anyway... + 'appdirs', # very common, and makes it portable + 'more-itertools', # it's just too useful and very common anyway ] From 19e90eb64728e8805be260353013f1aaedf4fcf3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 15:57:11 +0100 Subject: [PATCH 06/11] improvements to @mcachew type checking --- my/common.py | 17 ++++++++++++++++- my/reddit.py | 26 +++++++++++++------------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/my/common.py b/my/common.py index 89ee916..172af06 100644 --- a/my/common.py +++ b/my/common.py @@ -135,7 +135,22 @@ def get_files(pp: Paths, glob: str, sort: bool=True) -> Tuple[Path, ...]: return tuple(paths) -def mcachew(*args, **kwargs): +# TODO annotate it, perhaps use 'dependent' type (for @doublewrap stuff) +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from typing import Callable, TypeVar + from typing_extensions import Protocol + # TODO reuse types from cachew? although not sure if we want hard dependency on it in typecheck time.. + # I guess, later just define pass through once this is fixed: https://github.com/python/typing/issues/270 + # ok, that's actually a super nice 'pattern' + F = TypeVar('F') + class McachewType(Protocol): + def __call__(self, cache_path: Any=None, *, hashf: Any=None, chunk_by: int=0, logger: Any=None) -> Callable[[F], F]: + ... + + mcachew: McachewType + +def mcachew(*args, **kwargs): # type: ignore[no-redef] """ Stands for 'Maybe cachew'. Defensive wrapper around @cachew to make it an optional dependency. diff --git a/my/reddit.py b/my/reddit.py index 36921e9..e4e5590 100755 --- a/my/reddit.py +++ b/my/reddit.py @@ -21,17 +21,17 @@ def get_sources() -> Sequence[Path]: return tuple(res) -logger = LazyLogger(__package__, level='debug') +logger = LazyLogger(__name__, level='debug') -Sid = rexport.Sid -Save = rexport.Save -Comment = rexport.Comment +Sid = rexport.Sid +Save = rexport.Save +Comment = rexport.Comment Submission = rexport.Submission -Upvote = rexport.Upvote +Upvote = rexport.Upvote -def dal(): +def dal() -> rexport.DAL: # TODO lru cache? but be careful when it runs continuously return rexport.DAL(get_sources()) @@ -173,12 +173,12 @@ def get_events(*args, **kwargs) -> List[Event]: return list(sorted(evit, key=lambda e: e.cmp_key)) -def test(): +def test() -> None: get_events(backups=get_sources()[-1:]) list(saved()) -def test_unfav(): +def test_unfav() -> None: events = get_events() url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' uevents = [e for e in events if e.url == url] @@ -188,15 +188,15 @@ def test_unfav(): uf = uevents[1] assert uf.text == 'unfavorited' - -def test_get_all_saves(): +# TODO move out.. +def test_get_all_saves() -> None: # TODO not sure if this is necesasry anymore? saves = list(saved()) # just check that they are unique.. make_dict(saves, key=lambda s: s.sid) -def test_disappearing(): +def test_disappearing() -> None: # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason # but I guess it was just a short glitch... so whatever saves = get_events() @@ -205,14 +205,14 @@ def test_disappearing(): assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc) -def test_unfavorite(): +def test_unfavorite() -> None: events = get_events() unfavs = [s for s in events if s.text == 'unfavorited'] [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc) -def main(): +def main() -> None: # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM... events = get_events(parallel=False) print(len(events)) From 5c6eec62ee33e5d5a286b4af2aaf33afa7f97bf1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 16:17:48 +0100 Subject: [PATCH 07/11] start testing get_files --- my/common.py | 2 +- tests/common.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 tests/common.py diff --git a/my/common.py b/my/common.py index 172af06..24dd0ce 100644 --- a/my/common.py +++ b/my/common.py @@ -107,7 +107,7 @@ from .kython.klogging import setup_logger, LazyLogger Paths = Union[Sequence[PathIsh], PathIsh] -def get_files(pp: Paths, glob: str, sort: bool=True) -> Tuple[Path, ...]: +def get_files(pp: Paths, glob: str='*', sort: bool=True) -> Tuple[Path, ...]: """ Helper function to avoid boilerplate. diff --git a/tests/common.py b/tests/common.py new file mode 100644 index 0000000..6486deb --- /dev/null +++ b/tests/common.py @@ -0,0 +1,45 @@ +from pathlib import Path +from my.common import get_files + +import pytest # type: ignore + + +def test_single_file(): + ''' + Regular file path is just returned as is. + ''' + + "Exception if it doesn't exist" + with pytest.raises(Exception): + get_files('/tmp/hpi_test/file.ext') + + + create('/tmp/hpi_test/file.ext') + + ''' + Couple of things: + 1. Return type is a tuple, it's friendlier for hashing/caching + 2. It always return pathlib.Path instead of plain strings + ''' + assert get_files('/tmp/hpi_test/file.ext') == ( + Path('/tmp/hpi_test/file.ext'), + ) + + + + +test_path = Path('/tmp/hpi_test') +def setup(): + teardown() + test_path.mkdir() + + +def teardown(): + import shutil + if test_path.is_dir(): + shutil.rmtree(test_path) + + +from my.common import PathIsh +def create(f: PathIsh) -> None: + Path(f).touch() From c2961cb1cfbefb00b89f3f30e16c664c63233637 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 16:29:39 +0100 Subject: [PATCH 08/11] properly test get_files --- tests/common.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/tests/common.py b/tests/common.py index 6486deb..4daa0fb 100644 --- a/tests/common.py +++ b/tests/common.py @@ -26,6 +26,48 @@ def test_single_file(): ) +def test_multiple_files(): + ''' + If you pass a directory/multiple directories, it flattens the contents + ''' + create('/tmp/hpi_test/dir1/') + create('/tmp/hpi_test/dir1/zzz') + create('/tmp/hpi_test/dir1/yyy') + # create('/tmp/hpi_test/dir1/whatever/') # TODO not sure about this... should really allow extra dirs + create('/tmp/hpi_test/dir2/') + create('/tmp/hpi_test/dir2/mmm') + create('/tmp/hpi_test/dir2/nnn') + create('/tmp/hpi_test/dir3/') + create('/tmp/hpi_test/dir3/ttt') + + assert get_files([ + Path('/tmp/hpi_test/dir3'), # it takes in Path as well as str + '/tmp/hpi_test/dir1', + ]) == ( + # the paths are always returned in sorted order (unless you pass sort=False) + Path('/tmp/hpi_test/dir1/yyy'), + Path('/tmp/hpi_test/dir1/zzz'), + Path('/tmp/hpi_test/dir3/ttt'), + ) + + +def test_glob(): + ''' + You can pass a blog to restrict the extensions + ''' + + create('/tmp/hpi_test/file_3.zip') + create('/tmp/hpi_test/file_2.zip') + create('/tmp/hpi_test/ignoreme') + create('/tmp/hpi_test/file.zip') + + assert get_files('/tmp/hpi_test', 'file_*.zip') == ( + Path('/tmp/hpi_test/file_2.zip'), + Path('/tmp/hpi_test/file_3.zip'), + ) + + # named argument should work too + assert len(get_files('/tmp/hpi_test', glob='file_*.zip')) > 0 test_path = Path('/tmp/hpi_test') @@ -40,6 +82,8 @@ def teardown(): shutil.rmtree(test_path) -from my.common import PathIsh -def create(f: PathIsh) -> None: - Path(f).touch() +def create(f: str) -> None: + if f.endswith('/'): + Path(f).mkdir() + else: + Path(f).touch() From 5706f690e7d1b25ecb916112cfbc8f7c6c61b40c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 16:52:09 +0100 Subject: [PATCH 09/11] support implicit globs! --- my/common.py | 11 ++++++++--- tests/common.py | 34 +++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/my/common.py b/my/common.py index 24dd0ce..a2c85cc 100644 --- a/my/common.py +++ b/my/common.py @@ -1,3 +1,4 @@ +from glob import glob as do_glob from pathlib import Path import functools import types @@ -126,9 +127,13 @@ def get_files(pp: Paths, glob: str='*', sort: bool=True) -> Tuple[Path, ...]: gp: Iterable[Path] = src.glob(glob) paths.extend(gp) else: - assert src.is_file(), src - # TODO FIXME assert matches glob?? - paths.append(src) + ss = str(src) + if '*' in ss: + paths.extend(map(Path, do_glob(ss))) + else: + assert src.is_file(), src + # todo assert matches glob?? + paths.append(src) if sort: paths = list(sorted(paths)) diff --git a/tests/common.py b/tests/common.py index 4daa0fb..2d4abcf 100644 --- a/tests/common.py +++ b/tests/common.py @@ -51,9 +51,9 @@ def test_multiple_files(): ) -def test_glob(): +def test_explicit_glob(): ''' - You can pass a blog to restrict the extensions + You can pass a glob to restrict the extensions ''' create('/tmp/hpi_test/file_3.zip') @@ -61,15 +61,39 @@ def test_glob(): create('/tmp/hpi_test/ignoreme') create('/tmp/hpi_test/file.zip') - assert get_files('/tmp/hpi_test', 'file_*.zip') == ( + # todo walrus operator would be great here... + expected = ( Path('/tmp/hpi_test/file_2.zip'), Path('/tmp/hpi_test/file_3.zip'), ) + assert get_files('/tmp/hpi_test', 'file_*.zip') == expected - # named argument should work too - assert len(get_files('/tmp/hpi_test', glob='file_*.zip')) > 0 + "named argument should work too" + assert get_files('/tmp/hpi_test', glob='file_*.zip') == expected +def test_implicit_blog(): + ''' + Asterisc in the path results in globing too. + ''' + # todo hopefully that makes sense? dunno why would anyone actually rely on asteriscs in names.. + # this is very convenient in configs, so people don't have to use some special types + + create('/tmp/hpi_test/123/') + create('/tmp/hpi_test/123/dummy') + create('/tmp/hpi_test/123/file.zip') + create('/tmp/hpi_test/456/') + create('/tmp/hpi_test/456/dummy') + create('/tmp/hpi_test/456/file.zip') + + assert get_files(['/tmp/hpi_test/*/*.zip']) == ( + Path('/tmp/hpi_test/123/file.zip'), + Path('/tmp/hpi_test/456/file.zip'), + ) + +# TODO not sure if should uniquify if the filenames end up same? +# TODO not sure about the symlinks? and hidden files? + test_path = Path('/tmp/hpi_test') def setup(): teardown() From 9bd61940b88f3451ae68ce45ac970457d49c2783 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 16:56:05 +0100 Subject: [PATCH 10/11] rely on implicit glob for my.reddit --- my/common.py | 7 +++++-- my/reddit.py | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/my/common.py b/my/common.py index a2c85cc..0d26310 100644 --- a/my/common.py +++ b/my/common.py @@ -3,6 +3,7 @@ from pathlib import Path import functools import types from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple +import warnings from . import init @@ -108,7 +109,8 @@ from .kython.klogging import setup_logger, LazyLogger Paths = Union[Sequence[PathIsh], PathIsh] -def get_files(pp: Paths, glob: str='*', sort: bool=True) -> Tuple[Path, ...]: +DEFAULT_GLOB = '*' +def get_files(pp: Paths, glob: str=DEFAULT_GLOB, sort: bool=True) -> Tuple[Path, ...]: """ Helper function to avoid boilerplate. @@ -129,6 +131,8 @@ def get_files(pp: Paths, glob: str='*', sort: bool=True) -> Tuple[Path, ...]: else: ss = str(src) if '*' in ss: + if glob != DEFAULT_GLOB: + warnings.warn(f"Treating {ss} as glob path. Explicit glob={glob} argument is ignored!") paths.extend(map(Path, do_glob(ss))) else: assert src.is_file(), src @@ -163,7 +167,6 @@ def mcachew(*args, **kwargs): # type: ignore[no-redef] try: import cachew except ModuleNotFoundError: - import warnings warnings.warn('cachew library not found. You might want to install it to speed things up. See https://github.com/karlicoss/cachew') return lambda orig_func: orig_func else: diff --git a/my/reddit.py b/my/reddit.py index e4e5590..143f120 100755 --- a/my/reddit.py +++ b/my/reddit.py @@ -15,9 +15,10 @@ import my.config.repos.rexport.dal as rexport def get_sources() -> Sequence[Path]: # TODO use zstd? - # TODO maybe add assert to get_files? (and allow to suppress it) - files = get_files(config.export_dir, glob='*.json.xz') + # TODO rename to export_path? + files = get_files(config.export_dir) res = list(map(CPath, files)); assert len(res) > 0 + # todo move the assert to get_files? return tuple(res) From 0b61dd9e42c2c9a74915995a383bd4b952edaff6 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 May 2020 17:15:51 +0100 Subject: [PATCH 11/11] more minor tweaks, benefit from get_files --- my/instapaper.py | 2 +- my/lastfm/__init__.py | 36 ++++++++++++++---------------------- my/lastfm/fill_influxdb.py | 6 +++--- my/reading/polar.py | 1 - tests/instapaper.py | 7 +++---- tests/lastfm.py | 7 +++++++ 6 files changed, 28 insertions(+), 31 deletions(-) create mode 100644 tests/lastfm.py diff --git a/my/instapaper.py b/my/instapaper.py index 1564be7..364c402 100644 --- a/my/instapaper.py +++ b/my/instapaper.py @@ -13,7 +13,7 @@ Bookmark = dal.Bookmark def inputs(): - return get_files(config.export_path, glob='*.json') + return get_files(config.export_path) def _dal() -> dal.DAL: diff --git a/my/lastfm/__init__.py b/my/lastfm/__init__.py index 12239b0..d55fef4 100755 --- a/my/lastfm/__init__.py +++ b/my/lastfm/__init__.py @@ -2,27 +2,31 @@ Last.fm scrobbles ''' -from .. import init -from functools import lru_cache -from typing import NamedTuple, Dict, Any +from ..common import get_files, mcachew, Json + from datetime import datetime -from pathlib import Path import json +from pathlib import Path +from typing import NamedTuple, Any, Sequence, Iterable import pytz from my.config import lastfm as config -# TODO Json type? # TODO memoised properties? # TODO lazy mode and eager mode? # lazy is a bit nicer in terms of more flexibility and less processing? # eager is a bit more explicit for error handling -class Scrobble(NamedTuple): - raw: Dict[str, Any] +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + +class Scrobble(NamedTuple): + raw: Json + + # TODO mm, no timezone? hopefuly it's UTC @property def dt(self) -> datetime: ts = int(self.raw['date']) @@ -45,22 +49,10 @@ class Scrobble(NamedTuple): # TODO could also be nice to make generic? maybe even depending on eagerness -# TODO memoise...? -# TODO watch out, if we keep the app running it might expire -def _iter_scrobbles(): - # TODO use get_files - last = max(Path(config.export_path).glob('*.json')) - # TODO mm, no timezone? hopefuly it's UTC +@mcachew(hashf=lambda: inputs()) +def scrobbles() -> Iterable[Scrobble]: + last = max(inputs()) j = json.loads(last.read_text()) for raw in j: yield Scrobble(raw=raw) - - -@lru_cache(1) -def get_scrobbles(): - return list(sorted(_iter_scrobbles(), key=lambda s: s.dt)) - - -def test(): - assert len(get_scrobbles()) > 1000 diff --git a/my/lastfm/fill_influxdb.py b/my/lastfm/fill_influxdb.py index c20e39f..7754760 100755 --- a/my/lastfm/fill_influxdb.py +++ b/my/lastfm/fill_influxdb.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 # pip install influxdb from influxdb import InfluxDBClient # type: ignore -from my.lastfm import get_scrobbles +from my.lastfm import scrobbles -def main(): - scrobbles = get_scrobbles() +def main() -> None: + scrobbles = scrobbles() client = InfluxDBClient() # TODO client.create_database('lastfm') diff --git a/my/reading/polar.py b/my/reading/polar.py index 9eb4783..d2b2d60 100755 --- a/my/reading/polar.py +++ b/my/reading/polar.py @@ -8,7 +8,6 @@ from typing import List, Dict, Iterator, NamedTuple, Sequence, Optional import json import pytz -# TODO declare DEPENDS = [pytz??] from ..common import LazyLogger, get_files diff --git a/tests/instapaper.py b/tests/instapaper.py index 2c492e8..ae685e6 100644 --- a/tests/instapaper.py +++ b/tests/instapaper.py @@ -1,6 +1,5 @@ -from my.instapaper import get_todos +from my.instapaper import pages -def test_get_todos(): - for t in get_todos(): - print(t) +def test_pages(): + assert len(list(pages())) > 3 diff --git a/tests/lastfm.py b/tests/lastfm.py new file mode 100644 index 0000000..e94c3c5 --- /dev/null +++ b/tests/lastfm.py @@ -0,0 +1,7 @@ +from more_itertools import ilen + +from my.lastfm import scrobbles + + +def test(): + assert ilen(scrobbles()) > 1000