diff --git a/doc/CONFIGURING.org b/doc/CONFIGURING.org index 6c7c70e..83342e0 100644 --- a/doc/CONFIGURING.org +++ b/doc/CONFIGURING.org @@ -224,7 +224,7 @@ My conclusion was using a *combined approach*: Inheritance is a standard mechanism, which doesn't require any extra frameworks and plays well with other Python concepts. As a specific example: -,#+begin_src python +#+begin_src python from my.config import bluemaestro as user_config @dataclass @@ -256,24 +256,27 @@ I claim this solves pretty much everything: - *(6)*: the dataclass header is easily readable, and it's possible to generate the docs automatically Downsides: -- inheriting from ~user_config~ means early import of =my.config= +- inheriting from ~user_config~ means an early import of =my.config= Generally it's better to keep everything as lazy as possible and defer loading to the first time the config is used. This might be annoying at times, e.g. if you have a top-level import of you module, but no config. But considering that in 99% of cases config is going to be on the disk - and it's possible to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue. - # TODO demonstrate in a test? + and it's [[https://github.com/karlicoss/HPI/blob/1e6e0bd381d20437343473878c7f63b1f9d6362b/tests/demo.py#L22-L25][possible]] to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue. - =make_config= allows for some mypy false negatives in the user config E.g. if you forgot =export_path= attribute, mypy would miss it. But you'd have a runtime failure, and the downstream code using config is still correctly type checked. - Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this]] is fixed. + Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this mypy issue]] is fixed. - the =make_config= bit is a little scary and manual However, it's extracted in a generic helper, and [[https://github.com/karlicoss/HPI/blob/d6f071e3b12ba1cd5a86ad80e3821bec004e6a6d/my/twitter/archive.py#L17][ends up pretty simple]] + # In addition, it's not even necessary if you don't have optional attributes, you can simply use the class variables (i.e. ~bluemaestro.export_path~) + # upd. ugh, you can't, it doesn't handle default attributes overriding correctly (see tests/demo.py) + # eh. basically all I need is class level dataclass?? + - inheriting from ~user_config~ requires it to be a =class= rather than an =object= A practical downside is you can't use something like ~SimpleNamespace~. diff --git a/doc/MODULES.org b/doc/MODULES.org index 7d97f29..ddff2bd 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -32,6 +32,7 @@ modules = [ ('reddit' , 'my.reddit' ), ('twint' , 'my.twitter.twint' ), ('twitter', 'my.twitter.archive' ), + ('lastfm' , 'my.lastfm' ), ] def indent(s, spaces=4): @@ -105,4 +106,15 @@ for cls, p in modules: class twitter: export_path: Paths # path[s]/glob to the twitter archive takeout #+end_src +- [[file:../my/lastfm][my.lastfm]] + + Last.fm scrobbles + + #+begin_src python + class lastfm: + """ + Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs + """ + export_path: Paths + #+end_src :end: diff --git a/my/_rss.py b/my/_rss.py deleted file mode 100644 index cabf53a..0000000 --- a/my/_rss.py +++ /dev/null @@ -1,10 +0,0 @@ -# shared Rss stuff -from typing import NamedTuple - -class Subscription(NamedTuple): - # TODO date? - title: str - url: str - id: str - subscribed: bool=True - diff --git a/my/core/common.py b/my/core/common.py index 83c77d7..918f4b2 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -1,5 +1,6 @@ from glob import glob as do_glob from pathlib import Path +from datetime import datetime import functools import types from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple @@ -219,3 +220,28 @@ class classproperty(Generic[_R]): # # def __get__(self) -> _R: # return self.f() + +# TODO maybe use opaque mypy alias? +tzdatetime = datetime + + +fromisoformat: Callable[[str], datetime] +import sys +if sys.version_info.minor >= 7: + # prevent mypy on py3.6 from complaining... + fromisoformat_real = datetime.fromisoformat # type: ignore[attr-defined] + fromisoformat = fromisoformat_real +else: + from .py37 import fromisoformat + + +# TODO doctests? +def isoparse(s: str) -> tzdatetime: + """ + Parses timestamps formatted like 2020-05-01T10:32:02.925961Z + """ + # TODO could use dateutil? but it's quite slow as far as I remember.. + # TODO support non-utc.. somehow? + assert s.endswith('Z'), s + s = s[:-1] + '+00:00' + return fromisoformat(s) diff --git a/my/core/py37.py b/my/core/py37.py new file mode 100644 index 0000000..6a52593 --- /dev/null +++ b/my/core/py37.py @@ -0,0 +1,122 @@ +# borrowed from /usr/lib/python3.7/datetime.py +from datetime import datetime, timezone, timedelta + +def _parse_isoformat_date(dtstr): + # It is assumed that this function will only be called with a + # string of length exactly 10, and (though this is not used) ASCII-only + year = int(dtstr[0:4]) + if dtstr[4] != '-': + raise ValueError('Invalid date separator: %s' % dtstr[4]) + + month = int(dtstr[5:7]) + + if dtstr[7] != '-': + raise ValueError('Invalid date separator') + + day = int(dtstr[8:10]) + + return [year, month, day] + + +def _parse_hh_mm_ss_ff(tstr): + # Parses things of the form HH[:MM[:SS[.fff[fff]]]] + len_str = len(tstr) + + time_comps = [0, 0, 0, 0] + pos = 0 + for comp in range(0, 3): + if (len_str - pos) < 2: + raise ValueError('Incomplete time component') + + time_comps[comp] = int(tstr[pos:pos+2]) + + pos += 2 + next_char = tstr[pos:pos+1] + + if not next_char or comp >= 2: + break + + if next_char != ':': + raise ValueError('Invalid time separator: %c' % next_char) + + pos += 1 + + if pos < len_str: + if tstr[pos] != '.': + raise ValueError('Invalid microsecond component') + else: + pos += 1 + + len_remainder = len_str - pos + if len_remainder not in (3, 6): + raise ValueError('Invalid microsecond component') + + time_comps[3] = int(tstr[pos:]) + if len_remainder == 3: + time_comps[3] *= 1000 + + return time_comps + + +def _parse_isoformat_time(tstr): + # Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]] + len_str = len(tstr) + if len_str < 2: + raise ValueError('Isoformat time too short') + + # This is equivalent to re.search('[+-]', tstr), but faster + tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1) + timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr + + time_comps = _parse_hh_mm_ss_ff(timestr) + + tzi = None + if tz_pos > 0: + tzstr = tstr[tz_pos:] + + # Valid time zone strings are: + # HH:MM len: 5 + # HH:MM:SS len: 8 + # HH:MM:SS.ffffff len: 15 + + if len(tzstr) not in (5, 8, 15): + raise ValueError('Malformed time zone string') + + tz_comps = _parse_hh_mm_ss_ff(tzstr) + if all(x == 0 for x in tz_comps): + tzi = timezone.utc + else: + tzsign = -1 if tstr[tz_pos - 1] == '-' else 1 + + td = timedelta(hours=tz_comps[0], minutes=tz_comps[1], + seconds=tz_comps[2], microseconds=tz_comps[3]) + + tzi = timezone(tzsign * td) + + time_comps.append(tzi) + + return time_comps + +def fromisoformat(date_string, cls=datetime): + """Construct a datetime from the output of datetime.isoformat().""" + if not isinstance(date_string, str): + raise TypeError('fromisoformat: argument must be str') + + # Split this at the separator + dstr = date_string[0:10] + tstr = date_string[11:] + + try: + date_components = _parse_isoformat_date(dstr) + except ValueError: + raise ValueError('Invalid isoformat string: %s' % date_string) + + if tstr: + try: + time_components = _parse_isoformat_time(tstr) + except ValueError: + raise ValueError('Invalid isoformat string: %s' % date_string) + else: + time_components = [0, 0, 0, 0, None] + + return cls(*(date_components + time_components)) diff --git a/my/demo.py b/my/demo.py index ae57b67..811e9e2 100644 --- a/my/demo.py +++ b/my/demo.py @@ -16,13 +16,8 @@ class demo(user_config): username: str timezone: tzinfo = pytz.utc - -def config() -> demo: - from .core.cfg import make_config - config = make_config(demo) - return config - - +from .core.cfg import make_config +config = make_config(demo) from pathlib import Path from typing import Sequence, Iterable @@ -40,17 +35,17 @@ class Item: def inputs() -> Sequence[Path]: - return get_files(config().data_path) + return get_files(config.data_path) import json def items() -> Iterable[Item]: for f in inputs(): - dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config().timezone) + dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone) j = json.loads(f.read_text()) for raw in j: yield Item( - username=config().username, + username=config.username, raw=raw, dt=dt, ) diff --git a/my/feedbin.py b/my/feedbin.py deleted file mode 100644 index 3492afb..0000000 --- a/my/feedbin.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Feedbin RSS reader -""" - -from .common import listify -from ._rss import Subscription - -from my.config import feedbin as config - -import json -from pathlib import Path -from typing import Dict, List -from datetime import datetime -from dateutil.parser import isoparse - - -@listify -def parse_file(f: Path): - raw = json.loads(f.read_text()) - for r in raw: - yield Subscription( - # TODO created_at? - title=r['title'], - url=r['site_url'], - id=r['id'], - ) - -def get_states() -> Dict[datetime, List[Subscription]]: - res = {} - # TODO use get_files - for f in sorted(Path(config.export_dir).glob('*.json')): - dts = f.stem.split('_')[-1] - dt = isoparse(dts) - subs = parse_file(f) - res[dt] = subs - return res diff --git a/my/lastfm/__init__.py b/my/lastfm/__init__.py index d55fef4..a208e50 100755 --- a/my/lastfm/__init__.py +++ b/my/lastfm/__init__.py @@ -2,8 +2,21 @@ Last.fm scrobbles ''' +from ..core.common import Paths +from dataclasses import dataclass +from my.config import lastfm as user_config + +@dataclass +class lastfm(user_config): + """ + Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs + """ + export_path: Paths + + +from ..core.cfg import make_config +config = make_config(lastfm) -from ..common import get_files, mcachew, Json from datetime import datetime import json @@ -12,16 +25,17 @@ from typing import NamedTuple, Any, Sequence, Iterable import pytz -from my.config import lastfm as config +from ..core.common import mcachew, Json, get_files + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + # TODO memoised properties? # TODO lazy mode and eager mode? # lazy is a bit nicer in terms of more flexibility and less processing? # eager is a bit more explicit for error handling -def inputs() -> Sequence[Path]: - return get_files(config.export_path) - class Scrobble(NamedTuple): raw: Json @@ -54,5 +68,5 @@ def scrobbles() -> Iterable[Scrobble]: last = max(inputs()) j = json.loads(last.read_text()) - for raw in j: + for raw in reversed(j): yield Scrobble(raw=raw) diff --git a/my/reading/polar.py b/my/reading/polar.py index d2b2d60..7ba4fc2 100755 --- a/my/reading/polar.py +++ b/my/reading/polar.py @@ -21,6 +21,7 @@ _POLAR_DIR = Path('~').expanduser() / '.polar' logger = LazyLogger(__name__) +# TODO use core.isoparse def parse_dt(s: str) -> datetime: return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ')) diff --git a/my/rss.py b/my/rss.py deleted file mode 100644 index db577f6..0000000 --- a/my/rss.py +++ /dev/null @@ -1,29 +0,0 @@ -from itertools import chain -from typing import List, Dict - -from ._rss import Subscription - -from . import feedbin -from . import feedly -# TODO google reader? - - -def get_all_subscriptions() -> List[Subscription]: - """ - Keeps track of everything I ever subscribed to. It's useful to keep track of unsubscribed too - so you don't try to subscribe again (or at least take into account why you unsubscribed before) - """ - states = {} - states.update(feedly.get_states()) - states.update(feedbin.get_states()) - by_url: Dict[str, Subscription] = {} - for d, feeds in sorted(states.items()): - for f in feeds: - if f.url not in by_url: - by_url[f.url] = f - res = [] - last = {x.url: x for x in max(states.items())[1]} - for u, x in sorted(by_url.items()): - present = u in last - res.append(x._replace(subscribed=present)) - return res diff --git a/my/rss/all.py b/my/rss/all.py new file mode 100644 index 0000000..90f5efa --- /dev/null +++ b/my/rss/all.py @@ -0,0 +1,11 @@ +''' +Unified RSS data, merged from different services I used historically +''' +from typing import Iterable +from .common import Subscription, compute_subscriptions + + +def subscriptions() -> Iterable[Subscription]: + from . import feedbin, feedly + # TODO google reader? + yield from compute_subscriptions(feedbin.states(), feedly.states()) diff --git a/my/rss/common.py b/my/rss/common.py new file mode 100644 index 0000000..3dc761c --- /dev/null +++ b/my/rss/common.py @@ -0,0 +1,44 @@ +# shared Rss stuff +from datetime import datetime +from typing import NamedTuple, Optional, List, Dict + + +class Subscription(NamedTuple): + title: str + url: str + id: str # TODO not sure about it... + # eh, not all of them got reasonable 'created' time + created_at: Optional[datetime] + subscribed: bool=True + +from typing import Iterable, Tuple, Sequence + +# snapshot of subscriptions at time +SubscriptionState = Tuple[datetime, Sequence[Subscription]] + + +def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]: + """ + Keeps track of everything I ever subscribed to. + In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed) + """ + from itertools import chain + states = list(chain.from_iterable(sources)) + # TODO keep 'source'/'provider'/'service' attribute? + + by_url: Dict[str, Subscription] = {} + # ah. dates are used for sorting + for when, state in sorted(states): + # TODO use 'when'? + for feed in state: + if feed.url not in by_url: + by_url[feed.url] = feed + + _, last_state = max(states, key=lambda x: x[0]) + last_urls = {f.url for f in last_state} + + res = [] + for u, x in sorted(by_url.items()): + present = u in last_urls + res.append(x._replace(subscribed=present)) + return res diff --git a/my/rss/feedbin.py b/my/rss/feedbin.py new file mode 100644 index 0000000..5a2f117 --- /dev/null +++ b/my/rss/feedbin.py @@ -0,0 +1,42 @@ +""" +Feedbin RSS reader +""" + +from my.config import feedbin as config + +from pathlib import Path +from typing import Sequence + +from ..core.common import listify, get_files, isoparse +from .common import Subscription + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +import json + +@listify +def parse_file(f: Path): + raw = json.loads(f.read_text()) + for r in raw: + yield Subscription( + created_at=isoparse(r['created_at']), + title=r['title'], + url=r['site_url'], + id=r['id'], + ) + + +from typing import Iterable +from .common import SubscriptionState +def states() -> Iterable[SubscriptionState]: + # meh + from dateutil.parser import isoparse # type: ignore + for f in inputs(): + # TODO ugh. depends on my naming. not sure if useful? + dts = f.stem.split('_')[-1] + dt = isoparse(dts) + subs = parse_file(f) + yield dt, subs diff --git a/my/feedly.py b/my/rss/feedly.py similarity index 60% rename from my/feedly.py rename to my/rss/feedly.py index 93f8823..cc9331f 100644 --- a/my/feedly.py +++ b/my/rss/feedly.py @@ -2,16 +2,20 @@ Feedly RSS reader """ -from .common import listify -from ._rss import Subscription - from my.config import feedly as config -import json from pathlib import Path -from typing import Dict, List -from datetime import datetime -import pytz +from typing import Sequence + +from ..core.common import listify, get_files, isoparse +from .common import Subscription + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +import json @listify @@ -22,19 +26,21 @@ def parse_file(f: Path): rid = r['id'] website = r.get('website', rid) # meh yield Subscription( - # TODO created_at? + created_at=None, title=r['title'], url=website, id=rid, ) -def get_states() -> Dict[datetime, List[Subscription]]: - res = {} - # TODO use get_files - for f in sorted(Path(config.export_dir).glob('*.json')): + +from datetime import datetime +from typing import Iterable +from .common import SubscriptionState +def states() -> Iterable[SubscriptionState]: + import pytz + for f in inputs(): dts = f.stem.split('_')[-1] dt = datetime.strptime(dts, '%Y%m%d%H%M%S') dt = pytz.utc.localize(dt) subs = parse_file(f) - res[dt] = subs - return res + yield dt, subs diff --git a/my/twitter/all.py b/my/twitter/all.py index f2e0469..be4bdbf 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -7,10 +7,9 @@ from . import twint from . import archive -from more_itertools import unique_everseen - - +# TODO move to .common? def merge_tweets(*sources): + from more_itertools import unique_everseen yield from unique_everseen( chain(*sources), key=lambda t: t.id_str, diff --git a/tests/config.py b/tests/config.py index 523797a..5df0e04 100644 --- a/tests/config.py +++ b/tests/config.py @@ -76,7 +76,7 @@ class feedly: os.environ['MY_CONFIG'] = str(tmp_path) # should not raise at least - import my.feedly + import my.rss.feedly @pytest.fixture diff --git a/tests/demo.py b/tests/demo.py index 93b777c..4dfae6d 100644 --- a/tests/demo.py +++ b/tests/demo.py @@ -54,7 +54,40 @@ def test_dynamic_config_simplenamespace(tmp_path: Path) -> None: my.config.demo = user_config # type: ignore[misc, assignment] from my.demo import config - assert config().username == 'user3' + assert config.username == 'user3' + + +# make sure our config handling pattern does it as expected +def test_attribute_handling(tmp_path: Path) -> None: + # doesn't work without it! + # because the config from test_dybamic_config_1 is cached in my.demo.demo + del sys.modules['my.demo'] + + import pytz + nytz = pytz.timezone('America/New_York') + + import my.config + class user_config: + # check that override is taken into the account + timezone = nytz + + irrelevant = 'hello' + + username = 'UUU' + data_path = f'{tmp_path}/*.json' + + + my.config.demo = user_config # type: ignore[misc, assignment] + + from my.demo import config + + assert config.username == 'UUU' + + # mypy doesn't know about it, but the attribute is there + assert getattr(config, 'irrelevant') == 'hello' + + # check that overriden default attribute is actually getting overridden + assert config.timezone == nytz diff --git a/tests/lastfm.py b/tests/lastfm.py index e94c3c5..4b37e24 100644 --- a/tests/lastfm.py +++ b/tests/lastfm.py @@ -1,3 +1,7 @@ +from my.core.cachew import disable_cachew +# TODO need something nicer and integrated inside cachew.. +disable_cachew() # meh + from more_itertools import ilen from my.lastfm import scrobbles @@ -5,3 +9,9 @@ from my.lastfm import scrobbles def test(): assert ilen(scrobbles()) > 1000 + + +def test_datetime_ascending(): + from more_itertools import pairwise + for a, b in pairwise(scrobbles()): + assert a.dt <= b.dt