Merge pull request #48 from karlicoss/configuration

lastfmupdates: docs, lastfm, rss module
2020-05-13 23:07:50 +01:00 · 2020-05-13 23:07:50 +01:00 · d0427855e8
commit d0427855e8
parent 1e6e0bd381 63d4198fd9
18 changed files with 358 additions and 115 deletions
--- a/doc/CONFIGURING.org
+++ b/doc/CONFIGURING.org
@ -224,7 +224,7 @@ My conclusion was using a *combined approach*:
 Inheritance is a standard mechanism, which doesn't require any extra frameworks and plays well with other Python concepts. As a specific example:
-,#+begin_src python
+#+begin_src python
 from my.config import bluemaestro as user_config
@dataclass
@ -256,24 +256,27 @@ I claim this solves pretty much everything:
 - *(6)*: the dataclass header is easily readable, and it's possible to generate the docs automatically
 Downsides:
- inheriting from ~user_config~ means early import of =my.config=
+- inheriting from ~user_config~ means an early import of =my.config=
  Generally it's better to keep everything as lazy as possible and defer loading to the first time the config is used.
  This might be annoying at times, e.g. if you have a top-level import of you module, but no config.
  But considering that in 99% of cases config is going to be on the disk
-  and it's possible to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.
+  and it's [[https://github.com/karlicoss/HPI/blob/1e6e0bd381d20437343473878c7f63b1f9d6362b/tests/demo.py#L22-L25][possible]] to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.
  # TODO demonstrate in a test?
 - =make_config= allows for some mypy false negatives in the user config
  E.g. if you forgot =export_path= attribute, mypy would miss it. But you'd have a runtime failure, and the downstream code using config is still correctly type checked.
-  Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this]] is fixed.
+  Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this mypy issue]] is fixed.
 - the =make_config= bit is a little scary and manual
  However, it's extracted in a generic helper, and [[https://github.com/karlicoss/HPI/blob/d6f071e3b12ba1cd5a86ad80e3821bec004e6a6d/my/twitter/archive.py#L17][ends up pretty simple]]
  # In addition, it's not even necessary if you don't have optional attributes, you can simply use the class variables (i.e. ~bluemaestro.export_path~)
  # upd. ugh, you can't, it doesn't handle default attributes overriding correctly (see tests/demo.py)
  # eh. basically all I need is class level dataclass??
 - inheriting from ~user_config~ requires it to be a =class= rather than an =object=
  A practical downside is you can't use something like ~SimpleNamespace~.
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@ -32,6 +32,7 @@ modules = [
    ('reddit' , 'my.reddit'              ),
    ('twint'  , 'my.twitter.twint'       ),
    ('twitter', 'my.twitter.archive'     ),
    ('lastfm' , 'my.lastfm'              ),
 ]
 def indent(s, spaces=4):
@ -105,4 +106,15 @@ for cls, p in modules:
    class twitter:
        export_path: Paths # path[s]/glob to the twitter archive takeout
    #+end_src
 - [[file:../my/lastfm][my.lastfm]]
    Last.fm scrobbles
    #+begin_src python
    class lastfm:
        """
        Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
        """
        export_path: Paths
    #+end_src
 :end:
--- a/my/_rss.py
+++ b/my/_rss.py
@ -1,10 +0,0 @@
 # shared Rss stuff
 from typing import NamedTuple
 class Subscription(NamedTuple):
    # TODO date?
    title: str
    url: str
    id: str
    subscribed: bool=True
--- a/my/core/common.py
+++ b/my/core/common.py
@ -1,5 +1,6 @@
 from glob import glob as do_glob
 from pathlib import Path
 from datetime import datetime
 import functools
 import types
 from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple
@ -219,3 +220,28 @@ class classproperty(Generic[_R]):
 #
 #     def __get__(self) -> _R:
 #         return self.f()
 # TODO maybe use opaque mypy alias?
 tzdatetime = datetime
 fromisoformat: Callable[[str], datetime]
 import sys
 if sys.version_info.minor >= 7:
    # prevent mypy on py3.6 from complaining...
    fromisoformat_real = datetime.fromisoformat # type: ignore[attr-defined]
    fromisoformat = fromisoformat_real
 else:
    from .py37 import fromisoformat
 # TODO doctests?
 def isoparse(s: str) -> tzdatetime:
    """
    Parses timestamps formatted like 2020-05-01T10:32:02.925961Z
    """
    # TODO could use dateutil? but it's quite slow as far as I remember..
    # TODO support non-utc.. somehow?
    assert s.endswith('Z'), s
    s = s[:-1] + '+00:00'
    return fromisoformat(s)
--- a/my/core/py37.py
+++ b/my/core/py37.py
@ -0,0 +1,122 @@
 # borrowed from /usr/lib/python3.7/datetime.py
 from datetime import datetime, timezone, timedelta
 def _parse_isoformat_date(dtstr):
    # It is assumed that this function will only be called with a
    # string of length exactly 10, and (though this is not used) ASCII-only
    year = int(dtstr[0:4])
    if dtstr[4] != '-':
        raise ValueError('Invalid date separator: %s' % dtstr[4])
    month = int(dtstr[5:7])
    if dtstr[7] != '-':
        raise ValueError('Invalid date separator')
    day = int(dtstr[8:10])
    return [year, month, day]
 def _parse_hh_mm_ss_ff(tstr):
    # Parses things of the form HH[:MM[:SS[.fff[fff]]]]
    len_str = len(tstr)
    time_comps = [0, 0, 0, 0]
    pos = 0
    for comp in range(0, 3):
        if (len_str - pos) < 2:
            raise ValueError('Incomplete time component')
        time_comps[comp] = int(tstr[pos:pos+2])
        pos += 2
        next_char = tstr[pos:pos+1]
        if not next_char or comp >= 2:
            break
        if next_char != ':':
            raise ValueError('Invalid time separator: %c' % next_char)
        pos += 1
    if pos < len_str:
        if tstr[pos] != '.':
            raise ValueError('Invalid microsecond component')
        else:
            pos += 1
            len_remainder = len_str - pos
            if len_remainder not in (3, 6):
                raise ValueError('Invalid microsecond component')
            time_comps[3] = int(tstr[pos:])
            if len_remainder == 3:
                time_comps[3] *= 1000
    return time_comps
 def _parse_isoformat_time(tstr):
    # Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]
    len_str = len(tstr)
    if len_str < 2:
        raise ValueError('Isoformat time too short')
    # This is equivalent to re.search('[+-]', tstr), but faster
    tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1)
    timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr
    time_comps = _parse_hh_mm_ss_ff(timestr)
    tzi = None
    if tz_pos > 0:
        tzstr = tstr[tz_pos:]
        # Valid time zone strings are:
        # HH:MM               len: 5
        # HH:MM:SS            len: 8
        # HH:MM:SS.ffffff     len: 15
        if len(tzstr) not in (5, 8, 15):
            raise ValueError('Malformed time zone string')
        tz_comps = _parse_hh_mm_ss_ff(tzstr)
        if all(x == 0 for x in tz_comps):
            tzi = timezone.utc
        else:
            tzsign = -1 if tstr[tz_pos - 1] == '-' else 1
            td = timedelta(hours=tz_comps[0], minutes=tz_comps[1],
                           seconds=tz_comps[2], microseconds=tz_comps[3])
            tzi = timezone(tzsign * td)
    time_comps.append(tzi)
    return time_comps
 def fromisoformat(date_string, cls=datetime):
    """Construct a datetime from the output of datetime.isoformat()."""
    if not isinstance(date_string, str):
        raise TypeError('fromisoformat: argument must be str')
    # Split this at the separator
    dstr = date_string[0:10]
    tstr = date_string[11:]
    try:
        date_components = _parse_isoformat_date(dstr)
    except ValueError:
        raise ValueError('Invalid isoformat string: %s' % date_string)
    if tstr:
        try:
            time_components = _parse_isoformat_time(tstr)
        except ValueError:
            raise ValueError('Invalid isoformat string: %s' % date_string)
    else:
        time_components = [0, 0, 0, 0, None]
    return cls(*(date_components + time_components))
--- a/my/demo.py
+++ b/my/demo.py
@ -16,13 +16,8 @@ class demo(user_config):
    username: str
    timezone: tzinfo = pytz.utc
-
+from .core.cfg import make_config
-def config() -> demo:
+config = make_config(demo)
    from .core.cfg import make_config
    config = make_config(demo)
    return config
 from pathlib import Path
 from typing import Sequence, Iterable
@ -40,17 +35,17 @@ class Item:
 def inputs() -> Sequence[Path]:
-    return get_files(config().data_path)
+    return get_files(config.data_path)
 import json
 def items() -> Iterable[Item]:
    for f in inputs():
-        dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config().timezone)
+        dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone)
        j = json.loads(f.read_text())
        for raw in j:
            yield Item(
-                username=config().username,
+                username=config.username,
                raw=raw,
                dt=dt,
            )
--- a/my/feedbin.py
+++ b/my/feedbin.py
@ -1,36 +0,0 @@
 """
 Feedbin RSS reader
 """
 from .common import listify
 from ._rss import Subscription
 from my.config import feedbin as config
 import json
 from pathlib import Path
 from typing import Dict, List
 from datetime import datetime
 from dateutil.parser import isoparse
@listify
 def parse_file(f: Path):
    raw = json.loads(f.read_text())
    for r in raw:
        yield Subscription(
            # TODO created_at?
            title=r['title'],
            url=r['site_url'],
            id=r['id'],
        )
 def get_states() -> Dict[datetime, List[Subscription]]:
    res = {}
    # TODO use get_files
    for f in sorted(Path(config.export_dir).glob('*.json')):
        dts = f.stem.split('_')[-1]
        dt = isoparse(dts)
        subs = parse_file(f)
        res[dt] = subs
    return res
--- a/my/lastfm/init.py
+++ b/my/lastfm/init.py
@ -2,8 +2,21 @@
 Last.fm scrobbles
 '''
 from ..core.common import Paths
 from dataclasses import dataclass
 from my.config import lastfm as user_config
@dataclass
 class lastfm(user_config):
    """
    Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
    """
    export_path: Paths
 from ..core.cfg import make_config
 config = make_config(lastfm)
 from ..common import get_files, mcachew, Json
 from datetime import datetime
 import json
@ -12,16 +25,17 @@ from typing import NamedTuple, Any, Sequence, Iterable
 import pytz
-from my.config import lastfm as config
+from ..core.common import mcachew, Json, get_files
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
 # TODO memoised properties?
 # TODO lazy mode and eager mode?
 # lazy is a bit nicer in terms of more flexibility and less processing?
 # eager is a bit more explicit for error handling
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
 class Scrobble(NamedTuple):
    raw: Json
@ -54,5 +68,5 @@ def scrobbles() -> Iterable[Scrobble]:
    last = max(inputs())
    j = json.loads(last.read_text())
-    for raw in j:
+    for raw in reversed(j):
        yield Scrobble(raw=raw)
--- a/my/reading/polar.py
+++ b/my/reading/polar.py
@ -21,6 +21,7 @@ _POLAR_DIR = Path('~').expanduser() / '.polar'
 logger = LazyLogger(__name__)
 # TODO use core.isoparse
 def parse_dt(s: str) -> datetime:
    return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))
--- a/my/rss.py
+++ b/my/rss.py
@ -1,29 +0,0 @@
 from itertools import chain
 from typing import List, Dict
 from ._rss import Subscription
 from . import feedbin
 from . import feedly
 # TODO google reader?
 def get_all_subscriptions() -> List[Subscription]:
    """
    Keeps track of everything I ever subscribed to. It's useful to keep track of unsubscribed too
    so you don't try to subscribe again (or at least take into account why you unsubscribed before)
    """
    states = {}
    states.update(feedly.get_states())
    states.update(feedbin.get_states())
    by_url: Dict[str, Subscription] = {}
    for d, feeds in sorted(states.items()):
        for f in feeds:
            if f.url not in by_url:
                by_url[f.url] = f
    res = []
    last = {x.url: x for x in max(states.items())[1]}
    for u, x in sorted(by_url.items()):
        present = u in last
        res.append(x._replace(subscribed=present))
    return res
--- a/my/rss/all.py
+++ b/my/rss/all.py
@ -0,0 +1,11 @@
 '''
 Unified RSS data, merged from different services I used historically
 '''
 from typing import Iterable
 from .common import Subscription, compute_subscriptions
 def subscriptions() -> Iterable[Subscription]:
    from . import feedbin, feedly
    # TODO google reader?
    yield from compute_subscriptions(feedbin.states(), feedly.states())
--- a/my/rss/common.py
+++ b/my/rss/common.py
@ -0,0 +1,44 @@
 # shared Rss stuff
 from datetime import datetime
 from typing import NamedTuple, Optional, List, Dict
 class Subscription(NamedTuple):
    title: str
    url: str
    id: str # TODO not sure about it...
    # eh, not all of them got reasonable 'created' time
    created_at: Optional[datetime]
    subscribed: bool=True
 from typing import Iterable, Tuple, Sequence
 # snapshot of subscriptions at time
 SubscriptionState = Tuple[datetime, Sequence[Subscription]]
 def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]:
    """
    Keeps track of everything I ever subscribed to.
    In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed)
    """
    from itertools import chain
    states = list(chain.from_iterable(sources))
    # TODO keep 'source'/'provider'/'service' attribute?
    by_url: Dict[str, Subscription] = {}
    # ah. dates are used for sorting
    for when, state in sorted(states):
        # TODO use 'when'?
        for feed in state:
            if feed.url not in by_url:
                by_url[feed.url] = feed
    _, last_state = max(states, key=lambda x: x[0])
    last_urls = {f.url for f in last_state}
    res = []
    for u, x in sorted(by_url.items()):
        present = u in last_urls
        res.append(x._replace(subscribed=present))
    return res
--- a/my/rss/feedbin.py
+++ b/my/rss/feedbin.py
@ -0,0 +1,42 @@
 """
 Feedbin RSS reader
 """
 from my.config import feedbin as config
 from pathlib import Path
 from typing import Sequence
 from ..core.common import listify, get_files, isoparse
 from .common import Subscription
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
 import json
@listify
 def parse_file(f: Path):
    raw = json.loads(f.read_text())
    for r in raw:
        yield Subscription(
            created_at=isoparse(r['created_at']),
            title=r['title'],
            url=r['site_url'],
            id=r['id'],
        )
 from typing import Iterable
 from .common import SubscriptionState
 def states() -> Iterable[SubscriptionState]:
    # meh
    from dateutil.parser import isoparse # type: ignore
    for f in inputs():
        # TODO ugh. depends on my naming. not sure if useful?
        dts = f.stem.split('_')[-1]
        dt = isoparse(dts)
        subs = parse_file(f)
        yield dt, subs
--- a/my/rss/feedly.py
+++ b/my/rss/feedly.py
@ -2,16 +2,20 @@
 Feedly RSS reader
 """
 from .common import listify
 from ._rss import Subscription
 from my.config import feedly as config
 import json
 from pathlib import Path
-from typing import Dict, List
+from typing import Sequence
-from datetime import datetime
+
-import pytz
+from ..core.common import listify, get_files, isoparse
 from .common import Subscription
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
 import json
@listify
@ -22,19 +26,21 @@ def parse_file(f: Path):
        rid = r['id']
        website = r.get('website', rid) # meh
        yield Subscription(
-            # TODO created_at?
+            created_at=None,
            title=r['title'],
            url=website,
            id=rid,
        )
-def get_states() -> Dict[datetime, List[Subscription]]:
+
-    res = {}
+from datetime import datetime
-    # TODO use get_files
+from typing import Iterable
-    for f in sorted(Path(config.export_dir).glob('*.json')):
+from .common import SubscriptionState
 def states() -> Iterable[SubscriptionState]:
    import pytz
    for f in inputs():
        dts = f.stem.split('_')[-1]
        dt = datetime.strptime(dts, '%Y%m%d%H%M%S')
        dt = pytz.utc.localize(dt)
        subs = parse_file(f)
-        res[dt] = subs
+        yield dt, subs
    return res
--- a/my/twitter/all.py
+++ b/my/twitter/all.py
@ -7,10 +7,9 @@ from . import twint
 from . import archive
-from more_itertools import unique_everseen
+# TODO move to .common?
 def merge_tweets(*sources):
    from more_itertools import unique_everseen
    yield from unique_everseen(
        chain(*sources),
        key=lambda t: t.id_str,
--- a/tests/config.py
+++ b/tests/config.py
@ -76,7 +76,7 @@ class feedly:
    os.environ['MY_CONFIG'] = str(tmp_path)
    # should not raise at least
-    import my.feedly
+    import my.rss.feedly
@pytest.fixture
--- a/tests/demo.py
+++ b/tests/demo.py
@ -54,7 +54,40 @@ def test_dynamic_config_simplenamespace(tmp_path: Path) -> None:
    my.config.demo = user_config # type: ignore[misc, assignment]
    from my.demo import config
-    assert config().username == 'user3'
+    assert config.username == 'user3'
 # make sure our config handling pattern does it as expected
 def test_attribute_handling(tmp_path: Path) -> None:
    # doesn't work without it!
    # because the config from test_dybamic_config_1 is cached in my.demo.demo
    del sys.modules['my.demo']
    import pytz
    nytz = pytz.timezone('America/New_York')
    import my.config
    class user_config:
        # check that override is taken into the account
        timezone = nytz
        irrelevant = 'hello'
        username = 'UUU'
        data_path = f'{tmp_path}/*.json'
    my.config.demo = user_config # type: ignore[misc, assignment]
    from my.demo import config
    assert config.username == 'UUU'
    # mypy doesn't know about it, but the attribute is there
    assert getattr(config, 'irrelevant') == 'hello'
    # check that overriden default attribute is actually getting overridden
    assert config.timezone == nytz
--- a/tests/lastfm.py
+++ b/tests/lastfm.py
@ -1,3 +1,7 @@
 from my.core.cachew import disable_cachew
 # TODO need something nicer and integrated inside cachew..
 disable_cachew()  # meh
 from more_itertools import ilen
 from my.lastfm import scrobbles
@ -5,3 +9,9 @@ from my.lastfm import scrobbles
 def test():
    assert ilen(scrobbles()) > 1000
 def test_datetime_ascending():
    from more_itertools import pairwise
    for a, b in pairwise(scrobbles()):
        assert a.dt <= b.dt