Merge pull request #48 from karlicoss/configuration

lastfmupdates: docs, lastfm, rss module
2020-05-13 23:07:50 +01:00 · 2020-05-13 23:07:50 +01:00 · d0427855e8
commit d0427855e8
parent 1e6e0bd381 63d4198fd9
18 changed files with 358 additions and 115 deletions
--- a/doc/CONFIGURING.org
+++ b/doc/CONFIGURING.org
@ -224,7 +224,7 @@ My conclusion was using a *combined approach*:

 Inheritance is a standard mechanism, which doesn't require any extra frameworks and plays well with other Python concepts. As a specific example:

-,#+begin_src python
+#+begin_src python
 from my.config import bluemaestro as user_config

@dataclass
@ -256,24 +256,27 @@ I claim this solves pretty much everything:
 - *(6)*: the dataclass header is easily readable, and it's possible to generate the docs automatically

 Downsides:
- inheriting from ~user_config~ means early import of =my.config=
+- inheriting from ~user_config~ means an early import of =my.config=

  Generally it's better to keep everything as lazy as possible and defer loading to the first time the config is used.
  This might be annoying at times, e.g. if you have a top-level import of you module, but no config.

  But considering that in 99% of cases config is going to be on the disk
-  and it's possible to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.
-  # TODO demonstrate in a test?
+  and it's [[https://github.com/karlicoss/HPI/blob/1e6e0bd381d20437343473878c7f63b1f9d6362b/tests/demo.py#L22-L25][possible]] to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.

 - =make_config= allows for some mypy false negatives in the user config

  E.g. if you forgot =export_path= attribute, mypy would miss it. But you'd have a runtime failure, and the downstream code using config is still correctly type checked.

-  Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this]] is fixed.
+  Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this mypy issue]] is fixed.
 - the =make_config= bit is a little scary and manual

  However, it's extracted in a generic helper, and [[https://github.com/karlicoss/HPI/blob/d6f071e3b12ba1cd5a86ad80e3821bec004e6a6d/my/twitter/archive.py#L17][ends up pretty simple]]

+  # In addition, it's not even necessary if you don't have optional attributes, you can simply use the class variables (i.e. ~bluemaestro.export_path~)
+  # upd. ugh, you can't, it doesn't handle default attributes overriding correctly (see tests/demo.py)
+  # eh. basically all I need is class level dataclass??
+
 - inheriting from ~user_config~ requires it to be a =class= rather than an =object=

  A practical downside is you can't use something like ~SimpleNamespace~.
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@ -32,6 +32,7 @@ modules = [
    ('reddit' , 'my.reddit'              ),
    ('twint'  , 'my.twitter.twint'       ),
    ('twitter', 'my.twitter.archive'     ),
+    ('lastfm' , 'my.lastfm'              ),
 ]

 def indent(s, spaces=4):
@ -105,4 +106,15 @@ for cls, p in modules:
    class twitter:
        export_path: Paths # path[s]/glob to the twitter archive takeout
    #+end_src
+- [[file:../my/lastfm][my.lastfm]]
+
+    Last.fm scrobbles
+
+    #+begin_src python
+    class lastfm:
+        """
+        Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
+        """
+        export_path: Paths
+    #+end_src
 :end:
--- a/my/_rss.py
+++ b/my/_rss.py
@ -1,10 +0,0 @@
-# shared Rss stuff
-from typing import NamedTuple
-
-class Subscription(NamedTuple):
-    # TODO date?
-    title: str
-    url: str
-    id: str
-    subscribed: bool=True
-
--- a/my/core/common.py
+++ b/my/core/common.py
@ -1,5 +1,6 @@
 from glob import glob as do_glob
 from pathlib import Path
+from datetime import datetime
 import functools
 import types
 from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple
@ -219,3 +220,28 @@ class classproperty(Generic[_R]):
 #
 #     def __get__(self) -> _R:
 #         return self.f()
+
+# TODO maybe use opaque mypy alias?
+tzdatetime = datetime
+
+
+fromisoformat: Callable[[str], datetime]
+import sys
+if sys.version_info.minor >= 7:
+    # prevent mypy on py3.6 from complaining...
+    fromisoformat_real = datetime.fromisoformat # type: ignore[attr-defined]
+    fromisoformat = fromisoformat_real
+else:
+    from .py37 import fromisoformat
+
+
+# TODO doctests?
+def isoparse(s: str) -> tzdatetime:
+    """
+    Parses timestamps formatted like 2020-05-01T10:32:02.925961Z
+    """
+    # TODO could use dateutil? but it's quite slow as far as I remember..
+    # TODO support non-utc.. somehow?
+    assert s.endswith('Z'), s
+    s = s[:-1] + '+00:00'
+    return fromisoformat(s)
--- a/my/core/py37.py
+++ b/my/core/py37.py
@ -0,0 +1,122 @@
+# borrowed from /usr/lib/python3.7/datetime.py
+from datetime import datetime, timezone, timedelta
+
+def _parse_isoformat_date(dtstr):
+    # It is assumed that this function will only be called with a
+    # string of length exactly 10, and (though this is not used) ASCII-only
+    year = int(dtstr[0:4])
+    if dtstr[4] != '-':
+        raise ValueError('Invalid date separator: %s' % dtstr[4])
+
+    month = int(dtstr[5:7])
+
+    if dtstr[7] != '-':
+        raise ValueError('Invalid date separator')
+
+    day = int(dtstr[8:10])
+
+    return [year, month, day]
+
+
+def _parse_hh_mm_ss_ff(tstr):
+    # Parses things of the form HH[:MM[:SS[.fff[fff]]]]
+    len_str = len(tstr)
+
+    time_comps = [0, 0, 0, 0]
+    pos = 0
+    for comp in range(0, 3):
+        if (len_str - pos) < 2:
+            raise ValueError('Incomplete time component')
+
+        time_comps[comp] = int(tstr[pos:pos+2])
+
+        pos += 2
+        next_char = tstr[pos:pos+1]
+
+        if not next_char or comp >= 2:
+            break
+
+        if next_char != ':':
+            raise ValueError('Invalid time separator: %c' % next_char)
+
+        pos += 1
+
+    if pos < len_str:
+        if tstr[pos] != '.':
+            raise ValueError('Invalid microsecond component')
+        else:
+            pos += 1
+
+            len_remainder = len_str - pos
+            if len_remainder not in (3, 6):
+                raise ValueError('Invalid microsecond component')
+
+            time_comps[3] = int(tstr[pos:])
+            if len_remainder == 3:
+                time_comps[3] *= 1000
+
+    return time_comps
+
+
+def _parse_isoformat_time(tstr):
+    # Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]
+    len_str = len(tstr)
+    if len_str < 2:
+        raise ValueError('Isoformat time too short')
+
+    # This is equivalent to re.search('[+-]', tstr), but faster
+    tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1)
+    timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr
+
+    time_comps = _parse_hh_mm_ss_ff(timestr)
+
+    tzi = None
+    if tz_pos > 0:
+        tzstr = tstr[tz_pos:]
+
+        # Valid time zone strings are:
+        # HH:MM               len: 5
+        # HH:MM:SS            len: 8
+        # HH:MM:SS.ffffff     len: 15
+
+        if len(tzstr) not in (5, 8, 15):
+            raise ValueError('Malformed time zone string')
+
+        tz_comps = _parse_hh_mm_ss_ff(tzstr)
+        if all(x == 0 for x in tz_comps):
+            tzi = timezone.utc
+        else:
+            tzsign = -1 if tstr[tz_pos - 1] == '-' else 1
+
+            td = timedelta(hours=tz_comps[0], minutes=tz_comps[1],
+                           seconds=tz_comps[2], microseconds=tz_comps[3])
+
+            tzi = timezone(tzsign * td)
+
+    time_comps.append(tzi)
+
+    return time_comps
+
+def fromisoformat(date_string, cls=datetime):
+    """Construct a datetime from the output of datetime.isoformat()."""
+    if not isinstance(date_string, str):
+        raise TypeError('fromisoformat: argument must be str')
+
+    # Split this at the separator
+    dstr = date_string[0:10]
+    tstr = date_string[11:]
+
+    try:
+        date_components = _parse_isoformat_date(dstr)
+    except ValueError:
+        raise ValueError('Invalid isoformat string: %s' % date_string)
+
+    if tstr:
+        try:
+            time_components = _parse_isoformat_time(tstr)
+        except ValueError:
+            raise ValueError('Invalid isoformat string: %s' % date_string)
+    else:
+        time_components = [0, 0, 0, 0, None]
+
+    return cls(*(date_components + time_components))
--- a/my/demo.py
+++ b/my/demo.py
@ -16,13 +16,8 @@ class demo(user_config):
    username: str
    timezone: tzinfo = pytz.utc

-
-def config() -> demo:
-    from .core.cfg import make_config
-    config = make_config(demo)
-    return config
-
-
+from .core.cfg import make_config
+config = make_config(demo)

 from pathlib import Path
 from typing import Sequence, Iterable
@ -40,17 +35,17 @@ class Item:


 def inputs() -> Sequence[Path]:
-    return get_files(config().data_path)
+    return get_files(config.data_path)


 import json
 def items() -> Iterable[Item]:
    for f in inputs():
-        dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config().timezone)
+        dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone)
        j = json.loads(f.read_text())
        for raw in j:
            yield Item(
-                username=config().username,
+                username=config.username,
                raw=raw,
                dt=dt,
            )
--- a/my/feedbin.py
+++ b/my/feedbin.py
@ -1,36 +0,0 @@
-"""
-Feedbin RSS reader
-"""
-
-from .common import listify
-from ._rss import Subscription
-
-from my.config import feedbin as config
-
-import json
-from pathlib import Path
-from typing import Dict, List
-from datetime import datetime
-from dateutil.parser import isoparse
-
-
-@listify
-def parse_file(f: Path):
-    raw = json.loads(f.read_text())
-    for r in raw:
-        yield Subscription(
-            # TODO created_at?
-            title=r['title'],
-            url=r['site_url'],
-            id=r['id'],
-        )
-
-def get_states() -> Dict[datetime, List[Subscription]]:
-    res = {}
-    # TODO use get_files
-    for f in sorted(Path(config.export_dir).glob('*.json')):
-        dts = f.stem.split('_')[-1]
-        dt = isoparse(dts)
-        subs = parse_file(f)
-        res[dt] = subs
-    return res
--- a/my/lastfm/init.py
+++ b/my/lastfm/init.py
@ -2,8 +2,21 @@
 Last.fm scrobbles
 '''

+from ..core.common import Paths
+from dataclasses import dataclass
+from my.config import lastfm as user_config
+
+@dataclass
+class lastfm(user_config):
+    """
+    Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
+    """
+    export_path: Paths
+
+
+from ..core.cfg import make_config
+config = make_config(lastfm)

-from ..common import get_files, mcachew, Json

 from datetime import datetime
 import json
@ -12,16 +25,17 @@ from typing import NamedTuple, Any, Sequence, Iterable

 import pytz

-from my.config import lastfm as config
+from ..core.common import mcachew, Json, get_files
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+

 # TODO memoised properties?
 # TODO lazy mode and eager mode?
 # lazy is a bit nicer in terms of more flexibility and less processing?
 # eager is a bit more explicit for error handling

-def inputs() -> Sequence[Path]:
-    return get_files(config.export_path)
-

 class Scrobble(NamedTuple):
    raw: Json
@ -54,5 +68,5 @@ def scrobbles() -> Iterable[Scrobble]:
    last = max(inputs())
    j = json.loads(last.read_text())

-    for raw in j:
+    for raw in reversed(j):
        yield Scrobble(raw=raw)
--- a/my/reading/polar.py
+++ b/my/reading/polar.py
@ -21,6 +21,7 @@ _POLAR_DIR = Path('~').expanduser() / '.polar'
 logger = LazyLogger(__name__)


+# TODO use core.isoparse
 def parse_dt(s: str) -> datetime:
    return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))

--- a/my/rss.py
+++ b/my/rss.py
@ -1,29 +0,0 @@
-from itertools import chain
-from typing import List, Dict
-
-from ._rss import Subscription
-
-from . import feedbin
-from . import feedly
-# TODO google reader?
-
-
-def get_all_subscriptions() -> List[Subscription]:
-    """
-    Keeps track of everything I ever subscribed to. It's useful to keep track of unsubscribed too
-    so you don't try to subscribe again (or at least take into account why you unsubscribed before)
-    """
-    states = {}
-    states.update(feedly.get_states())
-    states.update(feedbin.get_states())
-    by_url: Dict[str, Subscription] = {}
-    for d, feeds in sorted(states.items()):
-        for f in feeds:
-            if f.url not in by_url:
-                by_url[f.url] = f
-    res = []
-    last = {x.url: x for x in max(states.items())[1]}
-    for u, x in sorted(by_url.items()):
-        present = u in last
-        res.append(x._replace(subscribed=present))
-    return res
--- a/my/rss/all.py
+++ b/my/rss/all.py
@ -0,0 +1,11 @@
+'''
+Unified RSS data, merged from different services I used historically
+'''
+from typing import Iterable
+from .common import Subscription, compute_subscriptions
+
+
+def subscriptions() -> Iterable[Subscription]:
+    from . import feedbin, feedly
+    # TODO google reader?
+    yield from compute_subscriptions(feedbin.states(), feedly.states())
--- a/my/rss/common.py
+++ b/my/rss/common.py
@ -0,0 +1,44 @@
+# shared Rss stuff
+from datetime import datetime
+from typing import NamedTuple, Optional, List, Dict
+
+
+class Subscription(NamedTuple):
+    title: str
+    url: str
+    id: str # TODO not sure about it...
+    # eh, not all of them got reasonable 'created' time
+    created_at: Optional[datetime]
+    subscribed: bool=True
+
+from typing import Iterable, Tuple, Sequence
+
+# snapshot of subscriptions at time
+SubscriptionState = Tuple[datetime, Sequence[Subscription]]
+
+
+def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]:
+    """
+    Keeps track of everything I ever subscribed to.
+    In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed)
+    """
+    from itertools import chain
+    states = list(chain.from_iterable(sources))
+    # TODO keep 'source'/'provider'/'service' attribute?
+
+    by_url: Dict[str, Subscription] = {}
+    # ah. dates are used for sorting
+    for when, state in sorted(states):
+        # TODO use 'when'?
+        for feed in state:
+            if feed.url not in by_url:
+                by_url[feed.url] = feed
+
+    _, last_state = max(states, key=lambda x: x[0])
+    last_urls = {f.url for f in last_state}
+
+    res = []
+    for u, x in sorted(by_url.items()):
+        present = u in last_urls
+        res.append(x._replace(subscribed=present))
+    return res
--- a/my/rss/feedbin.py
+++ b/my/rss/feedbin.py
@ -0,0 +1,42 @@
+"""
+Feedbin RSS reader
+"""
+
+from my.config import feedbin as config
+
+from pathlib import Path
+from typing import Sequence
+
+from ..core.common import listify, get_files, isoparse
+from .common import Subscription
+
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+import json
+
+@listify
+def parse_file(f: Path):
+    raw = json.loads(f.read_text())
+    for r in raw:
+        yield Subscription(
+            created_at=isoparse(r['created_at']),
+            title=r['title'],
+            url=r['site_url'],
+            id=r['id'],
+        )
+
+
+from typing import Iterable
+from .common import SubscriptionState
+def states() -> Iterable[SubscriptionState]:
+    # meh
+    from dateutil.parser import isoparse # type: ignore
+    for f in inputs():
+        # TODO ugh. depends on my naming. not sure if useful?
+        dts = f.stem.split('_')[-1]
+        dt = isoparse(dts)
+        subs = parse_file(f)
+        yield dt, subs
--- a/my/rss/feedly.py
+++ b/my/rss/feedly.py
@ -2,16 +2,20 @@
 Feedly RSS reader
 """

-from .common import listify
-from ._rss import Subscription
-
 from my.config import feedly as config

-import json
 from pathlib import Path
-from typing import Dict, List
-from datetime import datetime
-import pytz
+from typing import Sequence
+
+from ..core.common import listify, get_files, isoparse
+from .common import Subscription
+
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+import json


@listify
@ -22,19 +26,21 @@ def parse_file(f: Path):
        rid = r['id']
        website = r.get('website', rid) # meh
        yield Subscription(
-            # TODO created_at?
+            created_at=None,
            title=r['title'],
            url=website,
            id=rid,
        )

-def get_states() -> Dict[datetime, List[Subscription]]:
-    res = {}
-    # TODO use get_files
-    for f in sorted(Path(config.export_dir).glob('*.json')):
+
+from datetime import datetime
+from typing import Iterable
+from .common import SubscriptionState
+def states() -> Iterable[SubscriptionState]:
+    import pytz
+    for f in inputs():
        dts = f.stem.split('_')[-1]
        dt = datetime.strptime(dts, '%Y%m%d%H%M%S')
        dt = pytz.utc.localize(dt)
        subs = parse_file(f)
-        res[dt] = subs
-    return res
+        yield dt, subs
--- a/my/twitter/all.py
+++ b/my/twitter/all.py
@ -7,10 +7,9 @@ from . import twint
 from . import archive


-from more_itertools import unique_everseen
-
-
+# TODO move to .common?
 def merge_tweets(*sources):
+    from more_itertools import unique_everseen
    yield from unique_everseen(
        chain(*sources),
        key=lambda t: t.id_str,
--- a/tests/config.py
+++ b/tests/config.py
@ -76,7 +76,7 @@ class feedly:
    os.environ['MY_CONFIG'] = str(tmp_path)

    # should not raise at least
-    import my.feedly
+    import my.rss.feedly


@pytest.fixture
--- a/tests/demo.py
+++ b/tests/demo.py
@ -54,7 +54,40 @@ def test_dynamic_config_simplenamespace(tmp_path: Path) -> None:
    my.config.demo = user_config # type: ignore[misc, assignment]

    from my.demo import config
-    assert config().username == 'user3'
+    assert config.username == 'user3'
+
+
+# make sure our config handling pattern does it as expected
+def test_attribute_handling(tmp_path: Path) -> None:
+    # doesn't work without it!
+    # because the config from test_dybamic_config_1 is cached in my.demo.demo
+    del sys.modules['my.demo']
+
+    import pytz
+    nytz = pytz.timezone('America/New_York')
+
+    import my.config
+    class user_config:
+        # check that override is taken into the account
+        timezone = nytz
+
+        irrelevant = 'hello'
+
+        username = 'UUU'
+        data_path = f'{tmp_path}/*.json'
+
+
+    my.config.demo = user_config # type: ignore[misc, assignment]
+
+    from my.demo import config
+
+    assert config.username == 'UUU'
+
+    # mypy doesn't know about it, but the attribute is there
+    assert getattr(config, 'irrelevant') == 'hello'
+
+    # check that overriden default attribute is actually getting overridden
+    assert config.timezone == nytz



--- a/tests/lastfm.py
+++ b/tests/lastfm.py
@ -1,3 +1,7 @@
+from my.core.cachew import disable_cachew
+# TODO need something nicer and integrated inside cachew..
+disable_cachew()  # meh
+
 from more_itertools import ilen

 from my.lastfm import scrobbles
@ -5,3 +9,9 @@ from my.lastfm import scrobbles

 def test():
    assert ilen(scrobbles()) > 1000
+
+
+def test_datetime_ascending():
+    from more_itertools import pairwise
+    for a, b in pairwise(scrobbles()):
+        assert a.dt <= b.dt