Merge pull request #48 from karlicoss/configuration

lastfmupdates: docs, lastfm, rss module
This commit is contained in:
karlicoss 2020-05-13 23:07:50 +01:00 committed by GitHub
commit d0427855e8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 358 additions and 115 deletions

View file

@ -224,7 +224,7 @@ My conclusion was using a *combined approach*:
Inheritance is a standard mechanism, which doesn't require any extra frameworks and plays well with other Python concepts. As a specific example: Inheritance is a standard mechanism, which doesn't require any extra frameworks and plays well with other Python concepts. As a specific example:
,#+begin_src python #+begin_src python
from my.config import bluemaestro as user_config from my.config import bluemaestro as user_config
@dataclass @dataclass
@ -256,24 +256,27 @@ I claim this solves pretty much everything:
- *(6)*: the dataclass header is easily readable, and it's possible to generate the docs automatically - *(6)*: the dataclass header is easily readable, and it's possible to generate the docs automatically
Downsides: Downsides:
- inheriting from ~user_config~ means early import of =my.config= - inheriting from ~user_config~ means an early import of =my.config=
Generally it's better to keep everything as lazy as possible and defer loading to the first time the config is used. Generally it's better to keep everything as lazy as possible and defer loading to the first time the config is used.
This might be annoying at times, e.g. if you have a top-level import of you module, but no config. This might be annoying at times, e.g. if you have a top-level import of you module, but no config.
But considering that in 99% of cases config is going to be on the disk But considering that in 99% of cases config is going to be on the disk
and it's possible to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue. and it's [[https://github.com/karlicoss/HPI/blob/1e6e0bd381d20437343473878c7f63b1f9d6362b/tests/demo.py#L22-L25][possible]] to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.
# TODO demonstrate in a test?
- =make_config= allows for some mypy false negatives in the user config - =make_config= allows for some mypy false negatives in the user config
E.g. if you forgot =export_path= attribute, mypy would miss it. But you'd have a runtime failure, and the downstream code using config is still correctly type checked. E.g. if you forgot =export_path= attribute, mypy would miss it. But you'd have a runtime failure, and the downstream code using config is still correctly type checked.
Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this]] is fixed. Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this mypy issue]] is fixed.
- the =make_config= bit is a little scary and manual - the =make_config= bit is a little scary and manual
However, it's extracted in a generic helper, and [[https://github.com/karlicoss/HPI/blob/d6f071e3b12ba1cd5a86ad80e3821bec004e6a6d/my/twitter/archive.py#L17][ends up pretty simple]] However, it's extracted in a generic helper, and [[https://github.com/karlicoss/HPI/blob/d6f071e3b12ba1cd5a86ad80e3821bec004e6a6d/my/twitter/archive.py#L17][ends up pretty simple]]
# In addition, it's not even necessary if you don't have optional attributes, you can simply use the class variables (i.e. ~bluemaestro.export_path~)
# upd. ugh, you can't, it doesn't handle default attributes overriding correctly (see tests/demo.py)
# eh. basically all I need is class level dataclass??
- inheriting from ~user_config~ requires it to be a =class= rather than an =object= - inheriting from ~user_config~ requires it to be a =class= rather than an =object=
A practical downside is you can't use something like ~SimpleNamespace~. A practical downside is you can't use something like ~SimpleNamespace~.

View file

@ -32,6 +32,7 @@ modules = [
('reddit' , 'my.reddit' ), ('reddit' , 'my.reddit' ),
('twint' , 'my.twitter.twint' ), ('twint' , 'my.twitter.twint' ),
('twitter', 'my.twitter.archive' ), ('twitter', 'my.twitter.archive' ),
('lastfm' , 'my.lastfm' ),
] ]
def indent(s, spaces=4): def indent(s, spaces=4):
@ -105,4 +106,15 @@ for cls, p in modules:
class twitter: class twitter:
export_path: Paths # path[s]/glob to the twitter archive takeout export_path: Paths # path[s]/glob to the twitter archive takeout
#+end_src #+end_src
- [[file:../my/lastfm][my.lastfm]]
Last.fm scrobbles
#+begin_src python
class lastfm:
"""
Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
"""
export_path: Paths
#+end_src
:end: :end:

View file

@ -1,10 +0,0 @@
# shared Rss stuff
from typing import NamedTuple
class Subscription(NamedTuple):
# TODO date?
title: str
url: str
id: str
subscribed: bool=True

View file

@ -1,5 +1,6 @@
from glob import glob as do_glob from glob import glob as do_glob
from pathlib import Path from pathlib import Path
from datetime import datetime
import functools import functools
import types import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple
@ -219,3 +220,28 @@ class classproperty(Generic[_R]):
# #
# def __get__(self) -> _R: # def __get__(self) -> _R:
# return self.f() # return self.f()
# TODO maybe use opaque mypy alias?
tzdatetime = datetime
fromisoformat: Callable[[str], datetime]
import sys
if sys.version_info.minor >= 7:
# prevent mypy on py3.6 from complaining...
fromisoformat_real = datetime.fromisoformat # type: ignore[attr-defined]
fromisoformat = fromisoformat_real
else:
from .py37 import fromisoformat
# TODO doctests?
def isoparse(s: str) -> tzdatetime:
"""
Parses timestamps formatted like 2020-05-01T10:32:02.925961Z
"""
# TODO could use dateutil? but it's quite slow as far as I remember..
# TODO support non-utc.. somehow?
assert s.endswith('Z'), s
s = s[:-1] + '+00:00'
return fromisoformat(s)

122
my/core/py37.py Normal file
View file

@ -0,0 +1,122 @@
# borrowed from /usr/lib/python3.7/datetime.py
from datetime import datetime, timezone, timedelta
def _parse_isoformat_date(dtstr):
# It is assumed that this function will only be called with a
# string of length exactly 10, and (though this is not used) ASCII-only
year = int(dtstr[0:4])
if dtstr[4] != '-':
raise ValueError('Invalid date separator: %s' % dtstr[4])
month = int(dtstr[5:7])
if dtstr[7] != '-':
raise ValueError('Invalid date separator')
day = int(dtstr[8:10])
return [year, month, day]
def _parse_hh_mm_ss_ff(tstr):
# Parses things of the form HH[:MM[:SS[.fff[fff]]]]
len_str = len(tstr)
time_comps = [0, 0, 0, 0]
pos = 0
for comp in range(0, 3):
if (len_str - pos) < 2:
raise ValueError('Incomplete time component')
time_comps[comp] = int(tstr[pos:pos+2])
pos += 2
next_char = tstr[pos:pos+1]
if not next_char or comp >= 2:
break
if next_char != ':':
raise ValueError('Invalid time separator: %c' % next_char)
pos += 1
if pos < len_str:
if tstr[pos] != '.':
raise ValueError('Invalid microsecond component')
else:
pos += 1
len_remainder = len_str - pos
if len_remainder not in (3, 6):
raise ValueError('Invalid microsecond component')
time_comps[3] = int(tstr[pos:])
if len_remainder == 3:
time_comps[3] *= 1000
return time_comps
def _parse_isoformat_time(tstr):
# Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]
len_str = len(tstr)
if len_str < 2:
raise ValueError('Isoformat time too short')
# This is equivalent to re.search('[+-]', tstr), but faster
tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1)
timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr
time_comps = _parse_hh_mm_ss_ff(timestr)
tzi = None
if tz_pos > 0:
tzstr = tstr[tz_pos:]
# Valid time zone strings are:
# HH:MM len: 5
# HH:MM:SS len: 8
# HH:MM:SS.ffffff len: 15
if len(tzstr) not in (5, 8, 15):
raise ValueError('Malformed time zone string')
tz_comps = _parse_hh_mm_ss_ff(tzstr)
if all(x == 0 for x in tz_comps):
tzi = timezone.utc
else:
tzsign = -1 if tstr[tz_pos - 1] == '-' else 1
td = timedelta(hours=tz_comps[0], minutes=tz_comps[1],
seconds=tz_comps[2], microseconds=tz_comps[3])
tzi = timezone(tzsign * td)
time_comps.append(tzi)
return time_comps
def fromisoformat(date_string, cls=datetime):
"""Construct a datetime from the output of datetime.isoformat()."""
if not isinstance(date_string, str):
raise TypeError('fromisoformat: argument must be str')
# Split this at the separator
dstr = date_string[0:10]
tstr = date_string[11:]
try:
date_components = _parse_isoformat_date(dstr)
except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string)
if tstr:
try:
time_components = _parse_isoformat_time(tstr)
except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string)
else:
time_components = [0, 0, 0, 0, None]
return cls(*(date_components + time_components))

View file

@ -16,13 +16,8 @@ class demo(user_config):
username: str username: str
timezone: tzinfo = pytz.utc timezone: tzinfo = pytz.utc
from .core.cfg import make_config
def config() -> demo: config = make_config(demo)
from .core.cfg import make_config
config = make_config(demo)
return config
from pathlib import Path from pathlib import Path
from typing import Sequence, Iterable from typing import Sequence, Iterable
@ -40,17 +35,17 @@ class Item:
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config().data_path) return get_files(config.data_path)
import json import json
def items() -> Iterable[Item]: def items() -> Iterable[Item]:
for f in inputs(): for f in inputs():
dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config().timezone) dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone)
j = json.loads(f.read_text()) j = json.loads(f.read_text())
for raw in j: for raw in j:
yield Item( yield Item(
username=config().username, username=config.username,
raw=raw, raw=raw,
dt=dt, dt=dt,
) )

View file

@ -1,36 +0,0 @@
"""
Feedbin RSS reader
"""
from .common import listify
from ._rss import Subscription
from my.config import feedbin as config
import json
from pathlib import Path
from typing import Dict, List
from datetime import datetime
from dateutil.parser import isoparse
@listify
def parse_file(f: Path):
raw = json.loads(f.read_text())
for r in raw:
yield Subscription(
# TODO created_at?
title=r['title'],
url=r['site_url'],
id=r['id'],
)
def get_states() -> Dict[datetime, List[Subscription]]:
res = {}
# TODO use get_files
for f in sorted(Path(config.export_dir).glob('*.json')):
dts = f.stem.split('_')[-1]
dt = isoparse(dts)
subs = parse_file(f)
res[dt] = subs
return res

View file

@ -2,8 +2,21 @@
Last.fm scrobbles Last.fm scrobbles
''' '''
from ..core.common import Paths
from dataclasses import dataclass
from my.config import lastfm as user_config
@dataclass
class lastfm(user_config):
"""
Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
"""
export_path: Paths
from ..core.cfg import make_config
config = make_config(lastfm)
from ..common import get_files, mcachew, Json
from datetime import datetime from datetime import datetime
import json import json
@ -12,16 +25,17 @@ from typing import NamedTuple, Any, Sequence, Iterable
import pytz import pytz
from my.config import lastfm as config from ..core.common import mcachew, Json, get_files
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
# TODO memoised properties? # TODO memoised properties?
# TODO lazy mode and eager mode? # TODO lazy mode and eager mode?
# lazy is a bit nicer in terms of more flexibility and less processing? # lazy is a bit nicer in terms of more flexibility and less processing?
# eager is a bit more explicit for error handling # eager is a bit more explicit for error handling
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
class Scrobble(NamedTuple): class Scrobble(NamedTuple):
raw: Json raw: Json
@ -54,5 +68,5 @@ def scrobbles() -> Iterable[Scrobble]:
last = max(inputs()) last = max(inputs())
j = json.loads(last.read_text()) j = json.loads(last.read_text())
for raw in j: for raw in reversed(j):
yield Scrobble(raw=raw) yield Scrobble(raw=raw)

View file

@ -21,6 +21,7 @@ _POLAR_DIR = Path('~').expanduser() / '.polar'
logger = LazyLogger(__name__) logger = LazyLogger(__name__)
# TODO use core.isoparse
def parse_dt(s: str) -> datetime: def parse_dt(s: str) -> datetime:
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ')) return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))

View file

@ -1,29 +0,0 @@
from itertools import chain
from typing import List, Dict
from ._rss import Subscription
from . import feedbin
from . import feedly
# TODO google reader?
def get_all_subscriptions() -> List[Subscription]:
"""
Keeps track of everything I ever subscribed to. It's useful to keep track of unsubscribed too
so you don't try to subscribe again (or at least take into account why you unsubscribed before)
"""
states = {}
states.update(feedly.get_states())
states.update(feedbin.get_states())
by_url: Dict[str, Subscription] = {}
for d, feeds in sorted(states.items()):
for f in feeds:
if f.url not in by_url:
by_url[f.url] = f
res = []
last = {x.url: x for x in max(states.items())[1]}
for u, x in sorted(by_url.items()):
present = u in last
res.append(x._replace(subscribed=present))
return res

11
my/rss/all.py Normal file
View file

@ -0,0 +1,11 @@
'''
Unified RSS data, merged from different services I used historically
'''
from typing import Iterable
from .common import Subscription, compute_subscriptions
def subscriptions() -> Iterable[Subscription]:
from . import feedbin, feedly
# TODO google reader?
yield from compute_subscriptions(feedbin.states(), feedly.states())

44
my/rss/common.py Normal file
View file

@ -0,0 +1,44 @@
# shared Rss stuff
from datetime import datetime
from typing import NamedTuple, Optional, List, Dict
class Subscription(NamedTuple):
title: str
url: str
id: str # TODO not sure about it...
# eh, not all of them got reasonable 'created' time
created_at: Optional[datetime]
subscribed: bool=True
from typing import Iterable, Tuple, Sequence
# snapshot of subscriptions at time
SubscriptionState = Tuple[datetime, Sequence[Subscription]]
def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]:
"""
Keeps track of everything I ever subscribed to.
In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed)
"""
from itertools import chain
states = list(chain.from_iterable(sources))
# TODO keep 'source'/'provider'/'service' attribute?
by_url: Dict[str, Subscription] = {}
# ah. dates are used for sorting
for when, state in sorted(states):
# TODO use 'when'?
for feed in state:
if feed.url not in by_url:
by_url[feed.url] = feed
_, last_state = max(states, key=lambda x: x[0])
last_urls = {f.url for f in last_state}
res = []
for u, x in sorted(by_url.items()):
present = u in last_urls
res.append(x._replace(subscribed=present))
return res

42
my/rss/feedbin.py Normal file
View file

@ -0,0 +1,42 @@
"""
Feedbin RSS reader
"""
from my.config import feedbin as config
from pathlib import Path
from typing import Sequence
from ..core.common import listify, get_files, isoparse
from .common import Subscription
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
import json
@listify
def parse_file(f: Path):
raw = json.loads(f.read_text())
for r in raw:
yield Subscription(
created_at=isoparse(r['created_at']),
title=r['title'],
url=r['site_url'],
id=r['id'],
)
from typing import Iterable
from .common import SubscriptionState
def states() -> Iterable[SubscriptionState]:
# meh
from dateutil.parser import isoparse # type: ignore
for f in inputs():
# TODO ugh. depends on my naming. not sure if useful?
dts = f.stem.split('_')[-1]
dt = isoparse(dts)
subs = parse_file(f)
yield dt, subs

View file

@ -2,16 +2,20 @@
Feedly RSS reader Feedly RSS reader
""" """
from .common import listify
from ._rss import Subscription
from my.config import feedly as config from my.config import feedly as config
import json
from pathlib import Path from pathlib import Path
from typing import Dict, List from typing import Sequence
from datetime import datetime
import pytz from ..core.common import listify, get_files, isoparse
from .common import Subscription
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
import json
@listify @listify
@ -22,19 +26,21 @@ def parse_file(f: Path):
rid = r['id'] rid = r['id']
website = r.get('website', rid) # meh website = r.get('website', rid) # meh
yield Subscription( yield Subscription(
# TODO created_at? created_at=None,
title=r['title'], title=r['title'],
url=website, url=website,
id=rid, id=rid,
) )
def get_states() -> Dict[datetime, List[Subscription]]:
res = {} from datetime import datetime
# TODO use get_files from typing import Iterable
for f in sorted(Path(config.export_dir).glob('*.json')): from .common import SubscriptionState
def states() -> Iterable[SubscriptionState]:
import pytz
for f in inputs():
dts = f.stem.split('_')[-1] dts = f.stem.split('_')[-1]
dt = datetime.strptime(dts, '%Y%m%d%H%M%S') dt = datetime.strptime(dts, '%Y%m%d%H%M%S')
dt = pytz.utc.localize(dt) dt = pytz.utc.localize(dt)
subs = parse_file(f) subs = parse_file(f)
res[dt] = subs yield dt, subs
return res

View file

@ -7,10 +7,9 @@ from . import twint
from . import archive from . import archive
from more_itertools import unique_everseen # TODO move to .common?
def merge_tweets(*sources): def merge_tweets(*sources):
from more_itertools import unique_everseen
yield from unique_everseen( yield from unique_everseen(
chain(*sources), chain(*sources),
key=lambda t: t.id_str, key=lambda t: t.id_str,

View file

@ -76,7 +76,7 @@ class feedly:
os.environ['MY_CONFIG'] = str(tmp_path) os.environ['MY_CONFIG'] = str(tmp_path)
# should not raise at least # should not raise at least
import my.feedly import my.rss.feedly
@pytest.fixture @pytest.fixture

View file

@ -54,7 +54,40 @@ def test_dynamic_config_simplenamespace(tmp_path: Path) -> None:
my.config.demo = user_config # type: ignore[misc, assignment] my.config.demo = user_config # type: ignore[misc, assignment]
from my.demo import config from my.demo import config
assert config().username == 'user3' assert config.username == 'user3'
# make sure our config handling pattern does it as expected
def test_attribute_handling(tmp_path: Path) -> None:
# doesn't work without it!
# because the config from test_dybamic_config_1 is cached in my.demo.demo
del sys.modules['my.demo']
import pytz
nytz = pytz.timezone('America/New_York')
import my.config
class user_config:
# check that override is taken into the account
timezone = nytz
irrelevant = 'hello'
username = 'UUU'
data_path = f'{tmp_path}/*.json'
my.config.demo = user_config # type: ignore[misc, assignment]
from my.demo import config
assert config.username == 'UUU'
# mypy doesn't know about it, but the attribute is there
assert getattr(config, 'irrelevant') == 'hello'
# check that overriden default attribute is actually getting overridden
assert config.timezone == nytz

View file

@ -1,3 +1,7 @@
from my.core.cachew import disable_cachew
# TODO need something nicer and integrated inside cachew..
disable_cachew() # meh
from more_itertools import ilen from more_itertools import ilen
from my.lastfm import scrobbles from my.lastfm import scrobbles
@ -5,3 +9,9 @@ from my.lastfm import scrobbles
def test(): def test():
assert ilen(scrobbles()) > 1000 assert ilen(scrobbles()) > 1000
def test_datetime_ascending():
from more_itertools import pairwise
for a, b in pairwise(scrobbles()):
assert a.dt <= b.dt