Merge pull request #48 from karlicoss/configuration

lastfmupdates: docs, lastfm, rss module
This commit is contained in:
karlicoss 2020-05-13 23:07:50 +01:00 committed by GitHub
commit d0427855e8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 358 additions and 115 deletions

View file

@ -224,7 +224,7 @@ My conclusion was using a *combined approach*:
Inheritance is a standard mechanism, which doesn't require any extra frameworks and plays well with other Python concepts. As a specific example:
,#+begin_src python
#+begin_src python
from my.config import bluemaestro as user_config
@dataclass
@ -256,24 +256,27 @@ I claim this solves pretty much everything:
- *(6)*: the dataclass header is easily readable, and it's possible to generate the docs automatically
Downsides:
- inheriting from ~user_config~ means early import of =my.config=
- inheriting from ~user_config~ means an early import of =my.config=
Generally it's better to keep everything as lazy as possible and defer loading to the first time the config is used.
This might be annoying at times, e.g. if you have a top-level import of you module, but no config.
But considering that in 99% of cases config is going to be on the disk
and it's possible to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.
# TODO demonstrate in a test?
and it's [[https://github.com/karlicoss/HPI/blob/1e6e0bd381d20437343473878c7f63b1f9d6362b/tests/demo.py#L22-L25][possible]] to do something dynamic like =del sys.modules['my.bluemastro']= to reload the config, I think it's a minor issue.
- =make_config= allows for some mypy false negatives in the user config
E.g. if you forgot =export_path= attribute, mypy would miss it. But you'd have a runtime failure, and the downstream code using config is still correctly type checked.
Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this]] is fixed.
Perhaps it will be better when [[https://github.com/python/mypy/issues/5374][this mypy issue]] is fixed.
- the =make_config= bit is a little scary and manual
However, it's extracted in a generic helper, and [[https://github.com/karlicoss/HPI/blob/d6f071e3b12ba1cd5a86ad80e3821bec004e6a6d/my/twitter/archive.py#L17][ends up pretty simple]]
# In addition, it's not even necessary if you don't have optional attributes, you can simply use the class variables (i.e. ~bluemaestro.export_path~)
# upd. ugh, you can't, it doesn't handle default attributes overriding correctly (see tests/demo.py)
# eh. basically all I need is class level dataclass??
- inheriting from ~user_config~ requires it to be a =class= rather than an =object=
A practical downside is you can't use something like ~SimpleNamespace~.

View file

@ -32,6 +32,7 @@ modules = [
('reddit' , 'my.reddit' ),
('twint' , 'my.twitter.twint' ),
('twitter', 'my.twitter.archive' ),
('lastfm' , 'my.lastfm' ),
]
def indent(s, spaces=4):
@ -105,4 +106,15 @@ for cls, p in modules:
class twitter:
export_path: Paths # path[s]/glob to the twitter archive takeout
#+end_src
- [[file:../my/lastfm][my.lastfm]]
Last.fm scrobbles
#+begin_src python
class lastfm:
"""
Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
"""
export_path: Paths
#+end_src
:end:

View file

@ -1,10 +0,0 @@
# shared Rss stuff
from typing import NamedTuple
class Subscription(NamedTuple):
# TODO date?
title: str
url: str
id: str
subscribed: bool=True

View file

@ -1,5 +1,6 @@
from glob import glob as do_glob
from pathlib import Path
from datetime import datetime
import functools
import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple
@ -219,3 +220,28 @@ class classproperty(Generic[_R]):
#
# def __get__(self) -> _R:
# return self.f()
# TODO maybe use opaque mypy alias?
tzdatetime = datetime
fromisoformat: Callable[[str], datetime]
import sys
if sys.version_info.minor >= 7:
# prevent mypy on py3.6 from complaining...
fromisoformat_real = datetime.fromisoformat # type: ignore[attr-defined]
fromisoformat = fromisoformat_real
else:
from .py37 import fromisoformat
# TODO doctests?
def isoparse(s: str) -> tzdatetime:
"""
Parses timestamps formatted like 2020-05-01T10:32:02.925961Z
"""
# TODO could use dateutil? but it's quite slow as far as I remember..
# TODO support non-utc.. somehow?
assert s.endswith('Z'), s
s = s[:-1] + '+00:00'
return fromisoformat(s)

122
my/core/py37.py Normal file
View file

@ -0,0 +1,122 @@
# borrowed from /usr/lib/python3.7/datetime.py
from datetime import datetime, timezone, timedelta
def _parse_isoformat_date(dtstr):
# It is assumed that this function will only be called with a
# string of length exactly 10, and (though this is not used) ASCII-only
year = int(dtstr[0:4])
if dtstr[4] != '-':
raise ValueError('Invalid date separator: %s' % dtstr[4])
month = int(dtstr[5:7])
if dtstr[7] != '-':
raise ValueError('Invalid date separator')
day = int(dtstr[8:10])
return [year, month, day]
def _parse_hh_mm_ss_ff(tstr):
# Parses things of the form HH[:MM[:SS[.fff[fff]]]]
len_str = len(tstr)
time_comps = [0, 0, 0, 0]
pos = 0
for comp in range(0, 3):
if (len_str - pos) < 2:
raise ValueError('Incomplete time component')
time_comps[comp] = int(tstr[pos:pos+2])
pos += 2
next_char = tstr[pos:pos+1]
if not next_char or comp >= 2:
break
if next_char != ':':
raise ValueError('Invalid time separator: %c' % next_char)
pos += 1
if pos < len_str:
if tstr[pos] != '.':
raise ValueError('Invalid microsecond component')
else:
pos += 1
len_remainder = len_str - pos
if len_remainder not in (3, 6):
raise ValueError('Invalid microsecond component')
time_comps[3] = int(tstr[pos:])
if len_remainder == 3:
time_comps[3] *= 1000
return time_comps
def _parse_isoformat_time(tstr):
# Format supported is HH[:MM[:SS[.fff[fff]]]][+HH:MM[:SS[.ffffff]]]
len_str = len(tstr)
if len_str < 2:
raise ValueError('Isoformat time too short')
# This is equivalent to re.search('[+-]', tstr), but faster
tz_pos = (tstr.find('-') + 1 or tstr.find('+') + 1)
timestr = tstr[:tz_pos-1] if tz_pos > 0 else tstr
time_comps = _parse_hh_mm_ss_ff(timestr)
tzi = None
if tz_pos > 0:
tzstr = tstr[tz_pos:]
# Valid time zone strings are:
# HH:MM len: 5
# HH:MM:SS len: 8
# HH:MM:SS.ffffff len: 15
if len(tzstr) not in (5, 8, 15):
raise ValueError('Malformed time zone string')
tz_comps = _parse_hh_mm_ss_ff(tzstr)
if all(x == 0 for x in tz_comps):
tzi = timezone.utc
else:
tzsign = -1 if tstr[tz_pos - 1] == '-' else 1
td = timedelta(hours=tz_comps[0], minutes=tz_comps[1],
seconds=tz_comps[2], microseconds=tz_comps[3])
tzi = timezone(tzsign * td)
time_comps.append(tzi)
return time_comps
def fromisoformat(date_string, cls=datetime):
"""Construct a datetime from the output of datetime.isoformat()."""
if not isinstance(date_string, str):
raise TypeError('fromisoformat: argument must be str')
# Split this at the separator
dstr = date_string[0:10]
tstr = date_string[11:]
try:
date_components = _parse_isoformat_date(dstr)
except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string)
if tstr:
try:
time_components = _parse_isoformat_time(tstr)
except ValueError:
raise ValueError('Invalid isoformat string: %s' % date_string)
else:
time_components = [0, 0, 0, 0, None]
return cls(*(date_components + time_components))

View file

@ -16,13 +16,8 @@ class demo(user_config):
username: str
timezone: tzinfo = pytz.utc
def config() -> demo:
from .core.cfg import make_config
config = make_config(demo)
return config
from .core.cfg import make_config
config = make_config(demo)
from pathlib import Path
from typing import Sequence, Iterable
@ -40,17 +35,17 @@ class Item:
def inputs() -> Sequence[Path]:
return get_files(config().data_path)
return get_files(config.data_path)
import json
def items() -> Iterable[Item]:
for f in inputs():
dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config().timezone)
dt = datetime.fromtimestamp(f.stat().st_mtime, tz=config.timezone)
j = json.loads(f.read_text())
for raw in j:
yield Item(
username=config().username,
username=config.username,
raw=raw,
dt=dt,
)

View file

@ -1,36 +0,0 @@
"""
Feedbin RSS reader
"""
from .common import listify
from ._rss import Subscription
from my.config import feedbin as config
import json
from pathlib import Path
from typing import Dict, List
from datetime import datetime
from dateutil.parser import isoparse
@listify
def parse_file(f: Path):
raw = json.loads(f.read_text())
for r in raw:
yield Subscription(
# TODO created_at?
title=r['title'],
url=r['site_url'],
id=r['id'],
)
def get_states() -> Dict[datetime, List[Subscription]]:
res = {}
# TODO use get_files
for f in sorted(Path(config.export_dir).glob('*.json')):
dts = f.stem.split('_')[-1]
dt = isoparse(dts)
subs = parse_file(f)
res[dt] = subs
return res

View file

@ -2,8 +2,21 @@
Last.fm scrobbles
'''
from ..core.common import Paths
from dataclasses import dataclass
from my.config import lastfm as user_config
@dataclass
class lastfm(user_config):
"""
Uses [[https://github.com/karlicoss/lastfm-backup][lastfm-backup]] outputs
"""
export_path: Paths
from ..core.cfg import make_config
config = make_config(lastfm)
from ..common import get_files, mcachew, Json
from datetime import datetime
import json
@ -12,16 +25,17 @@ from typing import NamedTuple, Any, Sequence, Iterable
import pytz
from my.config import lastfm as config
from ..core.common import mcachew, Json, get_files
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
# TODO memoised properties?
# TODO lazy mode and eager mode?
# lazy is a bit nicer in terms of more flexibility and less processing?
# eager is a bit more explicit for error handling
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
class Scrobble(NamedTuple):
raw: Json
@ -54,5 +68,5 @@ def scrobbles() -> Iterable[Scrobble]:
last = max(inputs())
j = json.loads(last.read_text())
for raw in j:
for raw in reversed(j):
yield Scrobble(raw=raw)

View file

@ -21,6 +21,7 @@ _POLAR_DIR = Path('~').expanduser() / '.polar'
logger = LazyLogger(__name__)
# TODO use core.isoparse
def parse_dt(s: str) -> datetime:
return pytz.utc.localize(datetime.strptime(s, '%Y-%m-%dT%H:%M:%S.%fZ'))

View file

@ -1,29 +0,0 @@
from itertools import chain
from typing import List, Dict
from ._rss import Subscription
from . import feedbin
from . import feedly
# TODO google reader?
def get_all_subscriptions() -> List[Subscription]:
"""
Keeps track of everything I ever subscribed to. It's useful to keep track of unsubscribed too
so you don't try to subscribe again (or at least take into account why you unsubscribed before)
"""
states = {}
states.update(feedly.get_states())
states.update(feedbin.get_states())
by_url: Dict[str, Subscription] = {}
for d, feeds in sorted(states.items()):
for f in feeds:
if f.url not in by_url:
by_url[f.url] = f
res = []
last = {x.url: x for x in max(states.items())[1]}
for u, x in sorted(by_url.items()):
present = u in last
res.append(x._replace(subscribed=present))
return res

11
my/rss/all.py Normal file
View file

@ -0,0 +1,11 @@
'''
Unified RSS data, merged from different services I used historically
'''
from typing import Iterable
from .common import Subscription, compute_subscriptions
def subscriptions() -> Iterable[Subscription]:
from . import feedbin, feedly
# TODO google reader?
yield from compute_subscriptions(feedbin.states(), feedly.states())

44
my/rss/common.py Normal file
View file

@ -0,0 +1,44 @@
# shared Rss stuff
from datetime import datetime
from typing import NamedTuple, Optional, List, Dict
class Subscription(NamedTuple):
title: str
url: str
id: str # TODO not sure about it...
# eh, not all of them got reasonable 'created' time
created_at: Optional[datetime]
subscribed: bool=True
from typing import Iterable, Tuple, Sequence
# snapshot of subscriptions at time
SubscriptionState = Tuple[datetime, Sequence[Subscription]]
def compute_subscriptions(*sources: Iterable[SubscriptionState]) -> List[Subscription]:
"""
Keeps track of everything I ever subscribed to.
In addition, keeps track of unsubscribed as well (so you'd remember when and why you unsubscribed)
"""
from itertools import chain
states = list(chain.from_iterable(sources))
# TODO keep 'source'/'provider'/'service' attribute?
by_url: Dict[str, Subscription] = {}
# ah. dates are used for sorting
for when, state in sorted(states):
# TODO use 'when'?
for feed in state:
if feed.url not in by_url:
by_url[feed.url] = feed
_, last_state = max(states, key=lambda x: x[0])
last_urls = {f.url for f in last_state}
res = []
for u, x in sorted(by_url.items()):
present = u in last_urls
res.append(x._replace(subscribed=present))
return res

42
my/rss/feedbin.py Normal file
View file

@ -0,0 +1,42 @@
"""
Feedbin RSS reader
"""
from my.config import feedbin as config
from pathlib import Path
from typing import Sequence
from ..core.common import listify, get_files, isoparse
from .common import Subscription
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
import json
@listify
def parse_file(f: Path):
raw = json.loads(f.read_text())
for r in raw:
yield Subscription(
created_at=isoparse(r['created_at']),
title=r['title'],
url=r['site_url'],
id=r['id'],
)
from typing import Iterable
from .common import SubscriptionState
def states() -> Iterable[SubscriptionState]:
# meh
from dateutil.parser import isoparse # type: ignore
for f in inputs():
# TODO ugh. depends on my naming. not sure if useful?
dts = f.stem.split('_')[-1]
dt = isoparse(dts)
subs = parse_file(f)
yield dt, subs

View file

@ -2,16 +2,20 @@
Feedly RSS reader
"""
from .common import listify
from ._rss import Subscription
from my.config import feedly as config
import json
from pathlib import Path
from typing import Dict, List
from datetime import datetime
import pytz
from typing import Sequence
from ..core.common import listify, get_files, isoparse
from .common import Subscription
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
import json
@listify
@ -22,19 +26,21 @@ def parse_file(f: Path):
rid = r['id']
website = r.get('website', rid) # meh
yield Subscription(
# TODO created_at?
created_at=None,
title=r['title'],
url=website,
id=rid,
)
def get_states() -> Dict[datetime, List[Subscription]]:
res = {}
# TODO use get_files
for f in sorted(Path(config.export_dir).glob('*.json')):
from datetime import datetime
from typing import Iterable
from .common import SubscriptionState
def states() -> Iterable[SubscriptionState]:
import pytz
for f in inputs():
dts = f.stem.split('_')[-1]
dt = datetime.strptime(dts, '%Y%m%d%H%M%S')
dt = pytz.utc.localize(dt)
subs = parse_file(f)
res[dt] = subs
return res
yield dt, subs

View file

@ -7,10 +7,9 @@ from . import twint
from . import archive
from more_itertools import unique_everseen
# TODO move to .common?
def merge_tweets(*sources):
from more_itertools import unique_everseen
yield from unique_everseen(
chain(*sources),
key=lambda t: t.id_str,

View file

@ -76,7 +76,7 @@ class feedly:
os.environ['MY_CONFIG'] = str(tmp_path)
# should not raise at least
import my.feedly
import my.rss.feedly
@pytest.fixture

View file

@ -54,7 +54,40 @@ def test_dynamic_config_simplenamespace(tmp_path: Path) -> None:
my.config.demo = user_config # type: ignore[misc, assignment]
from my.demo import config
assert config().username == 'user3'
assert config.username == 'user3'
# make sure our config handling pattern does it as expected
def test_attribute_handling(tmp_path: Path) -> None:
# doesn't work without it!
# because the config from test_dybamic_config_1 is cached in my.demo.demo
del sys.modules['my.demo']
import pytz
nytz = pytz.timezone('America/New_York')
import my.config
class user_config:
# check that override is taken into the account
timezone = nytz
irrelevant = 'hello'
username = 'UUU'
data_path = f'{tmp_path}/*.json'
my.config.demo = user_config # type: ignore[misc, assignment]
from my.demo import config
assert config.username == 'UUU'
# mypy doesn't know about it, but the attribute is there
assert getattr(config, 'irrelevant') == 'hello'
# check that overriden default attribute is actually getting overridden
assert config.timezone == nytz

View file

@ -1,3 +1,7 @@
from my.core.cachew import disable_cachew
# TODO need something nicer and integrated inside cachew..
disable_cachew() # meh
from more_itertools import ilen
from my.lastfm import scrobbles
@ -5,3 +9,9 @@ from my.lastfm import scrobbles
def test():
assert ilen(scrobbles()) > 1000
def test_datetime_ascending():
from more_itertools import pairwise
for a, b in pairwise(scrobbles()):
assert a.dt <= b.dt