cachy extraction

This commit is contained in:
Dima Gerasimov 2019-04-23 22:10:25 +01:00
parent 39adb48044
commit 687e008f13

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any, Sequence
import json import json
from functools import lru_cache
from collections import OrderedDict from collections import OrderedDict
from pathlib import Path from pathlib import Path
import pytz import pytz
@ -10,6 +11,7 @@ import logging
from multiprocessing import Pool from multiprocessing import Pool
from kython import kompress, cproperty, make_dict from kython import kompress, cproperty, make_dict
from kython.klogging import setup_logzero
# TODO hmm. apparently decompressing takes quite a bit of time... # TODO hmm. apparently decompressing takes quite a bit of time...
@ -22,8 +24,8 @@ def reddit(suffix: str) -> str:
return 'https://reddit.com' + suffix return 'https://reddit.com' + suffix
def _get_backups(all_=True) -> List[Path]: def _get_backups(all_=True) -> Sequence[Path]:
bfiles = list(sorted(BPATH.glob('reddit-*.json.xz'))) bfiles = tuple(sorted(BPATH.glob('reddit-*.json.xz'))) # TODO switch to that new compression format?
if all_: if all_:
return bfiles return bfiles
else: else:
@ -139,9 +141,9 @@ def get_state(bfile: Path) -> Dict[Sid, Save]:
return OrderedDict() return OrderedDict()
def get_events(all_=True, parallel=True) -> List[Event]: @lru_cache(1)
backups = _get_backups(all_=all_) def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]:
assert len(backups) > 0 logger = get_logger()
events: List[Event] = [] events: List[Event] = []
prev_saves: Dict[Sid, Save] = {} prev_saves: Dict[Sid, Save] = {}
@ -188,10 +190,15 @@ def get_events(all_=True, parallel=True) -> List[Event]:
# TODO a bit awkward, favorited should compare lower than unfavorited? # TODO a bit awkward, favorited should compare lower than unfavorited?
return list(sorted(events, key=lambda e: e.cmp_key)) return list(sorted(events, key=lambda e: e.cmp_key))
def get_saves(all_=True) -> List[Save]: def get_events(*args, all_=True, parallel=True):
backups = _get_backups(all_=all_)
assert len(backups) > 0
return _get_events(backups=backups, parallel=parallel)
def get_saves(**kwargs) -> List[Save]:
logger = get_logger() logger = get_logger()
events = get_events(all_=all_) events = get_events(**kwargs)
saves: Dict[Sid, Save] = OrderedDict() saves: Dict[Sid, Save] = OrderedDict()
for e in events: for e in events:
if e.text.startswith('favorited'): if e.text.startswith('favorited'):
@ -212,8 +219,6 @@ def test():
get_saves(all_=False) get_saves(all_=False)
# TODO fuck. pytest is broken??
# right, apparently I need pytest.ini file...
def test_unfav(): def test_unfav():
events = get_events(all_=True) events = get_events(all_=True)
url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/'
@ -224,14 +229,27 @@ def test_unfav():
uf = uevents[1] uf = uevents[1]
assert uf.text == 'unfavorited' assert uf.text == 'unfavorited'
def test_get_all_saves(): def test_get_all_saves():
saves = get_saves(all_=True) saves = get_saves(all_=True)
# just check that they are unique.. # just check that they are unique..
make_dict(saves, key=lambda s: s.sid) make_dict(saves, key=lambda s: s.sid)
# TODO cache?
def test_disappearing():
# eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason
# but I guess it was just a short glitch... so whatever
saves = get_events(all_=True)
favs = [s.kind for s in saves if s.text == 'favorited']
[deal_with_it] = [f for f in favs if f.title == '"Deal with it!"']
assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc)
def main(): def main():
events = get_events() setup_logzero(get_logger(), level=logging.DEBUG)
# TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM...
events = get_events(parallel=False)
print(len(events)) print(len(events))
for e in events: for e in events:
print(e.text, e.url) print(e.text, e.url)