From 48eec1868f34996a5a4bad02308658a58daf0316 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 4 Sep 2018 17:53:37 +0400 Subject: [PATCH 01/19] initial --- .gitignore | 178 +++++++++++++++++++++++++++++++++++++++++++++ ci.sh | 10 +++ reddit/__init__.py | 0 reddit/__main__.py | 0 run | 6 ++ 5 files changed, 194 insertions(+) create mode 100644 .gitignore create mode 100755 ci.sh create mode 100644 reddit/__init__.py create mode 100644 reddit/__main__.py create mode 100755 run diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6fdb0e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,178 @@ + +# Created by https://www.gitignore.io/api/emacs,python + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +### Python Patch ### +.venv/ + +### Python.VirtualEnv Stack ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + + +# End of https://www.gitignore.io/api/emacs,python diff --git a/ci.sh b/ci.sh new file mode 100755 index 0000000..d526562 --- /dev/null +++ b/ci.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +cd "$(this_dir)" || exit + +. ~/bash_ci + +ci_run mypy reddit +ci_run pylint -E reddit + +ci_report_errors diff --git a/reddit/__init__.py b/reddit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reddit/__main__.py b/reddit/__main__.py new file mode 100644 index 0000000..e69de29 diff --git a/run b/run new file mode 100755 index 0000000..a9f51cc --- /dev/null +++ b/run @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +cd "$(dirname "$0")" + +python3 -m location From a4693b9b44a4989a31bcc8017aecb3f6710324ba Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 4 Sep 2018 19:32:13 +0400 Subject: [PATCH 02/19] initial --- reddit/__init__.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++ reddit/__main__.py | 6 ++++ run | 2 +- 3 files changed, 94 insertions(+), 1 deletion(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index e69de29..1788e42 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -0,0 +1,87 @@ +from typing import List, Dict, Union, Iterable, Iterator, NamedTuple + +BPATH = "/L/backups/reddit" + +def iter_backups() -> Iterator[str]: + import os + for f in sorted(os.listdir(BPATH)): + if f.endswith('.json'): + yield os.path.join(BPATH, f) + + +from datetime import datetime + +class Save(NamedTuple): + dt: datetime + link: str + +class Misc(NamedTuple): + pass + +EventKind = Union[Save, Misc] + +class Event(NamedTuple): + dt: datetime + text: str + kind: EventKind + +from kython import JSONType, json_load + +def get_some(d, *keys): + for k in keys: + v = d.get(k, None) + if v is not None: + return v + else: + return None + + +def get_state(bfile: str): + saves: Dict[str, Save] = {} + json: JSONType + with open(bfile, 'r') as fo: + json = json_load(fo) + + saved = json['saved'] + for s in saved: + dt = datetime.utcfromtimestamp(s['created_utc']) + link = get_some(s, 'link_permalink', 'url') # TODO link title or title + save = Save(dt=dt, link=link) + saves[save.link] = save + + # "created_utc": 1535055017.0, + # link_title + # link_text + return saves + + +def get_events(): + backups = list(iter_backups()) + + events: List[Event] = [] + prev_saves: Dict[str, Save] = {} + # TODO suppress first batch?? + + for b in backups: # TODO when date... + saves = get_state(b) + for l in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): + if l in prev_saves: + s = prev_saves[l] + # TODO use backup date, that is more precise... + events.append(Event( + dt=s.dt, + text=f"Unfavorited {s.link}", + kind=s, + )) + else: # in saves + s = saves[l] + events.append(Event( + dt=s.dt, + text=f"Favorited {s.link}", + kind=s, + )) + prev_saves = saves + + return list(sorted(events, key=lambda e: e.dt)) + + diff --git a/reddit/__main__.py b/reddit/__main__.py index e69de29..fea3d2d 100644 --- a/reddit/__main__.py +++ b/reddit/__main__.py @@ -0,0 +1,6 @@ +from reddit import get_events + +import sys, ipdb, traceback; exec("def info(type, value, tb):\n traceback.print_exception(type, value, tb)\n ipdb.pm()"); sys.excepthook = info # type: ignore + +for e in get_events(): + print(e) diff --git a/run b/run index a9f51cc..4a4b7e5 100755 --- a/run +++ b/run @@ -3,4 +3,4 @@ set -eu cd "$(dirname "$0")" -python3 -m location +python3 -m reddit From 661cb381ebd1e90afb6bb6e31ebb2017fe7b443c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 4 Sep 2018 20:44:49 +0400 Subject: [PATCH 03/19] more stuff, changed events --- reddit/__init__.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index 1788e42..740c7d7 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -1,4 +1,5 @@ from typing import List, Dict, Union, Iterable, Iterator, NamedTuple +import pytz BPATH = "/L/backups/reddit" @@ -44,7 +45,7 @@ def get_state(bfile: str): saved = json['saved'] for s in saved: - dt = datetime.utcfromtimestamp(s['created_utc']) + dt = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) link = get_some(s, 'link_permalink', 'url') # TODO link title or title save = Save(dt=dt, link=link) saves[save.link] = save @@ -55,29 +56,44 @@ def get_state(bfile: str): return saves +import re + +RE = re.compile(r'reddit-(\d{14}).json') + def get_events(): backups = list(iter_backups()) events: List[Event] = [] prev_saves: Dict[str, Save] = {} # TODO suppress first batch?? + # TODO for initial batch, treat event time as creation time - for b in backups: # TODO when date... + for i, b in enumerate(backups): # TODO when date... + btime = pytz.utc.localize(datetime.strptime(RE.search(b).group(1), "%Y%m%d%H%M%S")) + + first = i == 0 saves = get_state(b) + + def etime(dt: datetime): + if first: + return dt + else: + return btime + for l in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): if l in prev_saves: s = prev_saves[l] # TODO use backup date, that is more precise... events.append(Event( - dt=s.dt, + dt=etime(s.dt), text=f"Unfavorited {s.link}", kind=s, )) else: # in saves s = saves[l] events.append(Event( - dt=s.dt, - text=f"Favorited {s.link}", + dt=etime(s.dt), + text=f"Favorited {s.link} {' [initial]' if first else ''}", kind=s, )) prev_saves = saves From 273c71587f151ae6aee03deafd81239b0321ad8b Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 4 Sep 2018 22:11:49 +0400 Subject: [PATCH 04/19] extract from xz archive --- reddit/__init__.py | 17 ++++++++--------- reddit/__main__.py | 2 -- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index 740c7d7..e9e6d99 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -3,10 +3,14 @@ import pytz BPATH = "/L/backups/reddit" + +import re +RE = re.compile(r'reddit-(\d{14}).json.xz') + def iter_backups() -> Iterator[str]: import os for f in sorted(os.listdir(BPATH)): - if f.endswith('.json'): + if RE.match(f): yield os.path.join(BPATH, f) @@ -26,7 +30,7 @@ class Event(NamedTuple): text: str kind: EventKind -from kython import JSONType, json_load +from kython import JSONType, load_json_file def get_some(d, *keys): for k in keys: @@ -39,9 +43,7 @@ def get_some(d, *keys): def get_state(bfile: str): saves: Dict[str, Save] = {} - json: JSONType - with open(bfile, 'r') as fo: - json = json_load(fo) + json: JSONType = load_json_file(bfile) saved = json['saved'] for s in saved: @@ -56,12 +58,9 @@ def get_state(bfile: str): return saves -import re - -RE = re.compile(r'reddit-(\d{14}).json') - def get_events(): backups = list(iter_backups()) + assert len(backups) > 0 events: List[Event] = [] prev_saves: Dict[str, Save] = {} diff --git a/reddit/__main__.py b/reddit/__main__.py index fea3d2d..811327b 100644 --- a/reddit/__main__.py +++ b/reddit/__main__.py @@ -1,6 +1,4 @@ from reddit import get_events -import sys, ipdb, traceback; exec("def info(type, value, tb):\n traceback.print_exception(type, value, tb)\n ipdb.pm()"); sys.excepthook = info # type: ignore - for e in get_events(): print(e) From 1143c07a1b432126efeeb1ba3947f624563af233 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 3 Feb 2019 22:40:24 +0000 Subject: [PATCH 05/19] add link and title to reddit --- reddit/__init__.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index e9e6d99..993b07e 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -18,7 +18,9 @@ from datetime import datetime class Save(NamedTuple): dt: datetime - link: str + title: str + url: str + sid: str class Misc(NamedTuple): pass @@ -29,6 +31,9 @@ class Event(NamedTuple): dt: datetime text: str kind: EventKind + eid: str + title: str + url: str from kython import JSONType, load_json_file @@ -48,9 +53,15 @@ def get_state(bfile: str): saved = json['saved'] for s in saved: dt = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) - link = get_some(s, 'link_permalink', 'url') # TODO link title or title - save = Save(dt=dt, link=link) - saves[save.link] = save + url = get_some(s, 'link_permalink', 'url') + title = get_some(s, 'link_title', 'title') + save = Save( + dt=dt, + title=title, + url=url, + sid=s['id'], + ) + saves[save.url] = save # "created_utc": 1535055017.0, # link_title @@ -85,15 +96,21 @@ def get_events(): # TODO use backup date, that is more precise... events.append(Event( dt=etime(s.dt), - text=f"Unfavorited {s.link}", + text=f"unfavorited", kind=s, + eid=f'unf-{s.sid}', + url=s.url, + title=s.title, )) else: # in saves s = saves[l] events.append(Event( dt=etime(s.dt), - text=f"Favorited {s.link} {' [initial]' if first else ''}", + text=f"favorited {' [initial]' if first else ''}", kind=s, + eid=f'fav-{s.sid}', + url=s.url, + title=s.title, )) prev_saves = saves From 073c19bf5d29d8b39d73cc051f576a92422298fa Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 12 Mar 2019 12:09:17 +0000 Subject: [PATCH 06/19] make ruci happy --- reddit/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index 993b07e..e9251ad 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -1,4 +1,6 @@ from typing import List, Dict, Union, Iterable, Iterator, NamedTuple +import json +from pathlib import Path import pytz BPATH = "/L/backups/reddit" @@ -48,9 +50,10 @@ def get_some(d, *keys): def get_state(bfile: str): saves: Dict[str, Save] = {} - json: JSONType = load_json_file(bfile) + with Path(bfile).open() as fo: + jj = json.load(fo) - saved = json['saved'] + saved = jj['saved'] for s in saved: dt = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) url = get_some(s, 'link_permalink', 'url') @@ -79,7 +82,9 @@ def get_events(): # TODO for initial batch, treat event time as creation time for i, b in enumerate(backups): # TODO when date... - btime = pytz.utc.localize(datetime.strptime(RE.search(b).group(1), "%Y%m%d%H%M%S")) + match = RE.search(b) + assert match is not None + btime = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) first = i == 0 saves = get_state(b) From ef270f4d019a9525a54f1c05c7803aa31f267a06 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 14 Mar 2019 20:11:30 +0000 Subject: [PATCH 07/19] Fix; use kython.kompress, nicer code using Path --- ci.sh | 10 --------- reddit/__init__.py | 53 +++++++++++++++++++++++++++++----------------- reddit/__main__.py | 4 ---- run | 6 ------ 4 files changed, 33 insertions(+), 40 deletions(-) delete mode 100755 ci.sh delete mode 100644 reddit/__main__.py delete mode 100755 run diff --git a/ci.sh b/ci.sh deleted file mode 100755 index d526562..0000000 --- a/ci.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -cd "$(this_dir)" || exit - -. ~/bash_ci - -ci_run mypy reddit -ci_run pylint -E reddit - -ci_report_errors diff --git a/reddit/__init__.py b/reddit/__init__.py index e9251ad..de71442 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -2,22 +2,23 @@ from typing import List, Dict, Union, Iterable, Iterator, NamedTuple import json from pathlib import Path import pytz - -BPATH = "/L/backups/reddit" - - import re -RE = re.compile(r'reddit-(\d{14}).json.xz') - -def iter_backups() -> Iterator[str]: - import os - for f in sorted(os.listdir(BPATH)): - if RE.match(f): - yield os.path.join(BPATH, f) - - from datetime import datetime +from kython import kompress + + +BPATH = Path("/L/backups/reddit") + + +def _get_backups(all_=True) -> List[Path]: + bfiles = list(sorted(BPATH.glob('reddit-*.json.xz'))) + if all_: + return bfiles + else: + return bfiles[-1:] + + class Save(NamedTuple): dt: datetime title: str @@ -37,8 +38,8 @@ class Event(NamedTuple): title: str url: str -from kython import JSONType, load_json_file +# TODO kython? def get_some(d, *keys): for k in keys: v = d.get(k, None) @@ -48,9 +49,9 @@ def get_some(d, *keys): return None -def get_state(bfile: str): +def get_state(bfile: Path): saves: Dict[str, Save] = {} - with Path(bfile).open() as fo: + with kompress.open(bfile) as fo: jj = json.load(fo) saved = jj['saved'] @@ -68,12 +69,12 @@ def get_state(bfile: str): # "created_utc": 1535055017.0, # link_title - # link_text + # link_text return saves -def get_events(): - backups = list(iter_backups()) +def get_events(all_=True): + backups = _get_backups(all_=all_) assert len(backups) > 0 events: List[Event] = [] @@ -81,8 +82,9 @@ def get_events(): # TODO suppress first batch?? # TODO for initial batch, treat event time as creation time + RE = re.compile(r'reddit-(\d{14})') for i, b in enumerate(backups): # TODO when date... - match = RE.search(b) + match = RE.search(b.stem) assert match is not None btime = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) @@ -122,3 +124,14 @@ def get_events(): return list(sorted(events, key=lambda e: e.dt)) +def test(): + get_events(all_=False) + + +def main(): + for e in get_events(): + print(e) + + +if __name__ == '__main__': + main() diff --git a/reddit/__main__.py b/reddit/__main__.py deleted file mode 100644 index 811327b..0000000 --- a/reddit/__main__.py +++ /dev/null @@ -1,4 +0,0 @@ -from reddit import get_events - -for e in get_events(): - print(e) diff --git a/run b/run deleted file mode 100755 index 4a4b7e5..0000000 --- a/run +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -eu - -cd "$(dirname "$0")" - -python3 -m reddit From aac4807b5ddc8c2832c25374ddbab055db4df698 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 14 Mar 2019 20:59:41 +0000 Subject: [PATCH 08/19] extract text, use cproperty, function to get saves --- reddit/__init__.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index de71442..a5ad170 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -1,11 +1,12 @@ -from typing import List, Dict, Union, Iterable, Iterator, NamedTuple +from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any import json +from collections import OrderedDict from pathlib import Path import pytz import re from datetime import datetime -from kython import kompress +from kython import kompress, cproperty BPATH = Path("/L/backups/reddit") @@ -24,6 +25,24 @@ class Save(NamedTuple): title: str url: str sid: str + json: Any = None + # TODO subreddit-display name? + + def __hash__(self): + return hash(self.sid) + + @cproperty + def text(self) -> str: + bb = self.json.get('body', None) + st = self.json.get('selftext', None) + if bb is not None and st is not None: + raise RuntimeError(f'wtf, both body and selftext are not None: {bb}; {st}') + return bb or st + + @cproperty + def subreddit(self) -> str: + return self.json['subreddit']['display_name'] + class Misc(NamedTuple): pass @@ -41,6 +60,7 @@ class Event(NamedTuple): # TODO kython? def get_some(d, *keys): + # TODO only one should be non None?? for k in keys: v = d.get(k, None) if v is not None: @@ -49,8 +69,11 @@ def get_some(d, *keys): return None -def get_state(bfile: Path): - saves: Dict[str, Save] = {} +Url = str + +# TODO OrderedDict +def get_state(bfile: Path) -> Dict[Url, Save]: + saves: Dict[Url, Save] = {} with kompress.open(bfile) as fo: jj = json.load(fo) @@ -64,13 +87,14 @@ def get_state(bfile: Path): title=title, url=url, sid=s['id'], + json=s, ) saves[save.url] = save # "created_utc": 1535055017.0, # link_title # link_text - return saves + return OrderedDict(sorted(saves.items(), key=lambda p: p[1].dt)) def get_events(all_=True): @@ -123,9 +147,20 @@ def get_events(all_=True): return list(sorted(events, key=lambda e: e.dt)) +def get_saves(all_=True) -> List[Save]: + # TODO hmm.... do we want ALL reddit saves I ever had? + # TODO for now even last ones would be ok + assert all_ is False, 'all saves are not supported yet...' + backups = _get_backups(all_=all_) + [backup] = backups + + saves = get_state(backup) + return list(saves.values()) + def test(): get_events(all_=False) + get_saves(all_=False) def main(): From b9587939caa5b9b28d4fb96f6eb60a0b1cf8d97c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 14 Mar 2019 21:39:55 +0000 Subject: [PATCH 09/19] fix ruci --- reddit/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reddit/__init__.py b/reddit/__init__.py index a5ad170..6b876ed 100644 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -41,6 +41,8 @@ class Save(NamedTuple): @cproperty def subreddit(self) -> str: + assert self.json is not None + # pylint: disable=unsubscriptable-object return self.json['subreddit']['display_name'] From ee99518cf5cb34914279e07c02430f5caf0c06e8 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 16 Mar 2019 10:56:29 +0000 Subject: [PATCH 10/19] fix saves retrieval, use save id instead of url --- reddit/__init__.py | 85 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 26 deletions(-) mode change 100644 => 100755 reddit/__init__.py diff --git a/reddit/__init__.py b/reddit/__init__.py old mode 100644 new mode 100755 index 6b876ed..bfd112c --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any import json from collections import OrderedDict @@ -6,11 +7,15 @@ import pytz import re from datetime import datetime -from kython import kompress, cproperty +from kython import kompress, cproperty, make_dict +# TODO hmm. apparently decompressing takes quite a bit of time... BPATH = Path("/L/backups/reddit") +def reddit(suffix: str) -> str: + return 'https://reddit.com' + suffix + def _get_backups(all_=True) -> List[Path]: bfiles = list(sorted(BPATH.glob('reddit-*.json.xz'))) @@ -23,14 +28,18 @@ def _get_backups(all_=True) -> List[Path]: class Save(NamedTuple): dt: datetime title: str - url: str sid: str json: Any = None - # TODO subreddit-display name? def __hash__(self): return hash(self.sid) + @cproperty + def url(self) -> str: + # pylint: disable=unsubscriptable-object + pl = self.json['permalink'] + return reddit(pl) + @cproperty def text(self) -> str: bb = self.json.get('body', None) @@ -59,6 +68,10 @@ class Event(NamedTuple): title: str url: str + @property + def cmp_key(self): + return (self.dt, (1 if 'unfavorited' in self.text else 0)) + # TODO kython? def get_some(d, *keys): @@ -73,33 +86,34 @@ def get_some(d, *keys): Url = str -# TODO OrderedDict +# TODO shit. there does seem to be a difference... def get_state(bfile: Path) -> Dict[Url, Save]: - saves: Dict[Url, Save] = {} + saves: List[Save] = [] with kompress.open(bfile) as fo: jj = json.load(fo) saved = jj['saved'] for s in saved: dt = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) - url = get_some(s, 'link_permalink', 'url') + # TODO need permalink + # url = get_some(s, 'link_permalink', 'url') # this was original url... title = get_some(s, 'link_title', 'title') save = Save( dt=dt, title=title, - url=url, sid=s['id'], json=s, ) - saves[save.url] = save + saves.append(save) - # "created_utc": 1535055017.0, - # link_title - # link_text - return OrderedDict(sorted(saves.items(), key=lambda p: p[1].dt)) + return make_dict( + sorted(saves, key=lambda p: p.dt), # TODO make helper to create lambda from property? + key=lambda s: s.sid, + ) + return OrderedDict() -def get_events(all_=True): +def get_events(all_=True) -> List[Event]: backups = _get_backups(all_=all_) assert len(backups) > 0 @@ -123,23 +137,23 @@ def get_events(all_=True): else: return btime - for l in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): - if l in prev_saves: - s = prev_saves[l] + for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): + ps = prev_saves.get(key, None) + if ps is not None: # TODO use backup date, that is more precise... events.append(Event( - dt=etime(s.dt), + dt=etime(ps.dt), text=f"unfavorited", - kind=s, - eid=f'unf-{s.sid}', - url=s.url, - title=s.title, + kind=ps, + eid=f'unf-{ps.sid}', + url=ps.url, + title=ps.title, )) else: # in saves - s = saves[l] + s = saves[key] events.append(Event( dt=etime(s.dt), - text=f"favorited {' [initial]' if first else ''}", + text=f"favorited {'[initial]' if first else ''}", kind=s, eid=f'fav-{s.sid}', url=s.url, @@ -147,7 +161,8 @@ def get_events(all_=True): )) prev_saves = saves - return list(sorted(events, key=lambda e: e.dt)) + # TODO a bit awkward, favorited should compare lower than unfavorited? + return list(sorted(events, key=lambda e: e.cmp_key)) def get_saves(all_=True) -> List[Save]: # TODO hmm.... do we want ALL reddit saves I ever had? @@ -165,9 +180,27 @@ def test(): get_saves(all_=False) +# TODO fuck. pytest is broken?? +def test_unfav(): + events = get_events(all_=True) + url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' + uevents = [e for e in events if e.url == url] + assert len(uevents) == 2 + ff = uevents[0] + assert ff.text == 'favorited [initial]' + uf = uevents[1] + assert uf.text == 'unfavorited' + + def main(): - for e in get_events(): - print(e) + events = get_events() + print(len(events)) + for e in events: + print(e.text, e.url) + # for e in get_ + # 509 with urls.. + # for e in get_events(): + # print(e) if __name__ == '__main__': From 3541d2a55aca69391613a84d02a2798115c6ef4a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 16 Mar 2019 11:01:03 +0000 Subject: [PATCH 11/19] add support for getting all saves --- reddit/__init__.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index bfd112c..67b2977 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -165,14 +165,15 @@ def get_events(all_=True) -> List[Event]: return list(sorted(events, key=lambda e: e.cmp_key)) def get_saves(all_=True) -> List[Save]: - # TODO hmm.... do we want ALL reddit saves I ever had? - # TODO for now even last ones would be ok - assert all_ is False, 'all saves are not supported yet...' - backups = _get_backups(all_=all_) - [backup] = backups - - saves = get_state(backup) - return list(saves.values()) + events = get_events(all_=all_) + saves = [] + for e in events: + if e.text.startswith('favorited'): + ss = e.kind + assert isinstance(ss, Save) + saves.append(ss) + assert len(saves) > 0 + return saves def test(): From ea06df9be0968e9e6eb252f62f7d70a9d9bd8ff1 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 16 Mar 2019 13:16:21 +0000 Subject: [PATCH 12/19] remove duplicate saves --- reddit/__init__.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index 67b2977..f57e342 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -6,6 +6,7 @@ from pathlib import Path import pytz import re from datetime import datetime +import logging from kython import kompress, cproperty, make_dict @@ -13,6 +14,9 @@ from kython import kompress, cproperty, make_dict BPATH = Path("/L/backups/reddit") +def get_logger(): + return logging.getLogger('reddit-provider') + def reddit(suffix: str) -> str: return 'https://reddit.com' + suffix @@ -24,11 +28,12 @@ def _get_backups(all_=True) -> List[Path]: else: return bfiles[-1:] +Sid = str class Save(NamedTuple): dt: datetime title: str - sid: str + sid: Sid json: Any = None def __hash__(self): @@ -87,7 +92,8 @@ def get_some(d, *keys): Url = str # TODO shit. there does seem to be a difference... -def get_state(bfile: Path) -> Dict[Url, Save]: +# TODO do it in multiple threads?? +def get_state(bfile: Path) -> Dict[Sid, Save]: saves: List[Save] = [] with kompress.open(bfile) as fo: jj = json.load(fo) @@ -118,7 +124,7 @@ def get_events(all_=True) -> List[Event]: assert len(backups) > 0 events: List[Event] = [] - prev_saves: Dict[str, Save] = {} + prev_saves: Dict[Sid, Save] = {} # TODO suppress first batch?? # TODO for initial batch, treat event time as creation time @@ -165,15 +171,22 @@ def get_events(all_=True) -> List[Event]: return list(sorted(events, key=lambda e: e.cmp_key)) def get_saves(all_=True) -> List[Save]: + logger = get_logger() + events = get_events(all_=all_) - saves = [] + saves: Dict[Sid, Save] = OrderedDict() for e in events: if e.text.startswith('favorited'): ss = e.kind assert isinstance(ss, Save) - saves.append(ss) + if ss.sid in saves: + # apparently we can get duplicates if we saved/unsaved multiple times... + logger.warning(f'ignoring duplicate save %s, title %s, url %s', ss.sid, ss.title, ss.url) + else: + saves[ss.sid] = ss assert len(saves) > 0 - return saves + + return list(saves.values()) def test(): @@ -192,6 +205,11 @@ def test_unfav(): uf = uevents[1] assert uf.text == 'unfavorited' +def test_get_all_saves(): + saves = get_saves(all_=True) + # just check that they are unique.. + make_dict(saves, key=lambda s: s.sid) + def main(): events = get_events() From b429ef23f6050ef9aa526825ca95fc7997121f01 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 16 Mar 2019 13:25:33 +0000 Subject: [PATCH 13/19] use multiprocessing to speed up unpacking --- reddit/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index f57e342..4e6e10d 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -7,6 +7,7 @@ import pytz import re from datetime import datetime import logging +from multiprocessing import Pool from kython import kompress, cproperty, make_dict @@ -128,14 +129,16 @@ def get_events(all_=True) -> List[Event]: # TODO suppress first batch?? # TODO for initial batch, treat event time as creation time + with Pool() as p: + states = p.map(get_state, backups) + RE = re.compile(r'reddit-(\d{14})') - for i, b in enumerate(backups): # TODO when date... + for i, (b, saves) in enumerate(zip(backups, states)): # TODO when date... match = RE.search(b.stem) assert match is not None btime = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) first = i == 0 - saves = get_state(b) def etime(dt: datetime): if first: From 1ad2a5bffabb5d9077ce278e224017e974c7a4a7 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 17 Mar 2019 15:09:24 +0000 Subject: [PATCH 14/19] make parallel processing optional --- reddit/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index 4e6e10d..ac8d480 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -120,7 +120,7 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: return OrderedDict() -def get_events(all_=True) -> List[Event]: +def get_events(all_=True, parallel=True) -> List[Event]: backups = _get_backups(all_=all_) assert len(backups) > 0 @@ -129,8 +129,13 @@ def get_events(all_=True) -> List[Event]: # TODO suppress first batch?? # TODO for initial batch, treat event time as creation time - with Pool() as p: - states = p.map(get_state, backups) + states: Iterable[Dict[Sid, Save]] + if parallel: + with Pool() as p: + states = p.map(get_state, backups) + else: + # also make it lazy... + states = map(get_state, backups) RE = re.compile(r'reddit-(\d{14})') for i, (b, saves) in enumerate(zip(backups, states)): # TODO when date... From 6e7029b74d47ef5e216dd6e68cfd423127891113 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 17 Mar 2019 16:06:42 +0000 Subject: [PATCH 15/19] add save_dt computation to Save --- reddit/__init__.py | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index ac8d480..a4c7abe 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -32,7 +32,8 @@ def _get_backups(all_=True) -> List[Path]: Sid = str class Save(NamedTuple): - dt: datetime + dt: datetime # TODO misleading name... this is creation dt, not saving dt + backup_dt: datetime title: str sid: Sid json: Any = None @@ -40,6 +41,11 @@ class Save(NamedTuple): def __hash__(self): return hash(self.sid) + @cproperty + def save_dt(self) -> datetime: + assert self.dt <= self.backup_dt + return max(self.dt, self.backup_dt) + @cproperty def url(self) -> str: # pylint: disable=unsubscriptable-object @@ -95,6 +101,11 @@ Url = str # TODO shit. there does seem to be a difference... # TODO do it in multiple threads?? def get_state(bfile: Path) -> Dict[Sid, Save]: + RE = re.compile(r'reddit-(\d{14})') + match = RE.search(bfile.stem) + assert match is not None + bdt = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) + saves: List[Save] = [] with kompress.open(bfile) as fo: jj = json.load(fo) @@ -107,6 +118,7 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: title = get_some(s, 'link_title', 'title') save = Save( dt=dt, + backup_dt=bdt, title=title, sid=s['id'], json=s, @@ -137,26 +149,17 @@ def get_events(all_=True, parallel=True) -> List[Event]: # also make it lazy... states = map(get_state, backups) - RE = re.compile(r'reddit-(\d{14})') - for i, (b, saves) in enumerate(zip(backups, states)): # TODO when date... - match = RE.search(b.stem) - assert match is not None - btime = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) + for i, saves in enumerate(states): # TODO when date... first = i == 0 - def etime(dt: datetime): - if first: - return dt - else: - return btime - for key in set(prev_saves.keys()).symmetric_difference(set(saves.keys())): ps = prev_saves.get(key, None) if ps is not None: # TODO use backup date, that is more precise... + # eh. I guess just take max and it will always be correct? events.append(Event( - dt=etime(ps.dt), + dt=ps.save_dt, text=f"unfavorited", kind=ps, eid=f'unf-{ps.sid}', @@ -166,8 +169,8 @@ def get_events(all_=True, parallel=True) -> List[Event]: else: # in saves s = saves[key] events.append(Event( - dt=etime(s.dt), - text=f"favorited {'[initial]' if first else ''}", + dt=s.save_dt, + text=f"favorited{' [initial]' if first else ''}", kind=s, eid=f'fav-{s.sid}', url=s.url, @@ -203,20 +206,21 @@ def test(): # TODO fuck. pytest is broken?? +# right, apparently I need pytest.ini file... def test_unfav(): events = get_events(all_=True) url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' uevents = [e for e in events if e.url == url] assert len(uevents) == 2 ff = uevents[0] - assert ff.text == 'favorited [initial]' + assert ff.text == 'favorited' uf = uevents[1] assert uf.text == 'unfavorited' -def test_get_all_saves(): - saves = get_saves(all_=True) - # just check that they are unique.. - make_dict(saves, key=lambda s: s.sid) +# def test_get_all_saves(): +# saves = get_saves(all_=True) +# # just check that they are unique.. +# make_dict(saves, key=lambda s: s.sid) def main(): From 39adb48044c5fe5da755827df3b5184f1caf9b15 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 17 Mar 2019 16:23:08 +0000 Subject: [PATCH 16/19] add created; use special saved handling for first backup --- reddit/__init__.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index a4c7abe..fc73f3e 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -41,10 +41,14 @@ class Save(NamedTuple): def __hash__(self): return hash(self.sid) + @cproperty + def created(self) -> datetime: + return self.dt + @cproperty def save_dt(self) -> datetime: - assert self.dt <= self.backup_dt - return max(self.dt, self.backup_dt) + # TODO not exactly precise... but whatever I guess + return self.backup_dt @cproperty def url(self) -> str: @@ -101,6 +105,9 @@ Url = str # TODO shit. there does seem to be a difference... # TODO do it in multiple threads?? def get_state(bfile: Path) -> Dict[Sid, Save]: + logger = get_logger() + logger.debug('handling %s', bfile) + RE = re.compile(r'reddit-(\d{14})') match = RE.search(bfile.stem) assert match is not None @@ -159,7 +166,7 @@ def get_events(all_=True, parallel=True) -> List[Event]: # TODO use backup date, that is more precise... # eh. I guess just take max and it will always be correct? events.append(Event( - dt=ps.save_dt, + dt=ps.created if first else ps.save_dt, text=f"unfavorited", kind=ps, eid=f'unf-{ps.sid}', @@ -169,7 +176,7 @@ def get_events(all_=True, parallel=True) -> List[Event]: else: # in saves s = saves[key] events.append(Event( - dt=s.save_dt, + dt=s.created if first else s.save_dt, text=f"favorited{' [initial]' if first else ''}", kind=s, eid=f'fav-{s.sid}', @@ -217,10 +224,10 @@ def test_unfav(): uf = uevents[1] assert uf.text == 'unfavorited' -# def test_get_all_saves(): -# saves = get_saves(all_=True) -# # just check that they are unique.. -# make_dict(saves, key=lambda s: s.sid) +def test_get_all_saves(): + saves = get_saves(all_=True) + # just check that they are unique.. + make_dict(saves, key=lambda s: s.sid) def main(): From 687e008f133f49c556c0086ecfeaef97fe93a646 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 23 Apr 2019 22:10:25 +0100 Subject: [PATCH 17/19] cachy extraction --- reddit/__init__.py | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index fc73f3e..a10c1ca 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any +from typing import List, Dict, Union, Iterable, Iterator, NamedTuple, Any, Sequence import json +from functools import lru_cache from collections import OrderedDict from pathlib import Path import pytz @@ -10,6 +11,7 @@ import logging from multiprocessing import Pool from kython import kompress, cproperty, make_dict +from kython.klogging import setup_logzero # TODO hmm. apparently decompressing takes quite a bit of time... @@ -22,8 +24,8 @@ def reddit(suffix: str) -> str: return 'https://reddit.com' + suffix -def _get_backups(all_=True) -> List[Path]: - bfiles = list(sorted(BPATH.glob('reddit-*.json.xz'))) +def _get_backups(all_=True) -> Sequence[Path]: + bfiles = tuple(sorted(BPATH.glob('reddit-*.json.xz'))) # TODO switch to that new compression format? if all_: return bfiles else: @@ -139,9 +141,9 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: return OrderedDict() -def get_events(all_=True, parallel=True) -> List[Event]: - backups = _get_backups(all_=all_) - assert len(backups) > 0 +@lru_cache(1) +def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: + logger = get_logger() events: List[Event] = [] prev_saves: Dict[Sid, Save] = {} @@ -188,10 +190,15 @@ def get_events(all_=True, parallel=True) -> List[Event]: # TODO a bit awkward, favorited should compare lower than unfavorited? return list(sorted(events, key=lambda e: e.cmp_key)) -def get_saves(all_=True) -> List[Save]: +def get_events(*args, all_=True, parallel=True): + backups = _get_backups(all_=all_) + assert len(backups) > 0 + return _get_events(backups=backups, parallel=parallel) + +def get_saves(**kwargs) -> List[Save]: logger = get_logger() - events = get_events(all_=all_) + events = get_events(**kwargs) saves: Dict[Sid, Save] = OrderedDict() for e in events: if e.text.startswith('favorited'): @@ -212,8 +219,6 @@ def test(): get_saves(all_=False) -# TODO fuck. pytest is broken?? -# right, apparently I need pytest.ini file... def test_unfav(): events = get_events(all_=True) url = 'https://reddit.com/r/QuantifiedSelf/comments/acxy1v/personal_dashboard/' @@ -224,14 +229,27 @@ def test_unfav(): uf = uevents[1] assert uf.text == 'unfavorited' + def test_get_all_saves(): saves = get_saves(all_=True) # just check that they are unique.. make_dict(saves, key=lambda s: s.sid) +# TODO cache? +def test_disappearing(): + # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason + # but I guess it was just a short glitch... so whatever + saves = get_events(all_=True) + favs = [s.kind for s in saves if s.text == 'favorited'] + [deal_with_it] = [f for f in favs if f.title == '"Deal with it!"'] + assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc) + + def main(): - events = get_events() + setup_logzero(get_logger(), level=logging.DEBUG) + # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM... + events = get_events(parallel=False) print(len(events)) for e in events: print(e.text, e.url) From f151e495743db10b5439c4cf67165d062cd1aaea Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 23 Apr 2019 23:04:39 +0100 Subject: [PATCH 18/19] extra test, better handling for unfavorited date --- reddit/__init__.py | 48 +++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index a10c1ca..f66efd3 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -10,13 +10,14 @@ from datetime import datetime import logging from multiprocessing import Pool -from kython import kompress, cproperty, make_dict +from kython import kompress, cproperty, make_dict, numbers from kython.klogging import setup_logzero # TODO hmm. apparently decompressing takes quite a bit of time... BPATH = Path("/L/backups/reddit") + def get_logger(): return logging.getLogger('reddit-provider') @@ -34,7 +35,7 @@ def _get_backups(all_=True) -> Sequence[Path]: Sid = str class Save(NamedTuple): - dt: datetime # TODO misleading name... this is creation dt, not saving dt + created: datetime backup_dt: datetime title: str sid: Sid @@ -43,10 +44,6 @@ class Save(NamedTuple): def __hash__(self): return hash(self.sid) - @cproperty - def created(self) -> datetime: - return self.dt - @cproperty def save_dt(self) -> datetime: # TODO not exactly precise... but whatever I guess @@ -104,16 +101,19 @@ def get_some(d, *keys): Url = str -# TODO shit. there does seem to be a difference... -# TODO do it in multiple threads?? -def get_state(bfile: Path) -> Dict[Sid, Save]: - logger = get_logger() - logger.debug('handling %s', bfile) - +def _get_bdate(bfile: Path) -> datetime: RE = re.compile(r'reddit-(\d{14})') match = RE.search(bfile.stem) assert match is not None bdt = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) + return bdt + + +def _get_state(bfile: Path) -> Dict[Sid, Save]: + logger = get_logger() + logger.debug('handling %s', bfile) + + bdt = _get_bdate(bfile) saves: List[Save] = [] with kompress.open(bfile) as fo: @@ -121,12 +121,12 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: saved = jj['saved'] for s in saved: - dt = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) + created = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) # TODO need permalink # url = get_some(s, 'link_permalink', 'url') # this was original url... title = get_some(s, 'link_title', 'title') save = Save( - dt=dt, + created=created, backup_dt=bdt, title=title, sid=s['id'], @@ -135,7 +135,7 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: saves.append(save) return make_dict( - sorted(saves, key=lambda p: p.dt), # TODO make helper to create lambda from property? + sorted(saves, key=lambda p: p.created), key=lambda s: s.sid, ) return OrderedDict() @@ -153,12 +153,13 @@ def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: states: Iterable[Dict[Sid, Save]] if parallel: with Pool() as p: - states = p.map(get_state, backups) + states = p.map(_get_state, backups) else: # also make it lazy... - states = map(get_state, backups) + states = map(_get_state, backups) - for i, saves in enumerate(states): # TODO when date... + for i, bfile, saves in zip(numbers(), backups, states): + bdt = _get_bdate(bfile) first = i == 0 @@ -167,8 +168,9 @@ def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: if ps is not None: # TODO use backup date, that is more precise... # eh. I guess just take max and it will always be correct? + assert not first events.append(Event( - dt=ps.created if first else ps.save_dt, + dt=bdt, # TODO average wit ps.save_dt? text=f"unfavorited", kind=ps, eid=f'unf-{ps.sid}', @@ -236,7 +238,6 @@ def test_get_all_saves(): make_dict(saves, key=lambda s: s.sid) -# TODO cache? def test_disappearing(): # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason # but I guess it was just a short glitch... so whatever @@ -246,6 +247,13 @@ def test_disappearing(): assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc) +def test_unfavorite(): + events = get_events(all_=True) + unfavs = [s for s in events if s.text == 'unfavorited'] + [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] + assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc) + + def main(): setup_logzero(get_logger(), level=logging.DEBUG) # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM... From 006e35cc877d19e7fc733a6ee8cae83f2457b6b2 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 28 Aug 2019 16:21:08 +0200 Subject: [PATCH 19/19] attemt to use cachew --- reddit/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index f66efd3..4c3169c 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -40,6 +40,8 @@ class Save(NamedTuple): title: str sid: Sid json: Any = None + # TODO ugh. not sure how to support this in cachew... could try serializing dicts of simple types automatically.. but json can't be properly typed + # TODO why would json be none? def __hash__(self): return hash(self.sid) @@ -70,10 +72,12 @@ class Save(NamedTuple): return self.json['subreddit']['display_name'] -class Misc(NamedTuple): - pass +# class Misc(NamedTuple): +# pass -EventKind = Union[Save, Misc] +# EventKind = Union[Save, Misc] + +EventKind = Save class Event(NamedTuple): dt: datetime @@ -140,6 +144,9 @@ def _get_state(bfile: Path) -> Dict[Sid, Save]: ) return OrderedDict() +# from cachew import cachew +# TODO hmm. how to combine cachew and lru_cache?.... +# @cachew('/L/data/.cache/reddit-events.cache') @lru_cache(1) def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: