From f151e495743db10b5439c4cf67165d062cd1aaea Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Tue, 23 Apr 2019 23:04:39 +0100 Subject: [PATCH] extra test, better handling for unfavorited date --- reddit/__init__.py | 48 +++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/reddit/__init__.py b/reddit/__init__.py index a10c1ca..f66efd3 100755 --- a/reddit/__init__.py +++ b/reddit/__init__.py @@ -10,13 +10,14 @@ from datetime import datetime import logging from multiprocessing import Pool -from kython import kompress, cproperty, make_dict +from kython import kompress, cproperty, make_dict, numbers from kython.klogging import setup_logzero # TODO hmm. apparently decompressing takes quite a bit of time... BPATH = Path("/L/backups/reddit") + def get_logger(): return logging.getLogger('reddit-provider') @@ -34,7 +35,7 @@ def _get_backups(all_=True) -> Sequence[Path]: Sid = str class Save(NamedTuple): - dt: datetime # TODO misleading name... this is creation dt, not saving dt + created: datetime backup_dt: datetime title: str sid: Sid @@ -43,10 +44,6 @@ class Save(NamedTuple): def __hash__(self): return hash(self.sid) - @cproperty - def created(self) -> datetime: - return self.dt - @cproperty def save_dt(self) -> datetime: # TODO not exactly precise... but whatever I guess @@ -104,16 +101,19 @@ def get_some(d, *keys): Url = str -# TODO shit. there does seem to be a difference... -# TODO do it in multiple threads?? -def get_state(bfile: Path) -> Dict[Sid, Save]: - logger = get_logger() - logger.debug('handling %s', bfile) - +def _get_bdate(bfile: Path) -> datetime: RE = re.compile(r'reddit-(\d{14})') match = RE.search(bfile.stem) assert match is not None bdt = pytz.utc.localize(datetime.strptime(match.group(1), "%Y%m%d%H%M%S")) + return bdt + + +def _get_state(bfile: Path) -> Dict[Sid, Save]: + logger = get_logger() + logger.debug('handling %s', bfile) + + bdt = _get_bdate(bfile) saves: List[Save] = [] with kompress.open(bfile) as fo: @@ -121,12 +121,12 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: saved = jj['saved'] for s in saved: - dt = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) + created = pytz.utc.localize(datetime.utcfromtimestamp(s['created_utc'])) # TODO need permalink # url = get_some(s, 'link_permalink', 'url') # this was original url... title = get_some(s, 'link_title', 'title') save = Save( - dt=dt, + created=created, backup_dt=bdt, title=title, sid=s['id'], @@ -135,7 +135,7 @@ def get_state(bfile: Path) -> Dict[Sid, Save]: saves.append(save) return make_dict( - sorted(saves, key=lambda p: p.dt), # TODO make helper to create lambda from property? + sorted(saves, key=lambda p: p.created), key=lambda s: s.sid, ) return OrderedDict() @@ -153,12 +153,13 @@ def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: states: Iterable[Dict[Sid, Save]] if parallel: with Pool() as p: - states = p.map(get_state, backups) + states = p.map(_get_state, backups) else: # also make it lazy... - states = map(get_state, backups) + states = map(_get_state, backups) - for i, saves in enumerate(states): # TODO when date... + for i, bfile, saves in zip(numbers(), backups, states): + bdt = _get_bdate(bfile) first = i == 0 @@ -167,8 +168,9 @@ def _get_events(backups: Sequence[Path], parallel: bool) -> List[Event]: if ps is not None: # TODO use backup date, that is more precise... # eh. I guess just take max and it will always be correct? + assert not first events.append(Event( - dt=ps.created if first else ps.save_dt, + dt=bdt, # TODO average wit ps.save_dt? text=f"unfavorited", kind=ps, eid=f'unf-{ps.sid}', @@ -236,7 +238,6 @@ def test_get_all_saves(): make_dict(saves, key=lambda s: s.sid) -# TODO cache? def test_disappearing(): # eh. so for instance, 'metro line colors' is missing from reddit-20190402005024.json for no reason # but I guess it was just a short glitch... so whatever @@ -246,6 +247,13 @@ def test_disappearing(): assert deal_with_it.backup_dt == datetime(2019, 4, 1, 23, 10, 25, tzinfo=pytz.utc) +def test_unfavorite(): + events = get_events(all_=True) + unfavs = [s for s in events if s.text == 'unfavorited'] + [xxx] = [u for u in unfavs if u.eid == 'unf-19ifop'] + assert xxx.dt == datetime(2019, 1, 28, 8, 10, 20, tzinfo=pytz.utc) + + def main(): setup_logzero(get_logger(), level=logging.DEBUG) # TODO eh. not sure why but parallel on seems to mess glumov up and cause OOM...