diff --git a/foursquare/__init__.py b/foursquare/__init__.py index 630e0cb..d1dbb86 100644 --- a/foursquare/__init__.py +++ b/foursquare/__init__.py @@ -1,19 +1,23 @@ from datetime import datetime, timezone, timedelta # TODO pytz for timezone??? -from typing import List, Dict, NamedTuple, Union, Any +from typing import List, Dict, NamedTuple, Union, Any, Tuple -from kython import safe_get, flatten +from kython import safe_get, flatten, load_json_file from kython.data import get_last_file # TODO actually i'm parsing FSQ in my gmaps thing _BPATH = '/L/backups/4sq' +def get_logger(): + import logging + return logging.getLogger("fsq-provider") + class Checkin: def __init__(self, j) -> None: self.j = j @property - def _summary(self) -> str: + def summary(self) -> str: return "checked into " + safe_get(self.j, 'venue', 'name', default="NO_NAME") + " " + self.j.get('shout', "") # TODO should should be bold... # TODO maybe return htmlish? if not html, interpret as string @@ -26,8 +30,157 @@ class Checkin: # TODO localize?? return datetime.fromtimestamp(created, tz=tz) -def get_checkins(): - j = get_last_file(_BPATH) - everything = flatten([x['response']['checkins']['items'] for x in j]) +def get_raw(fname=None): + if fname is None: + fname = get_last_file(_BPATH, '.json') + j = load_json_file(fname) + + assert isinstance(j, list) + for chunk in j: + del chunk['meta'] + del chunk['notifications'] + assert chunk.keys() == {'response'} + assert chunk['response'].keys() == {'checkins'} + + return flatten([x['response']['checkins']['items'] for x in j]) + + +# TODO not sure how to make it generic.. +def get_checkins(*args, **kwargs): + everything = get_raw(*args, **kwargs) checkins = sorted([Checkin(i) for i in everything], key=lambda c: c.dt) return checkins + + +# def extract(j): +# assert isinstance(j, list) +# for chunk in j: + +class JsonComparer: + def __init__(self, ignored=None): + import re + self.ignored = {} if ignored is None else { + re.compile(i) for i in ignored + } + self.logger = get_logger() + + # TODO ugh, maybe just check if it dominates? and comparison if both dominate each other... + def compare(self, a, b, path: str=""): + # TODO not so sure about contains... + if any(i.match(path) for i in self.ignored): + self.logger.debug(f"ignoring path {path}") + return True + if a == b: + return True + alleq = True + if isinstance(a, (int, float, bool, type(None), str)): + self.logger.warning(f"at path {path}: {a} != {b}") + alleq = False + elif isinstance(a, list) or isinstance(b, list): + if a is None or b is None or len(a) != len(b): + alleq = False + else: + for i in range(len(a)): + if not self.compare(a[i], b[i], path + f"[]"): + self.logger.warning(f"at path {path}") + alleq = False + elif isinstance(a, dict) or isinstance(b, dict): + ka = set(a.keys()) + kb = set(b.keys()) + if ka != kb: + import ipdb; ipdb.set_trace() + self.logger.warning(f"at path {path}") + alleq = False + else: + for k in ka: + if not self.compare(a[k], b[k], path + f".{k}"): + alleq = False + else: + raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}") + + return alleq + + +# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit... +def get_comparer(): + def chregex(rest: str): + return r"^.\w+" + rest + c = JsonComparer(ignored={ + chregex('.venue.stats'), + chregex('.venue.menu.url'), + + # not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different?? + chregex('.venue.location.lat'), + chregex('.venue.location.lng'), + chregex('.venue.location.labeledLatLngs'), + + # TODO isMayor? + }) + return c + +# TODO right, I should only compare equivalent entries... +from kython import JSONType +def check_backups(backups: List[Tuple[JSONType, str]]): + logger = get_logger() + if len(backups) < 1: + logger.info(f"Nothing to check: only {len(backups)} left") + return [] + lastj, lastf = backups[-1] + tocleanup: List[str] = [] + comp = get_comparer() + for prevj, prevf in backups[-2::-1]: + logger.info(f"Comparing {lastf} vs {prevf}") + cres = comp.compare(prevj, lastj) + if cres: + logger.info(f"Removing {prevf}") + else: + logger.info(f"{lastf} differs from {prevf}") + + +def get_cid_map(bfile: str): + raw = get_raw(bfile) + return {i['id']: i for i in raw} + + +def cleanup_backups(): + from kython.data import get_all_files + from pprint import pprint + prev = None + + # ok, so. pick last + # compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat + + bfiles = get_all_files(_BPATH, 'checkins_2018-08') + backups = [(get_cid_map(bfile), bfile) for bfile in bfiles] + for (pv, _), (nx, _) in zip(backups, backups[1:]): + torm = set() + for cid in nx: + if cid not in pv: + torm.add(cid) + for cid in torm: + del nx[cid] # meh? + check_backups(backups) + return + + for f in bfiles: + print(f"Processing {f}") + cur = {ch['id']: ch for ch in get_raw(f)} + count = 0 + if prev is not None: + for cid, c in cur.items(): + if cid not in prev: + print(f"new checkin {cid}!") + else: + pc = prev[cid] + if pc != c: + compare_jsons(pc, c) + # import ipdb; ipdb.set_trace() + # print("WTF") + # pprint(pc) + # pprint(c) + # print("-----------") + # pres = c in prev + # if not pres: + # count += 1 + print(f"Difference: {count}") + prev = cur diff --git a/foursquare/__main__.py b/foursquare/__main__.py index 2ac915e..5cff636 100644 --- a/foursquare/__main__.py +++ b/foursquare/__main__.py @@ -1,4 +1,13 @@ -from foursquare import get_checkins +from foursquare import get_checkins, get_logger, cleanup_backups + +import logging +from kython.logging import setup_logzero + +logger = get_logger() +setup_logzero(logger, level=logging.INFO) + +cleanup_backups() + +# for c in get_checkins(): +# print(c) -for c in get_checkins(): - print(c)