attempt to dedup

This commit is contained in:
Dima Gerasimov 2018-09-09 16:57:26 +03:00
parent a0da879a0e
commit 416b363362
2 changed files with 171 additions and 9 deletions

View file

@ -1,19 +1,23 @@
from datetime import datetime, timezone, timedelta
# TODO pytz for timezone???
from typing import List, Dict, NamedTuple, Union, Any
from typing import List, Dict, NamedTuple, Union, Any, Tuple
from kython import safe_get, flatten
from kython import safe_get, flatten, load_json_file
from kython.data import get_last_file
# TODO actually i'm parsing FSQ in my gmaps thing
_BPATH = '/L/backups/4sq'
def get_logger():
import logging
return logging.getLogger("fsq-provider")
class Checkin:
def __init__(self, j) -> None:
self.j = j
@property
def _summary(self) -> str:
def summary(self) -> str:
return "checked into " + safe_get(self.j, 'venue', 'name', default="NO_NAME") + " " + self.j.get('shout', "") # TODO should should be bold...
# TODO maybe return htmlish? if not html, interpret as string
@ -26,8 +30,157 @@ class Checkin:
# TODO localize??
return datetime.fromtimestamp(created, tz=tz)
def get_checkins():
j = get_last_file(_BPATH)
everything = flatten([x['response']['checkins']['items'] for x in j])
def get_raw(fname=None):
if fname is None:
fname = get_last_file(_BPATH, '.json')
j = load_json_file(fname)
assert isinstance(j, list)
for chunk in j:
del chunk['meta']
del chunk['notifications']
assert chunk.keys() == {'response'}
assert chunk['response'].keys() == {'checkins'}
return flatten([x['response']['checkins']['items'] for x in j])
# TODO not sure how to make it generic..
def get_checkins(*args, **kwargs):
everything = get_raw(*args, **kwargs)
checkins = sorted([Checkin(i) for i in everything], key=lambda c: c.dt)
return checkins
# def extract(j):
# assert isinstance(j, list)
# for chunk in j:
class JsonComparer:
def __init__(self, ignored=None):
import re
self.ignored = {} if ignored is None else {
re.compile(i) for i in ignored
}
self.logger = get_logger()
# TODO ugh, maybe just check if it dominates? and comparison if both dominate each other...
def compare(self, a, b, path: str=""):
# TODO not so sure about contains...
if any(i.match(path) for i in self.ignored):
self.logger.debug(f"ignoring path {path}")
return True
if a == b:
return True
alleq = True
if isinstance(a, (int, float, bool, type(None), str)):
self.logger.warning(f"at path {path}: {a} != {b}")
alleq = False
elif isinstance(a, list) or isinstance(b, list):
if a is None or b is None or len(a) != len(b):
alleq = False
else:
for i in range(len(a)):
if not self.compare(a[i], b[i], path + f"[]"):
self.logger.warning(f"at path {path}")
alleq = False
elif isinstance(a, dict) or isinstance(b, dict):
ka = set(a.keys())
kb = set(b.keys())
if ka != kb:
import ipdb; ipdb.set_trace()
self.logger.warning(f"at path {path}")
alleq = False
else:
for k in ka:
if not self.compare(a[k], b[k], path + f".{k}"):
alleq = False
else:
raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}")
return alleq
# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit...
def get_comparer():
def chregex(rest: str):
return r"^.\w+" + rest
c = JsonComparer(ignored={
chregex('.venue.stats'),
chregex('.venue.menu.url'),
# not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different??
chregex('.venue.location.lat'),
chregex('.venue.location.lng'),
chregex('.venue.location.labeledLatLngs'),
# TODO isMayor?
})
return c
# TODO right, I should only compare equivalent entries...
from kython import JSONType
def check_backups(backups: List[Tuple[JSONType, str]]):
logger = get_logger()
if len(backups) < 1:
logger.info(f"Nothing to check: only {len(backups)} left")
return []
lastj, lastf = backups[-1]
tocleanup: List[str] = []
comp = get_comparer()
for prevj, prevf in backups[-2::-1]:
logger.info(f"Comparing {lastf} vs {prevf}")
cres = comp.compare(prevj, lastj)
if cres:
logger.info(f"Removing {prevf}")
else:
logger.info(f"{lastf} differs from {prevf}")
def get_cid_map(bfile: str):
raw = get_raw(bfile)
return {i['id']: i for i in raw}
def cleanup_backups():
from kython.data import get_all_files
from pprint import pprint
prev = None
# ok, so. pick last
# compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat
bfiles = get_all_files(_BPATH, 'checkins_2018-08')
backups = [(get_cid_map(bfile), bfile) for bfile in bfiles]
for (pv, _), (nx, _) in zip(backups, backups[1:]):
torm = set()
for cid in nx:
if cid not in pv:
torm.add(cid)
for cid in torm:
del nx[cid] # meh?
check_backups(backups)
return
for f in bfiles:
print(f"Processing {f}")
cur = {ch['id']: ch for ch in get_raw(f)}
count = 0
if prev is not None:
for cid, c in cur.items():
if cid not in prev:
print(f"new checkin {cid}!")
else:
pc = prev[cid]
if pc != c:
compare_jsons(pc, c)
# import ipdb; ipdb.set_trace()
# print("WTF")
# pprint(pc)
# pprint(c)
# print("-----------")
# pres = c in prev
# if not pres:
# count += 1
print(f"Difference: {count}")
prev = cur

View file

@ -1,4 +1,13 @@
from foursquare import get_checkins
from foursquare import get_checkins, get_logger, cleanup_backups
import logging
from kython.logging import setup_logzero
logger = get_logger()
setup_logzero(logger, level=logging.INFO)
cleanup_backups()
# for c in get_checkins():
# print(c)
for c in get_checkins():
print(c)