simplify provider

This commit is contained in:
Dima Gerasimov 2019-05-08 20:50:10 +01:00
parent 416b363362
commit 8d79c750c4
4 changed files with 24 additions and 160 deletions

10
ci.sh
View file

@ -1,10 +0,0 @@
#!/bin/bash
cd "$(this_dir)" || exit
. ~/bash_ci
ci_run mypy foursquare
ci_run pylint -E foursquare
ci_report_errors

155
foursquare/__init__.py Normal file → Executable file
View file

@ -1,17 +1,21 @@
#!/usr/bin/env python3
from datetime import datetime, timezone, timedelta
# TODO pytz for timezone???
from typing import List, Dict, NamedTuple, Union, Any, Tuple
from typing import List, Dict, NamedTuple, Union, Any, Tuple, Set
import json
from pathlib import Path
from kython import safe_get, flatten, load_json_file
from kython.data import get_last_file
# TODO pytz for timezone???
from kython import safe_get, flatten
# TODO actually i'm parsing FSQ in my gmaps thing
_BPATH = '/L/backups/4sq'
_BPATH = Path('/L/backups/4sq')
def get_logger():
import logging
return logging.getLogger("fsq-provider")
class Checkin:
def __init__(self, j) -> None:
self.j = j
@ -30,10 +34,15 @@ class Checkin:
# TODO localize??
return datetime.fromtimestamp(created, tz=tz)
@property
def cid(self) -> str:
return self.j['id']
def get_raw(fname=None):
if fname is None:
fname = get_last_file(_BPATH, '.json')
j = load_json_file(fname)
fname = max(_BPATH.glob('*.json'))
with Path(fname).open() as fo:
j = json.load(fo)
assert isinstance(j, list)
for chunk in j:
@ -52,135 +61,19 @@ def get_checkins(*args, **kwargs):
return checkins
# def extract(j):
# assert isinstance(j, list)
# for chunk in j:
class JsonComparer:
def __init__(self, ignored=None):
import re
self.ignored = {} if ignored is None else {
re.compile(i) for i in ignored
}
self.logger = get_logger()
# TODO ugh, maybe just check if it dominates? and comparison if both dominate each other...
def compare(self, a, b, path: str=""):
# TODO not so sure about contains...
if any(i.match(path) for i in self.ignored):
self.logger.debug(f"ignoring path {path}")
return True
if a == b:
return True
alleq = True
if isinstance(a, (int, float, bool, type(None), str)):
self.logger.warning(f"at path {path}: {a} != {b}")
alleq = False
elif isinstance(a, list) or isinstance(b, list):
if a is None or b is None or len(a) != len(b):
alleq = False
else:
for i in range(len(a)):
if not self.compare(a[i], b[i], path + f"[]"):
self.logger.warning(f"at path {path}")
alleq = False
elif isinstance(a, dict) or isinstance(b, dict):
ka = set(a.keys())
kb = set(b.keys())
if ka != kb:
import ipdb; ipdb.set_trace()
self.logger.warning(f"at path {path}")
alleq = False
else:
for k in ka:
if not self.compare(a[k], b[k], path + f".{k}"):
alleq = False
else:
raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}")
return alleq
# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit...
def get_comparer():
def chregex(rest: str):
return r"^.\w+" + rest
c = JsonComparer(ignored={
chregex('.venue.stats'),
chregex('.venue.menu.url'),
# not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different??
chregex('.venue.location.lat'),
chregex('.venue.location.lng'),
chregex('.venue.location.labeledLatLngs'),
# TODO isMayor?
})
return c
# TODO right, I should only compare equivalent entries...
from kython import JSONType
def check_backups(backups: List[Tuple[JSONType, str]]):
logger = get_logger()
if len(backups) < 1:
logger.info(f"Nothing to check: only {len(backups)} left")
return []
lastj, lastf = backups[-1]
tocleanup: List[str] = []
comp = get_comparer()
for prevj, prevf in backups[-2::-1]:
logger.info(f"Comparing {lastf} vs {prevf}")
cres = comp.compare(prevj, lastj)
if cres:
logger.info(f"Removing {prevf}")
else:
logger.info(f"{lastf} differs from {prevf}")
# TODO do I need this??
def get_cid_map(bfile: str):
raw = get_raw(bfile)
return {i['id']: i for i in raw}
def cleanup_backups():
from kython.data import get_all_files
from pprint import pprint
prev = None
def test():
assert len(get_checkins()) > 100
# TODO cid_map??
# ok, so. pick last
# compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat
bfiles = get_all_files(_BPATH, 'checkins_2018-08')
backups = [(get_cid_map(bfile), bfile) for bfile in bfiles]
for (pv, _), (nx, _) in zip(backups, backups[1:]):
torm = set()
for cid in nx:
if cid not in pv:
torm.add(cid)
for cid in torm:
del nx[cid] # meh?
check_backups(backups)
return
def main():
print(get_checkins())
for f in bfiles:
print(f"Processing {f}")
cur = {ch['id']: ch for ch in get_raw(f)}
count = 0
if prev is not None:
for cid, c in cur.items():
if cid not in prev:
print(f"new checkin {cid}!")
else:
pc = prev[cid]
if pc != c:
compare_jsons(pc, c)
# import ipdb; ipdb.set_trace()
# print("WTF")
# pprint(pc)
# pprint(c)
# print("-----------")
# pres = c in prev
# if not pres:
# count += 1
print(f"Difference: {count}")
prev = cur
if __name__ == '__main__':
main()

View file

@ -1,13 +0,0 @@
from foursquare import get_checkins, get_logger, cleanup_backups
import logging
from kython.logging import setup_logzero
logger = get_logger()
setup_logzero(logger, level=logging.INFO)
cleanup_backups()
# for c in get_checkins():
# print(c)

6
run
View file

@ -1,6 +0,0 @@
#!/bin/bash
set -eu
cd "$(dirname "$0")"
python3 -m foursquare