simplify provider
This commit is contained in:
parent
416b363362
commit
8d79c750c4
4 changed files with 24 additions and 160 deletions
10
ci.sh
10
ci.sh
|
@ -1,10 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
cd "$(this_dir)" || exit
|
|
||||||
|
|
||||||
. ~/bash_ci
|
|
||||||
|
|
||||||
ci_run mypy foursquare
|
|
||||||
ci_run pylint -E foursquare
|
|
||||||
|
|
||||||
ci_report_errors
|
|
155
foursquare/__init__.py
Normal file → Executable file
155
foursquare/__init__.py
Normal file → Executable file
|
@ -1,17 +1,21 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
# TODO pytz for timezone???
|
from typing import List, Dict, NamedTuple, Union, Any, Tuple, Set
|
||||||
from typing import List, Dict, NamedTuple, Union, Any, Tuple
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from kython import safe_get, flatten, load_json_file
|
# TODO pytz for timezone???
|
||||||
from kython.data import get_last_file
|
|
||||||
|
from kython import safe_get, flatten
|
||||||
|
|
||||||
# TODO actually i'm parsing FSQ in my gmaps thing
|
# TODO actually i'm parsing FSQ in my gmaps thing
|
||||||
_BPATH = '/L/backups/4sq'
|
_BPATH = Path('/L/backups/4sq')
|
||||||
|
|
||||||
def get_logger():
|
def get_logger():
|
||||||
import logging
|
import logging
|
||||||
return logging.getLogger("fsq-provider")
|
return logging.getLogger("fsq-provider")
|
||||||
|
|
||||||
|
|
||||||
class Checkin:
|
class Checkin:
|
||||||
def __init__(self, j) -> None:
|
def __init__(self, j) -> None:
|
||||||
self.j = j
|
self.j = j
|
||||||
|
@ -30,10 +34,15 @@ class Checkin:
|
||||||
# TODO localize??
|
# TODO localize??
|
||||||
return datetime.fromtimestamp(created, tz=tz)
|
return datetime.fromtimestamp(created, tz=tz)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cid(self) -> str:
|
||||||
|
return self.j['id']
|
||||||
|
|
||||||
def get_raw(fname=None):
|
def get_raw(fname=None):
|
||||||
if fname is None:
|
if fname is None:
|
||||||
fname = get_last_file(_BPATH, '.json')
|
fname = max(_BPATH.glob('*.json'))
|
||||||
j = load_json_file(fname)
|
with Path(fname).open() as fo:
|
||||||
|
j = json.load(fo)
|
||||||
|
|
||||||
assert isinstance(j, list)
|
assert isinstance(j, list)
|
||||||
for chunk in j:
|
for chunk in j:
|
||||||
|
@ -52,135 +61,19 @@ def get_checkins(*args, **kwargs):
|
||||||
return checkins
|
return checkins
|
||||||
|
|
||||||
|
|
||||||
# def extract(j):
|
# TODO do I need this??
|
||||||
# assert isinstance(j, list)
|
|
||||||
# for chunk in j:
|
|
||||||
|
|
||||||
class JsonComparer:
|
|
||||||
def __init__(self, ignored=None):
|
|
||||||
import re
|
|
||||||
self.ignored = {} if ignored is None else {
|
|
||||||
re.compile(i) for i in ignored
|
|
||||||
}
|
|
||||||
self.logger = get_logger()
|
|
||||||
|
|
||||||
# TODO ugh, maybe just check if it dominates? and comparison if both dominate each other...
|
|
||||||
def compare(self, a, b, path: str=""):
|
|
||||||
# TODO not so sure about contains...
|
|
||||||
if any(i.match(path) for i in self.ignored):
|
|
||||||
self.logger.debug(f"ignoring path {path}")
|
|
||||||
return True
|
|
||||||
if a == b:
|
|
||||||
return True
|
|
||||||
alleq = True
|
|
||||||
if isinstance(a, (int, float, bool, type(None), str)):
|
|
||||||
self.logger.warning(f"at path {path}: {a} != {b}")
|
|
||||||
alleq = False
|
|
||||||
elif isinstance(a, list) or isinstance(b, list):
|
|
||||||
if a is None or b is None or len(a) != len(b):
|
|
||||||
alleq = False
|
|
||||||
else:
|
|
||||||
for i in range(len(a)):
|
|
||||||
if not self.compare(a[i], b[i], path + f"[]"):
|
|
||||||
self.logger.warning(f"at path {path}")
|
|
||||||
alleq = False
|
|
||||||
elif isinstance(a, dict) or isinstance(b, dict):
|
|
||||||
ka = set(a.keys())
|
|
||||||
kb = set(b.keys())
|
|
||||||
if ka != kb:
|
|
||||||
import ipdb; ipdb.set_trace()
|
|
||||||
self.logger.warning(f"at path {path}")
|
|
||||||
alleq = False
|
|
||||||
else:
|
|
||||||
for k in ka:
|
|
||||||
if not self.compare(a[k], b[k], path + f".{k}"):
|
|
||||||
alleq = False
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}")
|
|
||||||
|
|
||||||
return alleq
|
|
||||||
|
|
||||||
|
|
||||||
# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit...
|
|
||||||
def get_comparer():
|
|
||||||
def chregex(rest: str):
|
|
||||||
return r"^.\w+" + rest
|
|
||||||
c = JsonComparer(ignored={
|
|
||||||
chregex('.venue.stats'),
|
|
||||||
chregex('.venue.menu.url'),
|
|
||||||
|
|
||||||
# not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different??
|
|
||||||
chregex('.venue.location.lat'),
|
|
||||||
chregex('.venue.location.lng'),
|
|
||||||
chregex('.venue.location.labeledLatLngs'),
|
|
||||||
|
|
||||||
# TODO isMayor?
|
|
||||||
})
|
|
||||||
return c
|
|
||||||
|
|
||||||
# TODO right, I should only compare equivalent entries...
|
|
||||||
from kython import JSONType
|
|
||||||
def check_backups(backups: List[Tuple[JSONType, str]]):
|
|
||||||
logger = get_logger()
|
|
||||||
if len(backups) < 1:
|
|
||||||
logger.info(f"Nothing to check: only {len(backups)} left")
|
|
||||||
return []
|
|
||||||
lastj, lastf = backups[-1]
|
|
||||||
tocleanup: List[str] = []
|
|
||||||
comp = get_comparer()
|
|
||||||
for prevj, prevf in backups[-2::-1]:
|
|
||||||
logger.info(f"Comparing {lastf} vs {prevf}")
|
|
||||||
cres = comp.compare(prevj, lastj)
|
|
||||||
if cres:
|
|
||||||
logger.info(f"Removing {prevf}")
|
|
||||||
else:
|
|
||||||
logger.info(f"{lastf} differs from {prevf}")
|
|
||||||
|
|
||||||
|
|
||||||
def get_cid_map(bfile: str):
|
def get_cid_map(bfile: str):
|
||||||
raw = get_raw(bfile)
|
raw = get_raw(bfile)
|
||||||
return {i['id']: i for i in raw}
|
return {i['id']: i for i in raw}
|
||||||
|
|
||||||
|
|
||||||
def cleanup_backups():
|
def test():
|
||||||
from kython.data import get_all_files
|
assert len(get_checkins()) > 100
|
||||||
from pprint import pprint
|
# TODO cid_map??
|
||||||
prev = None
|
|
||||||
|
|
||||||
# ok, so. pick last
|
|
||||||
# compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat
|
|
||||||
|
|
||||||
bfiles = get_all_files(_BPATH, 'checkins_2018-08')
|
def main():
|
||||||
backups = [(get_cid_map(bfile), bfile) for bfile in bfiles]
|
print(get_checkins())
|
||||||
for (pv, _), (nx, _) in zip(backups, backups[1:]):
|
|
||||||
torm = set()
|
|
||||||
for cid in nx:
|
|
||||||
if cid not in pv:
|
|
||||||
torm.add(cid)
|
|
||||||
for cid in torm:
|
|
||||||
del nx[cid] # meh?
|
|
||||||
check_backups(backups)
|
|
||||||
return
|
|
||||||
|
|
||||||
for f in bfiles:
|
if __name__ == '__main__':
|
||||||
print(f"Processing {f}")
|
main()
|
||||||
cur = {ch['id']: ch for ch in get_raw(f)}
|
|
||||||
count = 0
|
|
||||||
if prev is not None:
|
|
||||||
for cid, c in cur.items():
|
|
||||||
if cid not in prev:
|
|
||||||
print(f"new checkin {cid}!")
|
|
||||||
else:
|
|
||||||
pc = prev[cid]
|
|
||||||
if pc != c:
|
|
||||||
compare_jsons(pc, c)
|
|
||||||
# import ipdb; ipdb.set_trace()
|
|
||||||
# print("WTF")
|
|
||||||
# pprint(pc)
|
|
||||||
# pprint(c)
|
|
||||||
# print("-----------")
|
|
||||||
# pres = c in prev
|
|
||||||
# if not pres:
|
|
||||||
# count += 1
|
|
||||||
print(f"Difference: {count}")
|
|
||||||
prev = cur
|
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
from foursquare import get_checkins, get_logger, cleanup_backups
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from kython.logging import setup_logzero
|
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
setup_logzero(logger, level=logging.INFO)
|
|
||||||
|
|
||||||
cleanup_backups()
|
|
||||||
|
|
||||||
# for c in get_checkins():
|
|
||||||
# print(c)
|
|
||||||
|
|
6
run
6
run
|
@ -1,6 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
|
||||||
|
|
||||||
python3 -m foursquare
|
|
Loading…
Add table
Reference in a new issue