diff --git a/ci.sh b/ci.sh deleted file mode 100755 index 760804a..0000000 --- a/ci.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -cd "$(this_dir)" || exit - -. ~/bash_ci - -ci_run mypy foursquare -ci_run pylint -E foursquare - -ci_report_errors diff --git a/foursquare/__init__.py b/foursquare/__init__.py old mode 100644 new mode 100755 index d1dbb86..1729fad --- a/foursquare/__init__.py +++ b/foursquare/__init__.py @@ -1,17 +1,21 @@ +#!/usr/bin/env python3 from datetime import datetime, timezone, timedelta -# TODO pytz for timezone??? -from typing import List, Dict, NamedTuple, Union, Any, Tuple +from typing import List, Dict, NamedTuple, Union, Any, Tuple, Set +import json +from pathlib import Path -from kython import safe_get, flatten, load_json_file -from kython.data import get_last_file +# TODO pytz for timezone??? + +from kython import safe_get, flatten # TODO actually i'm parsing FSQ in my gmaps thing -_BPATH = '/L/backups/4sq' +_BPATH = Path('/L/backups/4sq') def get_logger(): import logging return logging.getLogger("fsq-provider") + class Checkin: def __init__(self, j) -> None: self.j = j @@ -30,10 +34,15 @@ class Checkin: # TODO localize?? return datetime.fromtimestamp(created, tz=tz) + @property + def cid(self) -> str: + return self.j['id'] + def get_raw(fname=None): if fname is None: - fname = get_last_file(_BPATH, '.json') - j = load_json_file(fname) + fname = max(_BPATH.glob('*.json')) + with Path(fname).open() as fo: + j = json.load(fo) assert isinstance(j, list) for chunk in j: @@ -52,135 +61,19 @@ def get_checkins(*args, **kwargs): return checkins -# def extract(j): -# assert isinstance(j, list) -# for chunk in j: - -class JsonComparer: - def __init__(self, ignored=None): - import re - self.ignored = {} if ignored is None else { - re.compile(i) for i in ignored - } - self.logger = get_logger() - - # TODO ugh, maybe just check if it dominates? and comparison if both dominate each other... - def compare(self, a, b, path: str=""): - # TODO not so sure about contains... - if any(i.match(path) for i in self.ignored): - self.logger.debug(f"ignoring path {path}") - return True - if a == b: - return True - alleq = True - if isinstance(a, (int, float, bool, type(None), str)): - self.logger.warning(f"at path {path}: {a} != {b}") - alleq = False - elif isinstance(a, list) or isinstance(b, list): - if a is None or b is None or len(a) != len(b): - alleq = False - else: - for i in range(len(a)): - if not self.compare(a[i], b[i], path + f"[]"): - self.logger.warning(f"at path {path}") - alleq = False - elif isinstance(a, dict) or isinstance(b, dict): - ka = set(a.keys()) - kb = set(b.keys()) - if ka != kb: - import ipdb; ipdb.set_trace() - self.logger.warning(f"at path {path}") - alleq = False - else: - for k in ka: - if not self.compare(a[k], b[k], path + f".{k}"): - alleq = False - else: - raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}") - - return alleq - - -# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit... -def get_comparer(): - def chregex(rest: str): - return r"^.\w+" + rest - c = JsonComparer(ignored={ - chregex('.venue.stats'), - chregex('.venue.menu.url'), - - # not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different?? - chregex('.venue.location.lat'), - chregex('.venue.location.lng'), - chregex('.venue.location.labeledLatLngs'), - - # TODO isMayor? - }) - return c - -# TODO right, I should only compare equivalent entries... -from kython import JSONType -def check_backups(backups: List[Tuple[JSONType, str]]): - logger = get_logger() - if len(backups) < 1: - logger.info(f"Nothing to check: only {len(backups)} left") - return [] - lastj, lastf = backups[-1] - tocleanup: List[str] = [] - comp = get_comparer() - for prevj, prevf in backups[-2::-1]: - logger.info(f"Comparing {lastf} vs {prevf}") - cres = comp.compare(prevj, lastj) - if cres: - logger.info(f"Removing {prevf}") - else: - logger.info(f"{lastf} differs from {prevf}") - - +# TODO do I need this?? def get_cid_map(bfile: str): raw = get_raw(bfile) return {i['id']: i for i in raw} -def cleanup_backups(): - from kython.data import get_all_files - from pprint import pprint - prev = None +def test(): + assert len(get_checkins()) > 100 + # TODO cid_map?? - # ok, so. pick last - # compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat - bfiles = get_all_files(_BPATH, 'checkins_2018-08') - backups = [(get_cid_map(bfile), bfile) for bfile in bfiles] - for (pv, _), (nx, _) in zip(backups, backups[1:]): - torm = set() - for cid in nx: - if cid not in pv: - torm.add(cid) - for cid in torm: - del nx[cid] # meh? - check_backups(backups) - return +def main(): + print(get_checkins()) - for f in bfiles: - print(f"Processing {f}") - cur = {ch['id']: ch for ch in get_raw(f)} - count = 0 - if prev is not None: - for cid, c in cur.items(): - if cid not in prev: - print(f"new checkin {cid}!") - else: - pc = prev[cid] - if pc != c: - compare_jsons(pc, c) - # import ipdb; ipdb.set_trace() - # print("WTF") - # pprint(pc) - # pprint(c) - # print("-----------") - # pres = c in prev - # if not pres: - # count += 1 - print(f"Difference: {count}") - prev = cur +if __name__ == '__main__': + main() diff --git a/foursquare/__main__.py b/foursquare/__main__.py deleted file mode 100644 index 5cff636..0000000 --- a/foursquare/__main__.py +++ /dev/null @@ -1,13 +0,0 @@ -from foursquare import get_checkins, get_logger, cleanup_backups - -import logging -from kython.logging import setup_logzero - -logger = get_logger() -setup_logzero(logger, level=logging.INFO) - -cleanup_backups() - -# for c in get_checkins(): -# print(c) - diff --git a/run b/run deleted file mode 100755 index 8a06e5c..0000000 --- a/run +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -eu - -cd "$(dirname "$0")" - -python3 -m foursquare