From f7a6782b714e9c95bcb7115ebae17cc17a85b7de Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 9 Sep 2018 13:11:04 +0300 Subject: [PATCH 1/5] initial --- .gitignore | 178 +++++++++++++++++++++++++++++++++++++++++ ci.sh | 10 +++ foursquare/__init__.py | 0 foursquare/__main__.py | 0 run | 6 ++ 5 files changed, 194 insertions(+) create mode 100644 .gitignore create mode 100755 ci.sh create mode 100644 foursquare/__init__.py create mode 100644 foursquare/__main__.py create mode 100755 run diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..236a11e --- /dev/null +++ b/.gitignore @@ -0,0 +1,178 @@ + +# Created by https://www.gitignore.io/api/python,emacs + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +### Python Patch ### +.venv/ + +### Python.VirtualEnv Stack ### +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +pip-selfcheck.json + + +# End of https://www.gitignore.io/api/python,emacs diff --git a/ci.sh b/ci.sh new file mode 100755 index 0000000..760804a --- /dev/null +++ b/ci.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +cd "$(this_dir)" || exit + +. ~/bash_ci + +ci_run mypy foursquare +ci_run pylint -E foursquare + +ci_report_errors diff --git a/foursquare/__init__.py b/foursquare/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/foursquare/__main__.py b/foursquare/__main__.py new file mode 100644 index 0000000..e69de29 diff --git a/run b/run new file mode 100755 index 0000000..8a06e5c --- /dev/null +++ b/run @@ -0,0 +1,6 @@ +#!/bin/bash +set -eu + +cd "$(dirname "$0")" + +python3 -m foursquare From a0da879a0e9a4c116814e836d7d460105d26c6c8 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 9 Sep 2018 13:40:50 +0300 Subject: [PATCH 2/5] initial --- foursquare/__init__.py | 33 +++++++++++++++++++++++++++++++++ foursquare/__main__.py | 4 ++++ 2 files changed, 37 insertions(+) diff --git a/foursquare/__init__.py b/foursquare/__init__.py index e69de29..630e0cb 100644 --- a/foursquare/__init__.py +++ b/foursquare/__init__.py @@ -0,0 +1,33 @@ +from datetime import datetime, timezone, timedelta +# TODO pytz for timezone??? +from typing import List, Dict, NamedTuple, Union, Any + +from kython import safe_get, flatten +from kython.data import get_last_file + +# TODO actually i'm parsing FSQ in my gmaps thing +_BPATH = '/L/backups/4sq' + +class Checkin: + def __init__(self, j) -> None: + self.j = j + + @property + def _summary(self) -> str: + return "checked into " + safe_get(self.j, 'venue', 'name', default="NO_NAME") + " " + self.j.get('shout', "") # TODO should should be bold... + # TODO maybe return htmlish? if not html, interpret as string + + @property + def dt(self) -> datetime: + created = self.j['createdAt'] # this is local time + offset = self.j['timeZoneOffset'] + tz = timezone(timedelta(minutes=offset)) + # a bit meh, but seems to work.. + # TODO localize?? + return datetime.fromtimestamp(created, tz=tz) + +def get_checkins(): + j = get_last_file(_BPATH) + everything = flatten([x['response']['checkins']['items'] for x in j]) + checkins = sorted([Checkin(i) for i in everything], key=lambda c: c.dt) + return checkins diff --git a/foursquare/__main__.py b/foursquare/__main__.py index e69de29..2ac915e 100644 --- a/foursquare/__main__.py +++ b/foursquare/__main__.py @@ -0,0 +1,4 @@ +from foursquare import get_checkins + +for c in get_checkins(): + print(c) From 416b36336293376e9844b1bab1c1d70aee167792 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 9 Sep 2018 16:57:26 +0300 Subject: [PATCH 3/5] attempt to dedup --- foursquare/__init__.py | 165 +++++++++++++++++++++++++++++++++++++++-- foursquare/__main__.py | 15 +++- 2 files changed, 171 insertions(+), 9 deletions(-) diff --git a/foursquare/__init__.py b/foursquare/__init__.py index 630e0cb..d1dbb86 100644 --- a/foursquare/__init__.py +++ b/foursquare/__init__.py @@ -1,19 +1,23 @@ from datetime import datetime, timezone, timedelta # TODO pytz for timezone??? -from typing import List, Dict, NamedTuple, Union, Any +from typing import List, Dict, NamedTuple, Union, Any, Tuple -from kython import safe_get, flatten +from kython import safe_get, flatten, load_json_file from kython.data import get_last_file # TODO actually i'm parsing FSQ in my gmaps thing _BPATH = '/L/backups/4sq' +def get_logger(): + import logging + return logging.getLogger("fsq-provider") + class Checkin: def __init__(self, j) -> None: self.j = j @property - def _summary(self) -> str: + def summary(self) -> str: return "checked into " + safe_get(self.j, 'venue', 'name', default="NO_NAME") + " " + self.j.get('shout', "") # TODO should should be bold... # TODO maybe return htmlish? if not html, interpret as string @@ -26,8 +30,157 @@ class Checkin: # TODO localize?? return datetime.fromtimestamp(created, tz=tz) -def get_checkins(): - j = get_last_file(_BPATH) - everything = flatten([x['response']['checkins']['items'] for x in j]) +def get_raw(fname=None): + if fname is None: + fname = get_last_file(_BPATH, '.json') + j = load_json_file(fname) + + assert isinstance(j, list) + for chunk in j: + del chunk['meta'] + del chunk['notifications'] + assert chunk.keys() == {'response'} + assert chunk['response'].keys() == {'checkins'} + + return flatten([x['response']['checkins']['items'] for x in j]) + + +# TODO not sure how to make it generic.. +def get_checkins(*args, **kwargs): + everything = get_raw(*args, **kwargs) checkins = sorted([Checkin(i) for i in everything], key=lambda c: c.dt) return checkins + + +# def extract(j): +# assert isinstance(j, list) +# for chunk in j: + +class JsonComparer: + def __init__(self, ignored=None): + import re + self.ignored = {} if ignored is None else { + re.compile(i) for i in ignored + } + self.logger = get_logger() + + # TODO ugh, maybe just check if it dominates? and comparison if both dominate each other... + def compare(self, a, b, path: str=""): + # TODO not so sure about contains... + if any(i.match(path) for i in self.ignored): + self.logger.debug(f"ignoring path {path}") + return True + if a == b: + return True + alleq = True + if isinstance(a, (int, float, bool, type(None), str)): + self.logger.warning(f"at path {path}: {a} != {b}") + alleq = False + elif isinstance(a, list) or isinstance(b, list): + if a is None or b is None or len(a) != len(b): + alleq = False + else: + for i in range(len(a)): + if not self.compare(a[i], b[i], path + f"[]"): + self.logger.warning(f"at path {path}") + alleq = False + elif isinstance(a, dict) or isinstance(b, dict): + ka = set(a.keys()) + kb = set(b.keys()) + if ka != kb: + import ipdb; ipdb.set_trace() + self.logger.warning(f"at path {path}") + alleq = False + else: + for k in ka: + if not self.compare(a[k], b[k], path + f".{k}"): + alleq = False + else: + raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}") + + return alleq + + +# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit... +def get_comparer(): + def chregex(rest: str): + return r"^.\w+" + rest + c = JsonComparer(ignored={ + chregex('.venue.stats'), + chregex('.venue.menu.url'), + + # not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different?? + chregex('.venue.location.lat'), + chregex('.venue.location.lng'), + chregex('.venue.location.labeledLatLngs'), + + # TODO isMayor? + }) + return c + +# TODO right, I should only compare equivalent entries... +from kython import JSONType +def check_backups(backups: List[Tuple[JSONType, str]]): + logger = get_logger() + if len(backups) < 1: + logger.info(f"Nothing to check: only {len(backups)} left") + return [] + lastj, lastf = backups[-1] + tocleanup: List[str] = [] + comp = get_comparer() + for prevj, prevf in backups[-2::-1]: + logger.info(f"Comparing {lastf} vs {prevf}") + cres = comp.compare(prevj, lastj) + if cres: + logger.info(f"Removing {prevf}") + else: + logger.info(f"{lastf} differs from {prevf}") + + +def get_cid_map(bfile: str): + raw = get_raw(bfile) + return {i['id']: i for i in raw} + + +def cleanup_backups(): + from kython.data import get_all_files + from pprint import pprint + prev = None + + # ok, so. pick last + # compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat + + bfiles = get_all_files(_BPATH, 'checkins_2018-08') + backups = [(get_cid_map(bfile), bfile) for bfile in bfiles] + for (pv, _), (nx, _) in zip(backups, backups[1:]): + torm = set() + for cid in nx: + if cid not in pv: + torm.add(cid) + for cid in torm: + del nx[cid] # meh? + check_backups(backups) + return + + for f in bfiles: + print(f"Processing {f}") + cur = {ch['id']: ch for ch in get_raw(f)} + count = 0 + if prev is not None: + for cid, c in cur.items(): + if cid not in prev: + print(f"new checkin {cid}!") + else: + pc = prev[cid] + if pc != c: + compare_jsons(pc, c) + # import ipdb; ipdb.set_trace() + # print("WTF") + # pprint(pc) + # pprint(c) + # print("-----------") + # pres = c in prev + # if not pres: + # count += 1 + print(f"Difference: {count}") + prev = cur diff --git a/foursquare/__main__.py b/foursquare/__main__.py index 2ac915e..5cff636 100644 --- a/foursquare/__main__.py +++ b/foursquare/__main__.py @@ -1,4 +1,13 @@ -from foursquare import get_checkins +from foursquare import get_checkins, get_logger, cleanup_backups + +import logging +from kython.logging import setup_logzero + +logger = get_logger() +setup_logzero(logger, level=logging.INFO) + +cleanup_backups() + +# for c in get_checkins(): +# print(c) -for c in get_checkins(): - print(c) From 8d79c750c4886b89ce230e11b3492743264231e3 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 8 May 2019 20:50:10 +0100 Subject: [PATCH 4/5] simplify provider --- ci.sh | 10 --- foursquare/__init__.py | 155 +++++++---------------------------------- foursquare/__main__.py | 13 ---- run | 6 -- 4 files changed, 24 insertions(+), 160 deletions(-) delete mode 100755 ci.sh mode change 100644 => 100755 foursquare/__init__.py delete mode 100644 foursquare/__main__.py delete mode 100755 run diff --git a/ci.sh b/ci.sh deleted file mode 100755 index 760804a..0000000 --- a/ci.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -cd "$(this_dir)" || exit - -. ~/bash_ci - -ci_run mypy foursquare -ci_run pylint -E foursquare - -ci_report_errors diff --git a/foursquare/__init__.py b/foursquare/__init__.py old mode 100644 new mode 100755 index d1dbb86..1729fad --- a/foursquare/__init__.py +++ b/foursquare/__init__.py @@ -1,17 +1,21 @@ +#!/usr/bin/env python3 from datetime import datetime, timezone, timedelta -# TODO pytz for timezone??? -from typing import List, Dict, NamedTuple, Union, Any, Tuple +from typing import List, Dict, NamedTuple, Union, Any, Tuple, Set +import json +from pathlib import Path -from kython import safe_get, flatten, load_json_file -from kython.data import get_last_file +# TODO pytz for timezone??? + +from kython import safe_get, flatten # TODO actually i'm parsing FSQ in my gmaps thing -_BPATH = '/L/backups/4sq' +_BPATH = Path('/L/backups/4sq') def get_logger(): import logging return logging.getLogger("fsq-provider") + class Checkin: def __init__(self, j) -> None: self.j = j @@ -30,10 +34,15 @@ class Checkin: # TODO localize?? return datetime.fromtimestamp(created, tz=tz) + @property + def cid(self) -> str: + return self.j['id'] + def get_raw(fname=None): if fname is None: - fname = get_last_file(_BPATH, '.json') - j = load_json_file(fname) + fname = max(_BPATH.glob('*.json')) + with Path(fname).open() as fo: + j = json.load(fo) assert isinstance(j, list) for chunk in j: @@ -52,135 +61,19 @@ def get_checkins(*args, **kwargs): return checkins -# def extract(j): -# assert isinstance(j, list) -# for chunk in j: - -class JsonComparer: - def __init__(self, ignored=None): - import re - self.ignored = {} if ignored is None else { - re.compile(i) for i in ignored - } - self.logger = get_logger() - - # TODO ugh, maybe just check if it dominates? and comparison if both dominate each other... - def compare(self, a, b, path: str=""): - # TODO not so sure about contains... - if any(i.match(path) for i in self.ignored): - self.logger.debug(f"ignoring path {path}") - return True - if a == b: - return True - alleq = True - if isinstance(a, (int, float, bool, type(None), str)): - self.logger.warning(f"at path {path}: {a} != {b}") - alleq = False - elif isinstance(a, list) or isinstance(b, list): - if a is None or b is None or len(a) != len(b): - alleq = False - else: - for i in range(len(a)): - if not self.compare(a[i], b[i], path + f"[]"): - self.logger.warning(f"at path {path}") - alleq = False - elif isinstance(a, dict) or isinstance(b, dict): - ka = set(a.keys()) - kb = set(b.keys()) - if ka != kb: - import ipdb; ipdb.set_trace() - self.logger.warning(f"at path {path}") - alleq = False - else: - for k in ka: - if not self.compare(a[k], b[k], path + f".{k}"): - alleq = False - else: - raise RuntimeError(f"Type mismatch: {type(a)} vs {type(b)}") - - return alleq - - -# TODO ok, so it's stats changing... I guess I can handle it same way I handle reddit... -def get_comparer(): - def chregex(rest: str): - return r"^.\w+" + rest - c = JsonComparer(ignored={ - chregex('.venue.stats'), - chregex('.venue.menu.url'), - - # not so sure about these, but I guess makes sense. maybe add a sanity check that they are not too different?? - chregex('.venue.location.lat'), - chregex('.venue.location.lng'), - chregex('.venue.location.labeledLatLngs'), - - # TODO isMayor? - }) - return c - -# TODO right, I should only compare equivalent entries... -from kython import JSONType -def check_backups(backups: List[Tuple[JSONType, str]]): - logger = get_logger() - if len(backups) < 1: - logger.info(f"Nothing to check: only {len(backups)} left") - return [] - lastj, lastf = backups[-1] - tocleanup: List[str] = [] - comp = get_comparer() - for prevj, prevf in backups[-2::-1]: - logger.info(f"Comparing {lastf} vs {prevf}") - cres = comp.compare(prevj, lastj) - if cres: - logger.info(f"Removing {prevf}") - else: - logger.info(f"{lastf} differs from {prevf}") - - +# TODO do I need this?? def get_cid_map(bfile: str): raw = get_raw(bfile) return {i['id']: i for i in raw} -def cleanup_backups(): - from kython.data import get_all_files - from pprint import pprint - prev = None +def test(): + assert len(get_checkins()) > 100 + # TODO cid_map?? - # ok, so. pick last - # compare against prev. if there are no differences, delete prev. otherwise, choose prev as last. repeat - bfiles = get_all_files(_BPATH, 'checkins_2018-08') - backups = [(get_cid_map(bfile), bfile) for bfile in bfiles] - for (pv, _), (nx, _) in zip(backups, backups[1:]): - torm = set() - for cid in nx: - if cid not in pv: - torm.add(cid) - for cid in torm: - del nx[cid] # meh? - check_backups(backups) - return +def main(): + print(get_checkins()) - for f in bfiles: - print(f"Processing {f}") - cur = {ch['id']: ch for ch in get_raw(f)} - count = 0 - if prev is not None: - for cid, c in cur.items(): - if cid not in prev: - print(f"new checkin {cid}!") - else: - pc = prev[cid] - if pc != c: - compare_jsons(pc, c) - # import ipdb; ipdb.set_trace() - # print("WTF") - # pprint(pc) - # pprint(c) - # print("-----------") - # pres = c in prev - # if not pres: - # count += 1 - print(f"Difference: {count}") - prev = cur +if __name__ == '__main__': + main() diff --git a/foursquare/__main__.py b/foursquare/__main__.py deleted file mode 100644 index 5cff636..0000000 --- a/foursquare/__main__.py +++ /dev/null @@ -1,13 +0,0 @@ -from foursquare import get_checkins, get_logger, cleanup_backups - -import logging -from kython.logging import setup_logzero - -logger = get_logger() -setup_logzero(logger, level=logging.INFO) - -cleanup_backups() - -# for c in get_checkins(): -# print(c) - diff --git a/run b/run deleted file mode 100755 index 8a06e5c..0000000 --- a/run +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -eu - -cd "$(dirname "$0")" - -python3 -m foursquare From 4b954ed6de4daaf013d7aee6cc999314f3930c3c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 30 Jun 2019 10:01:29 +0100 Subject: [PATCH 5/5] attempt to add lists --- foursquare/__init__.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/foursquare/__init__.py b/foursquare/__init__.py index 1729fad..f7a4617 100755 --- a/foursquare/__init__.py +++ b/foursquare/__init__.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 from datetime import datetime, timezone, timedelta from typing import List, Dict, NamedTuple, Union, Any, Tuple, Set +from itertools import chain import json from pathlib import Path # TODO pytz for timezone??? -from kython import safe_get, flatten +from kython import safe_get # TODO actually i'm parsing FSQ in my gmaps thing _BPATH = Path('/L/backups/4sq') @@ -38,20 +39,34 @@ class Checkin: def cid(self) -> str: return self.j['id'] + def __repr__(self): + return repr(self.j) + + +class Place: + def __init__(self, j) -> None: + self.j = j + + +# TODO ugh. I'm not backing up lists, apparently... +# def test_places(): +# raise RuntimeError() + + + def get_raw(fname=None): if fname is None: fname = max(_BPATH.glob('*.json')) - with Path(fname).open() as fo: - j = json.load(fo) - + j = json.loads(Path(fname).read_text()) assert isinstance(j, list) + for chunk in j: del chunk['meta'] del chunk['notifications'] assert chunk.keys() == {'response'} assert chunk['response'].keys() == {'checkins'} - return flatten([x['response']['checkins']['items'] for x in j]) + return chain.from_iterable(x['response']['checkins']['items'] for x in j) # TODO not sure how to make it generic.. @@ -67,8 +82,10 @@ def get_cid_map(bfile: str): return {i['id']: i for i in raw} -def test(): - assert len(get_checkins()) > 100 +def test_checkins(): + checkins = get_checkins() + assert len(checkins) > 100 + assert any('Victoria Park' in c.summary for c in checkins) # TODO cid_map??