my.location: let takeout provider be in a separate my.location.google; add CI test & enable mypy

2020-10-05 21:14:14 +01:00 · 2020-10-05 21:14:14 +01:00 · ba9acc3445
commit ba9acc3445
parent 90ada92110
7 changed files with 82 additions and 35 deletions
--- a/my/location/google.py
+++ b/my/location/google.py
@ -0,0 +1,279 @@
+"""
+Location data from Google Takeout
+"""
+
+import json
+from collections import deque
+from datetime import datetime, timezone
+from itertools import islice
+from pathlib import Path
+from subprocess import Popen, PIPE
+from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence, IO, Tuple
+import re
+
+# pip3 install geopy
+import geopy # type: ignore
+import geopy.distance # type: ignore
+
+from ..core.common import get_files, LazyLogger, mcachew
+from ..core.cachew import cache_dir
+from ..google.takeout.paths import get_last_takeout
+from ..kython import kompress
+
+
+ # otherwise uses ijson
+ # todo move to config??
+USE_GREP = False
+
+
+logger = LazyLogger(__name__)
+
+
+Tag = Optional[str]
+
+# todo maybe don't tag by default?
+class Location(NamedTuple):
+    dt: datetime
+    lat: float
+    lon: float
+    alt: Optional[float]
+    tag: Tag
+
+
+TsLatLon = Tuple[int, int, int]
+
+
+def _iter_via_ijson(fo) -> Iterator[TsLatLon]:
+    # ijson version takes 25 seconds for 1M items (without processing)
+    try:
+        # pip3 install ijson cffi
+        import ijson.backends.yajl2_cffi as ijson # type: ignore
+    except:
+        import warnings
+        warnings.warn("Falling back to default ijson because 'cffi' backend isn't found. It's up to 2x faster, you might want to check it out")
+        import ijson # type: ignore
+
+    for d in ijson.items(fo, 'locations.item'):
+        yield (
+            int(d['timestampMs']),
+            d['latitudeE7' ],
+            d['longitudeE7'],
+        )
+
+
+# todo ugh. fragile, not sure, maybe should do some assert in advance?
+def _iter_via_grep(fo) -> Iterator[TsLatLon]:
+    # grep version takes 5 seconds for 1M items (without processing)
+    x = [-1, -1, -1]
+    for i, line in enumerate(fo):
+        if i > 0 and i % 3 == 0:
+            yield tuple(x) # type: ignore[misc]
+        n = re.search(b': "?(-?\\d+)"?,?$', line) # meh. somewhat fragile...
+        assert n is not None
+        j = i % 3
+        x[j] = int(n.group(1).decode('ascii'))
+    # make sure it's read what we expected
+    assert (i + 1) % 3 == 0
+    yield tuple(x) # type: ignore[misc]
+
+
+# todo could also use pool? not sure if that would really be faster...
+# earch thread could process 100K at once?
+# would need to find out a way to know when to stop? process in some sort of sqrt progression??
+
+
+def _iter_locations_fo(fit) -> Iterator[Location]:
+    total = 0
+    errors = 0
+
+    try:
+        from my.config.locations import LOCATIONS as known_locations
+    except ModuleNotFoundError as e:
+        name = 'my.config.locations'
+        if e.name != name:
+            raise e
+        logger.warning("'%s' isn't found. setting known_locations to empty list", name)
+        known_locations = []
+
+    # TODO tagging should be takeout-agnostic
+    def tagger(dt: datetime, point: geopy.Point) -> Tag:
+        '''
+        Tag points with known locations (e.g. work/home/etc)
+        '''
+        for lat, lon, dist, tag in known_locations:
+            # TODO use something more efficient?
+            if geopy.distance.distance((lat, lon), point).m  < dist:
+                return tag
+        else:
+            return None
+
+    for tsMs, latE7, lonE7 in fit:
+        dt = datetime.fromtimestamp(tsMs / 1000, tz=timezone.utc)
+        total += 1
+        if total % 10000 == 0:
+            logger.info('processing item %d %s', total, dt)
+
+        try:
+            lat = float(latE7 / 1e7)
+            lon = float(lonE7 / 1e7)
+            # note: geopy is quite slow..
+            point = geopy.Point(lat, lon) # kinda sanity check that coordinates are ok
+        except Exception as e:
+            logger.exception(e)
+            errors += 1
+            if float(errors) / total > 0.01:
+                raise RuntimeError('too many errors! aborting')
+            else:
+                continue
+
+        # todo support later
+        # alt = j.get("altitude", None)
+        alt = None
+        # todo enable tags later
+        # tag = tagger(dt, point) # TODO take accuracy into account??
+        tag = None
+        yield Location(
+            dt=dt,
+            lat=lat,
+            lon=lon,
+            alt=alt,
+            tag=tag
+        )
+
+
+_LOCATION_JSON = 'Takeout/Location History/Location History.json'
+
+# todo if start != 0, disable cache? again this is where nicer caching would come handy
+# TODO hope they are sorted... (could assert for it)
+@mcachew(cache_dir() / 'google_location.cache', logger=logger)
+def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
+    ctx: IO[str]
+    if path.suffix == '.json':
+        # todo: to support, should perhaps provide it as input= to Popen
+        raise RuntimeError("Temporary not supported")
+        ctx = path.open('r')
+    else: # must be a takeout archive
+        # todo CPath? although not sure if it can be iterative?
+        ctx = kompress.open(path, _LOCATION_JSON)
+
+    if USE_GREP:
+        unzip = f'unzip -p "{path}" "{_LOCATION_JSON}"'
+        extract = "grep -E '^    .(timestampMs|latitudeE7|longitudeE7)'"
+        with Popen(f'{unzip} | {extract}', shell=True, stdout=PIPE) as p:
+            out = p.stdout; assert out is not None
+            fit = _iter_via_grep(out)
+            fit = islice(fit, start, stop)
+            yield from _iter_locations_fo(fit)
+    else:
+        with ctx as fo:
+            # todo need to open as bytes
+            fit = _iter_via_ijson(fo)
+            fit = islice(fit, start, stop)
+            yield from _iter_locations_fo(fit)
+    # todo wonder if old takeouts could contribute as well??
+
+
+def locations(**kwargs) -> Iterator[Location]:
+    # TODO need to include older data
+    last_takeout = get_last_takeout(path=_LOCATION_JSON)
+
+    return _iter_locations(path=last_takeout, **kwargs)
+
+
+# todo deprecate?
+def get_locations(*args, **kwargs) -> Sequence[Location]:
+    return list(locations(*args, **kwargs))
+
+
+class LocInterval(NamedTuple):
+    from_: Location
+    to: Location
+
+
+# TODO use more_itertools
+# TODO kython? nicer interface?
+class Window:
+    def __init__(self, it):
+        self.it = it
+        self.storage: Deque[Any] = deque()
+        self.start = 0
+        self.end = 0
+
+    # TODO need check for existence?
+    def load_to(self, to):
+        while to >= self.end:
+            try:
+                ii = next(self.it)
+                self.storage.append(ii)
+                self.end += 1
+            except StopIteration:
+                break
+    def exists(self, i):
+        self.load_to(i)
+        return i < self.end
+
+    def consume_to(self, i):
+        self.load_to(i)
+        consumed = i - self.start
+        self.start = i
+        for _ in range(consumed):
+            self.storage.popleft()
+
+    def __getitem__(self, i):
+        self.load_to(i)
+        ii = i - self.start
+        assert ii >= 0
+        return self.storage[ii]
+
+
+
+# todo cachew as well?
+# TODO maybe if tag is none, we just don't care?
+def get_groups(*args, **kwargs) -> List[LocInterval]:
+    all_locations = iter(locations(*args, **kwargs))
+    locsi = Window(all_locations)
+    i = 0
+    groups: List[LocInterval] = []
+    curg: List[Location] = []
+
+    def add_to_group(x):
+        nonlocal curg
+        if len(curg) < 2:
+            curg.append(x)
+        else:
+            curg[-1] = x
+
+    def dump_group():
+        nonlocal curg
+        if len(curg) > 0:
+            # print("new group")
+            groups.append(LocInterval(from_=curg[0], to=curg[-1]))
+            curg = []
+
+    while locsi.exists(i):
+        if i % 10000 == 0:
+            logger.debug('grouping item %d', i)
+
+        locsi.consume_to(i)
+
+        last = None if len(curg) == 0 else curg[-1]
+        cur = locsi[i]
+        j = i
+        match = False
+        while not match and locsi.exists(j) and j < i + 10: # TODO FIXME time distance here... e.g. half an hour?
+            cur = locsi[j]
+            if last is None or cur.tag == last.tag:
+                # ok
+                add_to_group(cur)
+                i = j + 1
+                match = True
+            else:
+                j += 1
+        # if we made here without advancing
+        if not match:
+            dump_group()
+            i += 1
+        else:
+            pass
+    dump_group()
+    return groups