HPI/my/location/takeout.py

from typing import NamedTuple, Iterator, List, Iterable, Collection, Sequence, Deque, Any, Optional
from collections import deque
from itertools import islice
from datetime import datetime
from zipfile import ZipFile
import logging
import csv
import re
import json
from pathlib import Path
import pytz


from kython import kompress

from cachew import cachew, mtime_hash


# pip3 install geopy
import geopy # type: ignore
import geopy.distance # type: ignore
# pip3 install ijson
import ijson # type: ignore

def get_logger():
    return logging.getLogger("location")


TAKEOUTS_PATH = Path("/path/to/takeout")
CACHE_PATH = Path('/L/data/.cache/location.sqlite')


Tag = str

class Location(NamedTuple):
    dt: datetime
    lat: float
    lon: float
    alt: Optional[float]
    tag: Tag


def tagger(dt: datetime, point: geopy.Point) -> Tag:
    TAGS = [
 # removed
    ]
    for coord, dist, tag in TAGS:
        if geopy.distance.distance(coord, point).m  < dist:
            return tag
    else:
        return "other"


def _iter_locations_fo(fo, start, stop) -> Iterator[Location]:
    logger = get_logger()
    total = 0
    errors = 0

    for j in islice(ijson.items(fo, 'locations.item'), start, stop):
        dt = datetime.utcfromtimestamp(int(j["timestampMs"]) / 1000)
        if total % 10000 == 0:
            logger.info('processing item %d %s', total, dt)
        total += 1

        dt = pytz.utc.localize(dt)
        try:
            lat = float(j["latitudeE7"] / 10000000)
            lon = float(j["longitudeE7"] / 10000000)
            point = geopy.Point(lat, lon) # kinda sanity check that coordinates are ok
        except Exception as e:
            logger.exception(e)
            errors += 1
            if float(errors) / total > 0.01:
                raise RuntimeError('too many errors! aborting')
            else:
                continue

        alt = j.get("altitude", None)
        tag = tagger(dt, point) # TODO take accuracy into account??
        yield Location(
            dt=dt,
            lat=lat,
            lon=lon,
            alt=alt,
            tag=tag
        )

# TODO hope they are sorted...
@cachew(CACHE_PATH, hashf=mtime_hash, cls=Location, chunk_by=10000, logger=get_logger())
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
    if path.suffix == '.json':
        ctx = path.open('r')
    else: # must be a takeout archive
        ctx = kompress.open(path, 'Takeout/Location History/Location History.json')

    with ctx as fo:
        yield from _iter_locations_fo(fo, start=start, stop=stop)
    # TODO wonder if old takeouts could contribute as well??


def iter_locations(**kwargs) -> Iterator[Location]:
    last_takeout = max(TAKEOUTS_PATH.glob('takeout*.zip'))
    return _iter_locations(path=last_takeout, **kwargs)


def get_locations() -> Sequence[Location]:
    return list(iter_locations())

class LocInterval(NamedTuple):
    from_: Location
    to: Location


# TODO kython? nicer interface?
class Window:
    def __init__(self, it):
        self.it = it
        self.storage: Deque[Any] = deque()
        self.start = 0
        self.end = 0

    # TODO need check for existence?
    def load_to(self, to):
        while to >= self.end:
            try:
                ii = next(self.it)
                self.storage.append(ii)
                self.end += 1
            except StopIteration:
                break
    def exists(self, i):
        self.load_to(i)
        return i < self.end

    def consume_to(self, i):
        self.load_to(i)
        consumed = i - self.start
        self.start = i
        for _ in range(consumed):
            self.storage.popleft()

    def __getitem__(self, i):
        self.load_to(i)
        ii = i - self.start
        assert ii >= 0
        return self.storage[ii]


# TODO maybe if tag is none, we just don't care?
def get_groups() -> List[LocInterval]:
    logger = get_logger()

    all_locations = iter(iter_locations()) # TODO
    locsi = Window(all_locations)
    i = 0
    groups: List[LocInterval] = []
    curg: List[Location] = []

    def add_to_group(x):
        nonlocal curg
        if len(curg) < 2:
            curg.append(x)
        else:
            curg[-1] = x

    def dump_group():
        nonlocal curg
        if len(curg) > 0:
            # print("new group")
            groups.append(LocInterval(from_=curg[0], to=curg[-1]))
            curg = []

    while locsi.exists(i):
        if i % 10000 == 0:
            logger.debug('grouping item %d', i)

        locsi.consume_to(i)

        last = None if len(curg) == 0 else curg[-1]
        cur = locsi[i]
        j = i
        match = False
        while not match and locsi.exists(j) and j < i + 10: # TODO FIXME time distance here... e.g. half an hour?
            cur = locsi[j]
            if last is None or cur.tag == last.tag:
                # ok
                add_to_group(cur)
                i = j + 1
                match = True
            else:
                j += 1
        # if we made here without advancing
        if not match:
            dump_group()
            i += 1
        else:
            pass
    dump_group()
    return groups


def update_cache():
    # TODO perhaps set hash to null instead, that's a bit less intrusive
    if CACHE_PATH.exists():
        CACHE_PATH.unlink()
    for _ in iter_locations():
        pass