From 1f2e595be9d0e914ac604e358c05e9fd46e40a9a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Mon, 5 Oct 2020 23:21:09 +0100 Subject: [PATCH] Initial my.time.tz provider, infer from location with daily resolution --- my/google/takeout/paths.py | 7 ++- my/location/google.py | 5 +- my/time/tz/main.py | 9 +++ my/time/tz/via_location.py | 118 +++++++++++++++++++++++++++++++++++++ tests/tz.py | 62 +++++++++++++++++++ tox.ini | 7 ++- 6 files changed, 202 insertions(+), 6 deletions(-) create mode 100644 my/time/tz/main.py create mode 100644 my/time/tz/via_location.py create mode 100644 tests/tz.py diff --git a/my/google/takeout/paths.py b/my/google/takeout/paths.py index e36e22c..994d5d3 100644 --- a/my/google/takeout/paths.py +++ b/my/google/takeout/paths.py @@ -7,6 +7,9 @@ from ...core.common import Paths, get_files from ...core.util import __NOT_HPI_MODULE__ from my.config import google as user_config + +from more_itertools import last + @dataclass class google(user_config): takeout_path: Paths # path/paths/glob for the takeout zips @@ -35,9 +38,7 @@ def get_takeouts(*, path: Optional[str]=None) -> Iterable[Path]: def get_last_takeout(*, path: Optional[str]=None) -> Path: - # TODO more_itertools? - matching = list(get_takeouts(path=path)) - return matching[-1] + return last(get_takeouts(path=path)) # TODO might be a good idea to merge across multiple takeouts... diff --git a/my/location/google.py b/my/location/google.py index 5572514..0bd68f3 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -18,7 +18,6 @@ import geopy # type: ignore from ..core.common import LazyLogger, mcachew from ..core.cachew import cache_dir -from ..google.takeout.paths import get_last_takeout from ..kython import kompress @@ -148,7 +147,9 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]: def locations(**kwargs) -> Iterator[Location]: - # TODO need to include older data + # NOTE: if this import isn't lazy, tests/tz.py breaks because it can't override config + # very weird, as if this function captures the values of globals somehow?? investigate later. + from ..google.takeout.paths import get_last_takeout last_takeout = get_last_takeout(path=_LOCATION_JSON) return _iter_locations(path=last_takeout, **kwargs) diff --git a/my/time/tz/main.py b/my/time/tz/main.py new file mode 100644 index 0000000..87d8a17 --- /dev/null +++ b/my/time/tz/main.py @@ -0,0 +1,9 @@ +''' +Timezone data provider +''' +from datetime import datetime + +def localize(dt: datetime) -> datetime: + # For now, it's user's reponsibility to check that it actually managed to localize + from . import via_location as L + return L.localize(dt) diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py new file mode 100644 index 0000000..15ffe5f --- /dev/null +++ b/my/time/tz/via_location.py @@ -0,0 +1,118 @@ +''' +Timezone data provider, useful for localizing UTC-only/timezone unaware dates. +''' +REQUIRES = [ + # for determining timezone by coordinate + 'timezonefinder', +] + + +from collections import Counter +from datetime import date, datetime +from functools import lru_cache +from itertools import groupby, islice +from pathlib import Path +from typing import Dict, Iterator, List, NamedTuple, Optional, Tuple + +from more_itertools import seekable +import pytz + +from ...core.common import LazyLogger +from ...location.google import locations + + +logger = LazyLogger(__name__, level='debug') + + +# todo should move to config? not sure +_FASTER: bool = False +@lru_cache(1) +def _timezone_finder(): + from timezonefinder import TimezoneFinder as Finder # type: ignore + if _FASTER: + from timezonefinder import TimezoneFinderL as Finder # type: ignore + return Finder(in_memory=True) + + +Zone = str + + +# NOTE: for now only daily resolution is supported... later will implement something more efficient +class DayWithZone(NamedTuple): + day: date + zone: Zone + + +def _iter_local_dates(start=0, stop=None) -> Iterator[DayWithZone]: + finder = _timezone_finder(fast=_FASTER) # rely on the default + pdt = None + warnings = [] + # todo allow to skip if not noo many errors in row? + for l in locations(start=start, stop=stop): + # TODO right. its _very_ slow... + zone = finder.timezone_at(lng=l.lon, lat=l.lat) + if zone is None: + warnings.append(f"Couldn't figure out tz for {l}") + continue + tz = pytz.timezone(zone) + ldt = l.dt.astimezone(tz) + ndate = ldt.date() + if pdt is not None and ndate < pdt.date(): + # TODO for now just drop and collect the stats + # I guess we'd have minor drops while air travel... + warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") + continue + pdt = ldt + yield DayWithZone(day=ndate, zone=tz.zone) + + +def most_common(l): + res, count = Counter(l).most_common(1)[0] # type: ignore[var-annotated] + return res + + +def _iter_tzs() -> Iterator[DayWithZone]: + for d, gr in groupby(_iter_local_dates(), key=lambda p: p.day): + logger.info('processed %s', d) + zone = most_common(list(gr)).zone + yield DayWithZone(day=d, zone=zone) + + +@lru_cache(1) +def loc_tz_getter() -> Iterator[DayWithZone]: + # seekable makes it cache the emitted values + return seekable(_iter_tzs()) + + +# todo expose zone names too? +@lru_cache(maxsize=None) +def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: + sit = loc_tz_getter() + # todo hmm. seeking is not super efficient... might need to use some smarter dict-based cache + # hopefully, this method itself caches stuff forthe users, so won't be too bad + sit.seek(0) # type: ignore + + zone: Optional[str] = None + for x, tz in sit: + if x == d: + zone = tz + if x >= d: + break + return None if zone is None else pytz.timezone(zone) + + +def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: + return _get_day_tz(d=dt.date()) + + +def localize(dt: datetime) -> datetime: + # todo not sure. warn instead? + assert dt.tzinfo is None, dt + tz = _get_tz(dt) + if tz is None: + return dt + else: + return tz.localize(dt) + + +# TODO: cache stuff diff --git a/tests/tz.py b/tests/tz.py new file mode 100644 index 0000000..b117a63 --- /dev/null +++ b/tests/tz.py @@ -0,0 +1,62 @@ +from datetime import datetime, timedelta +from pathlib import Path +import sys + +import pytest # type: ignore + +import my.time.tz.main as TZ +import my.time.tz.via_location as LTZ + + +def test_iter_tzs() -> None: + ll = list(LTZ._iter_tzs()) + assert len(ll) > 3 + + +def test_future() -> None: + fut = datetime.now() + timedelta(days=100) + # shouldn't crash at least + assert TZ.localize(fut) is not None + + +def test_tz() -> None: + # not present in the test data + tz = LTZ._get_tz(D('20200101 10:00:00')) + assert tz is None + + tz = LTZ._get_tz(D('20170801 11:00:00')) + assert tz is not None + assert tz.zone == 'Europe/Vienna' + + tz = LTZ._get_tz(D('20170730 10:00:00')) + assert tz is not None + assert tz.zone == 'Europe/Rome' + + +def D(dstr: str) -> datetime: + return datetime.strptime(dstr, '%Y%m%d %H:%M:%S') + + +# TODO copy pasted from location.py, need to extract some common provider +@pytest.fixture(autouse=True) +def prepare(tmp_path: Path): + LTZ._FASTER = True + + from more_itertools import one + testdata = Path(__file__).absolute().parent.parent / 'testdata' + assert testdata.exists(), testdata + + track = one(testdata.rglob('italy-slovenia-2017-07-29.json')) + + # todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs + import zipfile + with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf: + zf.writestr('Takeout/Location History/Location History.json', track.read_bytes()) + + # FIXME ugh. early import/inheritance of user_confg in my.google.takeout.paths messes things up.. + from my.cfg import config + class user_config: + takeout_path = tmp_path + config.google = user_config # type: ignore + + yield diff --git a/tox.ini b/tox.ini index 3f26068..b9e8637 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,9 @@ commands = # my.location.google deps pip install geopy ijson + # my.time.tz.via_location dep + pip install timezonefinder + python3 -m pytest \ tests/core.py \ tests/misc.py \ @@ -25,7 +28,8 @@ commands = tests/config.py::test_environment_variable \ tests/demo.py \ tests/bluemaestro.py \ - tests/location.py + tests/location.py \ + tests/tz.py # TODO add; once I figure out porg depdencency?? tests/config.py # TODO run demo.py? just make sure with_my is a bit cleverer? # TODO e.g. under CI, rely on installing @@ -63,6 +67,7 @@ commands = -p my.body.exercise.cross_trainer \ -p my.bluemaestro \ -p my.location.google \ + -p my.time.tz.via_location \ --txt-report .mypy-coverage \ --html-report .mypy-coverage \ {posargs}