location fallback (#263)

see https://github.com/karlicoss/HPI/issues/262

* move home to fallback/via_home.py
* move via_ip to fallback
* add fallback model
* add stub via_ip file
* add fallback_locations for via_ip
* use protocol for locations
* estimate_from helper, via_home estimator, all.py
* via_home: add accuracy, cache history
* add datasources to gpslogger/google_takeout
* tz/via_location.py: update import to fallback
* denylist docs/installation instructions
* tz.via_location: let user customize cachew refresh time
* add via_ip.estimate_location using binary search
* use estimate_location in via_home.get_location
* tests: add gpslogger to location config stub
* tests: install tz related libs in test env
* tz: add regression test for broken windows dates

* vendorize bisect_left from python src
doesnt have a 'key' parameter till python3.10
This commit is contained in:
seanbreckenridge 2023-02-27 20:30:06 -08:00 committed by GitHub
parent 6dc5e7575f
commit 98b086f746
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
25 changed files with 1166 additions and 190 deletions

View file

@ -41,17 +41,23 @@ class config(user_config):
# if the accuracy for the location is more than 5km, don't use
require_accuracy: float = 5_000
# how often (hours) to refresh the cachew timezone cache
# this may be removed in the future if we opt for dict-based caching
_iter_tz_refresh_time: int = 6
from collections import Counter
from datetime import date, datetime
from functools import lru_cache
from itertools import groupby
from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable
from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable, Set
from more_itertools import seekable
import heapq
import pytz
from more_itertools import seekable
from my.core.common import LazyLogger, mcachew, tzdatetime
from my.core.source import import_source
logger = LazyLogger(__name__, level='warning')
@ -102,23 +108,13 @@ def _sorted_locations() -> List[Tuple[LatLon, datetime]]:
return list(sorted(_locations(), key=lambda x: x[1]))
# Note: this takes a while, as the upstream since _locations isn't sorted, so this
# has to do an iterative sort of the entire my.locations.all list
def _iter_local_dates() -> Iterator[DayWithZone]:
finder = _timezone_finder(fast=config.fast) # rely on the default
#pdt = None
# TODO: warnings doesnt actually warn?
warnings = []
locs: Iterable[Tuple[LatLon, datetime]]
locs = _sorted_locations() if config.sort_locations else _locations()
# todo allow to skip if not noo many errors in row?
def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]:
for (lat, lon), dt in locs:
# TODO right. its _very_ slow...
zone = finder.timezone_at(lat=lat, lng=lon)
# todo allow to skip if not noo many errors in row?
if zone is None:
warnings.append(f"Couldn't figure out tz for {lat}, {lon}")
# warnings.append(f"Couldn't figure out tz for {lat}, {lon}")
continue
tz = pytz.timezone(zone)
# TODO this is probably a bit expensive... test & benchmark
@ -133,6 +129,33 @@ def _iter_local_dates() -> Iterator[DayWithZone]:
z = tz.zone; assert z is not None
yield DayWithZone(day=ndate, zone=z)
# Note: this takes a while, as the upstream since _locations isn't sorted, so this
# has to do an iterative sort of the entire my.locations.all list
def _iter_local_dates() -> Iterator[DayWithZone]:
finder = _timezone_finder(fast=config.fast) # rely on the default
#pdt = None
# TODO: warnings doesnt actually warn?
# warnings = []
locs: Iterable[Tuple[LatLon, datetime]]
locs = _sorted_locations() if config.sort_locations else _locations()
yield from _find_tz_for_locs(finder, locs)
# my.location.fallback.estimate_location could be used here
# but iterating through all the locations is faster since this
# is saved behind cachew
@import_source(module_name="my.location.fallback.all")
def _iter_local_dates_fallback() -> Iterator[DayWithZone]:
from my.location.fallback.all import fallback_locations as flocs
def _fallback_locations() -> Iterator[Tuple[LatLon, datetime]]:
for loc in sorted(flocs(), key=lambda x: x.dt):
yield ((loc.lat, loc.lon), loc.dt)
yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations())
def most_common(lst: List[DayWithZone]) -> DayWithZone:
res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated]
@ -142,27 +165,43 @@ def most_common(lst: List[DayWithZone]) -> DayWithZone:
def _iter_tz_depends_on() -> str:
"""
Since you might get new data which specifies a new timezone sometime
in the day, this causes _iter_tzs to refresh every 6 hours, like:
in the day, this causes _iter_tzs to refresh every _iter_tz_refresh_time hours
(default 6), like:
2022-04-26_00
2022-04-26_06
2022-04-26_12
2022-04-26_18
"""
mod = config._iter_tz_refresh_time
assert mod >= 1
day = str(date.today())
hr = datetime.now().hour
hr_truncated = hr // 6 * 6
hr_truncated = hr // mod * mod
return "{}_{}".format(day, hr_truncated)
# refresh _iter_tzs every 6 hours -- don't think a better depends_on is possible dynamically
# refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically
@mcachew(logger=logger, depends_on=_iter_tz_depends_on)
def _iter_tzs() -> Iterator[DayWithZone]:
# since we have no control over what order the locations are returned,
# we need to sort them first before we can do a groupby
local_dates: List[DayWithZone] = list(_iter_local_dates())
local_dates.sort(key=lambda p: p.day)
for d, gr in groupby(local_dates, key=lambda p: p.day):
logger.info('processed %s', d)
logger.debug(f"no. of items using exact locations: {len(local_dates)}")
local_dates_fallback: List[DayWithZone] = list(_iter_local_dates_fallback())
local_dates_fallback.sort(key=lambda p: p.day)
# find days that are in fallback but not in local_dates (i.e., missing days)
local_dates_set: Set[date] = set(d.day for d in local_dates)
use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set]
logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}")
# combine local_dates and missing days from fallback into a sorted list
all_dates = heapq.merge(local_dates, use_fallback_days, key=lambda p: p.day)
for d, gr in groupby(all_dates, key=lambda p: p.day):
logger.info(f"processed {d}{', using fallback' if d in local_dates_set else ''}")
zone = most_common(list(gr)).zone
yield DayWithZone(day=d, zone=zone)
@ -192,7 +231,7 @@ def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]:
# ok to cache, there are only a few home locations?
@lru_cache(maxsize=None)
def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]:
def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]:
(lat, lng) = loc
finder = _timezone_finder(fast=False) # ok to use slow here for better precision
zone = finder.timezone_at(lat=lat, lng=lng)
@ -211,9 +250,17 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
if res is not None:
return res
# fallback to home tz
from ...location import home
loc = home.get_location(dt)
return _get_home_tz(loc=loc)
# note: the fallback to fallback.via_home.estimate_location is still needed, since
# _iter_local_dates_fallback only returns days which we actually have a datetime for
# (e.g. there was an IP address within a day of that datetime)
#
# given a datetime, fallback.via_home.estimate_location will find which home location
# that datetime is between, else fallback on your first home location, so it acts
# as a last resort
from my.location.fallback import via_home as home
loc = list(home.estimate_location(dt))
assert len(loc) == 1, f"should only have one home location, received {loc}"
return _get_home_tz(loc=(loc[0].lat, loc[0].lon))
# expose as 'public' function
get_tz = _get_tz