time.tz.via_location: more consistent behaviour wrt caching

previously it was possible to cachew never properly initialize the cache because if you only queried some dates in the past
because we never made it to the end of _iter_tzs

also some minor cleanup
This commit is contained in:
karlicoss 2023-11-10 22:09:48 +00:00
parent 70bb9ed0c5
commit 996169aa29

View file

@ -6,6 +6,24 @@ REQUIRES = [
'timezonefinder', 'timezonefinder',
] ]
from collections import Counter
from dataclasses import dataclass
from datetime import date, datetime
from functools import lru_cache
import heapq
from itertools import groupby
import os
from typing import Iterator, Optional, Tuple, Any, List, Iterable, Set, Dict
import pytz
from my.core import make_logger, stat, Stats, datetime_aware
from my.core.common import mcachew
from my.core.source import import_source
from my.core.warnings import high
from my.location.common import LatLon
## user might not have tz config section, so makes sense to be more defensive about it ## user might not have tz config section, so makes sense to be more defensive about it
# todo might be useful to extract a helper for this # todo might be useful to extract a helper for this
@ -27,8 +45,6 @@ if 'user_config' not in globals():
## ##
from my.core import dataclass
@dataclass @dataclass
class config(user_config): class config(user_config):
# less precise, but faster # less precise, but faster
@ -46,55 +62,33 @@ class config(user_config):
_iter_tz_refresh_time: int = 6 _iter_tz_refresh_time: int = 6
from collections import Counter logger = make_logger(__name__)
from datetime import date, datetime
from functools import lru_cache
from itertools import groupby
from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable, Set
import heapq
import pytz
from more_itertools import seekable
from my.core.common import LazyLogger, mcachew, tzdatetime @lru_cache(None)
from my.core.source import import_source
logger = LazyLogger(__name__, level='warning')
@lru_cache(2)
def _timezone_finder(fast: bool) -> Any: def _timezone_finder(fast: bool) -> Any:
if fast: if fast:
# less precise, but faster # less precise, but faster
from timezonefinder import TimezoneFinderL as Finder from timezonefinder import TimezoneFinderL as Finder
else: else:
from timezonefinder import TimezoneFinder as Finder # type: ignore from timezonefinder import TimezoneFinder as Finder # type: ignore
return Finder(in_memory=True) return Finder(in_memory=True)
# todo move to common?
Zone = str
# NOTE: for now only daily resolution is supported... later will implement something more efficient
class DayWithZone(NamedTuple):
day: date
zone: Zone
from my.location.common import LatLon
# for backwards compatibility # for backwards compatibility
def _locations() -> Iterator[Tuple[LatLon, datetime]]: def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]:
try: try:
import my.location.all import my.location.all
for loc in my.location.all.locations(): for loc in my.location.all.locations():
if loc.accuracy is not None and loc.accuracy > config.require_accuracy: if loc.accuracy is not None and loc.accuracy > config.require_accuracy:
continue continue
yield ((loc.lat, loc.lon), loc.dt) yield ((loc.lat, loc.lon), loc.dt)
except Exception as e: except Exception as e:
from my.core.warnings import high logger.exception(
logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e) "Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e
)
high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data")
import my.location.google import my.location.google
@ -102,10 +96,22 @@ def _locations() -> Iterator[Tuple[LatLon, datetime]]:
for gloc in my.location.google.locations(): for gloc in my.location.google.locations():
yield ((gloc.lat, gloc.lon), gloc.dt) yield ((gloc.lat, gloc.lon), gloc.dt)
# TODO: could use heapmerge or sort the underlying iterators somehow? # TODO: could use heapmerge or sort the underlying iterators somehow?
# see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934 # see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934
def _sorted_locations() -> List[Tuple[LatLon, datetime]]: def _sorted_locations() -> List[Tuple[LatLon, datetime_aware]]:
return list(sorted(_locations(), key=lambda x: x[1])) return sorted(_locations(), key=lambda x: x[1])
# todo move to common?
Zone = str
# NOTE: for now only daily resolution is supported... later will implement something more efficient
@dataclass(unsafe_hash=True)
class DayWithZone:
day: date
zone: Zone
def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]: def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]:
@ -120,20 +126,22 @@ def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> I
# TODO this is probably a bit expensive... test & benchmark # TODO this is probably a bit expensive... test & benchmark
ldt = dt.astimezone(tz) ldt = dt.astimezone(tz)
ndate = ldt.date() ndate = ldt.date()
#if pdt is not None and ndate < pdt.date(): # if pdt is not None and ndate < pdt.date():
# # TODO for now just drop and collect the stats # # TODO for now just drop and collect the stats
# # I guess we'd have minor drops while air travel... # # I guess we'd have minor drops while air travel...
# warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") # warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}")
# continue # continue
#pdt = ldt # pdt = ldt
z = tz.zone; assert z is not None z = tz.zone
assert z is not None
yield DayWithZone(day=ndate, zone=z) yield DayWithZone(day=ndate, zone=z)
# Note: this takes a while, as the upstream since _locations isn't sorted, so this # Note: this takes a while, as the upstream since _locations isn't sorted, so this
# has to do an iterative sort of the entire my.locations.all list # has to do an iterative sort of the entire my.locations.all list
def _iter_local_dates() -> Iterator[DayWithZone]: def _iter_local_dates() -> Iterator[DayWithZone]:
finder = _timezone_finder(fast=config.fast) # rely on the default finder = _timezone_finder(fast=config.fast) # rely on the default
#pdt = None # pdt = None
# TODO: warnings doesn't actually warn? # TODO: warnings doesn't actually warn?
# warnings = [] # warnings = []
@ -157,7 +165,7 @@ def _iter_local_dates_fallback() -> Iterator[DayWithZone]:
yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations()) yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations())
def most_common(lst: List[DayWithZone]) -> DayWithZone: def most_common(lst: Iterator[DayWithZone]) -> DayWithZone:
res, _ = Counter(lst).most_common(1)[0] res, _ = Counter(lst).most_common(1)[0]
return res return res
@ -181,59 +189,49 @@ def _iter_tz_depends_on() -> str:
# refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically # refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically
@mcachew(logger=logger, depends_on=_iter_tz_depends_on) @mcachew(depends_on=_iter_tz_depends_on)
def _iter_tzs() -> Iterator[DayWithZone]: def _iter_tzs() -> Iterator[DayWithZone]:
# since we have no control over what order the locations are returned, # since we have no control over what order the locations are returned,
# we need to sort them first before we can do a groupby # we need to sort them first before we can do a groupby
local_dates: List[DayWithZone] = list(_iter_local_dates()) by_day = lambda p: p.day
local_dates.sort(key=lambda p: p.day)
local_dates: List[DayWithZone] = sorted(_iter_local_dates(), key=by_day)
logger.debug(f"no. of items using exact locations: {len(local_dates)}") logger.debug(f"no. of items using exact locations: {len(local_dates)}")
local_dates_fallback: List[DayWithZone] = list(_iter_local_dates_fallback()) local_dates_fallback: List[DayWithZone] = sorted(_iter_local_dates_fallback(), key=by_day)
local_dates_fallback.sort(key=lambda p: p.day)
# find days that are in fallback but not in local_dates (i.e., missing days) # find days that are in fallback but not in local_dates (i.e., missing days)
local_dates_set: Set[date] = set(d.day for d in local_dates) local_dates_set: Set[date] = {d.day for d in local_dates}
use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set] use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set]
logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}") logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}")
# combine local_dates and missing days from fallback into a sorted list # combine local_dates and missing days from fallback into a sorted list
all_dates = heapq.merge(local_dates, use_fallback_days, key=lambda p: p.day) all_dates = heapq.merge(local_dates, use_fallback_days, key=by_day)
# todo could probably use heapify here instead of heapq.merge?
for d, gr in groupby(all_dates, key=lambda p: p.day): for d, gr in groupby(all_dates, key=by_day):
logger.info(f"processed {d}{', using fallback' if d in local_dates_set else ''}") logger.debug(f"processed {d}{', using fallback' if d in local_dates_set else ''}")
zone = most_common(list(gr)).zone zone = most_common(gr).zone
yield DayWithZone(day=d, zone=zone) yield DayWithZone(day=d, zone=zone)
@lru_cache(1) @lru_cache(1)
def loc_tz_getter() -> Iterator[DayWithZone]: def _day2zone() -> Dict[date, pytz.BaseTzInfo]:
# seekable makes it cache the emitted values # NOTE: kinda unfortunate that this will have to process all days before returning result for just one
return seekable(_iter_tzs()) # however otherwise cachew cache might never be initialized properly
# so we'll always end up recomputing everyting during subsequent runs
return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()}
# todo expose zone names too?
@lru_cache(maxsize=None)
def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]:
sit = loc_tz_getter() return _day2zone().get(d)
# todo hmm. seeking is not super efficient... might need to use some smarter dict-based cache
# hopefully, this method itself caches stuff forthe users, so won't be too bad
sit.seek(0) # type: ignore
zone: Optional[str] = None
for x, tz in sit:
if x == d:
zone = tz
if x >= d:
break
return None if zone is None else pytz.timezone(zone)
# ok to cache, there are only a few home locations? # ok to cache, there are only a few home locations?
@lru_cache(maxsize=None) @lru_cache(None)
def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]:
(lat, lng) = loc (lat, lng) = loc
finder = _timezone_finder(fast=False) # ok to use slow here for better precision finder = _timezone_finder(fast=False) # ok to use slow here for better precision
zone = finder.timezone_at(lat=lat, lng=lng) zone = finder.timezone_at(lat=lat, lng=lng)
if zone is None: if zone is None:
# TODO shouldn't really happen, warn? # TODO shouldn't really happen, warn?
@ -242,7 +240,7 @@ def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]:
return pytz.timezone(zone) return pytz.timezone(zone)
def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: def get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
''' '''
Given a datetime, returns the timezone for that date. Given a datetime, returns the timezone for that date.
''' '''
@ -258,16 +256,14 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
# that datetime is between, else fallback on your first home location, so it acts # that datetime is between, else fallback on your first home location, so it acts
# as a last resort # as a last resort
from my.location.fallback import via_home as home from my.location.fallback import via_home as home
loc = list(home.estimate_location(dt)) loc = list(home.estimate_location(dt))
assert len(loc) == 1, f"should only have one home location, received {loc}" assert len(loc) == 1, f"should only have one home location, received {loc}"
return _get_home_tz(loc=(loc[0].lat, loc[0].lon)) return _get_home_tz(loc=(loc[0].lat, loc[0].lon))
# expose as 'public' function
get_tz = _get_tz
def localize(dt: datetime) -> datetime_aware:
def localize(dt: datetime) -> tzdatetime: tz = get_tz(dt)
tz = _get_tz(dt)
if tz is None: if tz is None:
# TODO -- this shouldn't really happen.. think about it carefully later # TODO -- this shouldn't really happen.. think about it carefully later
return dt return dt
@ -275,20 +271,17 @@ def localize(dt: datetime) -> tzdatetime:
return tz.localize(dt) return tz.localize(dt)
from ...core import stat, Stats def stats(quick: bool = False) -> Stats:
def stats(quick: bool=False) -> Stats:
if quick: if quick:
prev, config.sort_locations = config.sort_locations, False prev, config.sort_locations = config.sort_locations, False
res = { res = {'first': next(_iter_local_dates())}
'first': next(_iter_local_dates())
}
config.sort_locations = prev config.sort_locations = prev
return res return res
# TODO not sure what would be a good stat() for this module... # TODO not sure what would be a good stat() for this module...
# might be nice to print some actual timezones? # might be nice to print some actual timezones?
# there aren't really any great iterables to expose # there aren't really any great iterables to expose
import os
VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990)) VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990))
def localized_years(): def localized_years():
last = datetime.now().year + 2 last = datetime.now().year + 2
# note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached # note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached
@ -296,4 +289,9 @@ def stats(quick: bool=False) -> Stats:
for Y in range(VIA_LOCATION_START_YEAR, last): for Y in range(VIA_LOCATION_START_YEAR, last):
dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01') dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01')
yield localize(dt) yield localize(dt)
return stat(localized_years) return stat(localized_years)
# deprecated -- still used in some other modules so need to keep
_get_tz = get_tz