time.tz.via_location: more consistent behaviour wrt caching

previously it was possible to cachew never properly initialize the cache because if you only queried some dates in the past
because we never made it to the end of _iter_tzs

also some minor cleanup
This commit is contained in:
karlicoss 2023-11-10 22:09:48 +00:00
parent 70bb9ed0c5
commit 996169aa29

View file

@ -6,6 +6,24 @@ REQUIRES = [
'timezonefinder',
]
from collections import Counter
from dataclasses import dataclass
from datetime import date, datetime
from functools import lru_cache
import heapq
from itertools import groupby
import os
from typing import Iterator, Optional, Tuple, Any, List, Iterable, Set, Dict
import pytz
from my.core import make_logger, stat, Stats, datetime_aware
from my.core.common import mcachew
from my.core.source import import_source
from my.core.warnings import high
from my.location.common import LatLon
## user might not have tz config section, so makes sense to be more defensive about it
# todo might be useful to extract a helper for this
@ -27,8 +45,6 @@ if 'user_config' not in globals():
##
from my.core import dataclass
@dataclass
class config(user_config):
# less precise, but faster
@ -46,22 +62,10 @@ class config(user_config):
_iter_tz_refresh_time: int = 6
from collections import Counter
from datetime import date, datetime
from functools import lru_cache
from itertools import groupby
from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable, Set
logger = make_logger(__name__)
import heapq
import pytz
from more_itertools import seekable
from my.core.common import LazyLogger, mcachew, tzdatetime
from my.core.source import import_source
logger = LazyLogger(__name__, level='warning')
@lru_cache(2)
@lru_cache(None)
def _timezone_finder(fast: bool) -> Any:
if fast:
# less precise, but faster
@ -71,30 +75,20 @@ def _timezone_finder(fast: bool) -> Any:
return Finder(in_memory=True)
# todo move to common?
Zone = str
# NOTE: for now only daily resolution is supported... later will implement something more efficient
class DayWithZone(NamedTuple):
day: date
zone: Zone
from my.location.common import LatLon
# for backwards compatibility
def _locations() -> Iterator[Tuple[LatLon, datetime]]:
def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]:
try:
import my.location.all
for loc in my.location.all.locations():
if loc.accuracy is not None and loc.accuracy > config.require_accuracy:
continue
yield ((loc.lat, loc.lon), loc.dt)
except Exception as e:
from my.core.warnings import high
logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e)
logger.exception(
"Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e
)
high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data")
import my.location.google
@ -102,10 +96,22 @@ def _locations() -> Iterator[Tuple[LatLon, datetime]]:
for gloc in my.location.google.locations():
yield ((gloc.lat, gloc.lon), gloc.dt)
# TODO: could use heapmerge or sort the underlying iterators somehow?
# see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934
def _sorted_locations() -> List[Tuple[LatLon, datetime]]:
return list(sorted(_locations(), key=lambda x: x[1]))
def _sorted_locations() -> List[Tuple[LatLon, datetime_aware]]:
return sorted(_locations(), key=lambda x: x[1])
# todo move to common?
Zone = str
# NOTE: for now only daily resolution is supported... later will implement something more efficient
@dataclass(unsafe_hash=True)
class DayWithZone:
day: date
zone: Zone
def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]:
@ -126,9 +132,11 @@ def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> I
# warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}")
# continue
# pdt = ldt
z = tz.zone; assert z is not None
z = tz.zone
assert z is not None
yield DayWithZone(day=ndate, zone=z)
# Note: this takes a while, as the upstream since _locations isn't sorted, so this
# has to do an iterative sort of the entire my.locations.all list
def _iter_local_dates() -> Iterator[DayWithZone]:
@ -157,7 +165,7 @@ def _iter_local_dates_fallback() -> Iterator[DayWithZone]:
yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations())
def most_common(lst: List[DayWithZone]) -> DayWithZone:
def most_common(lst: Iterator[DayWithZone]) -> DayWithZone:
res, _ = Counter(lst).most_common(1)[0]
return res
@ -181,56 +189,46 @@ def _iter_tz_depends_on() -> str:
# refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically
@mcachew(logger=logger, depends_on=_iter_tz_depends_on)
@mcachew(depends_on=_iter_tz_depends_on)
def _iter_tzs() -> Iterator[DayWithZone]:
# since we have no control over what order the locations are returned,
# we need to sort them first before we can do a groupby
local_dates: List[DayWithZone] = list(_iter_local_dates())
local_dates.sort(key=lambda p: p.day)
by_day = lambda p: p.day
local_dates: List[DayWithZone] = sorted(_iter_local_dates(), key=by_day)
logger.debug(f"no. of items using exact locations: {len(local_dates)}")
local_dates_fallback: List[DayWithZone] = list(_iter_local_dates_fallback())
local_dates_fallback.sort(key=lambda p: p.day)
local_dates_fallback: List[DayWithZone] = sorted(_iter_local_dates_fallback(), key=by_day)
# find days that are in fallback but not in local_dates (i.e., missing days)
local_dates_set: Set[date] = set(d.day for d in local_dates)
local_dates_set: Set[date] = {d.day for d in local_dates}
use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set]
logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}")
# combine local_dates and missing days from fallback into a sorted list
all_dates = heapq.merge(local_dates, use_fallback_days, key=lambda p: p.day)
all_dates = heapq.merge(local_dates, use_fallback_days, key=by_day)
# todo could probably use heapify here instead of heapq.merge?
for d, gr in groupby(all_dates, key=lambda p: p.day):
logger.info(f"processed {d}{', using fallback' if d in local_dates_set else ''}")
zone = most_common(list(gr)).zone
for d, gr in groupby(all_dates, key=by_day):
logger.debug(f"processed {d}{', using fallback' if d in local_dates_set else ''}")
zone = most_common(gr).zone
yield DayWithZone(day=d, zone=zone)
@lru_cache(1)
def loc_tz_getter() -> Iterator[DayWithZone]:
# seekable makes it cache the emitted values
return seekable(_iter_tzs())
def _day2zone() -> Dict[date, pytz.BaseTzInfo]:
# NOTE: kinda unfortunate that this will have to process all days before returning result for just one
# however otherwise cachew cache might never be initialized properly
# so we'll always end up recomputing everyting during subsequent runs
return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()}
# todo expose zone names too?
@lru_cache(maxsize=None)
def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]:
sit = loc_tz_getter()
# todo hmm. seeking is not super efficient... might need to use some smarter dict-based cache
# hopefully, this method itself caches stuff forthe users, so won't be too bad
sit.seek(0) # type: ignore
zone: Optional[str] = None
for x, tz in sit:
if x == d:
zone = tz
if x >= d:
break
return None if zone is None else pytz.timezone(zone)
return _day2zone().get(d)
# ok to cache, there are only a few home locations?
@lru_cache(maxsize=None)
@lru_cache(None)
def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]:
(lat, lng) = loc
finder = _timezone_finder(fast=False) # ok to use slow here for better precision
@ -242,7 +240,7 @@ def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]:
return pytz.timezone(zone)
def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
def get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
'''
Given a datetime, returns the timezone for that date.
'''
@ -258,16 +256,14 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
# that datetime is between, else fallback on your first home location, so it acts
# as a last resort
from my.location.fallback import via_home as home
loc = list(home.estimate_location(dt))
assert len(loc) == 1, f"should only have one home location, received {loc}"
return _get_home_tz(loc=(loc[0].lat, loc[0].lon))
# expose as 'public' function
get_tz = _get_tz
def localize(dt: datetime) -> tzdatetime:
tz = _get_tz(dt)
def localize(dt: datetime) -> datetime_aware:
tz = get_tz(dt)
if tz is None:
# TODO -- this shouldn't really happen.. think about it carefully later
return dt
@ -275,20 +271,17 @@ def localize(dt: datetime) -> tzdatetime:
return tz.localize(dt)
from ...core import stat, Stats
def stats(quick: bool = False) -> Stats:
if quick:
prev, config.sort_locations = config.sort_locations, False
res = {
'first': next(_iter_local_dates())
}
res = {'first': next(_iter_local_dates())}
config.sort_locations = prev
return res
# TODO not sure what would be a good stat() for this module...
# might be nice to print some actual timezones?
# there aren't really any great iterables to expose
import os
VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990))
def localized_years():
last = datetime.now().year + 2
# note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached
@ -296,4 +289,9 @@ def stats(quick: bool=False) -> Stats:
for Y in range(VIA_LOCATION_START_YEAR, last):
dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01')
yield localize(dt)
return stat(localized_years)
# deprecated -- still used in some other modules so need to keep
_get_tz = get_tz