diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index 1ed1ba7..612341a 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -6,6 +6,24 @@ REQUIRES = [ 'timezonefinder', ] +from collections import Counter +from dataclasses import dataclass +from datetime import date, datetime +from functools import lru_cache +import heapq +from itertools import groupby +import os +from typing import Iterator, Optional, Tuple, Any, List, Iterable, Set, Dict + +import pytz + +from my.core import make_logger, stat, Stats, datetime_aware +from my.core.common import mcachew +from my.core.source import import_source +from my.core.warnings import high + +from my.location.common import LatLon + ## user might not have tz config section, so makes sense to be more defensive about it # todo might be useful to extract a helper for this @@ -27,8 +45,6 @@ if 'user_config' not in globals(): ## -from my.core import dataclass - @dataclass class config(user_config): # less precise, but faster @@ -46,55 +62,33 @@ class config(user_config): _iter_tz_refresh_time: int = 6 -from collections import Counter -from datetime import date, datetime -from functools import lru_cache -from itertools import groupby -from typing import Iterator, NamedTuple, Optional, Tuple, Any, List, Iterable, Set +logger = make_logger(__name__) -import heapq -import pytz -from more_itertools import seekable -from my.core.common import LazyLogger, mcachew, tzdatetime -from my.core.source import import_source - -logger = LazyLogger(__name__, level='warning') - -@lru_cache(2) +@lru_cache(None) def _timezone_finder(fast: bool) -> Any: if fast: # less precise, but faster from timezonefinder import TimezoneFinderL as Finder else: - from timezonefinder import TimezoneFinder as Finder # type: ignore + from timezonefinder import TimezoneFinder as Finder # type: ignore return Finder(in_memory=True) -# todo move to common? -Zone = str - - -# NOTE: for now only daily resolution is supported... later will implement something more efficient -class DayWithZone(NamedTuple): - day: date - zone: Zone - - -from my.location.common import LatLon - # for backwards compatibility -def _locations() -> Iterator[Tuple[LatLon, datetime]]: +def _locations() -> Iterator[Tuple[LatLon, datetime_aware]]: try: import my.location.all + for loc in my.location.all.locations(): if loc.accuracy is not None and loc.accuracy > config.require_accuracy: continue yield ((loc.lat, loc.lon), loc.dt) except Exception as e: - from my.core.warnings import high - logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e) + logger.exception( + "Could not setup via_location using my.location.all provider, falling back to legacy google implementation", exc_info=e + ) high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") import my.location.google @@ -102,10 +96,22 @@ def _locations() -> Iterator[Tuple[LatLon, datetime]]: for gloc in my.location.google.locations(): yield ((gloc.lat, gloc.lon), gloc.dt) + # TODO: could use heapmerge or sort the underlying iterators somehow? # see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934 -def _sorted_locations() -> List[Tuple[LatLon, datetime]]: - return list(sorted(_locations(), key=lambda x: x[1])) +def _sorted_locations() -> List[Tuple[LatLon, datetime_aware]]: + return sorted(_locations(), key=lambda x: x[1]) + + +# todo move to common? +Zone = str + + +# NOTE: for now only daily resolution is supported... later will implement something more efficient +@dataclass(unsafe_hash=True) +class DayWithZone: + day: date + zone: Zone def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> Iterator[DayWithZone]: @@ -120,20 +126,22 @@ def _find_tz_for_locs(finder: Any, locs: Iterable[Tuple[LatLon, datetime]]) -> I # TODO this is probably a bit expensive... test & benchmark ldt = dt.astimezone(tz) ndate = ldt.date() - #if pdt is not None and ndate < pdt.date(): + # if pdt is not None and ndate < pdt.date(): # # TODO for now just drop and collect the stats # # I guess we'd have minor drops while air travel... # warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") # continue - #pdt = ldt - z = tz.zone; assert z is not None + # pdt = ldt + z = tz.zone + assert z is not None yield DayWithZone(day=ndate, zone=z) + # Note: this takes a while, as the upstream since _locations isn't sorted, so this # has to do an iterative sort of the entire my.locations.all list def _iter_local_dates() -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=config.fast) # rely on the default - #pdt = None + finder = _timezone_finder(fast=config.fast) # rely on the default + # pdt = None # TODO: warnings doesn't actually warn? # warnings = [] @@ -157,7 +165,7 @@ def _iter_local_dates_fallback() -> Iterator[DayWithZone]: yield from _find_tz_for_locs(_timezone_finder(fast=config.fast), _fallback_locations()) -def most_common(lst: List[DayWithZone]) -> DayWithZone: +def most_common(lst: Iterator[DayWithZone]) -> DayWithZone: res, _ = Counter(lst).most_common(1)[0] return res @@ -181,59 +189,49 @@ def _iter_tz_depends_on() -> str: # refresh _iter_tzs every few hours -- don't think a better depends_on is possible dynamically -@mcachew(logger=logger, depends_on=_iter_tz_depends_on) +@mcachew(depends_on=_iter_tz_depends_on) def _iter_tzs() -> Iterator[DayWithZone]: # since we have no control over what order the locations are returned, # we need to sort them first before we can do a groupby - local_dates: List[DayWithZone] = list(_iter_local_dates()) - local_dates.sort(key=lambda p: p.day) + by_day = lambda p: p.day + + local_dates: List[DayWithZone] = sorted(_iter_local_dates(), key=by_day) logger.debug(f"no. of items using exact locations: {len(local_dates)}") - local_dates_fallback: List[DayWithZone] = list(_iter_local_dates_fallback()) - local_dates_fallback.sort(key=lambda p: p.day) + local_dates_fallback: List[DayWithZone] = sorted(_iter_local_dates_fallback(), key=by_day) # find days that are in fallback but not in local_dates (i.e., missing days) - local_dates_set: Set[date] = set(d.day for d in local_dates) + local_dates_set: Set[date] = {d.day for d in local_dates} use_fallback_days: List[DayWithZone] = [d for d in local_dates_fallback if d.day not in local_dates_set] logger.debug(f"no. of items being used from fallback locations: {len(use_fallback_days)}") # combine local_dates and missing days from fallback into a sorted list - all_dates = heapq.merge(local_dates, use_fallback_days, key=lambda p: p.day) + all_dates = heapq.merge(local_dates, use_fallback_days, key=by_day) + # todo could probably use heapify here instead of heapq.merge? - for d, gr in groupby(all_dates, key=lambda p: p.day): - logger.info(f"processed {d}{', using fallback' if d in local_dates_set else ''}") - zone = most_common(list(gr)).zone + for d, gr in groupby(all_dates, key=by_day): + logger.debug(f"processed {d}{', using fallback' if d in local_dates_set else ''}") + zone = most_common(gr).zone yield DayWithZone(day=d, zone=zone) @lru_cache(1) -def loc_tz_getter() -> Iterator[DayWithZone]: - # seekable makes it cache the emitted values - return seekable(_iter_tzs()) +def _day2zone() -> Dict[date, pytz.BaseTzInfo]: + # NOTE: kinda unfortunate that this will have to process all days before returning result for just one + # however otherwise cachew cache might never be initialized properly + # so we'll always end up recomputing everyting during subsequent runs + return {dz.day: pytz.timezone(dz.zone) for dz in _iter_tzs()} -# todo expose zone names too? -@lru_cache(maxsize=None) def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: - sit = loc_tz_getter() - # todo hmm. seeking is not super efficient... might need to use some smarter dict-based cache - # hopefully, this method itself caches stuff forthe users, so won't be too bad - sit.seek(0) # type: ignore - - zone: Optional[str] = None - for x, tz in sit: - if x == d: - zone = tz - if x >= d: - break - return None if zone is None else pytz.timezone(zone) + return _day2zone().get(d) # ok to cache, there are only a few home locations? -@lru_cache(maxsize=None) +@lru_cache(None) def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: (lat, lng) = loc - finder = _timezone_finder(fast=False) # ok to use slow here for better precision + finder = _timezone_finder(fast=False) # ok to use slow here for better precision zone = finder.timezone_at(lat=lat, lng=lng) if zone is None: # TODO shouldn't really happen, warn? @@ -242,7 +240,7 @@ def _get_home_tz(loc: LatLon) -> Optional[pytz.BaseTzInfo]: return pytz.timezone(zone) -def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: +def get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: ''' Given a datetime, returns the timezone for that date. ''' @@ -258,16 +256,14 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: # that datetime is between, else fallback on your first home location, so it acts # as a last resort from my.location.fallback import via_home as home + loc = list(home.estimate_location(dt)) assert len(loc) == 1, f"should only have one home location, received {loc}" return _get_home_tz(loc=(loc[0].lat, loc[0].lon)) -# expose as 'public' function -get_tz = _get_tz - -def localize(dt: datetime) -> tzdatetime: - tz = _get_tz(dt) +def localize(dt: datetime) -> datetime_aware: + tz = get_tz(dt) if tz is None: # TODO -- this shouldn't really happen.. think about it carefully later return dt @@ -275,20 +271,17 @@ def localize(dt: datetime) -> tzdatetime: return tz.localize(dt) -from ...core import stat, Stats -def stats(quick: bool=False) -> Stats: +def stats(quick: bool = False) -> Stats: if quick: prev, config.sort_locations = config.sort_locations, False - res = { - 'first': next(_iter_local_dates()) - } + res = {'first': next(_iter_local_dates())} config.sort_locations = prev return res # TODO not sure what would be a good stat() for this module... # might be nice to print some actual timezones? # there aren't really any great iterables to expose - import os VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990)) + def localized_years(): last = datetime.now().year + 2 # note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached @@ -296,4 +289,9 @@ def stats(quick: bool=False) -> Stats: for Y in range(VIA_LOCATION_START_YEAR, last): dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01') yield localize(dt) + return stat(localized_years) + + +# deprecated -- still used in some other modules so need to keep +_get_tz = get_tz