core/time: more flexible support for resolving TZ abbreviation -> TZ ambiguities

addresses https://github.com/karlicoss/HPI/issues/103 for now via experimental time.tz.force_abbreviations config variable not sure if this whole things is doomed to be resolved properly
2021-03-07 21:50:39 +00:00 · 2021-03-07 21:50:39 +00:00 · 1fd2a9f643
commit 1fd2a9f643
parent 5ef638694e
3 changed files with 54 additions and 12 deletions
--- a/my/core/time.py
+++ b/my/core/time.py
@ -1,20 +1,50 @@
 from functools import lru_cache
 from datetime import datetime, tzinfo
 from typing import Sequence
 import pytz # type: ignore
-# https://gist.github.com/edwardabraham/8680198
+
-tz_lookup = {
+def user_forced() -> Sequence[str]:
-    pytz.timezone(x).localize(datetime.now()).tzname(): pytz.timezone(x)
+    # conversion from abbreviations is always ambiguous
-    for x in pytz.all_timezones
+    # https://stackoverflow.com/questions/36067621/python-all-possible-timezone-abbreviations-for-given-timezone-name-and-vise-ve
-}
+    try:
-tz_lookup['UTC'] = pytz.utc # ugh. otherwise it'z Zulu...
+        from my.config import time as user_config
        return user_config.tz.force_abbreviations # type: ignore[attr-defined]
    except:
        # todo log/apply policy
        return []
-# TODO dammit, lru_cache interferes with mypy?
+@lru_cache(1)
 def _abbr_to_timezone_map():
    # also force UTC to always correspond to utc
    # this makes more sense than Zulu it ends up by default
    timezones = pytz.all_timezones + ['UTC'] + list(user_forced())
    res = {}
    for tzname in timezones:
        tz = pytz.timezone(tzname)
        infos = getattr(tz, '_tzinfos', []) # not sure if can rely on attr always present?
        for info in infos:
            abbr = info[-1]
            # todo could support this with a better error handling strategy?
            # otz = res.get(abbr, tz)
            # if otz != tz:
            #     raise RuntimeError(abbr, tz, otz)
            res[abbr] = tz
        # ugh. also necessary, e.g. for Zulu?? why is it not in _tzinfos?
        # note: somehow this is not the same as the tzname!
        tzn = getattr(tz, '_tzname', None)
        if tzn is not None:
            res[tzn] = tz
    return res
 # todo dammit, lru_cache interferes with mypy?
@lru_cache(None)
 def abbr_to_timezone(abbr: str) -> tzinfo:
-    return tz_lookup[abbr]
+    return _abbr_to_timezone_map()[abbr]
 def zone_to_countrycode(zone: str) -> str:
@ -30,3 +60,6 @@ def _zones_to_countrycode():
        for timezone in timezones:
            res[timezone] = countrycode
    return res
 # todo stuff here could be a bit more defensive? e.g. dependent on policy
--- a/my/google/takeout/html.py
+++ b/my/google/takeout/html.py
@ -29,22 +29,29 @@ def parse_dt(s: str) -> datetime:
    if end == ' PM' or end == ' AM':
        # old takeouts didn't have timezone
        # hopefully it was utc? Legacy, so no that much of an issue anymore..
        # todo although maybe worth adding timezone from location provider?
        tz = pytz.utc
    else:
        s, tzabbr = s.rsplit(maxsplit=1)
        tz = abbr_to_timezone(tzabbr)
    dt = datetime.strptime(s, fmt)
-    dt = tz.localize(dt)
+    return tz.localize(dt)
    return dt
-def test_parse_dt():
+def test_parse_dt() -> None:
    parse_dt('Jun 23, 2015, 2:43:45 PM')
    parse_dt('Jan 25, 2019, 8:23:48 AM GMT')
    parse_dt('Jan 22, 2020, 8:34:00 PM UTC')
    parse_dt('Sep 10, 2019, 8:51:45 PM MSK')
    # this testcases are interesting: in pytz, abbr resolution might depend on the _current_ date!
    # so these used to fail during winter
    # you can see all the different values used in in _tzinfos field
    parse_dt('Jun 01, 2018, 11:00:00 PM BST')
    parse_dt('Jun 01, 2018, 11:00:00 PM PDT')
    parse_dt('Feb 01, 2018, 11:00:00 PM PST')
 class State(Enum):
    OUTSIDE = 0
--- a/tests/takeout.py
+++ b/tests/takeout.py
@ -3,7 +3,7 @@ from datetime import datetime
 from itertools import islice
 import pytz
-import my.location.takeout as LT
+import my.location.google as LT
 from my.google.takeout.html import read_html
 from my.google.takeout.paths import get_last_takeout
@ -69,3 +69,5 @@ def parse_takeout_xmllint(data: str):
    out = res.stdout.decode('utf8')
    # out = data
    return out.split('<div class="content-cell')
 from my.google.takeout.html import test_parse_dt