location fallback (#263)

see https://github.com/karlicoss/HPI/issues/262

* move home to fallback/via_home.py
* move via_ip to fallback
* add fallback model
* add stub via_ip file
* add fallback_locations for via_ip
* use protocol for locations
* estimate_from helper, via_home estimator, all.py
* via_home: add accuracy, cache history
* add datasources to gpslogger/google_takeout
* tz/via_location.py: update import to fallback
* denylist docs/installation instructions
* tz.via_location: let user customize cachew refresh time
* add via_ip.estimate_location using binary search
* use estimate_location in via_home.get_location
* tests: add gpslogger to location config stub
* tests: install tz related libs in test env
* tz: add regression test for broken windows dates

* vendorize bisect_left from python src
doesnt have a 'key' parameter till python3.10
This commit is contained in:
seanbreckenridge 2023-02-27 20:30:06 -08:00 committed by GitHub
parent 6dc5e7575f
commit 98b086f746
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
25 changed files with 1166 additions and 190 deletions

106
tests/core/test_denylist.py Normal file
View file

@ -0,0 +1,106 @@
import warnings
import json
from pathlib import Path
from datetime import datetime
from typing import NamedTuple, Iterator
from my.core.denylist import DenyList
class IP(NamedTuple):
addr: str
dt: datetime
def data() -> Iterator[IP]:
# random IP addresses
yield IP(addr="67.98.113.0", dt=datetime(2020, 1, 1))
yield IP(addr="59.40.113.87", dt=datetime(2020, 2, 1))
yield IP(addr="161.235.192.228", dt=datetime(2020, 3, 1))
yield IP(addr="165.243.139.87", dt=datetime(2020, 4, 1))
yield IP(addr="69.69.141.154", dt=datetime(2020, 5, 1))
yield IP(addr="50.72.224.80", dt=datetime(2020, 6, 1))
yield IP(addr="221.67.89.168", dt=datetime(2020, 7, 1))
yield IP(addr="177.113.119.251", dt=datetime(2020, 8, 1))
yield IP(addr="93.200.246.215", dt=datetime(2020, 9, 1))
yield IP(addr="127.105.171.61", dt=datetime(2020, 10, 1))
def test_denylist(tmp_path: Path) -> None:
tf = (tmp_path / "denylist.json").absolute()
with warnings.catch_warnings(record=True):
# create empty denylist (though file does not have to exist for denylist to work)
tf.write_text("[]")
d = DenyList(tf)
d.load()
assert dict(d._deny_map) == {}
assert d._deny_raw_list == []
assert list(d.filter(data())) == list(data())
# no data in denylist yet
assert len(d._deny_map) == 0
assert len(d._deny_raw_list) == 0
# add some data
d.deny(key="addr", value="67.98.113.0")
# write and reload to update _deny_map, _deny_raw_list
d.write()
d.load()
assert len(d._deny_map) == 1
assert len(d._deny_raw_list) == 1
assert d._deny_raw_list == [{"addr": "67.98.113.0"}]
filtered = list(d.filter(data()))
assert len(filtered) == 9
assert "67.98.113.0" not in [i.addr for i in filtered]
assert dict(d._deny_map) == {"addr": {"67.98.113.0"}}
denied = list(d.filter(data(), invert=True))
assert len(denied) == 1
assert denied[0] == IP(addr="67.98.113.0", dt=datetime(2020, 1, 1))
# add some non-JSON primitive data
d.deny(key="dt", value=datetime(2020, 2, 1))
# test internal behavior, _deny_raw_list should have been updated,
# but _deny_map doesnt get updated by a call to .deny
#
# if we change this just update the test, is just here to ensure
# this is the behaviour
assert len(d._deny_map) == 1
# write and load to update _deny_map
d.write()
d.load()
assert len(d._deny_map) == 2
assert len(d._deny_raw_list) == 2
assert d._deny_raw_list[-1] == {"dt": "2020-02-01T00:00:00"}
filtered = list(d.filter(data()))
assert len(filtered) == 8
assert "59.40.113.87" not in [i.addr for i in filtered]
with open(tf, "r") as f:
data_json = json.loads(f.read())
assert data_json == [
{
"addr": "67.98.113.0",
},
{
"dt": "2020-02-01T00:00:00",
},
]

View file

@ -1,7 +1,5 @@
from pathlib import Path
from more_itertools import one
import pytest # type: ignore
@ -20,26 +18,11 @@ def test() -> None:
@pytest.fixture(autouse=True)
def prepare(tmp_path: Path):
from .common import reset_modules
reset_modules()
user_config = _prepare_google_config(tmp_path)
from .shared_config import temp_config
user_config = temp_config(tmp_path)
import my.core.cfg as C
with C.tmp_config() as config:
config.google = user_config # type: ignore
config.google = user_config.google
yield
def _prepare_google_config(tmp_path: Path):
from .common import testdata
track = one(testdata().rglob('italy-slovenia-2017-07-29.json'))
# todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs
import zipfile
with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf:
zf.writestr('Takeout/Location History/Location History.json', track.read_bytes())
class google_config:
takeout_path = tmp_path
return google_config

125
tests/location_fallback.py Normal file
View file

@ -0,0 +1,125 @@
"""
To test my.location.fallback_location.all
"""
from typing import Iterator
from datetime import datetime, timezone, timedelta
from more_itertools import ilen
from my.ip.common import IP
def data() -> Iterator[IP]:
# random IP addresses
yield IP(addr="67.98.113.0", dt=datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc))
yield IP(addr="67.98.112.0", dt=datetime(2020, 1, 15, 12, 0, 0, tzinfo=timezone.utc))
yield IP(addr="59.40.113.87", dt=datetime(2020, 2, 1, 12, 0, 0, tzinfo=timezone.utc))
yield IP(addr="59.40.139.87", dt=datetime(2020, 2, 1, 16, 0, 0, tzinfo=timezone.utc))
yield IP(addr="161.235.192.228", dt=datetime(2020, 3, 1, 12, 0, 0, tzinfo=timezone.utc))
# redefine the my.ip.all function using data for testing
import my.ip.all as ip_module
ip_module.ips = data
from my.location.fallback import via_ip
# these are all tests for the bisect algorithm defined in via_ip.py
# to make sure we can correctly find IPs that are within the 'for_duration' of a given datetime
def test_ip_fallback() -> None:
# make sure that the data override works
assert ilen(ip_module.ips()) == ilen(data())
assert ilen(ip_module.ips()) == ilen(via_ip.fallback_locations())
assert ilen(via_ip.fallback_locations()) == 5
assert ilen(via_ip._sorted_fallback_locations()) == 5
# confirm duration from via_ip since that is used for bisect
assert via_ip.config.for_duration == timedelta(hours=24)
# basic tests
# try estimating slightly before the first IP
est = list(via_ip.estimate_location(datetime(2020, 1, 1, 11, 59, 59, tzinfo=timezone.utc)))
assert len(est) == 0
# during the duration for the first IP
est = list(via_ip.estimate_location(datetime(2020, 1, 1, 12, 30, 0, tzinfo=timezone.utc)))
assert len(est) == 1
# right after the 'for_duration' for an IP
est = list(via_ip.estimate_location(datetime(2020, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + via_ip.config.for_duration + timedelta(seconds=1)))
assert len(est) == 0
# on 2/1/2020, threes one IP if before 16:30
est = list(via_ip.estimate_location(datetime(2020, 2, 1, 12, 30, 0, tzinfo=timezone.utc)))
assert len(est) == 1
# and two if after 16:30
est = list(via_ip.estimate_location(datetime(2020, 2, 1, 17, 00, 0, tzinfo=timezone.utc)))
assert len(est) == 2
# the 12:30 IP should 'expire' before the 16:30 IP, use 3:30PM on the next day
est = list(via_ip.estimate_location(datetime(2020, 2, 2, 15, 30, 0, tzinfo=timezone.utc)))
assert len(est) == 1
use_dt = datetime(2020, 3, 1, 12, 15, 0, tzinfo=timezone.utc)
# test last IP
est = list(via_ip.estimate_location(use_dt))
assert len(est) == 1
# datetime should be the IPs, not the passed IP (if via_home, it uses the passed dt)
assert est[0].dt != use_dt
# test interop with other fallback estimators/all.py
#
# redefine fallback_estimators to prevent possible namespace packages the user
# may have installed from having side effects testing this
from my.location.fallback import all
from my.location.fallback import via_home
def _fe() -> Iterator[all.LocationEstimator]:
yield via_ip.estimate_location
yield via_home.estimate_location
all.fallback_estimators = _fe
assert ilen(all.fallback_estimators()) == 2
# test that all.estimate_location has access to both IPs
#
# just passing via_ip should give one IP
from my.location.fallback.common import _iter_estimate_from
raw_est = list(_iter_estimate_from(use_dt, (via_ip.estimate_location,)))
assert len(raw_est) == 1
assert raw_est[0].datasource == "via_ip"
assert raw_est[0].accuracy == 15_000
# passing home should give one
home_est = list(_iter_estimate_from(use_dt, (via_home.estimate_location,)))
assert len(home_est) == 1
assert home_est[0].accuracy == 30_000
# make sure ip accuracy is more accurate
assert raw_est[0].accuracy < home_est[0].accuracy
# passing both should give two
raw_est = list(_iter_estimate_from(use_dt, (via_ip.estimate_location, via_home.estimate_location)))
assert len(raw_est) == 2
# shouldn't raise value error
all_est = all.estimate_location(use_dt)
# should have used the IP from via_ip since it was more accurate
assert all_est.datasource == "via_ip"
# test that a home defined in shared_config.py is used if no IP is found
loc = all.estimate_location(datetime(2021, 1, 1, 12, 30, 0, tzinfo=timezone.utc))
assert loc.datasource == "via_home"
# test a different home using location.fallback.all
bulgaria = all.estimate_location(datetime(2006, 1, 1, 12, 30, 0, tzinfo=timezone.utc))
assert bulgaria.datasource == "via_home"
assert (bulgaria.lat, bulgaria.lon) == (42.697842, 23.325973)
assert (loc.lat, loc.lon) != (bulgaria.lat, bulgaria.lon)
# re-use prepare fixture for overriding config from shared_config.py
from .tz import prepare

65
tests/shared_config.py Normal file
View file

@ -0,0 +1,65 @@
# Defines some shared config for tests
from datetime import datetime, date, timezone
from pathlib import Path
from typing import Any, NamedTuple
import my.time.tz.via_location as LTZ
from more_itertools import one
class SharedConfig(NamedTuple):
google: Any
location: Any
time: Any
def _prepare_google_config(tmp_path: Path):
from .common import testdata
try:
track = one(testdata().rglob('italy-slovenia-2017-07-29.json'))
except ValueError:
raise RuntimeError('testdata not found, setup git submodules?')
# todo ugh. unnecessary zipping, but at the moment takeout provider doesn't support plain dirs
import zipfile
with zipfile.ZipFile(tmp_path / 'takeout.zip', 'w') as zf:
zf.writestr('Takeout/Location History/Location History.json', track.read_bytes())
class google_config:
takeout_path = tmp_path
return google_config
# pass tmp_path from pytest to this helper function
# see tests/tz.py as an example
def temp_config(temp_path: Path) -> Any:
from .common import reset_modules
reset_modules()
LTZ.config.fast = True
class location:
home_accuracy = 30_000
home = (
# supports ISO strings
('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia
# supports date/datetime objects
(date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY
# check tz handling..
(datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia
)
# note: order doesn't matter, will be sorted in the data provider
class via_ip:
accuracy = 15_000
class gpslogger:
pass
class time:
class tz:
class via_location:
pass # just rely on the defaults...
return SharedConfig(google=_prepare_google_config(temp_path), location=location, time=time)

View file

@ -1,4 +1,5 @@
from datetime import datetime, timedelta, date, timezone
import sys
from datetime import datetime, timedelta
from pathlib import Path
import pytest # type: ignore
@ -46,8 +47,15 @@ def test_tz() -> None:
tz = LTZ._get_tz(D('20201001 14:15:16'))
assert tz is not None
tz = LTZ._get_tz(datetime.min)
assert tz is not None
on_windows = sys.platform == 'win32'
if not on_windows:
tz = LTZ._get_tz(datetime.min)
assert tz is not None
else:
# seems this fails because windows doesnt support same date ranges
# https://stackoverflow.com/a/41400321/
with pytest.raises(OSError):
LTZ._get_tz(datetime.min)
def test_policies() -> None:
@ -73,36 +81,15 @@ def D(dstr: str) -> datetime:
return datetime.strptime(dstr, '%Y%m%d %H:%M:%S')
# TODO copy pasted from location.py, need to extract some common provider
@pytest.fixture(autouse=True)
def prepare(tmp_path: Path):
from .common import reset_modules
reset_modules()
LTZ.config.fast = True
from .location import _prepare_google_config
google = _prepare_google_config(tmp_path)
class location:
home = (
# supports ISO strings
('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia
# supports date/datetime objects
(date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY
# check tz handling..
(datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia
)
# note: order doesn't matter, will be sorted in the data provider
class time:
class tz:
class via_location:
pass # just rely on the defaults...
from .shared_config import temp_config
conf = temp_config(tmp_path)
import my.core.cfg as C
with C.tmp_config() as config:
config.google = google
config.time = time
config.location = location
config.google = conf.google
config.time = conf.time
config.location = conf.location
yield