From ca10d524a4a5ccf2d65b7252c53933323f9f3926 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Mon, 25 Apr 2022 18:21:52 -0700 Subject: [PATCH] location: add all.py, using takeout/gpslogger/ip --- my/config.py | 11 ++++- my/ip/all.py | 28 +++++++++++++ my/ip/common.py | 39 ++++++++++++++++++ my/location/all.py | 46 +++++++++++++++++++++ my/location/common.py | 17 ++++++++ my/location/google.py | 7 ++++ my/location/google_takeout.py | 33 +++++++++++++++ my/location/gpslogger.py | 75 +++++++++++++++++++++++++++++++++++ my/location/home.py | 8 +--- my/location/via_ip.py | 39 ++++++++++++++++++ my/time/tz/via_location.py | 74 ++++++++++++++++++++++++---------- tox.ini | 7 ++++ 12 files changed, 357 insertions(+), 27 deletions(-) create mode 100644 my/ip/all.py create mode 100644 my/ip/common.py create mode 100644 my/location/all.py create mode 100644 my/location/common.py create mode 100644 my/location/google_takeout.py create mode 100644 my/location/gpslogger.py create mode 100644 my/location/via_ip.py diff --git a/my/config.py b/my/config.py index 5bb316f..0746803 100644 --- a/my/config.py +++ b/my/config.py @@ -72,10 +72,19 @@ class location: # and we can't import the types from the module itself, otherwise would be circular. common module? home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0) + class via_ip: + accuracy: float + + class gpslogger: + export_path: Paths = '' + accuracy: float + class time: class tz: - pass + class via_location: + fast: bool + require_accuracy: float class orgmode: diff --git a/my/ip/all.py b/my/ip/all.py new file mode 100644 index 0000000..a7ac679 --- /dev/null +++ b/my/ip/all.py @@ -0,0 +1,28 @@ +""" +An example all.py stub module that provides ip data + +To use this, you'd add IP providers that yield IPs to the 'ips' function + +For an example of how this could be used, see https://github.com/seanbreckenridge/HPI/tree/master/my/ip +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + + +from typing import Iterator + +from my.core.common import Stats + +from .common import IP + + +def ips() -> Iterator[IP]: + yield from () + + +def stats() -> Stats: + from my.core import stat + + return { + **stat(ips), + } diff --git a/my/ip/common.py b/my/ip/common.py new file mode 100644 index 0000000..fb57406 --- /dev/null +++ b/my/ip/common.py @@ -0,0 +1,39 @@ +""" +Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]] +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from my.core import __NOT_HPI_MODULE__ + +import ipaddress +from typing import NamedTuple, Iterator +from datetime import datetime + +import ipgeocache + +from my.core import Json + + +class IP(NamedTuple): + dt: datetime + addr: str # an IP address + + # TODO: could cache? not sure if it's worth it + def ipgeocache(self) -> Json: + return ipgeocache.get(self.addr) + + @property + def tz(self) -> str: + tz: str = self.ipgeocache()["timezone"] + return tz + + +def drop_private(ips: Iterator[IP]) -> Iterator[IP]: + """ + Helper function that can be used to filter out private IPs + """ + for ip in ips: + if ipaddress.ip_address(ip.addr).is_private: + continue + yield ip diff --git a/my/location/all.py b/my/location/all.py new file mode 100644 index 0000000..d52dce9 --- /dev/null +++ b/my/location/all.py @@ -0,0 +1,46 @@ +""" +Merges location data from multiple sources +""" + +from typing import Iterator + +from my.core import Stats, LazyLogger +from my.core.source import import_source + +from my.location.via_ip import locations + +from .common import Location + + +logger = LazyLogger(__name__, level="warning") + + +def locations() -> Iterator[Location]: + yield from _takeout_locations() + yield from _gpslogger_locations() + yield from _ip_locations() + + +@import_source(module_name="my.location.via_ip") +def _ip_locations() -> Iterator[Location]: + from . import via_ip + yield from via_ip.locations() + + +@import_source(module_name="my.location.google_takeout") +def _takeout_locations() -> Iterator[Location]: + from . import google_takeout + yield from google_takeout.locations() + + +@import_source(module_name="my.location.gpslogger") +def _gpslogger_locations() -> Iterator[Location]: + from . import gpslogger + yield from gpslogger.locations() + +def stats() -> Stats: + from my.core import stat + + return { + **stat(locations), + } diff --git a/my/location/common.py b/my/location/common.py new file mode 100644 index 0000000..b5cf3bd --- /dev/null +++ b/my/location/common.py @@ -0,0 +1,17 @@ +from datetime import date, datetime +from typing import Union, Tuple, NamedTuple, Optional + +from my.core import __NOT_HPI_MODULE__ + +DateIsh = Union[datetime, date, str] + +LatLon = Tuple[float, float] + + +# TODO: add timezone to this? can use timezonefinder in tz provider instead though +class Location(NamedTuple): + lon: float + lat: float + dt: datetime + accuracy: Optional[float] + elevation: Optional[float] diff --git a/my/location/google.py b/my/location/google.py index f196301..21ba3ed 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -1,6 +1,9 @@ """ Location data from Google Takeout + +DEPRECATED: setup my.google.takeout.parser and use my.location.google_takeout instead """ + REQUIRES = [ 'geopy', # checking that coordinates are valid 'ijson', @@ -20,6 +23,10 @@ from ..core.common import LazyLogger, mcachew from ..core.cachew import cache_dir from ..core import kompress +from my.core.warnings import high + +high("Please set up my.google.takeout.parser module for better takeout support") + # otherwise uses ijson # todo move to config?? diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py new file mode 100644 index 0000000..8b24e5f --- /dev/null +++ b/my/location/google_takeout.py @@ -0,0 +1,33 @@ +""" +Extracts locations using google_takeout_parser -- no shared code with the deprecated my.location.google +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] + +from typing import Iterator + +from my.google.takeout.parser import events, _cachew_depends_on +from google_takeout_parser.models import Location as GoogleLocation + +from my.core.common import mcachew, LazyLogger, Stats +from .common import Location + +logger = LazyLogger(__name__) + + +@mcachew( + depends_on=_cachew_depends_on, + logger=logger, +) +def locations() -> Iterator[Location]: + for g in events(): + if isinstance(g, GoogleLocation) and not isinstance(g, Exception): + yield Location( + lon=g.lng, lat=g.lat, dt=g.dt, accuracy=g.accuracy, elevation=None + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py new file mode 100644 index 0000000..d0eebed --- /dev/null +++ b/my/location/gpslogger.py @@ -0,0 +1,75 @@ +""" +Parse [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (xml) files +""" + +REQUIRES = ["gpxpy"] + +from my.config import location +from my.core import Paths, dataclass + + +@dataclass +class config(location.gpslogger): + # path[s]/glob to the synced gpx (XML) files + export_path: Paths + + # default accuracy for gpslogger + accuracy: float = 50.0 + + +from itertools import chain +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterator, Sequence, List + +import gpxpy # type: ignore[import] +from more_itertools import unique_everseen + +from my.core import Stats, LazyLogger +from my.core.common import get_files, mcachew +from my.utils.input_source import InputSource +from .common import Location + + +logger = LazyLogger(__name__, level="warning") + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path, glob="*.gpx") + + +def _cachew_depends_on(from_paths: InputSource) -> List[float]: + return [p.stat().st_mtime for p in from_paths()] + + +# TODO: could use a better cachew key/this has to recompute every file whenever the newest one changes +@mcachew(depends_on=_cachew_depends_on, logger=logger) +def locations(from_paths: InputSource = inputs) -> Iterator[Location]: + yield from unique_everseen( + chain(*map(_extract_locations, from_paths())), key=lambda loc: loc.dt + ) + + +def _extract_locations(path: Path) -> Iterator[Location]: + with path.open("r") as gf: + gpx_obj = gpxpy.parse(gf) + for track in gpx_obj.tracks: + for segment in track.segments: + for point in segment.points: + if point.time is None: + continue + # hmm - for gpslogger, seems that timezone is always SimpleTZ('Z'), which + # specifies UTC -- see https://github.com/tkrajina/gpxpy/blob/cb243b22841bd2ce9e603fe3a96672fc75edecf2/gpxpy/gpxfield.py#L38 + yield Location( + lat=point.latitude, + lon=point.longitude, + accuracy=config.accuracy, + elevation=point.elevation, + dt=datetime.replace(point.time, tzinfo=timezone.utc), + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/home.py b/my/location/home.py index dd7209f..ac0fcb8 100644 --- a/my/location/home.py +++ b/my/location/home.py @@ -2,17 +2,13 @@ Simple location provider, serving as a fallback when more detailed data isn't available ''' from dataclasses import dataclass -from datetime import datetime, date, time, timezone +from datetime import datetime, time, timezone from functools import lru_cache from typing import Sequence, Tuple, Union, cast from my.config import location as user_config - -DateIsh = Union[datetime, date, str] - -# todo hopefully reasonable? might be nice to add name or something too -LatLon = Tuple[float, float] +from my.location.common import LatLon, DateIsh @dataclass class Config(user_config): diff --git a/my/location/via_ip.py b/my/location/via_ip.py new file mode 100644 index 0000000..3cb5539 --- /dev/null +++ b/my/location/via_ip.py @@ -0,0 +1,39 @@ +""" +Converts IP addresses provided by my.location.ip to estimated locations +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from my.core import dataclass, Stats +from my.config import location + + +@dataclass +class config(location.via_ip): + # no real science to this, just a guess of ~15km accuracy for IP addresses + accuracy: int = 15_000 + + +from typing import Iterator + +from .common import Location +from my.ip.all import ips + + +def locations() -> Iterator[Location]: + for ip in ips(): + loc: str = ip.ipgeocache()["loc"] + lat, _, lon = loc.partition(",") + yield Location( + lat=float(lat), + lon=float(lon), + dt=ip.dt, + accuracy=config.accuracy, + elevation=None, + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index e390c43..274a560 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -7,27 +7,34 @@ REQUIRES = [ ] +from my.config import time +from my.core import dataclass + + +@dataclass +class config(time.tz.via_location): + # less precise, but faster + fast: bool = True + + # if the accuracy for the location is more than 5km, don't use + require_accuracy: float = 5_000 + + from collections import Counter from datetime import date, datetime from functools import lru_cache from itertools import groupby -from typing import Iterator, NamedTuple, Optional +from typing import Iterator, NamedTuple, Optional, Tuple, Any, List from more_itertools import seekable import pytz -from ...core.common import LazyLogger, mcachew, tzdatetime -from ...core.cachew import cache_dir -from ...location.google import locations +from my.core.common import LazyLogger, mcachew, tzdatetime +logger = LazyLogger(__name__, level='warning') -logger = LazyLogger(__name__, level='debug') - - -# todo should move to config? not sure -_FASTER: bool = True @lru_cache(2) -def _timezone_finder(fast: bool): +def _timezone_finder(fast: bool) -> Any: if fast: # less precise, but faster from timezonefinder import TimezoneFinderL as Finder # type: ignore @@ -46,20 +53,40 @@ class DayWithZone(NamedTuple): zone: Zone -def _iter_local_dates(start=0, stop=None) -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=_FASTER) # rely on the default +from my.location.common import LatLon + +# for backwards compatibility +def _locations() -> Iterator[Tuple[LatLon, datetime]]: + try: + import my.location.all + for loc in my.location.all.locations(): + yield ((loc.lat, loc.lon), loc.dt) + + except Exception as e: + from my.core.warnings import high + logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implemetation", exc_info=e) + high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") + + import my.location.google + + for loc in my.location.google.locations(): + yield ((loc.lat, loc.lon), loc.dt) + + +def _iter_local_dates() -> Iterator[DayWithZone]: + finder = _timezone_finder(fast=config.fast) # rely on the default pdt = None warnings = [] # todo allow to skip if not noo many errors in row? - for l in locations(start=start, stop=stop): + for (lat, lon), dt in _locations(): # TODO right. its _very_ slow... - zone = finder.timezone_at(lng=l.lon, lat=l.lat) + zone = finder.timezone_at(lat=lat, lng=lon) if zone is None: - warnings.append(f"Couldn't figure out tz for {l}") + warnings.append(f"Couldn't figure out tz for {lat}, {lon}") continue tz = pytz.timezone(zone) # TODO this is probably a bit expensive... test & benchmark - ldt = l.dt.astimezone(tz) + ldt = dt.astimezone(tz) ndate = ldt.date() if pdt is not None and ndate < pdt.date(): # TODO for now just drop and collect the stats @@ -71,12 +98,13 @@ def _iter_local_dates(start=0, stop=None) -> Iterator[DayWithZone]: yield DayWithZone(day=ndate, zone=z) -def most_common(l): - res, count = Counter(l).most_common(1)[0] # type: ignore[var-annotated] +def most_common(lst: List[DayWithZone]) -> DayWithZone: + res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated] return res -@mcachew(cache_path=cache_dir()) +# refresh _iter_tzs once per day -- don't think a better depends_on is possible dynamically +@mcachew(logger=logger, depends_on=lambda: str(date.today())) def _iter_tzs() -> Iterator[DayWithZone]: for d, gr in groupby(_iter_local_dates(), key=lambda p: p.day): logger.info('processed %s', d) @@ -106,6 +134,7 @@ def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: break return None if zone is None else pytz.timezone(zone) + # ok to cache, there are only a few home locations? @lru_cache(maxsize=None) def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: @@ -119,8 +148,10 @@ def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: return pytz.timezone(zone) -# TODO expose? to main as well? def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: + ''' + Given a datetime, returns the timezone for that date. + ''' res = _get_day_tz(d=dt.date()) if res is not None: return res @@ -129,6 +160,9 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: loc = home.get_location(dt) return _get_home_tz(loc=loc) +# expose as 'public' function +get_tz = _get_tz + def localize(dt: datetime) -> tzdatetime: tz = _get_tz(dt) diff --git a/tox.ini b/tox.ini index b8c89db..52bfdfb 100644 --- a/tox.ini +++ b/tox.ini @@ -100,6 +100,9 @@ commands = hpi module install my.goodreads hpi module install my.pdfs hpi module install my.smscalls + hpi module install my.location.gpslogger + hpi module install my.location.via_ip + hpi module install my.google.takeout.parser # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? @@ -118,6 +121,10 @@ commands = -p my.body.exercise.cross_trainer \ -p my.bluemaestro \ -p my.location.google \ + -p my.location.google_takeout \ + -p my.location.via_ip \ + -p my.location.gpslogger \ + -p my.ip.common \ -p my.time.tz.via_location \ -p my.calendar.holidays \ -p my.arbtt \