diff --git a/doc/MODULES.org b/doc/MODULES.org index e4bcdad..a6dcd9d 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -16,9 +16,12 @@ If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting][ - [[#toc][TOC]] - [[#intro][Intro]] - [[#configs][Configs]] - - [[#mygoogletakeoutpaths][my.google.takeout.paths]] + - [[#mygoogletakeoutparser][my.google.takeout.parser]] - [[#myhypothesis][my.hypothesis]] - [[#myreddit][my.reddit]] + - [[#mybrowser][my.browser]] + - [[#mylocation][my.location]] + - [[#mytimetzvia_location][my.time.tz.via_location]] - [[#mypocket][my.pocket]] - [[#mytwittertwint][my.twitter.twint]] - [[#mytwitterarchive][my.twitter.archive]] @@ -90,12 +93,12 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http export_path: Paths #+end_src + ** [[file:../my/browser/][my.browser]] Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] #+begin_src python - @dataclass class browser: class export: # path[s]/glob to your backed up browser history sqlite files @@ -108,6 +111,80 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http # active_databases = Firefox.locate_database() export_path: Paths #+end_src +** [[file:../my/location][my.location]] + + Merged location history from lots of sources. + + The main sources here are + [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (XML) files, and + google takeout (using =my.google.takeout.parser=), with a fallback on + manually defined home locations. + + You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to + provide geolocation data for an IPs (though no IPs are provided from any + of the sources here). For an example of usage, see [[https://github.com/seanbreckenridge/HPI/tree/master/my/ip][here]] + + #+begin_src python + class location: + home = ( + # supports ISO strings + ('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia + # supports date/datetime objects + (date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY + (datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia + ) + # note: order doesn't matter, will be sorted in the data provider + + class gpslogger: + # path[s]/glob to the exported gpx files + export_path: Paths + + # default accuracy for gpslogger + accuracy: float = 50.0 + + class via_ip: + # guess ~15km accuracy for IP addresses + accuracy: float = 15_000 + #+end_src +** [[file:../my/time/tz/via_location.py][my.time.tz.via_location]] + + Uses the =my.location= module to determine the timezone for a location. + + This can be used to 'localize' timezones. Most modules here return + datetimes in UTC, to prevent confusion whether or not its a local + timezone, one from UTC, or one in your timezone. + + Depending on the specific data provider and your level of paranoia you might expect different behaviour.. E.g.: + - if your objects already have tz info, you might not need to call localize() at all + - it's safer when either all of your objects are tz aware or all are tz unware, not a mixture + - you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable + + #+begin_src python + TzPolicy = Literal[ + 'keep' , # if datetime is tz aware, just preserve it + 'convert', # if datetime is tz aware, convert to provider's tz + 'throw' , # if datetime is tz aware, throw exception + ] + #+end_src + + This is still a work in progress, plan is to integrate it with =hpi query= + so that you can easily convert/localize timezones for some module/data + + #+begin_src python + class time: + class tz: + policy = 'keep' + + class via_location: + # less precise, but faster + fast: bool = True + + # if the accuracy for the location is more than 5km (this + # isn't an accurate location, so shouldn't use it to determine + # timezone), don't use + require_accuracy: float = 5_000 + #+end_src + # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh. @@ -163,7 +240,6 @@ for cls, p in modules: #+RESULTS: - ** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]] Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] diff --git a/my/config.py b/my/config.py index 5bb316f..0746803 100644 --- a/my/config.py +++ b/my/config.py @@ -72,10 +72,19 @@ class location: # and we can't import the types from the module itself, otherwise would be circular. common module? home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0) + class via_ip: + accuracy: float + + class gpslogger: + export_path: Paths = '' + accuracy: float + class time: class tz: - pass + class via_location: + fast: bool + require_accuracy: float class orgmode: diff --git a/my/ip/all.py b/my/ip/all.py new file mode 100644 index 0000000..b21b543 --- /dev/null +++ b/my/ip/all.py @@ -0,0 +1,29 @@ +""" +An example all.py stub module that provides ip data + +To use this, you'd add IP providers that yield IPs to the 'ips' function + +For an example of how this could be used, see https://github.com/seanbreckenridge/HPI/tree/master/my/ip +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + + +from typing import Iterator + +from my.core.common import Stats, warn_if_empty + +from .common import IP + + +@warn_if_empty +def ips() -> Iterator[IP]: + yield from () + + +def stats() -> Stats: + from my.core import stat + + return { + **stat(ips), + } diff --git a/my/ip/common.py b/my/ip/common.py new file mode 100644 index 0000000..82008e2 --- /dev/null +++ b/my/ip/common.py @@ -0,0 +1,39 @@ +""" +Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]] +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from my.core import __NOT_HPI_MODULE__ + +import ipaddress +from typing import NamedTuple, Iterator +from datetime import datetime + +import ipgeocache + +from my.core import Json + + +class IP(NamedTuple): + dt: datetime + addr: str # an IP address + + # TODO: could cache? not sure if it's worth it + def ipgeocache(self) -> Json: + return ipgeocache.get(self.addr) + + @property + def tzname(self) -> str: + tz: str = self.ipgeocache()["timezone"] + return tz + + +def drop_private(ips: Iterator[IP]) -> Iterator[IP]: + """ + Helper function that can be used to filter out private IPs + """ + for ip in ips: + if ipaddress.ip_address(ip.addr).is_private: + continue + yield ip diff --git a/my/location/all.py b/my/location/all.py new file mode 100644 index 0000000..bd9364e --- /dev/null +++ b/my/location/all.py @@ -0,0 +1,48 @@ +""" +Merges location data from multiple sources +""" + +from typing import Iterator + +from my.core import Stats, LazyLogger +from my.core.source import import_source + +from my.location.via_ip import locations + +from .common import Location + + +logger = LazyLogger(__name__, level="warning") + + +def locations() -> Iterator[Location]: + # can add/comment out sources here to disable them, or use core.disabled_modules + yield from _takeout_locations() + yield from _gpslogger_locations() + yield from _ip_locations() + + +@import_source(module_name="my.location.google_takeout") +def _takeout_locations() -> Iterator[Location]: + from . import google_takeout + yield from google_takeout.locations() + + +@import_source(module_name="my.location.gpslogger") +def _gpslogger_locations() -> Iterator[Location]: + from . import gpslogger + yield from gpslogger.locations() + + +@import_source(module_name="my.location.via_ip") +def _ip_locations() -> Iterator[Location]: + from . import via_ip + yield from via_ip.locations() + + +def stats() -> Stats: + from my.core import stat + + return { + **stat(locations), + } diff --git a/my/location/common.py b/my/location/common.py new file mode 100644 index 0000000..b0676ec --- /dev/null +++ b/my/location/common.py @@ -0,0 +1,17 @@ +from datetime import date, datetime +from typing import Union, Tuple, NamedTuple, Optional + +from my.core import __NOT_HPI_MODULE__ + +DateIsh = Union[datetime, date, str] + +LatLon = Tuple[float, float] + + +# TODO: add timezone to this? can use timezonefinder in tz provider instead though +class Location(NamedTuple): + lat: float + lon: float + dt: datetime + accuracy: Optional[float] + elevation: Optional[float] diff --git a/my/location/google.py b/my/location/google.py index f196301..21ba3ed 100644 --- a/my/location/google.py +++ b/my/location/google.py @@ -1,6 +1,9 @@ """ Location data from Google Takeout + +DEPRECATED: setup my.google.takeout.parser and use my.location.google_takeout instead """ + REQUIRES = [ 'geopy', # checking that coordinates are valid 'ijson', @@ -20,6 +23,10 @@ from ..core.common import LazyLogger, mcachew from ..core.cachew import cache_dir from ..core import kompress +from my.core.warnings import high + +high("Please set up my.google.takeout.parser module for better takeout support") + # otherwise uses ijson # todo move to config?? diff --git a/my/location/google_takeout.py b/my/location/google_takeout.py new file mode 100644 index 0000000..80b31cb --- /dev/null +++ b/my/location/google_takeout.py @@ -0,0 +1,33 @@ +""" +Extracts locations using google_takeout_parser -- no shared code with the deprecated my.location.google +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"] + +from typing import Iterator + +from my.google.takeout.parser import events, _cachew_depends_on +from google_takeout_parser.models import Location as GoogleLocation + +from my.core.common import mcachew, LazyLogger, Stats +from .common import Location + +logger = LazyLogger(__name__) + + +@mcachew( + depends_on=_cachew_depends_on, + logger=logger, +) +def locations() -> Iterator[Location]: + for g in events(): + if isinstance(g, GoogleLocation): + yield Location( + lon=g.lng, lat=g.lat, dt=g.dt, accuracy=g.accuracy, elevation=None + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/gpslogger.py b/my/location/gpslogger.py new file mode 100644 index 0000000..95f4474 --- /dev/null +++ b/my/location/gpslogger.py @@ -0,0 +1,74 @@ +""" +Parse [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (xml) files +""" + +REQUIRES = ["gpxpy"] + +from my.config import location +from my.core import Paths, dataclass + + +@dataclass +class config(location.gpslogger): + # path[s]/glob to the synced gpx (XML) files + export_path: Paths + + # default accuracy for gpslogger + accuracy: float = 50.0 + + +from itertools import chain +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterator, Sequence, List + +import gpxpy # type: ignore[import] +from more_itertools import unique_everseen + +from my.core import Stats, LazyLogger +from my.core.common import get_files, mcachew +from .common import Location + + +logger = LazyLogger(__name__, level="warning") + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path, glob="*.gpx") + + +def _cachew_depends_on() -> List[float]: + return [p.stat().st_mtime for p in inputs()] + + +# TODO: could use a better cachew key/this has to recompute every file whenever the newest one changes +@mcachew(depends_on=_cachew_depends_on, logger=logger) +def locations() -> Iterator[Location]: + yield from unique_everseen( + chain(*map(_extract_locations, inputs())), key=lambda loc: loc.dt + ) + + +def _extract_locations(path: Path) -> Iterator[Location]: + with path.open("r") as gf: + gpx_obj = gpxpy.parse(gf) + for track in gpx_obj.tracks: + for segment in track.segments: + for point in segment.points: + if point.time is None: + continue + # hmm - for gpslogger, seems that timezone is always SimpleTZ('Z'), which + # specifies UTC -- see https://github.com/tkrajina/gpxpy/blob/cb243b22841bd2ce9e603fe3a96672fc75edecf2/gpxpy/gpxfield.py#L38 + yield Location( + lat=point.latitude, + lon=point.longitude, + accuracy=config.accuracy, + elevation=point.elevation, + dt=datetime.replace(point.time, tzinfo=timezone.utc), + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/location/home.py b/my/location/home.py index dd7209f..ac0fcb8 100644 --- a/my/location/home.py +++ b/my/location/home.py @@ -2,17 +2,13 @@ Simple location provider, serving as a fallback when more detailed data isn't available ''' from dataclasses import dataclass -from datetime import datetime, date, time, timezone +from datetime import datetime, time, timezone from functools import lru_cache from typing import Sequence, Tuple, Union, cast from my.config import location as user_config - -DateIsh = Union[datetime, date, str] - -# todo hopefully reasonable? might be nice to add name or something too -LatLon = Tuple[float, float] +from my.location.common import LatLon, DateIsh @dataclass class Config(user_config): diff --git a/my/location/via_ip.py b/my/location/via_ip.py new file mode 100644 index 0000000..e882cdb --- /dev/null +++ b/my/location/via_ip.py @@ -0,0 +1,39 @@ +""" +Converts IP addresses provided by my.location.ip to estimated locations +""" + +REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"] + +from my.core import dataclass, Stats +from my.config import location + + +@dataclass +class config(location.via_ip): + # no real science to this, just a guess of ~15km accuracy for IP addresses + accuracy: float = 15_000.0 + + +from typing import Iterator + +from .common import Location +from my.ip.all import ips + + +def locations() -> Iterator[Location]: + for ip in ips(): + loc: str = ip.ipgeocache()["loc"] + lat, _, lon = loc.partition(",") + yield Location( + lat=float(lat), + lon=float(lon), + dt=ip.dt, + accuracy=config.accuracy, + elevation=None, + ) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(locations)} diff --git a/my/time/tz/common.py b/my/time/tz/common.py index b6ebbe5..e2c428d 100644 --- a/my/time/tz/common.py +++ b/my/time/tz/common.py @@ -10,24 +10,27 @@ Depending on the specific data provider and your level of paranoia you might exp - it's safer when either all of your objects are tz aware or all are tz unware, not a mixture - you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable ''' -Policy = Literal[ +TzPolicy = Literal[ 'keep' , # if datetime is tz aware, just preserve it 'convert', # if datetime is tz aware, convert to provider's tz 'throw' , # if datetime is tz aware, throw exception # todo 'warn'? not sure if very useful ] -def default_policy() -> Policy: +# backwards compatibility +Policy = TzPolicy + +def default_policy() -> TzPolicy: try: from my.config import time as user_config - return cast(Policy, user_config.tz.policy) + return cast(TzPolicy, user_config.tz.policy) except Exception as e: # todo meh.. need to think how to do this more carefully # rationale: do not mess with user's data unless they want return 'keep' -def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: Policy=default_policy()) -> tzdatetime: +def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: TzPolicy=default_policy()) -> tzdatetime: tz = dt.tzinfo if tz is None: return lfun(dt) diff --git a/my/time/tz/via_location.py b/my/time/tz/via_location.py index e390c43..0e91193 100644 --- a/my/time/tz/via_location.py +++ b/my/time/tz/via_location.py @@ -7,27 +7,34 @@ REQUIRES = [ ] +from my.config import time +from my.core import dataclass + + +@dataclass +class config(time.tz.via_location): + # less precise, but faster + fast: bool = True + + # if the accuracy for the location is more than 5km, don't use + require_accuracy: float = 5_000 + + from collections import Counter from datetime import date, datetime from functools import lru_cache from itertools import groupby -from typing import Iterator, NamedTuple, Optional +from typing import Iterator, NamedTuple, Optional, Tuple, Any, List from more_itertools import seekable import pytz -from ...core.common import LazyLogger, mcachew, tzdatetime -from ...core.cachew import cache_dir -from ...location.google import locations +from my.core.common import LazyLogger, mcachew, tzdatetime +logger = LazyLogger(__name__, level='warning') -logger = LazyLogger(__name__, level='debug') - - -# todo should move to config? not sure -_FASTER: bool = True @lru_cache(2) -def _timezone_finder(fast: bool): +def _timezone_finder(fast: bool) -> Any: if fast: # less precise, but faster from timezonefinder import TimezoneFinderL as Finder # type: ignore @@ -46,39 +53,89 @@ class DayWithZone(NamedTuple): zone: Zone -def _iter_local_dates(start=0, stop=None) -> Iterator[DayWithZone]: - finder = _timezone_finder(fast=_FASTER) # rely on the default - pdt = None +from my.location.common import LatLon + +# for backwards compatibility +def _locations() -> Iterator[Tuple[LatLon, datetime]]: + try: + import my.location.all + for loc in my.location.all.locations(): + if loc.accuracy is not None and loc.accuracy > config.require_accuracy: + continue + yield ((loc.lat, loc.lon), loc.dt) + + except Exception as e: + from my.core.warnings import high + logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implemetation", exc_info=e) + high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data") + + import my.location.google + + for gloc in my.location.google.locations(): + yield ((gloc.lat, gloc.lon), gloc.dt) + +# TODO: could use heapmerge or sort the underlying iterators somehow? +# see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934 +def _sorted_locations() -> List[Tuple[LatLon, datetime]]: + return list(sorted(_locations(), key=lambda x: x[1])) + + +# Note: this takes a while, as the upstream since _locations isn't sorted, so this +# has to do an iterative sort of the entire my.locations.all list +def _iter_local_dates() -> Iterator[DayWithZone]: + finder = _timezone_finder(fast=config.fast) # rely on the default + #pdt = None + # TODO: warnings doesnt actually warn? warnings = [] # todo allow to skip if not noo many errors in row? - for l in locations(start=start, stop=stop): + for (lat, lon), dt in _sorted_locations(): # TODO right. its _very_ slow... - zone = finder.timezone_at(lng=l.lon, lat=l.lat) + zone = finder.timezone_at(lat=lat, lng=lon) if zone is None: - warnings.append(f"Couldn't figure out tz for {l}") + warnings.append(f"Couldn't figure out tz for {lat}, {lon}") continue tz = pytz.timezone(zone) # TODO this is probably a bit expensive... test & benchmark - ldt = l.dt.astimezone(tz) + ldt = dt.astimezone(tz) ndate = ldt.date() - if pdt is not None and ndate < pdt.date(): - # TODO for now just drop and collect the stats - # I guess we'd have minor drops while air travel... - warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") - continue - pdt = ldt + #if pdt is not None and ndate < pdt.date(): + # # TODO for now just drop and collect the stats + # # I guess we'd have minor drops while air travel... + # warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}") + # continue + #pdt = ldt z = tz.zone; assert z is not None yield DayWithZone(day=ndate, zone=z) -def most_common(l): - res, count = Counter(l).most_common(1)[0] # type: ignore[var-annotated] +def most_common(lst: List[DayWithZone]) -> DayWithZone: + res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated] return res -@mcachew(cache_path=cache_dir()) +def _iter_tz_depends_on() -> str: + """ + Since you might get new data which specifies a new timezone sometime + in the day, this causes _iter_tzs to refresh every 6 hours, like: + 2022-04-26_00 + 2022-04-26_06 + 2022-04-26_12 + 2022-04-26_18 + """ + day = str(date.today()) + hr = datetime.now().hour + hr_truncated = hr // 6 * 6 + return "{}_{}".format(day, hr_truncated) + + +# refresh _iter_tzs every 6 hours -- don't think a better depends_on is possible dynamically +@mcachew(logger=logger, depends_on=_iter_tz_depends_on) def _iter_tzs() -> Iterator[DayWithZone]: - for d, gr in groupby(_iter_local_dates(), key=lambda p: p.day): + # since we have no control over what order the locations are returned, + # we need to sort them first before we can do a groupby + local_dates: List[DayWithZone] = list(_iter_local_dates()) + local_dates.sort(key=lambda p: p.day) + for d, gr in groupby(local_dates, key=lambda p: p.day): logger.info('processed %s', d) zone = most_common(list(gr)).zone yield DayWithZone(day=d, zone=zone) @@ -106,6 +163,7 @@ def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]: break return None if zone is None else pytz.timezone(zone) + # ok to cache, there are only a few home locations? @lru_cache(maxsize=None) def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: @@ -119,8 +177,10 @@ def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]: return pytz.timezone(zone) -# TODO expose? to main as well? def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: + ''' + Given a datetime, returns the timezone for that date. + ''' res = _get_day_tz(d=dt.date()) if res is not None: return res @@ -129,6 +189,9 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]: loc = home.get_location(dt) return _get_home_tz(loc=loc) +# expose as 'public' function +get_tz = _get_tz + def localize(dt: datetime) -> tzdatetime: tz = _get_tz(dt) @@ -144,11 +207,13 @@ def stats() -> Stats: # TODO not sure what would be a good stat() for this module... # might be nice to print some actual timezones? # there aren't really any great iterables to expose + import os + VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990)) def localized_years(): last = datetime.now().year + 2 # note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached # need to think about it... - for Y in range(1990, last): + for Y in range(VIA_LOCATION_START_YEAR, last): dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01') yield localize(dt) return stat(localized_years) diff --git a/tests/tz.py b/tests/tz.py index cb8c513..0ea2b40 100644 --- a/tests/tz.py +++ b/tests/tz.py @@ -1,6 +1,5 @@ from datetime import datetime, timedelta, date, timezone from pathlib import Path -import sys import pytest # type: ignore import pytz # type: ignore @@ -80,7 +79,7 @@ def prepare(tmp_path: Path): from .common import reset_modules reset_modules() - LTZ._FASTER = True + LTZ.config.fast = True from .location import _prepare_google_config google = _prepare_google_config(tmp_path) @@ -98,7 +97,8 @@ def prepare(tmp_path: Path): class time: class tz: - pass # just rely on the default.. + class via_location: + pass # just rely on the defaults... import my.core.cfg as C with C.tmp_config() as config: diff --git a/tox.ini b/tox.ini index b8c89db..52bfdfb 100644 --- a/tox.ini +++ b/tox.ini @@ -100,6 +100,9 @@ commands = hpi module install my.goodreads hpi module install my.pdfs hpi module install my.smscalls + hpi module install my.location.gpslogger + hpi module install my.location.via_ip + hpi module install my.google.takeout.parser # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? @@ -118,6 +121,10 @@ commands = -p my.body.exercise.cross_trainer \ -p my.bluemaestro \ -p my.location.google \ + -p my.location.google_takeout \ + -p my.location.via_ip \ + -p my.location.gpslogger \ + -p my.ip.common \ -p my.time.tz.via_location \ -p my.calendar.holidays \ -p my.arbtt \