location: add all.py, using takeout/gpslogger/ip (#237)

* location: add all.py, using takeout/gpslogger/ip, update docs
This commit is contained in:
seanbreckenridge 2022-04-26 13:11:35 -07:00 committed by GitHub
parent 66a00c6ada
commit 2cb836181b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 488 additions and 46 deletions

View file

@ -16,9 +16,12 @@ If you have some issues with the setup, see [[file:SETUP.org::#troubleshooting][
- [[#toc][TOC]]
- [[#intro][Intro]]
- [[#configs][Configs]]
- [[#mygoogletakeoutpaths][my.google.takeout.paths]]
- [[#mygoogletakeoutparser][my.google.takeout.parser]]
- [[#myhypothesis][my.hypothesis]]
- [[#myreddit][my.reddit]]
- [[#mybrowser][my.browser]]
- [[#mylocation][my.location]]
- [[#mytimetzvia_location][my.time.tz.via_location]]
- [[#mypocket][my.pocket]]
- [[#mytwittertwint][my.twitter.twint]]
- [[#mytwitterarchive][my.twitter.archive]]
@ -90,12 +93,12 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http
export_path: Paths
#+end_src
** [[file:../my/browser/][my.browser]]
Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
#+begin_src python
@dataclass
class browser:
class export:
# path[s]/glob to your backed up browser history sqlite files
@ -108,6 +111,80 @@ For an extensive/complex example, you can check out ~@seanbreckenridge~'s [[http
# active_databases = Firefox.locate_database()
export_path: Paths
#+end_src
** [[file:../my/location][my.location]]
Merged location history from lots of sources.
The main sources here are
[[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (XML) files, and
google takeout (using =my.google.takeout.parser=), with a fallback on
manually defined home locations.
You might also be able to use [[file:../my/location/via_ip.py][my.location.via_ip]] which uses =my.ip.all= to
provide geolocation data for an IPs (though no IPs are provided from any
of the sources here). For an example of usage, see [[https://github.com/seanbreckenridge/HPI/tree/master/my/ip][here]]
#+begin_src python
class location:
home = (
# supports ISO strings
('2005-12-04' , (42.697842, 23.325973)), # Bulgaria, Sofia
# supports date/datetime objects
(date(year=1980, month=2, day=15) , (40.7128 , -74.0060 )), # NY
(datetime.fromtimestamp(1600000000, tz=timezone.utc), (55.7558 , 37.6173 )), # Moscow, Russia
)
# note: order doesn't matter, will be sorted in the data provider
class gpslogger:
# path[s]/glob to the exported gpx files
export_path: Paths
# default accuracy for gpslogger
accuracy: float = 50.0
class via_ip:
# guess ~15km accuracy for IP addresses
accuracy: float = 15_000
#+end_src
** [[file:../my/time/tz/via_location.py][my.time.tz.via_location]]
Uses the =my.location= module to determine the timezone for a location.
This can be used to 'localize' timezones. Most modules here return
datetimes in UTC, to prevent confusion whether or not its a local
timezone, one from UTC, or one in your timezone.
Depending on the specific data provider and your level of paranoia you might expect different behaviour.. E.g.:
- if your objects already have tz info, you might not need to call localize() at all
- it's safer when either all of your objects are tz aware or all are tz unware, not a mixture
- you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable
#+begin_src python
TzPolicy = Literal[
'keep' , # if datetime is tz aware, just preserve it
'convert', # if datetime is tz aware, convert to provider's tz
'throw' , # if datetime is tz aware, throw exception
]
#+end_src
This is still a work in progress, plan is to integrate it with =hpi query=
so that you can easily convert/localize timezones for some module/data
#+begin_src python
class time:
class tz:
policy = 'keep'
class via_location:
# less precise, but faster
fast: bool = True
# if the accuracy for the location is more than 5km (this
# isn't an accurate location, so shouldn't use it to determine
# timezone), don't use
require_accuracy: float = 5_000
#+end_src
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
@ -163,7 +240,6 @@ for cls, p in modules:
#+RESULTS:
** [[file:../my/google/takeout/parser.py][my.google.takeout.parser]]
Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]]

View file

@ -72,10 +72,19 @@ class location:
# and we can't import the types from the module itself, otherwise would be circular. common module?
home: Union[LatLon, Sequence[Tuple[DateIsh, LatLon]]] = (1.0, -1.0)
class via_ip:
accuracy: float
class gpslogger:
export_path: Paths = ''
accuracy: float
class time:
class tz:
pass
class via_location:
fast: bool
require_accuracy: float
class orgmode:

29
my/ip/all.py Normal file
View file

@ -0,0 +1,29 @@
"""
An example all.py stub module that provides ip data
To use this, you'd add IP providers that yield IPs to the 'ips' function
For an example of how this could be used, see https://github.com/seanbreckenridge/HPI/tree/master/my/ip
"""
REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"]
from typing import Iterator
from my.core.common import Stats, warn_if_empty
from .common import IP
@warn_if_empty
def ips() -> Iterator[IP]:
yield from ()
def stats() -> Stats:
from my.core import stat
return {
**stat(ips),
}

39
my/ip/common.py Normal file
View file

@ -0,0 +1,39 @@
"""
Provides location/timezone data from IP addresses, using [[https://github.com/seanbreckenridge/ipgeocache][ipgeocache]]
"""
REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"]
from my.core import __NOT_HPI_MODULE__
import ipaddress
from typing import NamedTuple, Iterator
from datetime import datetime
import ipgeocache
from my.core import Json
class IP(NamedTuple):
dt: datetime
addr: str # an IP address
# TODO: could cache? not sure if it's worth it
def ipgeocache(self) -> Json:
return ipgeocache.get(self.addr)
@property
def tzname(self) -> str:
tz: str = self.ipgeocache()["timezone"]
return tz
def drop_private(ips: Iterator[IP]) -> Iterator[IP]:
"""
Helper function that can be used to filter out private IPs
"""
for ip in ips:
if ipaddress.ip_address(ip.addr).is_private:
continue
yield ip

48
my/location/all.py Normal file
View file

@ -0,0 +1,48 @@
"""
Merges location data from multiple sources
"""
from typing import Iterator
from my.core import Stats, LazyLogger
from my.core.source import import_source
from my.location.via_ip import locations
from .common import Location
logger = LazyLogger(__name__, level="warning")
def locations() -> Iterator[Location]:
# can add/comment out sources here to disable them, or use core.disabled_modules
yield from _takeout_locations()
yield from _gpslogger_locations()
yield from _ip_locations()
@import_source(module_name="my.location.google_takeout")
def _takeout_locations() -> Iterator[Location]:
from . import google_takeout
yield from google_takeout.locations()
@import_source(module_name="my.location.gpslogger")
def _gpslogger_locations() -> Iterator[Location]:
from . import gpslogger
yield from gpslogger.locations()
@import_source(module_name="my.location.via_ip")
def _ip_locations() -> Iterator[Location]:
from . import via_ip
yield from via_ip.locations()
def stats() -> Stats:
from my.core import stat
return {
**stat(locations),
}

17
my/location/common.py Normal file
View file

@ -0,0 +1,17 @@
from datetime import date, datetime
from typing import Union, Tuple, NamedTuple, Optional
from my.core import __NOT_HPI_MODULE__
DateIsh = Union[datetime, date, str]
LatLon = Tuple[float, float]
# TODO: add timezone to this? can use timezonefinder in tz provider instead though
class Location(NamedTuple):
lat: float
lon: float
dt: datetime
accuracy: Optional[float]
elevation: Optional[float]

View file

@ -1,6 +1,9 @@
"""
Location data from Google Takeout
DEPRECATED: setup my.google.takeout.parser and use my.location.google_takeout instead
"""
REQUIRES = [
'geopy', # checking that coordinates are valid
'ijson',
@ -20,6 +23,10 @@ from ..core.common import LazyLogger, mcachew
from ..core.cachew import cache_dir
from ..core import kompress
from my.core.warnings import high
high("Please set up my.google.takeout.parser module for better takeout support")
# otherwise uses ijson
# todo move to config??

View file

@ -0,0 +1,33 @@
"""
Extracts locations using google_takeout_parser -- no shared code with the deprecated my.location.google
"""
REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"]
from typing import Iterator
from my.google.takeout.parser import events, _cachew_depends_on
from google_takeout_parser.models import Location as GoogleLocation
from my.core.common import mcachew, LazyLogger, Stats
from .common import Location
logger = LazyLogger(__name__)
@mcachew(
depends_on=_cachew_depends_on,
logger=logger,
)
def locations() -> Iterator[Location]:
for g in events():
if isinstance(g, GoogleLocation):
yield Location(
lon=g.lng, lat=g.lat, dt=g.dt, accuracy=g.accuracy, elevation=None
)
def stats() -> Stats:
from my.core import stat
return {**stat(locations)}

74
my/location/gpslogger.py Normal file
View file

@ -0,0 +1,74 @@
"""
Parse [[https://github.com/mendhak/gpslogger][gpslogger]] .gpx (xml) files
"""
REQUIRES = ["gpxpy"]
from my.config import location
from my.core import Paths, dataclass
@dataclass
class config(location.gpslogger):
# path[s]/glob to the synced gpx (XML) files
export_path: Paths
# default accuracy for gpslogger
accuracy: float = 50.0
from itertools import chain
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator, Sequence, List
import gpxpy # type: ignore[import]
from more_itertools import unique_everseen
from my.core import Stats, LazyLogger
from my.core.common import get_files, mcachew
from .common import Location
logger = LazyLogger(__name__, level="warning")
def inputs() -> Sequence[Path]:
return get_files(config.export_path, glob="*.gpx")
def _cachew_depends_on() -> List[float]:
return [p.stat().st_mtime for p in inputs()]
# TODO: could use a better cachew key/this has to recompute every file whenever the newest one changes
@mcachew(depends_on=_cachew_depends_on, logger=logger)
def locations() -> Iterator[Location]:
yield from unique_everseen(
chain(*map(_extract_locations, inputs())), key=lambda loc: loc.dt
)
def _extract_locations(path: Path) -> Iterator[Location]:
with path.open("r") as gf:
gpx_obj = gpxpy.parse(gf)
for track in gpx_obj.tracks:
for segment in track.segments:
for point in segment.points:
if point.time is None:
continue
# hmm - for gpslogger, seems that timezone is always SimpleTZ('Z'), which
# specifies UTC -- see https://github.com/tkrajina/gpxpy/blob/cb243b22841bd2ce9e603fe3a96672fc75edecf2/gpxpy/gpxfield.py#L38
yield Location(
lat=point.latitude,
lon=point.longitude,
accuracy=config.accuracy,
elevation=point.elevation,
dt=datetime.replace(point.time, tzinfo=timezone.utc),
)
def stats() -> Stats:
from my.core import stat
return {**stat(locations)}

View file

@ -2,17 +2,13 @@
Simple location provider, serving as a fallback when more detailed data isn't available
'''
from dataclasses import dataclass
from datetime import datetime, date, time, timezone
from datetime import datetime, time, timezone
from functools import lru_cache
from typing import Sequence, Tuple, Union, cast
from my.config import location as user_config
DateIsh = Union[datetime, date, str]
# todo hopefully reasonable? might be nice to add name or something too
LatLon = Tuple[float, float]
from my.location.common import LatLon, DateIsh
@dataclass
class Config(user_config):

39
my/location/via_ip.py Normal file
View file

@ -0,0 +1,39 @@
"""
Converts IP addresses provided by my.location.ip to estimated locations
"""
REQUIRES = ["git+https://github.com/seanbreckenridge/ipgeocache"]
from my.core import dataclass, Stats
from my.config import location
@dataclass
class config(location.via_ip):
# no real science to this, just a guess of ~15km accuracy for IP addresses
accuracy: float = 15_000.0
from typing import Iterator
from .common import Location
from my.ip.all import ips
def locations() -> Iterator[Location]:
for ip in ips():
loc: str = ip.ipgeocache()["loc"]
lat, _, lon = loc.partition(",")
yield Location(
lat=float(lat),
lon=float(lon),
dt=ip.dt,
accuracy=config.accuracy,
elevation=None,
)
def stats() -> Stats:
from my.core import stat
return {**stat(locations)}

View file

@ -10,24 +10,27 @@ Depending on the specific data provider and your level of paranoia you might exp
- it's safer when either all of your objects are tz aware or all are tz unware, not a mixture
- you might trust your original timezone, or it might just be UTC, and you want to use something more reasonable
'''
Policy = Literal[
TzPolicy = Literal[
'keep' , # if datetime is tz aware, just preserve it
'convert', # if datetime is tz aware, convert to provider's tz
'throw' , # if datetime is tz aware, throw exception
# todo 'warn'? not sure if very useful
]
def default_policy() -> Policy:
# backwards compatibility
Policy = TzPolicy
def default_policy() -> TzPolicy:
try:
from my.config import time as user_config
return cast(Policy, user_config.tz.policy)
return cast(TzPolicy, user_config.tz.policy)
except Exception as e:
# todo meh.. need to think how to do this more carefully
# rationale: do not mess with user's data unless they want
return 'keep'
def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: Policy=default_policy()) -> tzdatetime:
def localize_with_policy(lfun: Callable[[datetime], tzdatetime], dt: datetime, policy: TzPolicy=default_policy()) -> tzdatetime:
tz = dt.tzinfo
if tz is None:
return lfun(dt)

View file

@ -7,27 +7,34 @@ REQUIRES = [
]
from my.config import time
from my.core import dataclass
@dataclass
class config(time.tz.via_location):
# less precise, but faster
fast: bool = True
# if the accuracy for the location is more than 5km, don't use
require_accuracy: float = 5_000
from collections import Counter
from datetime import date, datetime
from functools import lru_cache
from itertools import groupby
from typing import Iterator, NamedTuple, Optional
from typing import Iterator, NamedTuple, Optional, Tuple, Any, List
from more_itertools import seekable
import pytz
from ...core.common import LazyLogger, mcachew, tzdatetime
from ...core.cachew import cache_dir
from ...location.google import locations
from my.core.common import LazyLogger, mcachew, tzdatetime
logger = LazyLogger(__name__, level='warning')
logger = LazyLogger(__name__, level='debug')
# todo should move to config? not sure
_FASTER: bool = True
@lru_cache(2)
def _timezone_finder(fast: bool):
def _timezone_finder(fast: bool) -> Any:
if fast:
# less precise, but faster
from timezonefinder import TimezoneFinderL as Finder # type: ignore
@ -46,39 +53,89 @@ class DayWithZone(NamedTuple):
zone: Zone
def _iter_local_dates(start=0, stop=None) -> Iterator[DayWithZone]:
finder = _timezone_finder(fast=_FASTER) # rely on the default
pdt = None
from my.location.common import LatLon
# for backwards compatibility
def _locations() -> Iterator[Tuple[LatLon, datetime]]:
try:
import my.location.all
for loc in my.location.all.locations():
if loc.accuracy is not None and loc.accuracy > config.require_accuracy:
continue
yield ((loc.lat, loc.lon), loc.dt)
except Exception as e:
from my.core.warnings import high
logger.exception("Could not setup via_location using my.location.all provider, falling back to legacy google implemetation", exc_info=e)
high("Setup my.google.takeout.parser, then my.location.all for better google takeout/location data")
import my.location.google
for gloc in my.location.google.locations():
yield ((gloc.lat, gloc.lon), gloc.dt)
# TODO: could use heapmerge or sort the underlying iterators somehow?
# see https://github.com/karlicoss/HPI/pull/237#discussion_r858372934
def _sorted_locations() -> List[Tuple[LatLon, datetime]]:
return list(sorted(_locations(), key=lambda x: x[1]))
# Note: this takes a while, as the upstream since _locations isn't sorted, so this
# has to do an iterative sort of the entire my.locations.all list
def _iter_local_dates() -> Iterator[DayWithZone]:
finder = _timezone_finder(fast=config.fast) # rely on the default
#pdt = None
# TODO: warnings doesnt actually warn?
warnings = []
# todo allow to skip if not noo many errors in row?
for l in locations(start=start, stop=stop):
for (lat, lon), dt in _sorted_locations():
# TODO right. its _very_ slow...
zone = finder.timezone_at(lng=l.lon, lat=l.lat)
zone = finder.timezone_at(lat=lat, lng=lon)
if zone is None:
warnings.append(f"Couldn't figure out tz for {l}")
warnings.append(f"Couldn't figure out tz for {lat}, {lon}")
continue
tz = pytz.timezone(zone)
# TODO this is probably a bit expensive... test & benchmark
ldt = l.dt.astimezone(tz)
ldt = dt.astimezone(tz)
ndate = ldt.date()
if pdt is not None and ndate < pdt.date():
# TODO for now just drop and collect the stats
# I guess we'd have minor drops while air travel...
warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}")
continue
pdt = ldt
#if pdt is not None and ndate < pdt.date():
# # TODO for now just drop and collect the stats
# # I guess we'd have minor drops while air travel...
# warnings.append("local time goes backwards {ldt} ({tz}) < {pdt}")
# continue
#pdt = ldt
z = tz.zone; assert z is not None
yield DayWithZone(day=ndate, zone=z)
def most_common(l):
res, count = Counter(l).most_common(1)[0] # type: ignore[var-annotated]
def most_common(lst: List[DayWithZone]) -> DayWithZone:
res, _ = Counter(lst).most_common(1)[0] # type: ignore[var-annotated]
return res
@mcachew(cache_path=cache_dir())
def _iter_tz_depends_on() -> str:
"""
Since you might get new data which specifies a new timezone sometime
in the day, this causes _iter_tzs to refresh every 6 hours, like:
2022-04-26_00
2022-04-26_06
2022-04-26_12
2022-04-26_18
"""
day = str(date.today())
hr = datetime.now().hour
hr_truncated = hr // 6 * 6
return "{}_{}".format(day, hr_truncated)
# refresh _iter_tzs every 6 hours -- don't think a better depends_on is possible dynamically
@mcachew(logger=logger, depends_on=_iter_tz_depends_on)
def _iter_tzs() -> Iterator[DayWithZone]:
for d, gr in groupby(_iter_local_dates(), key=lambda p: p.day):
# since we have no control over what order the locations are returned,
# we need to sort them first before we can do a groupby
local_dates: List[DayWithZone] = list(_iter_local_dates())
local_dates.sort(key=lambda p: p.day)
for d, gr in groupby(local_dates, key=lambda p: p.day):
logger.info('processed %s', d)
zone = most_common(list(gr)).zone
yield DayWithZone(day=d, zone=zone)
@ -106,6 +163,7 @@ def _get_day_tz(d: date) -> Optional[pytz.BaseTzInfo]:
break
return None if zone is None else pytz.timezone(zone)
# ok to cache, there are only a few home locations?
@lru_cache(maxsize=None)
def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]:
@ -119,8 +177,10 @@ def _get_home_tz(loc) -> Optional[pytz.BaseTzInfo]:
return pytz.timezone(zone)
# TODO expose? to main as well?
def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
'''
Given a datetime, returns the timezone for that date.
'''
res = _get_day_tz(d=dt.date())
if res is not None:
return res
@ -129,6 +189,9 @@ def _get_tz(dt: datetime) -> Optional[pytz.BaseTzInfo]:
loc = home.get_location(dt)
return _get_home_tz(loc=loc)
# expose as 'public' function
get_tz = _get_tz
def localize(dt: datetime) -> tzdatetime:
tz = _get_tz(dt)
@ -144,11 +207,13 @@ def stats() -> Stats:
# TODO not sure what would be a good stat() for this module...
# might be nice to print some actual timezones?
# there aren't really any great iterables to expose
import os
VIA_LOCATION_START_YEAR = int(os.environ.get("VIA_LOCATION_START_YEAR", 1990))
def localized_years():
last = datetime.now().year + 2
# note: deliberately take + 2 years, so the iterator exhausts. otherwise stuff might never get cached
# need to think about it...
for Y in range(1990, last):
for Y in range(VIA_LOCATION_START_YEAR, last):
dt = datetime.fromisoformat(f'{Y}-01-01 01:01:01')
yield localize(dt)
return stat(localized_years)

View file

@ -1,6 +1,5 @@
from datetime import datetime, timedelta, date, timezone
from pathlib import Path
import sys
import pytest # type: ignore
import pytz # type: ignore
@ -80,7 +79,7 @@ def prepare(tmp_path: Path):
from .common import reset_modules
reset_modules()
LTZ._FASTER = True
LTZ.config.fast = True
from .location import _prepare_google_config
google = _prepare_google_config(tmp_path)
@ -98,7 +97,8 @@ def prepare(tmp_path: Path):
class time:
class tz:
pass # just rely on the default..
class via_location:
pass # just rely on the defaults...
import my.core.cfg as C
with C.tmp_config() as config:

View file

@ -100,6 +100,9 @@ commands =
hpi module install my.goodreads
hpi module install my.pdfs
hpi module install my.smscalls
hpi module install my.location.gpslogger
hpi module install my.location.via_ip
hpi module install my.google.takeout.parser
# todo fuck. -p my.github isn't checking the subpackages?? wtf...
# guess it wants .pyi file??
@ -118,6 +121,10 @@ commands =
-p my.body.exercise.cross_trainer \
-p my.bluemaestro \
-p my.location.google \
-p my.location.google_takeout \
-p my.location.via_ip \
-p my.location.gpslogger \
-p my.ip.common \
-p my.time.tz.via_location \
-p my.calendar.holidays \
-p my.arbtt \