cleanup locations.takeout a bit
This commit is contained in:
parent
89efa11a28
commit
64cd390db5
4 changed files with 63 additions and 53 deletions
|
@ -1,11 +1,12 @@
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from location import get_logger, get_locations, iter_locations, get_groups
|
from .takeout import get_logger, get_locations, iter_locations, get_groups
|
||||||
|
|
||||||
from kython.klogging import setup_logzero
|
from kython.klogging import setup_logzero
|
||||||
|
|
||||||
|
|
||||||
|
# TODO remove this?
|
||||||
def main():
|
def main():
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
setup_logzero(logger, level=logging.DEBUG)
|
setup_logzero(logger, level=logging.DEBUG)
|
||||||
|
@ -14,8 +15,9 @@ def main():
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
cmd = sys.argv[1]
|
cmd = sys.argv[1]
|
||||||
# TODO ok, update cache makes sense just to refresh in case of code changes...
|
# TODO ok, update cache makes sense just to refresh in case of code changes...
|
||||||
|
# TODO don't even need it anymore? cachew handles this..
|
||||||
if cmd == "update_cache":
|
if cmd == "update_cache":
|
||||||
from location import update_cache, get_locations
|
from .takeout import update_cache, get_locations
|
||||||
update_cache()
|
update_cache()
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Unknown command {cmd}")
|
raise RuntimeError(f"Unknown command {cmd}")
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
|
||||||
|
|
||||||
python3 -m location
|
|
|
@ -1,36 +1,37 @@
|
||||||
from typing import NamedTuple, Iterator, List, Iterable, Collection, Sequence, Deque, Any, Optional
|
|
||||||
from collections import deque
|
|
||||||
from itertools import islice
|
|
||||||
from datetime import datetime
|
|
||||||
from zipfile import ZipFile
|
|
||||||
import logging
|
|
||||||
import csv
|
|
||||||
import re
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections import deque
|
||||||
|
from datetime import datetime
|
||||||
|
from itertools import islice
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, Collection, Deque, Iterable, Iterator, List, NamedTuple, Optional, Sequence
|
||||||
|
from zipfile import ZipFile
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
|
|
||||||
from kython import kompress
|
|
||||||
|
|
||||||
from cachew import cachew, mtime_hash
|
|
||||||
|
|
||||||
|
|
||||||
# pip3 install geopy
|
# pip3 install geopy
|
||||||
import geopy # type: ignore
|
import geopy # type: ignore
|
||||||
import geopy.distance # type: ignore
|
import geopy.distance # type: ignore
|
||||||
# pip3 install ijson
|
# pip3 install ijson cffi
|
||||||
import ijson # type: ignore
|
# cffi backend is almost 2x faster than default
|
||||||
|
import ijson.backends.yajl2_cffi as ijson # type: ignore
|
||||||
|
|
||||||
|
from cachew import cachew, mtime_hash
|
||||||
|
from kython import kompress # TODO
|
||||||
|
|
||||||
|
from ..common import get_files
|
||||||
|
|
||||||
|
|
||||||
def get_logger():
|
def get_logger():
|
||||||
return logging.getLogger("location")
|
return logging.getLogger("location")
|
||||||
|
|
||||||
|
|
||||||
TAKEOUTS_PATH = Path("/path/to/takeout")
|
def cache_path(*args, **kwargs):
|
||||||
CACHE_PATH = Path('/L/data/.cache/location.sqlite')
|
from mycfg import paths
|
||||||
|
return paths.location.cache_path
|
||||||
|
|
||||||
|
|
||||||
Tag = str
|
Tag = Optional[str]
|
||||||
|
|
||||||
class Location(NamedTuple):
|
class Location(NamedTuple):
|
||||||
dt: datetime
|
dt: datetime
|
||||||
|
@ -40,22 +41,35 @@ class Location(NamedTuple):
|
||||||
tag: Tag
|
tag: Tag
|
||||||
|
|
||||||
|
|
||||||
def tagger(dt: datetime, point: geopy.Point) -> Tag:
|
# TODO use pool? not sure if that would really be faster...
|
||||||
TAGS = [
|
|
||||||
# removed
|
|
||||||
]
|
|
||||||
for coord, dist, tag in TAGS:
|
|
||||||
if geopy.distance.distance(coord, point).m < dist:
|
|
||||||
return tag
|
|
||||||
else:
|
|
||||||
return "other"
|
|
||||||
|
|
||||||
|
|
||||||
def _iter_locations_fo(fo, start, stop) -> Iterator[Location]:
|
def _iter_locations_fo(fo, start, stop) -> Iterator[Location]:
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
errors = 0
|
errors = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mycfg.locations import LOCATIONS as known_locations
|
||||||
|
except ModuleNotFoundError as e:
|
||||||
|
name = 'mycfg.locations'
|
||||||
|
if e.name != name:
|
||||||
|
raise e
|
||||||
|
logger.warning("'%s' isn't found. setting known_locations to empty list", name)
|
||||||
|
known_locations = []
|
||||||
|
|
||||||
|
# TODO tagging should be takeout-agnostic
|
||||||
|
def tagger(dt: datetime, point: geopy.Point) -> Tag:
|
||||||
|
'''
|
||||||
|
Tag points with known locations (e.g. work/home/etc)
|
||||||
|
'''
|
||||||
|
for lat, lon, dist, tag in known_locations:
|
||||||
|
# TODO use something more efficient?
|
||||||
|
if geopy.distance.distance((lat, lon), point).m < dist:
|
||||||
|
return tag
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
for j in islice(ijson.items(fo, 'locations.item'), start, stop):
|
for j in islice(ijson.items(fo, 'locations.item'), start, stop):
|
||||||
dt = datetime.utcfromtimestamp(int(j["timestampMs"]) / 1000)
|
dt = datetime.utcfromtimestamp(int(j["timestampMs"]) / 1000)
|
||||||
if total % 10000 == 0:
|
if total % 10000 == 0:
|
||||||
|
@ -86,7 +100,8 @@ def _iter_locations_fo(fo, start, stop) -> Iterator[Location]:
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO hope they are sorted...
|
# TODO hope they are sorted...
|
||||||
@cachew(CACHE_PATH, hashf=mtime_hash, cls=Location, chunk_by=10000, logger=get_logger())
|
# TODO CACHEW_OFF env variable?
|
||||||
|
@cachew(cache_path, hashf=mtime_hash, cls=Location, chunk_by=10000, logger=get_logger())
|
||||||
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
|
def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
|
||||||
if path.suffix == '.json':
|
if path.suffix == '.json':
|
||||||
ctx = path.open('r')
|
ctx = path.open('r')
|
||||||
|
@ -99,18 +114,22 @@ def _iter_locations(path: Path, start=0, stop=None) -> Iterator[Location]:
|
||||||
|
|
||||||
|
|
||||||
def iter_locations(**kwargs) -> Iterator[Location]:
|
def iter_locations(**kwargs) -> Iterator[Location]:
|
||||||
last_takeout = max(TAKEOUTS_PATH.glob('takeout*.zip'))
|
from mycfg import paths
|
||||||
|
# TODO need to include older data
|
||||||
|
last_takeout = max(get_files(paths.google.takeout_path, glob='takeout*.zip'))
|
||||||
|
|
||||||
return _iter_locations(path=last_takeout, **kwargs)
|
return _iter_locations(path=last_takeout, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def get_locations() -> Sequence[Location]:
|
def get_locations(*args, **kwargs) -> Sequence[Location]:
|
||||||
return list(iter_locations())
|
return list(iter_locations(*args, **kwargs))
|
||||||
|
|
||||||
class LocInterval(NamedTuple):
|
class LocInterval(NamedTuple):
|
||||||
from_: Location
|
from_: Location
|
||||||
to: Location
|
to: Location
|
||||||
|
|
||||||
|
|
||||||
|
# TODO use this advanced iterators library?
|
||||||
# TODO kython? nicer interface?
|
# TODO kython? nicer interface?
|
||||||
class Window:
|
class Window:
|
||||||
def __init__(self, it):
|
def __init__(self, it):
|
||||||
|
@ -147,11 +166,12 @@ class Window:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# TODO cachew as well?
|
||||||
# TODO maybe if tag is none, we just don't care?
|
# TODO maybe if tag is none, we just don't care?
|
||||||
def get_groups() -> List[LocInterval]:
|
def get_groups(*args, **kwargs) -> List[LocInterval]:
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
all_locations = iter(iter_locations()) # TODO
|
all_locations = iter(iter_locations(*args, **kwargs))
|
||||||
locsi = Window(all_locations)
|
locsi = Window(all_locations)
|
||||||
i = 0
|
i = 0
|
||||||
groups: List[LocInterval] = []
|
groups: List[LocInterval] = []
|
||||||
|
@ -202,7 +222,8 @@ def get_groups() -> List[LocInterval]:
|
||||||
|
|
||||||
def update_cache():
|
def update_cache():
|
||||||
# TODO perhaps set hash to null instead, that's a bit less intrusive
|
# TODO perhaps set hash to null instead, that's a bit less intrusive
|
||||||
if CACHE_PATH.exists():
|
cp = cache_path()
|
||||||
CACHE_PATH.unlink()
|
if cp.exists():
|
||||||
|
cp.unlink()
|
||||||
for _ in iter_locations():
|
for _ in iter_locations():
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
cd "$(dirname "$0")"
|
|
||||||
|
|
||||||
python3 -m location update_cache
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue