Collect from zip; iteratively

This commit is contained in:
Dima Gerasimov 2018-10-20 09:40:43 +01:00
parent 70a09a80ba
commit 96c8c324f3

View file

@ -1,13 +1,21 @@
from typing import NamedTuple, Iterator, List, Iterable, Collection, Sequence from typing import NamedTuple, Iterator, List, Iterable, Collection, Sequence
from datetime import datetime from datetime import datetime
from os import listdir
from os.path import join
from zipfile import ZipFile
import logging import logging
import csv import csv
import re
import json
import geopy.distance # type: ignore import geopy.distance # type: ignore
# pip3 install ijson
import ijson # type: ignore
def get_logger(): def get_logger():
return logging.getLogger("location") return logging.getLogger("location")
PATH = "/L/data/location/location.csv" TAKEOUTS_PATH = "/path/to/takeout"
CACHE_PATH = "/L/.cache/location.cache" CACHE_PATH = "/L/.cache/location.cache"
# TODO need to cache? # TODO need to cache?
@ -35,15 +43,14 @@ def tagger(dt: datetime, lat: float, lon: float) -> Tag:
# TODO hope they are sorted... # TODO hope they are sorted...
# TODO that could also serve as basis for timezone provider. # TODO that could also serve as basis for timezone provider.
def iter_locations() -> Iterator[Location]: def iter_locations() -> Iterator[Location]:
with open(PATH) as fo: last_takeout = max([f for f in listdir(TAKEOUTS_PATH) if re.match('takeout.*.zip', f)])
reader = csv.reader(fo) jdata = None
next(reader) # skip header with ZipFile(join(TAKEOUTS_PATH, last_takeout)).open('Takeout/Location History/Location History.json') as fo:
for ll in reader: for j in ijson.items(fo, 'locations.item'):
[ts, lats, lons] = ll # TODO eh, not very streaming?..
# TODO hmm, is it local?? dt = datetime.fromtimestamp(int(j["timestampMs"]) / 1000) # TODO utc??
dt = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S") lat = float(j["latitudeE7"] / 10000000)
lat = float(lats) lon = float(j["longitudeE7"] / 10000000)
lon = float(lons)
tag = tagger(dt, lat, lon) tag = tagger(dt, lat, lon)
yield Location( yield Location(
dt=dt, dt=dt,
@ -65,6 +72,7 @@ class LocInterval(NamedTuple):
from_: Location from_: Location
to: Location to: Location
# TOOD could cache groups too?... using 16% cpu is a bit annoying.. could also use some sliding window here
def get_groups(cached: bool=False) -> List[LocInterval]: def get_groups(cached: bool=False) -> List[LocInterval]:
locs = get_locations(cached=cached) locs = get_locations(cached=cached)
i = 0 i = 0