HPI/my/google/maps/android.py

202 lines
7.1 KiB
Python

"""
Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now)
"""
from __future__ import annotations
REQUIRES = [
"protobuf", # for parsing blobs from the database
]
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterator, Optional, Sequence
from urllib.parse import quote
from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
from my.core.common import unique_everseen
from my.core.sqlite import sqlite_connection
import my.config
from ._android_protobuf import parse_labeled, parse_list, parse_place
logger = LazyLogger(__name__)
@dataclass
class config(my.config.google.maps.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
def inputs() -> Sequence[Path]:
# TODO note sure if need to use all dbs? possibly the last one contains everything?
return get_files(config.export_path)
PlaceId = str
ListId = str
ListName = str
@dataclass(eq=True, frozen=True)
class Location:
lat: float
lon: float
@property
def url(self) -> str:
return f'https://maps.google.com/?q={self.lat},{self.lon}'
@dataclass(unsafe_hash=True)
class Place:
id: PlaceId
list_name: ListName # TODO maybe best to keep list id?
created_at: datetime_aware # TODO double check it's utc?
updated_at: datetime_aware # TODO double check it's utc?
title: str
location: Location
address: Optional[str]
note: Optional[str]
@property
def place_url(self) -> str:
title = quote(self.title)
return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}'
@property
def location_url(self) -> str:
return self.location.url
def _process_one(f: Path):
with sqlite_connection(f, row_factory='row') as conn:
msg: Any
lists: dict[ListId, ListName] = {}
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc)
server_id = row['server_id']
if server_id is None:
# this is the case for Travel plans, Followed places, Offers
# todo alternatively could use string_index column instead maybe?
continue
blob = row['item_proto']
msg = parse_list(blob)
name = msg.f1.name
lists[server_id] = name
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list
ts = row['timestamp'] / 1000
created = datetime.fromtimestamp(ts, tz=timezone.utc)
server_id = row['server_id']
[item_type, item_id] = server_id.split(':')
if item_type != '3':
# the ones that are not 3 are home/work address?
continue
blob = row['item_proto']
msg = parse_labeled(blob)
address = msg.address.full
if address == '':
address = None
location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)
yield Place(
id=item_id,
list_name='Labeled',
created_at=created,
updated_at=created, # doesn't look like it has 'updated'?
title=msg.title,
location=location,
address=address,
note=None, # don't think these allow notes
)
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places
server_id = row['server_id']
[list_id, _, id1, id2] = server_id.split(':')
item_id = f'{id1}:{id2}'
list_name = lists[list_id]
blob = row['item_proto']
msg = parse_place(blob)
title = msg.f1.title
note = msg.f1.note
if note == '': # seems that protobuf does that?
note = None
# TODO double check timezone
created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000)
# NOTE: this one seems to be the same as row['timestamp']
updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000)
address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug:
if address == '':
address = None
location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)
place = Place(
id=item_id,
list_name=list_name,
created_at=created,
updated_at=updated,
title=title,
location=location,
address=address,
note=note,
)
# ugh. in my case it's violated by one place by about 1 second??
# assert place.created_at <= place.updated_at
yield place
def saved() -> Iterator[Res[Place]]:
def it() -> Iterator[Res[Place]]:
paths = inputs()
total = len(paths)
width = len(str(total))
for idx, path in enumerate(paths):
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
yield from _process_one(path)
return unique_everseen(it)
# Summary of databases on Android (as of 20240101)
# -1_optimized_threads.notifications.db -- empty
# 1_optimized_threads.notifications.db -- empty
# 1_tasks.notifications.db -- empty
# -1_threads.notifications.db -- empty
# 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc?
# 1_thread_surveys.notifications.db -- empty
# 2_threads.notifications.db -- empty
# accounts.notifications.db -- just one row with account id
# brella_example_store -- empty
# gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled"
# gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something
# gmm_sync.db -- processed above
# gnp_fcm_database -- list of accounts
# google_app_measurement_local.db -- empty
# inbox_notifications.db -- nothing interesting
# <email>_room_notifications.db -- trip anniversaties?
# lighter_messaging_1.db -- empty
# lighter_messaging_2.db -- empty
# lighter_registration.db -- empty
# peopleCache_<email>_com.google_14.db -- contacts cache or something
# portable_geller_<email>.db -- looks like analytics
# primes_example_store -- looks like analytics
# pseudonymous_room_notifications.db -- looks like analytics
# ue3.db -- empty
# ugc_photos_location_data.db -- empty
# ugc-sync.db -- empty
# updates-tab-visit.db -- empty