""" Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now) """ from __future__ import annotations REQUIRES = [ "protobuf", # for parsing blobs from the database ] from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Any, Iterator, Optional, Sequence from urllib.parse import quote from my.core import datetime_aware, get_files, LazyLogger, Paths, Res from my.core.common import unique_everseen from my.core.sqlite import sqlite_connection import my.config from ._android_protobuf import parse_labeled, parse_list, parse_place logger = LazyLogger(__name__) @dataclass class config(my.config.google.maps.android): # paths[s]/glob to the exported sqlite databases export_path: Paths def inputs() -> Sequence[Path]: # TODO note sure if need to use all dbs? possibly the last one contains everything? return get_files(config.export_path) PlaceId = str ListId = str ListName = str @dataclass(eq=True, frozen=True) class Location: lat: float lon: float @property def url(self) -> str: return f'https://maps.google.com/?q={self.lat},{self.lon}' @dataclass(unsafe_hash=True) class Place: id: PlaceId list_name: ListName # TODO maybe best to keep list id? created_at: datetime_aware # TODO double check it's utc? updated_at: datetime_aware # TODO double check it's utc? title: str location: Location address: Optional[str] note: Optional[str] @property def place_url(self) -> str: title = quote(self.title) return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}' @property def location_url(self) -> str: return self.location.url def _process_one(f: Path): with sqlite_connection(f, row_factory='row') as conn: msg: Any lists: dict[ListId, ListName] = {} for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc) server_id = row['server_id'] if server_id is None: # this is the case for Travel plans, Followed places, Offers # todo alternatively could use string_index column instead maybe? continue blob = row['item_proto'] msg = parse_list(blob) name = msg.f1.name lists[server_id] = name for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list ts = row['timestamp'] / 1000 created = datetime.fromtimestamp(ts, tz=timezone.utc) server_id = row['server_id'] [item_type, item_id] = server_id.split(':') if item_type != '3': # the ones that are not 3 are home/work address? continue blob = row['item_proto'] msg = parse_labeled(blob) address = msg.address.full if address == '': address = None location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) yield Place( id=item_id, list_name='Labeled', created_at=created, updated_at=created, # doesn't look like it has 'updated'? title=msg.title, location=location, address=address, note=None, # don't think these allow notes ) for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places server_id = row['server_id'] [list_id, _, id1, id2] = server_id.split(':') item_id = f'{id1}:{id2}' list_name = lists[list_id] blob = row['item_proto'] msg = parse_place(blob) title = msg.f1.title note = msg.f1.note if note == '': # seems that protobuf does that? note = None # TODO double check timezone created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000) # NOTE: this one seems to be the same as row['timestamp'] updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000) address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug: if address == '': address = None location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) place = Place( id=item_id, list_name=list_name, created_at=created, updated_at=updated, title=title, location=location, address=address, note=note, ) # ugh. in my case it's violated by one place by about 1 second?? # assert place.created_at <= place.updated_at yield place def saved() -> Iterator[Res[Place]]: def it() -> Iterator[Res[Place]]: paths = inputs() total = len(paths) width = len(str(total)) for idx, path in enumerate(paths): logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') yield from _process_one(path) return unique_everseen(it) # Summary of databases on Android (as of 20240101) # -1_optimized_threads.notifications.db -- empty # 1_optimized_threads.notifications.db -- empty # 1_tasks.notifications.db -- empty # -1_threads.notifications.db -- empty # 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc? # 1_thread_surveys.notifications.db -- empty # 2_threads.notifications.db -- empty # accounts.notifications.db -- just one row with account id # brella_example_store -- empty # gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled" # gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something # gmm_sync.db -- processed above # gnp_fcm_database -- list of accounts # google_app_measurement_local.db -- empty # inbox_notifications.db -- nothing interesting # _room_notifications.db -- trip anniversaties? # lighter_messaging_1.db -- empty # lighter_messaging_2.db -- empty # lighter_registration.db -- empty # peopleCache__com.google_14.db -- contacts cache or something # portable_geller_.db -- looks like analytics # primes_example_store -- looks like analytics # pseudonymous_room_notifications.db -- looks like analytics # ue3.db -- empty # ugc_photos_location_data.db -- empty # ugc-sync.db -- empty # updates-tab-visit.db -- empty