diff --git a/my/config.py b/my/config.py index e9b0ec8..a92b2bc 100644 --- a/my/config.py +++ b/my/config.py @@ -68,6 +68,10 @@ class pinboard: export_dir: Paths = '' class google: + class maps: + class android: + export_path: Paths = '' + takeout_path: Paths = '' diff --git a/my/google/maps/_android_protobuf.py b/my/google/maps/_android_protobuf.py new file mode 100644 index 0000000..1d43ae0 --- /dev/null +++ b/my/google/maps/_android_protobuf.py @@ -0,0 +1,113 @@ +from my.core import __NOT_HPI_MODULE__ + +# NOTE: this tool was quite useful https://github.com/aj3423/aproto + +from google.protobuf import descriptor_pool, descriptor_pb2, message_factory + +TYPE_STRING = descriptor_pb2.FieldDescriptorProto.TYPE_STRING +TYPE_BYTES = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES +TYPE_UINT64 = descriptor_pb2.FieldDescriptorProto.TYPE_UINT64 +TYPE_MESSAGE = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE + +OPTIONAL = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL +REQUIRED = descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED + + +def get_place_protos(): + f1 = descriptor_pb2.DescriptorProto(name='xf1') + # TODO 2 -> 5 is address? 2 -> 6 is a pair of coordinates + f1.field.add(name='title', number=3, type=TYPE_STRING, label=REQUIRED) + f1.field.add(name='note' , number=4, type=TYPE_STRING, label=OPTIONAL) + # TODO what's the difference between required and optional? doesn't impact decoding? + + ts = descriptor_pb2.DescriptorProto(name='Timestamp') + ts.field.add(name='seconds', number=1, type=TYPE_UINT64, label=REQUIRED) + ts.field.add(name='nanos' , number=2, type=TYPE_UINT64, label=REQUIRED) + + f1.field.add(name='created', number=10 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name) + f1.field.add(name='updated', number=11 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name) + + f2 = descriptor_pb2.DescriptorProto(name='xf2') + f2.field.add(name='addr1', number=2, type=TYPE_STRING, label=REQUIRED) + f2.field.add(name='addr2', number=3, type=TYPE_STRING, label=REQUIRED) + f2.field.add(name='f21' , number=4, type=TYPE_BYTES , label=REQUIRED) + f2.field.add(name='f22' , number=5, type=TYPE_UINT64, label=REQUIRED) + f2.field.add(name='f23' , number=6, type=TYPE_STRING, label=REQUIRED) + # NOTE: this also contains place ID + + f3 = descriptor_pb2.DescriptorProto(name='xf3') + # NOTE: looks like it's the same as 'updated' from above?? + f3.field.add(name='f31', number=1, type=TYPE_UINT64, label=OPTIONAL) + + descriptor_proto = descriptor_pb2.DescriptorProto(name='PlaceParser') + descriptor_proto.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name) + descriptor_proto.field.add(name='f2', number=2, type=TYPE_MESSAGE, label=REQUIRED, type_name=f2.name) + descriptor_proto.field.add(name='f3', number=3, type=TYPE_MESSAGE, label=OPTIONAL, type_name=f3.name) + descriptor_proto.field.add(name='f4', number=4, type=TYPE_STRING , label=OPTIONAL) + # NOTE: f4 is the list id + + return [descriptor_proto, ts, f1, f2, f3] + + +def get_labeled_protos(): + address = descriptor_pb2.DescriptorProto(name='address') + # 1: address + # 2: parts of address (multiple) + # 3: full address + address.field.add(name='full', number=3, type=TYPE_STRING, label=REQUIRED) + + main = descriptor_pb2.DescriptorProto(name='LabeledParser') + # field 1 contains item type and item id + main.field.add(name='title' , number=3, type=TYPE_STRING , label=REQUIRED) + main.field.add(name='address', number=5, type=TYPE_MESSAGE, label=OPTIONAL, type_name=address.name) + + return [main, address] + + +def get_list_protos(): + f1 = descriptor_pb2.DescriptorProto(name='xf1') + f1.field.add(name='name', number=5, type=TYPE_STRING, label=REQUIRED) + + main = descriptor_pb2.DescriptorProto(name='ListParser') + main.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name) + main.field.add(name='f2', number=2, type=TYPE_STRING , label=REQUIRED) + + return [main, f1] + + +def make_parser(main, *extras): + file_descriptor_proto = descriptor_pb2.FileDescriptorProto(name='dynamic.proto', package='dynamic_package') + for proto in [main, *extras]: + file_descriptor_proto.message_type.add().CopyFrom(proto) + + pool = descriptor_pool.DescriptorPool() + file_descriptor = pool.Add(file_descriptor_proto) + + message_descriptor = pool.FindMessageTypeByName(f'{file_descriptor_proto.package}.{main.name}') + factory = message_factory.MessageFactory(pool) + dynamic_message_class = factory.GetPrototype(message_descriptor) + + return dynamic_message_class + + +place_parser_class = make_parser(*get_place_protos()) +labeled_parser_class = make_parser(*get_labeled_protos()) +list_parser_class = make_parser(*get_list_protos()) + + +def parse_place(blob: bytes): + m = place_parser_class() + m.ParseFromString(blob) + return m + + +def parse_labeled(blob: bytes): + m = labeled_parser_class() + m.ParseFromString(blob) + return m + + +def parse_list(blob: bytes): + msg = list_parser_class() + msg.ParseFromString(blob) + return msg diff --git a/my/google/maps/android.py b/my/google/maps/android.py new file mode 100644 index 0000000..279231a --- /dev/null +++ b/my/google/maps/android.py @@ -0,0 +1,202 @@ +""" +Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now) +""" +from __future__ import annotations + +REQUIRES = [ + "protobuf", # for parsing blobs from the database +] + +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterator, Optional, Sequence +from urllib.parse import quote + +from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core.common import unique_everseen +from my.core.sqlite import sqlite_connection + +import my.config + +from ._android_protobuf import parse_labeled, parse_list, parse_place + + +logger = LazyLogger(__name__) + + +@dataclass +class config(my.config.google.maps.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + + +def inputs() -> Sequence[Path]: + # TODO note sure if need to use all dbs? possibly the last one contains everything? + return get_files(config.export_path) + + +PlaceId = str +ListId = str +ListName = str + + +@dataclass(eq=True, frozen=True) +class Location: + lat: float + lon: float + + @property + def url(self) -> str: + return f'https://maps.google.com/?q={self.lat},{self.lon}' + + +@dataclass(unsafe_hash=True) +class Place: + id: PlaceId + list_name: ListName # TODO maybe best to keep list id? + created_at: datetime_aware # TODO double check it's utc? + updated_at: datetime_aware # TODO double check it's utc? + title: str + location: Location + address: Optional[str] + note: Optional[str] + + @property + def place_url(self) -> str: + title = quote(self.title) + return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}' + + @property + def location_url(self) -> str: + return self.location.url + + +def _process_one(f: Path): + with sqlite_connection(f, row_factory='row') as conn: + msg: Any + + lists: dict[ListId, ListName] = {} + for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc) + server_id = row['server_id'] + + if server_id is None: + # this is the case for Travel plans, Followed places, Offers + # todo alternatively could use string_index column instead maybe? + continue + + blob = row['item_proto'] + msg = parse_list(blob) + name = msg.f1.name + lists[server_id] = name + + for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list + ts = row['timestamp'] / 1000 + created = datetime.fromtimestamp(ts, tz=timezone.utc) + + server_id = row['server_id'] + [item_type, item_id] = server_id.split(':') + if item_type != '3': + # the ones that are not 3 are home/work address? + continue + + blob = row['item_proto'] + msg = parse_labeled(blob) + address = msg.address.full + if address == '': + address = None + + location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) + + yield Place( + id=item_id, + list_name='Labeled', + created_at=created, + updated_at=created, # doesn't look like it has 'updated'? + title=msg.title, + location=location, + address=address, + note=None, # don't think these allow notes + ) + + for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places + server_id = row['server_id'] + [list_id, _, id1, id2] = server_id.split(':') + item_id = f'{id1}:{id2}' + + list_name = lists[list_id] + + blob = row['item_proto'] + msg = parse_place(blob) + title = msg.f1.title + note = msg.f1.note + if note == '': # seems that protobuf does that? + note = None + + # TODO double check timezone + created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000) + + # NOTE: this one seems to be the same as row['timestamp'] + updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000) + + address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug: + if address == '': + address = None + + location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) + + place = Place( + id=item_id, + list_name=list_name, + created_at=created, + updated_at=updated, + title=title, + location=location, + address=address, + note=note, + ) + + # ugh. in my case it's violated by one place by about 1 second?? + # assert place.created_at <= place.updated_at + yield place + + +def saved() -> Iterator[Res[Place]]: + def it() -> Iterator[Res[Place]]: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + yield from _process_one(path) + return unique_everseen(it) + + +# Summary of databases on Android (as of 20240101) +# -1_optimized_threads.notifications.db -- empty +# 1_optimized_threads.notifications.db -- empty +# 1_tasks.notifications.db -- empty +# -1_threads.notifications.db -- empty +# 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc? +# 1_thread_surveys.notifications.db -- empty +# 2_threads.notifications.db -- empty +# accounts.notifications.db -- just one row with account id +# brella_example_store -- empty +# gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled" +# gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something +# gmm_sync.db -- processed above +# gnp_fcm_database -- list of accounts +# google_app_measurement_local.db -- empty +# inbox_notifications.db -- nothing interesting +# _room_notifications.db -- trip anniversaties? +# lighter_messaging_1.db -- empty +# lighter_messaging_2.db -- empty +# lighter_registration.db -- empty +# peopleCache__com.google_14.db -- contacts cache or something +# portable_geller_.db -- looks like analytics +# primes_example_store -- looks like analytics +# pseudonymous_room_notifications.db -- looks like analytics +# ue3.db -- empty +# ugc_photos_location_data.db -- empty +# ugc-sync.db -- empty +# updates-tab-visit.db -- empty diff --git a/tox.ini b/tox.ini index e51d0b6..25874f4 100644 --- a/tox.ini +++ b/tox.ini @@ -143,6 +143,7 @@ commands = my.fbmessenger.export \ my.github.ghexport \ my.goodreads \ + my.google.maps.android \ my.google.takeout.parser \ my.hackernews.harmonic \ my.hypothesis \