my.google.maps: intitial module for extracting placed data from Android app
This commit is contained in:
parent
93e475795d
commit
87a8a7781b
4 changed files with 320 additions and 0 deletions
|
@ -68,6 +68,10 @@ class pinboard:
|
||||||
export_dir: Paths = ''
|
export_dir: Paths = ''
|
||||||
|
|
||||||
class google:
|
class google:
|
||||||
|
class maps:
|
||||||
|
class android:
|
||||||
|
export_path: Paths = ''
|
||||||
|
|
||||||
takeout_path: Paths = ''
|
takeout_path: Paths = ''
|
||||||
|
|
||||||
|
|
||||||
|
|
113
my/google/maps/_android_protobuf.py
Normal file
113
my/google/maps/_android_protobuf.py
Normal file
|
@ -0,0 +1,113 @@
|
||||||
|
from my.core import __NOT_HPI_MODULE__
|
||||||
|
|
||||||
|
# NOTE: this tool was quite useful https://github.com/aj3423/aproto
|
||||||
|
|
||||||
|
from google.protobuf import descriptor_pool, descriptor_pb2, message_factory
|
||||||
|
|
||||||
|
TYPE_STRING = descriptor_pb2.FieldDescriptorProto.TYPE_STRING
|
||||||
|
TYPE_BYTES = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES
|
||||||
|
TYPE_UINT64 = descriptor_pb2.FieldDescriptorProto.TYPE_UINT64
|
||||||
|
TYPE_MESSAGE = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE
|
||||||
|
|
||||||
|
OPTIONAL = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
|
||||||
|
REQUIRED = descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED
|
||||||
|
|
||||||
|
|
||||||
|
def get_place_protos():
|
||||||
|
f1 = descriptor_pb2.DescriptorProto(name='xf1')
|
||||||
|
# TODO 2 -> 5 is address? 2 -> 6 is a pair of coordinates
|
||||||
|
f1.field.add(name='title', number=3, type=TYPE_STRING, label=REQUIRED)
|
||||||
|
f1.field.add(name='note' , number=4, type=TYPE_STRING, label=OPTIONAL)
|
||||||
|
# TODO what's the difference between required and optional? doesn't impact decoding?
|
||||||
|
|
||||||
|
ts = descriptor_pb2.DescriptorProto(name='Timestamp')
|
||||||
|
ts.field.add(name='seconds', number=1, type=TYPE_UINT64, label=REQUIRED)
|
||||||
|
ts.field.add(name='nanos' , number=2, type=TYPE_UINT64, label=REQUIRED)
|
||||||
|
|
||||||
|
f1.field.add(name='created', number=10 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name)
|
||||||
|
f1.field.add(name='updated', number=11 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name)
|
||||||
|
|
||||||
|
f2 = descriptor_pb2.DescriptorProto(name='xf2')
|
||||||
|
f2.field.add(name='addr1', number=2, type=TYPE_STRING, label=REQUIRED)
|
||||||
|
f2.field.add(name='addr2', number=3, type=TYPE_STRING, label=REQUIRED)
|
||||||
|
f2.field.add(name='f21' , number=4, type=TYPE_BYTES , label=REQUIRED)
|
||||||
|
f2.field.add(name='f22' , number=5, type=TYPE_UINT64, label=REQUIRED)
|
||||||
|
f2.field.add(name='f23' , number=6, type=TYPE_STRING, label=REQUIRED)
|
||||||
|
# NOTE: this also contains place ID
|
||||||
|
|
||||||
|
f3 = descriptor_pb2.DescriptorProto(name='xf3')
|
||||||
|
# NOTE: looks like it's the same as 'updated' from above??
|
||||||
|
f3.field.add(name='f31', number=1, type=TYPE_UINT64, label=OPTIONAL)
|
||||||
|
|
||||||
|
descriptor_proto = descriptor_pb2.DescriptorProto(name='PlaceParser')
|
||||||
|
descriptor_proto.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name)
|
||||||
|
descriptor_proto.field.add(name='f2', number=2, type=TYPE_MESSAGE, label=REQUIRED, type_name=f2.name)
|
||||||
|
descriptor_proto.field.add(name='f3', number=3, type=TYPE_MESSAGE, label=OPTIONAL, type_name=f3.name)
|
||||||
|
descriptor_proto.field.add(name='f4', number=4, type=TYPE_STRING , label=OPTIONAL)
|
||||||
|
# NOTE: f4 is the list id
|
||||||
|
|
||||||
|
return [descriptor_proto, ts, f1, f2, f3]
|
||||||
|
|
||||||
|
|
||||||
|
def get_labeled_protos():
|
||||||
|
address = descriptor_pb2.DescriptorProto(name='address')
|
||||||
|
# 1: address
|
||||||
|
# 2: parts of address (multiple)
|
||||||
|
# 3: full address
|
||||||
|
address.field.add(name='full', number=3, type=TYPE_STRING, label=REQUIRED)
|
||||||
|
|
||||||
|
main = descriptor_pb2.DescriptorProto(name='LabeledParser')
|
||||||
|
# field 1 contains item type and item id
|
||||||
|
main.field.add(name='title' , number=3, type=TYPE_STRING , label=REQUIRED)
|
||||||
|
main.field.add(name='address', number=5, type=TYPE_MESSAGE, label=OPTIONAL, type_name=address.name)
|
||||||
|
|
||||||
|
return [main, address]
|
||||||
|
|
||||||
|
|
||||||
|
def get_list_protos():
|
||||||
|
f1 = descriptor_pb2.DescriptorProto(name='xf1')
|
||||||
|
f1.field.add(name='name', number=5, type=TYPE_STRING, label=REQUIRED)
|
||||||
|
|
||||||
|
main = descriptor_pb2.DescriptorProto(name='ListParser')
|
||||||
|
main.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name)
|
||||||
|
main.field.add(name='f2', number=2, type=TYPE_STRING , label=REQUIRED)
|
||||||
|
|
||||||
|
return [main, f1]
|
||||||
|
|
||||||
|
|
||||||
|
def make_parser(main, *extras):
|
||||||
|
file_descriptor_proto = descriptor_pb2.FileDescriptorProto(name='dynamic.proto', package='dynamic_package')
|
||||||
|
for proto in [main, *extras]:
|
||||||
|
file_descriptor_proto.message_type.add().CopyFrom(proto)
|
||||||
|
|
||||||
|
pool = descriptor_pool.DescriptorPool()
|
||||||
|
file_descriptor = pool.Add(file_descriptor_proto)
|
||||||
|
|
||||||
|
message_descriptor = pool.FindMessageTypeByName(f'{file_descriptor_proto.package}.{main.name}')
|
||||||
|
factory = message_factory.MessageFactory(pool)
|
||||||
|
dynamic_message_class = factory.GetPrototype(message_descriptor)
|
||||||
|
|
||||||
|
return dynamic_message_class
|
||||||
|
|
||||||
|
|
||||||
|
place_parser_class = make_parser(*get_place_protos())
|
||||||
|
labeled_parser_class = make_parser(*get_labeled_protos())
|
||||||
|
list_parser_class = make_parser(*get_list_protos())
|
||||||
|
|
||||||
|
|
||||||
|
def parse_place(blob: bytes):
|
||||||
|
m = place_parser_class()
|
||||||
|
m.ParseFromString(blob)
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def parse_labeled(blob: bytes):
|
||||||
|
m = labeled_parser_class()
|
||||||
|
m.ParseFromString(blob)
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def parse_list(blob: bytes):
|
||||||
|
msg = list_parser_class()
|
||||||
|
msg.ParseFromString(blob)
|
||||||
|
return msg
|
202
my/google/maps/android.py
Normal file
202
my/google/maps/android.py
Normal file
|
@ -0,0 +1,202 @@
|
||||||
|
"""
|
||||||
|
Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
REQUIRES = [
|
||||||
|
"protobuf", # for parsing blobs from the database
|
||||||
|
]
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterator, Optional, Sequence
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
|
||||||
|
from my.core.common import unique_everseen
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
|
import my.config
|
||||||
|
|
||||||
|
from ._android_protobuf import parse_labeled, parse_list, parse_place
|
||||||
|
|
||||||
|
|
||||||
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class config(my.config.google.maps.android):
|
||||||
|
# paths[s]/glob to the exported sqlite databases
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
# TODO note sure if need to use all dbs? possibly the last one contains everything?
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
PlaceId = str
|
||||||
|
ListId = str
|
||||||
|
ListName = str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(eq=True, frozen=True)
|
||||||
|
class Location:
|
||||||
|
lat: float
|
||||||
|
lon: float
|
||||||
|
|
||||||
|
@property
|
||||||
|
def url(self) -> str:
|
||||||
|
return f'https://maps.google.com/?q={self.lat},{self.lon}'
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Place:
|
||||||
|
id: PlaceId
|
||||||
|
list_name: ListName # TODO maybe best to keep list id?
|
||||||
|
created_at: datetime_aware # TODO double check it's utc?
|
||||||
|
updated_at: datetime_aware # TODO double check it's utc?
|
||||||
|
title: str
|
||||||
|
location: Location
|
||||||
|
address: Optional[str]
|
||||||
|
note: Optional[str]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def place_url(self) -> str:
|
||||||
|
title = quote(self.title)
|
||||||
|
return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def location_url(self) -> str:
|
||||||
|
return self.location.url
|
||||||
|
|
||||||
|
|
||||||
|
def _process_one(f: Path):
|
||||||
|
with sqlite_connection(f, row_factory='row') as conn:
|
||||||
|
msg: Any
|
||||||
|
|
||||||
|
lists: dict[ListId, ListName] = {}
|
||||||
|
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc)
|
||||||
|
server_id = row['server_id']
|
||||||
|
|
||||||
|
if server_id is None:
|
||||||
|
# this is the case for Travel plans, Followed places, Offers
|
||||||
|
# todo alternatively could use string_index column instead maybe?
|
||||||
|
continue
|
||||||
|
|
||||||
|
blob = row['item_proto']
|
||||||
|
msg = parse_list(blob)
|
||||||
|
name = msg.f1.name
|
||||||
|
lists[server_id] = name
|
||||||
|
|
||||||
|
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list
|
||||||
|
ts = row['timestamp'] / 1000
|
||||||
|
created = datetime.fromtimestamp(ts, tz=timezone.utc)
|
||||||
|
|
||||||
|
server_id = row['server_id']
|
||||||
|
[item_type, item_id] = server_id.split(':')
|
||||||
|
if item_type != '3':
|
||||||
|
# the ones that are not 3 are home/work address?
|
||||||
|
continue
|
||||||
|
|
||||||
|
blob = row['item_proto']
|
||||||
|
msg = parse_labeled(blob)
|
||||||
|
address = msg.address.full
|
||||||
|
if address == '':
|
||||||
|
address = None
|
||||||
|
|
||||||
|
location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)
|
||||||
|
|
||||||
|
yield Place(
|
||||||
|
id=item_id,
|
||||||
|
list_name='Labeled',
|
||||||
|
created_at=created,
|
||||||
|
updated_at=created, # doesn't look like it has 'updated'?
|
||||||
|
title=msg.title,
|
||||||
|
location=location,
|
||||||
|
address=address,
|
||||||
|
note=None, # don't think these allow notes
|
||||||
|
)
|
||||||
|
|
||||||
|
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places
|
||||||
|
server_id = row['server_id']
|
||||||
|
[list_id, _, id1, id2] = server_id.split(':')
|
||||||
|
item_id = f'{id1}:{id2}'
|
||||||
|
|
||||||
|
list_name = lists[list_id]
|
||||||
|
|
||||||
|
blob = row['item_proto']
|
||||||
|
msg = parse_place(blob)
|
||||||
|
title = msg.f1.title
|
||||||
|
note = msg.f1.note
|
||||||
|
if note == '': # seems that protobuf does that?
|
||||||
|
note = None
|
||||||
|
|
||||||
|
# TODO double check timezone
|
||||||
|
created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000)
|
||||||
|
|
||||||
|
# NOTE: this one seems to be the same as row['timestamp']
|
||||||
|
updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000)
|
||||||
|
|
||||||
|
address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug:
|
||||||
|
if address == '':
|
||||||
|
address = None
|
||||||
|
|
||||||
|
location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)
|
||||||
|
|
||||||
|
place = Place(
|
||||||
|
id=item_id,
|
||||||
|
list_name=list_name,
|
||||||
|
created_at=created,
|
||||||
|
updated_at=updated,
|
||||||
|
title=title,
|
||||||
|
location=location,
|
||||||
|
address=address,
|
||||||
|
note=note,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ugh. in my case it's violated by one place by about 1 second??
|
||||||
|
# assert place.created_at <= place.updated_at
|
||||||
|
yield place
|
||||||
|
|
||||||
|
|
||||||
|
def saved() -> Iterator[Res[Place]]:
|
||||||
|
def it() -> Iterator[Res[Place]]:
|
||||||
|
paths = inputs()
|
||||||
|
total = len(paths)
|
||||||
|
width = len(str(total))
|
||||||
|
for idx, path in enumerate(paths):
|
||||||
|
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
|
||||||
|
yield from _process_one(path)
|
||||||
|
return unique_everseen(it)
|
||||||
|
|
||||||
|
|
||||||
|
# Summary of databases on Android (as of 20240101)
|
||||||
|
# -1_optimized_threads.notifications.db -- empty
|
||||||
|
# 1_optimized_threads.notifications.db -- empty
|
||||||
|
# 1_tasks.notifications.db -- empty
|
||||||
|
# -1_threads.notifications.db -- empty
|
||||||
|
# 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc?
|
||||||
|
# 1_thread_surveys.notifications.db -- empty
|
||||||
|
# 2_threads.notifications.db -- empty
|
||||||
|
# accounts.notifications.db -- just one row with account id
|
||||||
|
# brella_example_store -- empty
|
||||||
|
# gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled"
|
||||||
|
# gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something
|
||||||
|
# gmm_sync.db -- processed above
|
||||||
|
# gnp_fcm_database -- list of accounts
|
||||||
|
# google_app_measurement_local.db -- empty
|
||||||
|
# inbox_notifications.db -- nothing interesting
|
||||||
|
# <email>_room_notifications.db -- trip anniversaties?
|
||||||
|
# lighter_messaging_1.db -- empty
|
||||||
|
# lighter_messaging_2.db -- empty
|
||||||
|
# lighter_registration.db -- empty
|
||||||
|
# peopleCache_<email>_com.google_14.db -- contacts cache or something
|
||||||
|
# portable_geller_<email>.db -- looks like analytics
|
||||||
|
# primes_example_store -- looks like analytics
|
||||||
|
# pseudonymous_room_notifications.db -- looks like analytics
|
||||||
|
# ue3.db -- empty
|
||||||
|
# ugc_photos_location_data.db -- empty
|
||||||
|
# ugc-sync.db -- empty
|
||||||
|
# updates-tab-visit.db -- empty
|
1
tox.ini
1
tox.ini
|
@ -143,6 +143,7 @@ commands =
|
||||||
my.fbmessenger.export \
|
my.fbmessenger.export \
|
||||||
my.github.ghexport \
|
my.github.ghexport \
|
||||||
my.goodreads \
|
my.goodreads \
|
||||||
|
my.google.maps.android \
|
||||||
my.google.takeout.parser \
|
my.google.takeout.parser \
|
||||||
my.hackernews.harmonic \
|
my.hackernews.harmonic \
|
||||||
my.hypothesis \
|
my.hypothesis \
|
||||||
|
|
Loading…
Add table
Reference in a new issue