my.tinder: initial module for android databases

This commit is contained in:
Dima Gerasimov 2022-06-04 16:57:10 +01:00 committed by karlicoss
parent b9d788efd0
commit fd0c65d176
3 changed files with 225 additions and 0 deletions

View file

@ -122,6 +122,11 @@ class bumble:
export_path: Paths export_path: Paths
class tinder:
class android:
export_path: Paths
class instagram: class instagram:
class android: class android:
export_path: Paths export_path: Paths

218
my/tinder/android.py Normal file
View file

@ -0,0 +1,218 @@
"""
Tinder data from Android app database (in =/data/data/com.tinder/databases/tinder-3.db=)
"""
from __future__ import annotations
REQUIRES = ['dataset']
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timezone
from itertools import chain
from pathlib import Path
from typing import Sequence, Iterator, Union, Dict, List, Mapping
from more_itertools import unique_everseen
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
from my.core.dataset import connect_readonly, DatabaseT
from my.config import tinder as user_config
@dataclass
class config(user_config.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
@dataclass(unsafe_hash=True)
class Person:
id: str
name: str
# todo bio? it might change, not sure what do we want here
@dataclass(unsafe_hash=True)
class _BaseMatch:
# for android, checked directly shortly after a match
when: datetime_aware
id: str
@dataclass
class _Match(_BaseMatch):
person_id: str
@dataclass(unsafe_hash=True)
class Match(_BaseMatch):
person: Person
# todo again, not sure what's the 'optimal' field order? perhaps the one which gives the most natural sort?
# so either match id or datetime
@dataclass
class _BaseMessage:
# looks like gdpr takeout does contain GMT (compared against google maps data)
sent: datetime_aware
id: str
text: str
@dataclass
class _Message(_BaseMessage):
match_id: str
from_id: str
to_id: str
@dataclass
class Message(_BaseMessage):
match: Match
from_: Person
to: Person
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
_Entity = Union[Person, _Match, _Message]
Entity = Union[Person, Match, Message]
def _entities() -> Iterator[Res[_Entity]]:
for db_file in inputs():
with connect_readonly(db_file) as db:
yield from _handle_db(db)
def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]:
# profile_user_view contains our own user id
for row in chain(db['profile_user_view'], db['match_person']):
try:
yield _parse_person(row)
except Exception as e:
# todo attach error contex?
yield e
for row in db['match']:
try:
yield _parse_match(row)
except Exception as e:
yield e
for row in db['message']:
try:
yield _parse_msg(row)
except Exception as e:
yield e
def _parse_person(row) -> Person:
return Person(
id=row['id'],
name=row['name'],
)
def _parse_match(row) -> _Match:
return _Match(
id=row['id'],
person_id=row['person_id'],
when=datetime.fromtimestamp(row['creation_date'] / 1000, tz=timezone.utc),
)
def _parse_msg(row) -> _Message:
# note it also has raw_message_data -- not sure which is best to use..
sent = row['sent_date']
return _Message(
sent=datetime.fromtimestamp(sent / 1000, tz=timezone.utc),
id=row['id'],
text=row['text'],
match_id=row['match_id'],
from_id=row['from_id'],
to_id=row['to_id'],
)
# todo maybe it's rich_entities method?
def entities() -> Iterator[Res[Entity]]:
id2person: Dict[str, Person] = {}
id2match : Dict[str, Match ] = {}
for x in unique_everseen(_entities()):
if isinstance(x, Exception):
yield x
continue
if isinstance(x, Person):
id2person[x.id] = x
yield x
continue
if isinstance(x, _Match):
try:
person = id2person[x.person_id]
except Exception as e:
yield e
continue
m = Match(
id=x.id,
when=x.when,
person=person,
)
id2match[x.id] = m
yield m
continue
if isinstance(x, _Message):
try:
match = id2match[x.match_id]
from_ = id2person[x.from_id]
to = id2person[x.to_id]
except Exception as e:
yield e
continue
yield Message(
sent=x.sent,
match=match,
id=x.id,
text=x.text,
from_=from_,
to=to,
)
continue
assert_never(x)
def messages() -> Iterator[Res[Message]]:
for x in entities():
if isinstance(x, (Exception, Message)):
yield x
continue
# todo not sure, maybe it's not fundamental enough to keep here...
def match2messages() -> Iterator[Res[Mapping[Match, Sequence[Message]]]]:
res: Dict[Match, List[Message]] = defaultdict(list)
for x in entities():
if isinstance(x, Exception):
yield x
continue
if isinstance(x, Match):
# match might happen without messages so makes sense to handle here
res[x] # just trigger creation
continue
if isinstance(x, Message):
try:
ml = res[x.match]
except Exception as e:
yield e
continue
ml.append(x)
continue
yield res
# TODO maybe a more natural return type is Iterator[Res[Tuple[Key, Value]]]
# but this doesn't work straight away because the key might have no corresponding values
def stats() -> Stats:
return stat(messages)

View file

@ -101,6 +101,7 @@ commands =
hpi module install my.reddit.rexport hpi module install my.reddit.rexport
hpi module install my.reddit.pushshift hpi module install my.reddit.pushshift
hpi module install my.stackexchange.stexport hpi module install my.stackexchange.stexport
hpi module install my.tinder.android
hpi module install my.pinboard hpi module install my.pinboard
hpi module install my.arbtt hpi module install my.arbtt
hpi module install my.coding.commits hpi module install my.coding.commits
@ -140,6 +141,7 @@ commands =
-p my.goodreads \ -p my.goodreads \
-p my.pdfs \ -p my.pdfs \
-p my.bumble.android \ -p my.bumble.android \
-p my.tinder.android \
--txt-report .coverage.mypy-misc \ --txt-report .coverage.mypy-misc \
--html-report .coverage.mypy-misc \ --html-report .coverage.mypy-misc \
{posargs} {posargs}