my.tinder: initial module for android databases
This commit is contained in:
parent
b9d788efd0
commit
fd0c65d176
3 changed files with 225 additions and 0 deletions
|
@ -122,6 +122,11 @@ class bumble:
|
|||
export_path: Paths
|
||||
|
||||
|
||||
class tinder:
|
||||
class android:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class instagram:
|
||||
class android:
|
||||
export_path: Paths
|
||||
|
|
218
my/tinder/android.py
Normal file
218
my/tinder/android.py
Normal file
|
@ -0,0 +1,218 @@
|
|||
"""
|
||||
Tinder data from Android app database (in =/data/data/com.tinder/databases/tinder-3.db=)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
REQUIRES = ['dataset']
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Sequence, Iterator, Union, Dict, List, Mapping
|
||||
|
||||
from more_itertools import unique_everseen
|
||||
|
||||
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
|
||||
from my.core.dataset import connect_readonly, DatabaseT
|
||||
|
||||
|
||||
from my.config import tinder as user_config
|
||||
@dataclass
|
||||
class config(user_config.android):
|
||||
# paths[s]/glob to the exported sqlite databases
|
||||
export_path: Paths
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class Person:
|
||||
id: str
|
||||
name: str
|
||||
# todo bio? it might change, not sure what do we want here
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class _BaseMatch:
|
||||
# for android, checked directly shortly after a match
|
||||
when: datetime_aware
|
||||
id: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Match(_BaseMatch):
|
||||
person_id: str
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class Match(_BaseMatch):
|
||||
person: Person
|
||||
|
||||
|
||||
# todo again, not sure what's the 'optimal' field order? perhaps the one which gives the most natural sort?
|
||||
# so either match id or datetime
|
||||
@dataclass
|
||||
class _BaseMessage:
|
||||
# looks like gdpr takeout does contain GMT (compared against google maps data)
|
||||
sent: datetime_aware
|
||||
id: str
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Message(_BaseMessage):
|
||||
match_id: str
|
||||
from_id: str
|
||||
to_id: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message(_BaseMessage):
|
||||
match: Match
|
||||
from_: Person
|
||||
to: Person
|
||||
|
||||
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
||||
_Entity = Union[Person, _Match, _Message]
|
||||
Entity = Union[Person, Match, Message]
|
||||
|
||||
|
||||
def _entities() -> Iterator[Res[_Entity]]:
|
||||
for db_file in inputs():
|
||||
with connect_readonly(db_file) as db:
|
||||
yield from _handle_db(db)
|
||||
|
||||
|
||||
def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]:
|
||||
# profile_user_view contains our own user id
|
||||
for row in chain(db['profile_user_view'], db['match_person']):
|
||||
try:
|
||||
yield _parse_person(row)
|
||||
except Exception as e:
|
||||
# todo attach error contex?
|
||||
yield e
|
||||
|
||||
for row in db['match']:
|
||||
try:
|
||||
yield _parse_match(row)
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
for row in db['message']:
|
||||
try:
|
||||
yield _parse_msg(row)
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
|
||||
def _parse_person(row) -> Person:
|
||||
return Person(
|
||||
id=row['id'],
|
||||
name=row['name'],
|
||||
)
|
||||
|
||||
|
||||
def _parse_match(row) -> _Match:
|
||||
return _Match(
|
||||
id=row['id'],
|
||||
person_id=row['person_id'],
|
||||
when=datetime.fromtimestamp(row['creation_date'] / 1000, tz=timezone.utc),
|
||||
)
|
||||
|
||||
|
||||
def _parse_msg(row) -> _Message:
|
||||
# note it also has raw_message_data -- not sure which is best to use..
|
||||
sent = row['sent_date']
|
||||
return _Message(
|
||||
sent=datetime.fromtimestamp(sent / 1000, tz=timezone.utc),
|
||||
id=row['id'],
|
||||
text=row['text'],
|
||||
match_id=row['match_id'],
|
||||
from_id=row['from_id'],
|
||||
to_id=row['to_id'],
|
||||
)
|
||||
|
||||
|
||||
# todo maybe it's rich_entities method?
|
||||
def entities() -> Iterator[Res[Entity]]:
|
||||
id2person: Dict[str, Person] = {}
|
||||
id2match : Dict[str, Match ] = {}
|
||||
for x in unique_everseen(_entities()):
|
||||
if isinstance(x, Exception):
|
||||
yield x
|
||||
continue
|
||||
if isinstance(x, Person):
|
||||
id2person[x.id] = x
|
||||
yield x
|
||||
continue
|
||||
if isinstance(x, _Match):
|
||||
try:
|
||||
person = id2person[x.person_id]
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
m = Match(
|
||||
id=x.id,
|
||||
when=x.when,
|
||||
person=person,
|
||||
)
|
||||
id2match[x.id] = m
|
||||
yield m
|
||||
continue
|
||||
if isinstance(x, _Message):
|
||||
try:
|
||||
match = id2match[x.match_id]
|
||||
from_ = id2person[x.from_id]
|
||||
to = id2person[x.to_id]
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
yield Message(
|
||||
sent=x.sent,
|
||||
match=match,
|
||||
id=x.id,
|
||||
text=x.text,
|
||||
from_=from_,
|
||||
to=to,
|
||||
)
|
||||
continue
|
||||
assert_never(x)
|
||||
|
||||
|
||||
def messages() -> Iterator[Res[Message]]:
|
||||
for x in entities():
|
||||
if isinstance(x, (Exception, Message)):
|
||||
yield x
|
||||
continue
|
||||
|
||||
|
||||
# todo not sure, maybe it's not fundamental enough to keep here...
|
||||
def match2messages() -> Iterator[Res[Mapping[Match, Sequence[Message]]]]:
|
||||
res: Dict[Match, List[Message]] = defaultdict(list)
|
||||
for x in entities():
|
||||
if isinstance(x, Exception):
|
||||
yield x
|
||||
continue
|
||||
if isinstance(x, Match):
|
||||
# match might happen without messages so makes sense to handle here
|
||||
res[x] # just trigger creation
|
||||
continue
|
||||
if isinstance(x, Message):
|
||||
try:
|
||||
ml = res[x.match]
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
ml.append(x)
|
||||
continue
|
||||
yield res
|
||||
# TODO maybe a more natural return type is Iterator[Res[Tuple[Key, Value]]]
|
||||
# but this doesn't work straight away because the key might have no corresponding values
|
||||
|
||||
|
||||
def stats() -> Stats:
|
||||
return stat(messages)
|
2
tox.ini
2
tox.ini
|
@ -101,6 +101,7 @@ commands =
|
|||
hpi module install my.reddit.rexport
|
||||
hpi module install my.reddit.pushshift
|
||||
hpi module install my.stackexchange.stexport
|
||||
hpi module install my.tinder.android
|
||||
hpi module install my.pinboard
|
||||
hpi module install my.arbtt
|
||||
hpi module install my.coding.commits
|
||||
|
@ -140,6 +141,7 @@ commands =
|
|||
-p my.goodreads \
|
||||
-p my.pdfs \
|
||||
-p my.bumble.android \
|
||||
-p my.tinder.android \
|
||||
--txt-report .coverage.mypy-misc \
|
||||
--html-report .coverage.mypy-misc \
|
||||
{posargs}
|
||||
|
|
Loading…
Add table
Reference in a new issue