some enhancements for facebook/instagram modules

figured out that datetimes are naive
better username handling + investigation of thread names
This commit is contained in:
Dima Gerasimov 2022-06-04 10:29:50 +01:00 committed by karlicoss
parent 7323e99504
commit b9d788efd0
4 changed files with 29 additions and 13 deletions

View file

@ -38,10 +38,12 @@ class Thread:
name: Optional[str] name: Optional[str]
# todo not sure about order of fields... # todo not sure about order of fields...
from ..core import datetime_naive
@dataclass @dataclass
class _BaseMessage: class _BaseMessage:
id: str id: str
dt: datetime # checked against a message sent on 4 may 2022, and it does look naive
dt: datetime_naive
text: Optional[str] text: Optional[str]

View file

@ -43,3 +43,9 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
# similar to twitter, might make sense to generify/document as a pattern # similar to twitter, might make sense to generify/document as a pattern
return (r.id, r.dt) return (r.id, r.dt)
yield from unique_everseen(chain(*sources), key=key) yield from unique_everseen(chain(*sources), key=key)
# TODO some notes about gdpr export (since there is no module yet)
# ugh, messages seem to go from new to old in messages_N.json files as N increases :facepalm:
# seems like it's storing local timestamp :facepalm:
# checked against a message sent on 4 may 2022

View file

@ -32,11 +32,13 @@ class User:
full_name: str full_name: str
from ..core import datetime_naive
# todo not sure about order of fields... # todo not sure about order of fields...
@dataclass @dataclass
class _BaseMessage: class _BaseMessage:
id: str id: str
created: datetime # NOTE: ffs, looks like they keep naive timestamps in the db (checked some random messages)
created: datetime_naive
text: str text: str
thread_id: str thread_id: str
@ -82,7 +84,6 @@ def _parse_message(j: Json) -> Optional[_Message]:
t = j['item_type'] t = j['item_type']
tid = j['thread_key']['thread_id'] tid = j['thread_key']['thread_id']
uid = j['user_id'] uid = j['user_id']
# TODO not sure if utc??
created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000) created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000)
text: str text: str
if t == 'text': if t == 'text':
@ -120,14 +121,11 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
with sqlite_connect_immutable(f) as db: with sqlite_connect_immutable(f) as db:
for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db): for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db):
# ugh wtf?? no easier way to extract your own user id/name??
yield User(
id=str(self_uid),
full_name='You',
username='you',
)
j = json.loads(thread_json) j = json.loads(thread_json)
for r in j['recipients']: # todo in principle should leave the thread attached to the message?
# since thread is a group of users?
# inviter usually contains our own user
for r in [j['inviter'], *j['recipients']]:
yield User( yield User(
id=str(r['id']), # for some reason it's int in the db id=str(r['id']), # for some reason it's int in the db
full_name=r['full_name'], full_name=r['full_name'],

View file

@ -31,12 +31,15 @@ class User:
full_name: str full_name: str
from ..core import datetime_naive
@dataclass @dataclass
class _BaseMessage: class _BaseMessage:
# TODO id is missing? # ugh, this is insane, but does look like it's just keeping local device time???
created: datetime # checked against a message sent on 3 June, which should be UTC+1, but timestamp seems local
created: datetime_naive
text: str text: str
thread_id: str thread_id: str
# NOTE: doesn't look like there aren't any meaningful message ids in the export
@dataclass(unsafe_hash=True) @dataclass(unsafe_hash=True)
@ -100,7 +103,14 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
j = json.loads(ffile.read_text()) j = json.loads(ffile.read_text())
id_len = 10 id_len = 10
# NOTE: no match in android db/api responses? # NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation
# but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :(
# so basically the only way to merge is to actually try some magic and correlate timestamps/message texts?
# another option is perhaps to query user id from username with some free API
# it's still fragile: e.g. if user deletes themselves there is no more username (it becomes "instagramuser")
# if we use older exports we might be able to figure it out though... so think about it?
# it also names grouped ones like instagramuserchrisfoodishblogand25others_einihreoog
# so I feel like there is just not guaranteed way to correlate :(
other_id = fname[-id_len:] other_id = fname[-id_len:]
# NOTE: no match in android db? # NOTE: no match in android db?
other_username = fname[:-id_len - 1] other_username = fname[:-id_len - 1]