some enhancements for facebook/instagram modules
figured out that datetimes are naive better username handling + investigation of thread names
This commit is contained in:
parent
7323e99504
commit
b9d788efd0
4 changed files with 29 additions and 13 deletions
|
@ -38,10 +38,12 @@ class Thread:
|
||||||
name: Optional[str]
|
name: Optional[str]
|
||||||
|
|
||||||
# todo not sure about order of fields...
|
# todo not sure about order of fields...
|
||||||
|
from ..core import datetime_naive
|
||||||
@dataclass
|
@dataclass
|
||||||
class _BaseMessage:
|
class _BaseMessage:
|
||||||
id: str
|
id: str
|
||||||
dt: datetime
|
# checked against a message sent on 4 may 2022, and it does look naive
|
||||||
|
dt: datetime_naive
|
||||||
text: Optional[str]
|
text: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -43,3 +43,9 @@ def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
|
||||||
# similar to twitter, might make sense to generify/document as a pattern
|
# similar to twitter, might make sense to generify/document as a pattern
|
||||||
return (r.id, r.dt)
|
return (r.id, r.dt)
|
||||||
yield from unique_everseen(chain(*sources), key=key)
|
yield from unique_everseen(chain(*sources), key=key)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO some notes about gdpr export (since there is no module yet)
|
||||||
|
# ugh, messages seem to go from new to old in messages_N.json files as N increases :facepalm:
|
||||||
|
# seems like it's storing local timestamp :facepalm:
|
||||||
|
# checked against a message sent on 4 may 2022
|
||||||
|
|
|
@ -32,11 +32,13 @@ class User:
|
||||||
full_name: str
|
full_name: str
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import datetime_naive
|
||||||
# todo not sure about order of fields...
|
# todo not sure about order of fields...
|
||||||
@dataclass
|
@dataclass
|
||||||
class _BaseMessage:
|
class _BaseMessage:
|
||||||
id: str
|
id: str
|
||||||
created: datetime
|
# NOTE: ffs, looks like they keep naive timestamps in the db (checked some random messages)
|
||||||
|
created: datetime_naive
|
||||||
text: str
|
text: str
|
||||||
thread_id: str
|
thread_id: str
|
||||||
|
|
||||||
|
@ -82,7 +84,6 @@ def _parse_message(j: Json) -> Optional[_Message]:
|
||||||
t = j['item_type']
|
t = j['item_type']
|
||||||
tid = j['thread_key']['thread_id']
|
tid = j['thread_key']['thread_id']
|
||||||
uid = j['user_id']
|
uid = j['user_id']
|
||||||
# TODO not sure if utc??
|
|
||||||
created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000)
|
created = datetime.fromtimestamp(int(j['timestamp']) / 1_000_000)
|
||||||
text: str
|
text: str
|
||||||
if t == 'text':
|
if t == 'text':
|
||||||
|
@ -120,14 +121,11 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
with sqlite_connect_immutable(f) as db:
|
with sqlite_connect_immutable(f) as db:
|
||||||
|
|
||||||
for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db):
|
for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db):
|
||||||
# ugh wtf?? no easier way to extract your own user id/name??
|
|
||||||
yield User(
|
|
||||||
id=str(self_uid),
|
|
||||||
full_name='You',
|
|
||||||
username='you',
|
|
||||||
)
|
|
||||||
j = json.loads(thread_json)
|
j = json.loads(thread_json)
|
||||||
for r in j['recipients']:
|
# todo in principle should leave the thread attached to the message?
|
||||||
|
# since thread is a group of users?
|
||||||
|
# inviter usually contains our own user
|
||||||
|
for r in [j['inviter'], *j['recipients']]:
|
||||||
yield User(
|
yield User(
|
||||||
id=str(r['id']), # for some reason it's int in the db
|
id=str(r['id']), # for some reason it's int in the db
|
||||||
full_name=r['full_name'],
|
full_name=r['full_name'],
|
||||||
|
|
|
@ -31,12 +31,15 @@ class User:
|
||||||
full_name: str
|
full_name: str
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import datetime_naive
|
||||||
@dataclass
|
@dataclass
|
||||||
class _BaseMessage:
|
class _BaseMessage:
|
||||||
# TODO id is missing?
|
# ugh, this is insane, but does look like it's just keeping local device time???
|
||||||
created: datetime
|
# checked against a message sent on 3 June, which should be UTC+1, but timestamp seems local
|
||||||
|
created: datetime_naive
|
||||||
text: str
|
text: str
|
||||||
thread_id: str
|
thread_id: str
|
||||||
|
# NOTE: doesn't look like there aren't any meaningful message ids in the export
|
||||||
|
|
||||||
|
|
||||||
@dataclass(unsafe_hash=True)
|
@dataclass(unsafe_hash=True)
|
||||||
|
@ -100,7 +103,14 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
||||||
j = json.loads(ffile.read_text())
|
j = json.loads(ffile.read_text())
|
||||||
|
|
||||||
id_len = 10
|
id_len = 10
|
||||||
# NOTE: no match in android db/api responses?
|
# NOTE: I'm not actually sure it's other user's id.., since it corresponds to the whole converstation
|
||||||
|
# but I stared a bit at these ids vs database ids and can't see any way to find the correspondence :(
|
||||||
|
# so basically the only way to merge is to actually try some magic and correlate timestamps/message texts?
|
||||||
|
# another option is perhaps to query user id from username with some free API
|
||||||
|
# it's still fragile: e.g. if user deletes themselves there is no more username (it becomes "instagramuser")
|
||||||
|
# if we use older exports we might be able to figure it out though... so think about it?
|
||||||
|
# it also names grouped ones like instagramuserchrisfoodishblogand25others_einihreoog
|
||||||
|
# so I feel like there is just not guaranteed way to correlate :(
|
||||||
other_id = fname[-id_len:]
|
other_id = fname[-id_len:]
|
||||||
# NOTE: no match in android db?
|
# NOTE: no match in android db?
|
||||||
other_username = fname[:-id_len - 1]
|
other_username = fname[:-id_len - 1]
|
||||||
|
|
Loading…
Add table
Reference in a new issue