my.twitter.android: some intial work on pasring sqlite databases from official Android app

This commit is contained in:
karlicoss 2023-12-23 02:33:23 +00:00
parent a8f8858cb1
commit 3d75abafe9

93
my/twitter/android.py Normal file
View file

@ -0,0 +1,93 @@
"""
Data from offficial app for Android
"""
from struct import unpack_from, calcsize
from my.core.sqlite import sqlite_connect_immutable
def _parse_content(data: bytes):
pos = 0
def skip(count: int) -> None:
nonlocal pos
pos += count
def getstring(slen: int) -> str:
if slen == 1:
lfmt = '>B'
elif slen == 2:
lfmt = '>H'
else:
raise RuntimeError
(sz,) = unpack_from(lfmt, data, offset=pos)
skip(slen)
assert sz > 0
assert sz <= 10000 # sanity check?
# soo, this is how it should ideally work:
# (ss,) = unpack_from(f'{sz}s', data, offset=pos)
# skip(sz)
# however sometimes there is a discrepancy between string length in header and actual length (if you stare at the data)
# example is 1725868458246570412
# wtf??? (see logging below)
# ughhhh
seps = [
b'I\x08',
b'I\x09',
]
sep_idxs = [data[pos:].find(sep) for sep in seps]
sep_idxs = [i for i in sep_idxs if i != -1]
assert len(sep_idxs) > 0
sep_idx = min(sep_idxs)
# print("EXPECTED LEN", sz, "GOT", sep_idx, "DIFF", sep_idx - sz)
zz = data[pos : pos + sep_idx]
return zz.decode('utf8')
skip(2) # always starts with 4a03?
(xx,) = unpack_from('B', data, offset=pos)
skip(1)
# print("TYPE:", xx)
# wtf is this... maybe it's a bitmask?
slen = {
66 : 1,
67 : 2,
106: 1,
107: 2,
}[xx]
try:
print(getstring(slen=slen))
finally:
pass
# print(data[pos:])
PATH_TO_DB = '/path/to/db'
with sqlite_connect_immutable(PATH_TO_DB) as db:
# TODO use statuses table instead?
# has r_ent_content??
# TODO hmm r_ent_content contains expanded urls?
# but they are still ellipsized? e.g. you can check 1692905005479580039
# TODO also I think content table has mappings from short urls to full, need to extract
for (tid, blob, blob2) in db.execute(
f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
):
if blob is None: # TODO exclude in sql query?
continue
print("----")
try:
print("PARSING", tid)
_parse_content(blob)
# _parse_content(blob2)
except UnicodeDecodeError as ue:
raise ue
# print("DECODING ERROR FOR ", tid, ue.object)