diff --git a/my/twitter/android.py b/my/twitter/android.py new file mode 100644 index 0000000..ac834df --- /dev/null +++ b/my/twitter/android.py @@ -0,0 +1,93 @@ +""" +Data from offficial app for Android +""" +from struct import unpack_from, calcsize + +from my.core.sqlite import sqlite_connect_immutable + + +def _parse_content(data: bytes): + pos = 0 + + def skip(count: int) -> None: + nonlocal pos + pos += count + + def getstring(slen: int) -> str: + if slen == 1: + lfmt = '>B' + elif slen == 2: + lfmt = '>H' + else: + raise RuntimeError + + (sz,) = unpack_from(lfmt, data, offset=pos) + skip(slen) + assert sz > 0 + assert sz <= 10000 # sanity check? + + # soo, this is how it should ideally work: + # (ss,) = unpack_from(f'{sz}s', data, offset=pos) + # skip(sz) + # however sometimes there is a discrepancy between string length in header and actual length (if you stare at the data) + # example is 1725868458246570412 + # wtf??? (see logging below) + + # ughhhh + seps = [ + b'I\x08', + b'I\x09', + ] + sep_idxs = [data[pos:].find(sep) for sep in seps] + sep_idxs = [i for i in sep_idxs if i != -1] + assert len(sep_idxs) > 0 + sep_idx = min(sep_idxs) + + # print("EXPECTED LEN", sz, "GOT", sep_idx, "DIFF", sep_idx - sz) + + zz = data[pos : pos + sep_idx] + return zz.decode('utf8') + + skip(2) # always starts with 4a03? + + (xx,) = unpack_from('B', data, offset=pos) + skip(1) + # print("TYPE:", xx) + + # wtf is this... maybe it's a bitmask? + slen = { + 66 : 1, + 67 : 2, + 106: 1, + 107: 2, + }[xx] + + try: + print(getstring(slen=slen)) + finally: + pass + # print(data[pos:]) + + +PATH_TO_DB = '/path/to/db' + + +with sqlite_connect_immutable(PATH_TO_DB) as db: + # TODO use statuses table instead? + # has r_ent_content?? + # TODO hmm r_ent_content contains expanded urls? + # but they are still ellipsized? e.g. you can check 1692905005479580039 + # TODO also I think content table has mappings from short urls to full, need to extract + for (tid, blob, blob2) in db.execute( + f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1', + ): + if blob is None: # TODO exclude in sql query? + continue + print("----") + try: + print("PARSING", tid) + _parse_content(blob) + # _parse_content(blob2) + except UnicodeDecodeError as ue: + raise ue + # print("DECODING ERROR FOR ", tid, ue.object)