my.twitter.android: extract entities

2023-12-23 23:14:21 +00:00 · 2023-12-23 23:14:21 +00:00 · a4a7bc41b9
commit a4a7bc41b9
parent 3d75abafe9
1 changed files with 47 additions and 5 deletions
--- a/my/twitter/android.py
+++ b/my/twitter/android.py
@ -1,6 +1,7 @@
 """
 Data from offficial app for Android
 """
 import re
 from struct import unpack_from, calcsize
 from my.core.sqlite import sqlite_connect_immutable
@ -46,6 +47,7 @@ def _parse_content(data: bytes):
        # print("EXPECTED LEN", sz, "GOT", sep_idx, "DIFF", sep_idx - sz)
        zz = data[pos : pos + sep_idx]
        skip(sep_idx)
        return zz.decode('utf8')
    skip(2)  # always starts with 4a03?
@ -62,11 +64,51 @@ def _parse_content(data: bytes):
        107: 2,
    }[xx]
-    try:
+    text = getstring(slen=slen)
-        print(getstring(slen=slen))
+
-    finally:
+    # after the main tweet text it contains entities (e.g. shortened urls)
-        pass
+    # however couldn't reverse engineer the schema properly, the links are kinda all over the place
-        # print(data[pos:])
+
    # TODO this also contains image alt descriptions?
    # see 1665029077034565633
    extracted = []
    linksep = 0x6a
    while True:
        m = re.search(b'\x6a.http', data[pos:])
        if m is None:
            break
        qq = m.start()
        pos += qq
        while True:
            if data[pos] != linksep:
                break
            pos += 1
            (sz,) = unpack_from('B', data, offset=pos)
            pos += 1
            (ss,) = unpack_from(f'{sz}s', data, offset=pos)
            pos += sz
            extracted.append(ss)
    replacements = {}
    i = 0
    while i < len(extracted):
        if b'https://t.co/' in extracted[i]:
            key = extracted[i].decode('utf8')
            value = extracted[i + 1].decode('utf8')
            i += 2
            replacements[key] = value
        else:
            i += 1
    for k, v in replacements.items():
        text = text.replace(k, v)
    assert 'https://t.co/' not in text  # make sure we detected all links
    print(text)
 PATH_TO_DB = '/path/to/db'