my.twitter.android: refactor into a proper module

for now only extracting bookmarks, will use it for some time and see how it goes
2023-12-24 00:06:29 +00:00 · 2023-12-24 00:06:29 +00:00 · 51209c547e
commit 51209c547e
parent a4a7bc41b9
3 changed files with 96 additions and 26 deletions
--- a/my/config.py
+++ b/my/config.py
@ -177,6 +177,8 @@ class twitter_archive:
 class twitter:
    class talon:
        export_path: Paths
    class android:
        export_path: Paths
 class twint:
--- a/my/core/common.py
+++ b/my/core/common.py
@ -686,6 +686,7 @@ def unique_everseen(
    if key is None:
        # todo check key return type as well? but it's more likely to be hashable
        if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
            # TODO return better error here, e.g. if there is no return type it crashes
            _check_all_hashable(fun)
    return more_itertools.unique_everseen(iterable=iterable, key=key)
--- a/my/twitter/android.py
+++ b/my/twitter/android.py
@ -1,13 +1,49 @@
 """
-Data from offficial app for Android
+Twitter data from offficial app for Android
 """
-import re
+from __future__ import annotations
 from struct import unpack_from, calcsize
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
 import re
 from struct import unpack_from
 from typing import Iterator, Sequence
 from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
 from my.core.common import unique_everseen
 from my.core.sqlite import sqlite_connect_immutable
 import my.config
-def _parse_content(data: bytes):
+from .common import permalink
 logger = LazyLogger(__name__)
@dataclass
 class config(my.config.twitter.android):
    # paths[s]/glob to the exported sqlite databases
    export_path: Paths
 def inputs() -> Sequence[Path]:
    return get_files(config.export_path)
@dataclass(unsafe_hash=True)
 class Tweet:
    id_str: str
    created_at: datetime_aware
    screen_name: str
    text: str
    @property
    def permalink(self) -> str:
        return permalink(screen_name=self.screen_name, id=self.id_str)
 def _parse_content(data: bytes) -> str:
    pos = 0
    def skip(count: int) -> None:
@ -107,29 +143,60 @@ def _parse_content(data: bytes):
        text = text.replace(k, v)
    assert 'https://t.co/' not in text  # make sure we detected all links
-    print(text)
+    return text
-
+def _process_one(f: Path) -> Iterator[Res[Tweet]]:
-PATH_TO_DB = '/path/to/db'
+    with sqlite_connect_immutable(f) as db:
        # NOTE:
        # - it also has statuses_r_ent_content which has entities' links replaced
        #   but they are still ellipsized (e.g. check 1692905005479580039)
        #   so let's just uses statuses_content
        # - there is also timeline_created_at, but they look like made up timestamps
        #   don't think they represent bookmarking time
        # - not sure what's timeline_type?
        #   seems like 30 means bookmarks?
        #   there is one tweet with timeline type 18, but it has timeline_is_preview=1
        for (
            tweet_id,
            user_name,
            user_username,
            created_ms,
            blob,
        ) in db.execute(
            '''
            SELECT
            statuses_status_id,
            users_name,
            users_username,
            statuses_created,
            CAST(statuses_content AS BLOB)
            FROM timeline_view
            WHERE statuses_bookmarked = 1
            ORDER BY timeline_sort_index DESC
            ''',
        ):
            if blob is None:  # TODO exclude in sql query?
                continue
            yield Tweet(
                id_str=tweet_id,
                # TODO double check it's utc?
                created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
                screen_name=user_username,
                text=_parse_content(blob),
            )
-with sqlite_connect_immutable(PATH_TO_DB) as db:
+def bookmarks() -> Iterator[Res[Tweet]]:
-    # TODO use statuses table instead?
+    # TODO might need to sort by timeline_sort_index again?
-    # has r_ent_content??
+    # not sure if each database contains full history of bookmarks (likely not!)
-    # TODO hmm r_ent_content contains expanded urls?
+    def it() -> Iterator[Res[Tweet]]:
-    # but they are still ellipsized? e.g. you can check 1692905005479580039
+        paths = inputs()
-    # TODO also I think content table has mappings from short urls to full, need to extract
+        total = len(paths)
-    for (tid, blob, blob2) in db.execute(
+        width = len(str(total))
-        f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
+        for idx, path in enumerate(paths):
-    ):
+            logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
-        if blob is None:  # TODO exclude in sql query?
+            yield from _process_one(path)
-            continue
+
-        print("----")
+    # TODO hmm maybe unique_everseen should be a decorator?
-        try:
+    return unique_everseen(it)
            print("PARSING", tid)
            _parse_content(blob)
            # _parse_content(blob2)
        except UnicodeDecodeError as ue:
            raise ue
            # print("DECODING ERROR FOR ", tid, ue.object)