my.twitter.android: refactor into a proper module

for now only extracting bookmarks, will use it for some time and see how it goes
2023-12-24 00:06:29 +00:00 · 2023-12-24 00:06:29 +00:00 · 51209c547e
commit 51209c547e
parent a4a7bc41b9
3 changed files with 96 additions and 26 deletions
--- a/my/config.py
+++ b/my/config.py
@ -177,6 +177,8 @@ class twitter_archive:
 class twitter:
    class talon:
        export_path: Paths
+    class android:
+        export_path: Paths


 class twint:
--- a/my/core/common.py
+++ b/my/core/common.py
@ -686,6 +686,7 @@ def unique_everseen(
    if key is None:
        # todo check key return type as well? but it's more likely to be hashable
        if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
+            # TODO return better error here, e.g. if there is no return type it crashes
            _check_all_hashable(fun)

    return more_itertools.unique_everseen(iterable=iterable, key=key)
--- a/my/twitter/android.py
+++ b/my/twitter/android.py
@ -1,13 +1,49 @@
 """
-Data from offficial app for Android
+Twitter data from offficial app for Android
 """
-import re
-from struct import unpack_from, calcsize
+from __future__ import annotations

+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+import re
+from struct import unpack_from
+from typing import Iterator, Sequence
+
+from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
+from my.core.common import unique_everseen
 from my.core.sqlite import sqlite_connect_immutable

+import my.config

-def _parse_content(data: bytes):
+from .common import permalink
+
+logger = LazyLogger(__name__)
+
+
+@dataclass
+class config(my.config.twitter.android):
+    # paths[s]/glob to the exported sqlite databases
+    export_path: Paths
+
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+@dataclass(unsafe_hash=True)
+class Tweet:
+    id_str: str
+    created_at: datetime_aware
+    screen_name: str
+    text: str
+
+    @property
+    def permalink(self) -> str:
+        return permalink(screen_name=self.screen_name, id=self.id_str)
+
+
+def _parse_content(data: bytes) -> str:
    pos = 0

    def skip(count: int) -> None:
@ -107,29 +143,60 @@ def _parse_content(data: bytes):
        text = text.replace(k, v)
    assert 'https://t.co/' not in text  # make sure we detected all links

-    print(text)
+    return text


-
-PATH_TO_DB = '/path/to/db'
-
-
-with sqlite_connect_immutable(PATH_TO_DB) as db:
-    # TODO use statuses table instead?
-    # has r_ent_content??
-    # TODO hmm r_ent_content contains expanded urls?
-    # but they are still ellipsized? e.g. you can check 1692905005479580039
-    # TODO also I think content table has mappings from short urls to full, need to extract
-    for (tid, blob, blob2) in db.execute(
-        f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
+def _process_one(f: Path) -> Iterator[Res[Tweet]]:
+    with sqlite_connect_immutable(f) as db:
+        # NOTE:
+        # - it also has statuses_r_ent_content which has entities' links replaced
+        #   but they are still ellipsized (e.g. check 1692905005479580039)
+        #   so let's just uses statuses_content
+        # - there is also timeline_created_at, but they look like made up timestamps
+        #   don't think they represent bookmarking time
+        # - not sure what's timeline_type?
+        #   seems like 30 means bookmarks?
+        #   there is one tweet with timeline type 18, but it has timeline_is_preview=1
+        for (
+            tweet_id,
+            user_name,
+            user_username,
+            created_ms,
+            blob,
+        ) in db.execute(
+            '''
+            SELECT
+            statuses_status_id,
+            users_name,
+            users_username,
+            statuses_created,
+            CAST(statuses_content AS BLOB)
+            FROM timeline_view
+            WHERE statuses_bookmarked = 1
+            ORDER BY timeline_sort_index DESC
+            ''',
        ):
            if blob is None:  # TODO exclude in sql query?
                continue
-        print("----")
-        try:
-            print("PARSING", tid)
-            _parse_content(blob)
-            # _parse_content(blob2)
-        except UnicodeDecodeError as ue:
-            raise ue
-            # print("DECODING ERROR FOR ", tid, ue.object)
+            yield Tweet(
+                id_str=tweet_id,
+                # TODO double check it's utc?
+                created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
+                screen_name=user_username,
+                text=_parse_content(blob),
+            )
+
+
+def bookmarks() -> Iterator[Res[Tweet]]:
+    # TODO might need to sort by timeline_sort_index again?
+    # not sure if each database contains full history of bookmarks (likely not!)
+    def it() -> Iterator[Res[Tweet]]:
+        paths = inputs()
+        total = len(paths)
+        width = len(str(total))
+        for idx, path in enumerate(paths):
+            logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
+            yield from _process_one(path)
+
+    # TODO hmm maybe unique_everseen should be a decorator?
+    return unique_everseen(it)