diff --git a/my/config.py b/my/config.py index ac44f41..e9b0ec8 100644 --- a/my/config.py +++ b/my/config.py @@ -177,6 +177,8 @@ class twitter_archive: class twitter: class talon: export_path: Paths + class android: + export_path: Paths class twint: diff --git a/my/core/common.py b/my/core/common.py index f1441a9..c429c8c 100644 --- a/my/core/common.py +++ b/my/core/common.py @@ -686,6 +686,7 @@ def unique_everseen( if key is None: # todo check key return type as well? but it's more likely to be hashable if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None: + # TODO return better error here, e.g. if there is no return type it crashes _check_all_hashable(fun) return more_itertools.unique_everseen(iterable=iterable, key=key) diff --git a/my/twitter/android.py b/my/twitter/android.py index dbb4946..be411e3 100644 --- a/my/twitter/android.py +++ b/my/twitter/android.py @@ -1,13 +1,49 @@ """ -Data from offficial app for Android +Twitter data from offficial app for Android """ -import re -from struct import unpack_from, calcsize +from __future__ import annotations +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +import re +from struct import unpack_from +from typing import Iterator, Sequence + +from my.core import datetime_aware, get_files, LazyLogger, Paths, Res +from my.core.common import unique_everseen from my.core.sqlite import sqlite_connect_immutable +import my.config -def _parse_content(data: bytes): +from .common import permalink + +logger = LazyLogger(__name__) + + +@dataclass +class config(my.config.twitter.android): + # paths[s]/glob to the exported sqlite databases + export_path: Paths + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +@dataclass(unsafe_hash=True) +class Tweet: + id_str: str + created_at: datetime_aware + screen_name: str + text: str + + @property + def permalink(self) -> str: + return permalink(screen_name=self.screen_name, id=self.id_str) + + +def _parse_content(data: bytes) -> str: pos = 0 def skip(count: int) -> None: @@ -107,29 +143,60 @@ def _parse_content(data: bytes): text = text.replace(k, v) assert 'https://t.co/' not in text # make sure we detected all links - print(text) + return text - -PATH_TO_DB = '/path/to/db' +def _process_one(f: Path) -> Iterator[Res[Tweet]]: + with sqlite_connect_immutable(f) as db: + # NOTE: + # - it also has statuses_r_ent_content which has entities' links replaced + # but they are still ellipsized (e.g. check 1692905005479580039) + # so let's just uses statuses_content + # - there is also timeline_created_at, but they look like made up timestamps + # don't think they represent bookmarking time + # - not sure what's timeline_type? + # seems like 30 means bookmarks? + # there is one tweet with timeline type 18, but it has timeline_is_preview=1 + for ( + tweet_id, + user_name, + user_username, + created_ms, + blob, + ) in db.execute( + ''' + SELECT + statuses_status_id, + users_name, + users_username, + statuses_created, + CAST(statuses_content AS BLOB) + FROM timeline_view + WHERE statuses_bookmarked = 1 + ORDER BY timeline_sort_index DESC + ''', + ): + if blob is None: # TODO exclude in sql query? + continue + yield Tweet( + id_str=tweet_id, + # TODO double check it's utc? + created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc), + screen_name=user_username, + text=_parse_content(blob), + ) -with sqlite_connect_immutable(PATH_TO_DB) as db: - # TODO use statuses table instead? - # has r_ent_content?? - # TODO hmm r_ent_content contains expanded urls? - # but they are still ellipsized? e.g. you can check 1692905005479580039 - # TODO also I think content table has mappings from short urls to full, need to extract - for (tid, blob, blob2) in db.execute( - f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1', - ): - if blob is None: # TODO exclude in sql query? - continue - print("----") - try: - print("PARSING", tid) - _parse_content(blob) - # _parse_content(blob2) - except UnicodeDecodeError as ue: - raise ue - # print("DECODING ERROR FOR ", tid, ue.object) +def bookmarks() -> Iterator[Res[Tweet]]: + # TODO might need to sort by timeline_sort_index again? + # not sure if each database contains full history of bookmarks (likely not!) + def it() -> Iterator[Res[Tweet]]: + paths = inputs() + total = len(paths) + width = len(str(total)) + for idx, path in enumerate(paths): + logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') + yield from _process_one(path) + + # TODO hmm maybe unique_everseen should be a decorator? + return unique_everseen(it)