my.twitter.android: refactor into a proper module

for now only extracting bookmarks, will use it for some time and see how it goes
This commit is contained in:
karlicoss 2023-12-24 00:06:29 +00:00
parent a4a7bc41b9
commit 51209c547e
3 changed files with 96 additions and 26 deletions

View file

@ -177,6 +177,8 @@ class twitter_archive:
class twitter:
class talon:
export_path: Paths
class android:
export_path: Paths
class twint:

View file

@ -686,6 +686,7 @@ def unique_everseen(
if key is None:
# todo check key return type as well? but it's more likely to be hashable
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
# TODO return better error here, e.g. if there is no return type it crashes
_check_all_hashable(fun)
return more_itertools.unique_everseen(iterable=iterable, key=key)

View file

@ -1,13 +1,49 @@
"""
Data from offficial app for Android
Twitter data from offficial app for Android
"""
import re
from struct import unpack_from, calcsize
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
import re
from struct import unpack_from
from typing import Iterator, Sequence
from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
from my.core.common import unique_everseen
from my.core.sqlite import sqlite_connect_immutable
import my.config
def _parse_content(data: bytes):
from .common import permalink
logger = LazyLogger(__name__)
@dataclass
class config(my.config.twitter.android):
# paths[s]/glob to the exported sqlite databases
export_path: Paths
def inputs() -> Sequence[Path]:
return get_files(config.export_path)
@dataclass(unsafe_hash=True)
class Tweet:
id_str: str
created_at: datetime_aware
screen_name: str
text: str
@property
def permalink(self) -> str:
return permalink(screen_name=self.screen_name, id=self.id_str)
def _parse_content(data: bytes) -> str:
pos = 0
def skip(count: int) -> None:
@ -107,29 +143,60 @@ def _parse_content(data: bytes):
text = text.replace(k, v)
assert 'https://t.co/' not in text # make sure we detected all links
print(text)
return text
PATH_TO_DB = '/path/to/db'
with sqlite_connect_immutable(PATH_TO_DB) as db:
# TODO use statuses table instead?
# has r_ent_content??
# TODO hmm r_ent_content contains expanded urls?
# but they are still ellipsized? e.g. you can check 1692905005479580039
# TODO also I think content table has mappings from short urls to full, need to extract
for (tid, blob, blob2) in db.execute(
f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
def _process_one(f: Path) -> Iterator[Res[Tweet]]:
with sqlite_connect_immutable(f) as db:
# NOTE:
# - it also has statuses_r_ent_content which has entities' links replaced
# but they are still ellipsized (e.g. check 1692905005479580039)
# so let's just uses statuses_content
# - there is also timeline_created_at, but they look like made up timestamps
# don't think they represent bookmarking time
# - not sure what's timeline_type?
# seems like 30 means bookmarks?
# there is one tweet with timeline type 18, but it has timeline_is_preview=1
for (
tweet_id,
user_name,
user_username,
created_ms,
blob,
) in db.execute(
'''
SELECT
statuses_status_id,
users_name,
users_username,
statuses_created,
CAST(statuses_content AS BLOB)
FROM timeline_view
WHERE statuses_bookmarked = 1
ORDER BY timeline_sort_index DESC
''',
):
if blob is None: # TODO exclude in sql query?
continue
print("----")
try:
print("PARSING", tid)
_parse_content(blob)
# _parse_content(blob2)
except UnicodeDecodeError as ue:
raise ue
# print("DECODING ERROR FOR ", tid, ue.object)
yield Tweet(
id_str=tweet_id,
# TODO double check it's utc?
created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
screen_name=user_username,
text=_parse_content(blob),
)
def bookmarks() -> Iterator[Res[Tweet]]:
# TODO might need to sort by timeline_sort_index again?
# not sure if each database contains full history of bookmarks (likely not!)
def it() -> Iterator[Res[Tweet]]:
paths = inputs()
total = len(paths)
width = len(str(total))
for idx, path in enumerate(paths):
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
yield from _process_one(path)
# TODO hmm maybe unique_everseen should be a decorator?
return unique_everseen(it)