my.twitter.android: refactor into a proper module
for now only extracting bookmarks, will use it for some time and see how it goes
This commit is contained in:
parent
a4a7bc41b9
commit
51209c547e
3 changed files with 96 additions and 26 deletions
|
@ -177,6 +177,8 @@ class twitter_archive:
|
|||
class twitter:
|
||||
class talon:
|
||||
export_path: Paths
|
||||
class android:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class twint:
|
||||
|
|
|
@ -686,6 +686,7 @@ def unique_everseen(
|
|||
if key is None:
|
||||
# todo check key return type as well? but it's more likely to be hashable
|
||||
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
|
||||
# TODO return better error here, e.g. if there is no return type it crashes
|
||||
_check_all_hashable(fun)
|
||||
|
||||
return more_itertools.unique_everseen(iterable=iterable, key=key)
|
||||
|
|
|
@ -1,13 +1,49 @@
|
|||
"""
|
||||
Data from offficial app for Android
|
||||
Twitter data from offficial app for Android
|
||||
"""
|
||||
import re
|
||||
from struct import unpack_from, calcsize
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
import re
|
||||
from struct import unpack_from
|
||||
from typing import Iterator, Sequence
|
||||
|
||||
from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
|
||||
from my.core.common import unique_everseen
|
||||
from my.core.sqlite import sqlite_connect_immutable
|
||||
|
||||
import my.config
|
||||
|
||||
def _parse_content(data: bytes):
|
||||
from .common import permalink
|
||||
|
||||
logger = LazyLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class config(my.config.twitter.android):
|
||||
# paths[s]/glob to the exported sqlite databases
|
||||
export_path: Paths
|
||||
|
||||
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class Tweet:
|
||||
id_str: str
|
||||
created_at: datetime_aware
|
||||
screen_name: str
|
||||
text: str
|
||||
|
||||
@property
|
||||
def permalink(self) -> str:
|
||||
return permalink(screen_name=self.screen_name, id=self.id_str)
|
||||
|
||||
|
||||
def _parse_content(data: bytes) -> str:
|
||||
pos = 0
|
||||
|
||||
def skip(count: int) -> None:
|
||||
|
@ -107,29 +143,60 @@ def _parse_content(data: bytes):
|
|||
text = text.replace(k, v)
|
||||
assert 'https://t.co/' not in text # make sure we detected all links
|
||||
|
||||
print(text)
|
||||
return text
|
||||
|
||||
|
||||
|
||||
PATH_TO_DB = '/path/to/db'
|
||||
|
||||
|
||||
with sqlite_connect_immutable(PATH_TO_DB) as db:
|
||||
# TODO use statuses table instead?
|
||||
# has r_ent_content??
|
||||
# TODO hmm r_ent_content contains expanded urls?
|
||||
# but they are still ellipsized? e.g. you can check 1692905005479580039
|
||||
# TODO also I think content table has mappings from short urls to full, need to extract
|
||||
for (tid, blob, blob2) in db.execute(
|
||||
f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
|
||||
def _process_one(f: Path) -> Iterator[Res[Tweet]]:
|
||||
with sqlite_connect_immutable(f) as db:
|
||||
# NOTE:
|
||||
# - it also has statuses_r_ent_content which has entities' links replaced
|
||||
# but they are still ellipsized (e.g. check 1692905005479580039)
|
||||
# so let's just uses statuses_content
|
||||
# - there is also timeline_created_at, but they look like made up timestamps
|
||||
# don't think they represent bookmarking time
|
||||
# - not sure what's timeline_type?
|
||||
# seems like 30 means bookmarks?
|
||||
# there is one tweet with timeline type 18, but it has timeline_is_preview=1
|
||||
for (
|
||||
tweet_id,
|
||||
user_name,
|
||||
user_username,
|
||||
created_ms,
|
||||
blob,
|
||||
) in db.execute(
|
||||
'''
|
||||
SELECT
|
||||
statuses_status_id,
|
||||
users_name,
|
||||
users_username,
|
||||
statuses_created,
|
||||
CAST(statuses_content AS BLOB)
|
||||
FROM timeline_view
|
||||
WHERE statuses_bookmarked = 1
|
||||
ORDER BY timeline_sort_index DESC
|
||||
''',
|
||||
):
|
||||
if blob is None: # TODO exclude in sql query?
|
||||
continue
|
||||
print("----")
|
||||
try:
|
||||
print("PARSING", tid)
|
||||
_parse_content(blob)
|
||||
# _parse_content(blob2)
|
||||
except UnicodeDecodeError as ue:
|
||||
raise ue
|
||||
# print("DECODING ERROR FOR ", tid, ue.object)
|
||||
yield Tweet(
|
||||
id_str=tweet_id,
|
||||
# TODO double check it's utc?
|
||||
created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
|
||||
screen_name=user_username,
|
||||
text=_parse_content(blob),
|
||||
)
|
||||
|
||||
|
||||
def bookmarks() -> Iterator[Res[Tweet]]:
|
||||
# TODO might need to sort by timeline_sort_index again?
|
||||
# not sure if each database contains full history of bookmarks (likely not!)
|
||||
def it() -> Iterator[Res[Tweet]]:
|
||||
paths = inputs()
|
||||
total = len(paths)
|
||||
width = len(str(total))
|
||||
for idx, path in enumerate(paths):
|
||||
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
|
||||
yield from _process_one(path)
|
||||
|
||||
# TODO hmm maybe unique_everseen should be a decorator?
|
||||
return unique_everseen(it)
|
||||
|
|
Loading…
Add table
Reference in a new issue