my.twitter.android: refactor into a proper module
for now only extracting bookmarks, will use it for some time and see how it goes
This commit is contained in:
parent
a4a7bc41b9
commit
51209c547e
3 changed files with 96 additions and 26 deletions
|
@ -177,6 +177,8 @@ class twitter_archive:
|
||||||
class twitter:
|
class twitter:
|
||||||
class talon:
|
class talon:
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
|
class android:
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
class twint:
|
class twint:
|
||||||
|
|
|
@ -686,6 +686,7 @@ def unique_everseen(
|
||||||
if key is None:
|
if key is None:
|
||||||
# todo check key return type as well? but it's more likely to be hashable
|
# todo check key return type as well? but it's more likely to be hashable
|
||||||
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
|
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
|
||||||
|
# TODO return better error here, e.g. if there is no return type it crashes
|
||||||
_check_all_hashable(fun)
|
_check_all_hashable(fun)
|
||||||
|
|
||||||
return more_itertools.unique_everseen(iterable=iterable, key=key)
|
return more_itertools.unique_everseen(iterable=iterable, key=key)
|
||||||
|
|
|
@ -1,13 +1,49 @@
|
||||||
"""
|
"""
|
||||||
Data from offficial app for Android
|
Twitter data from offficial app for Android
|
||||||
"""
|
"""
|
||||||
import re
|
from __future__ import annotations
|
||||||
from struct import unpack_from, calcsize
|
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
from struct import unpack_from
|
||||||
|
from typing import Iterator, Sequence
|
||||||
|
|
||||||
|
from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
|
||||||
|
from my.core.common import unique_everseen
|
||||||
from my.core.sqlite import sqlite_connect_immutable
|
from my.core.sqlite import sqlite_connect_immutable
|
||||||
|
|
||||||
|
import my.config
|
||||||
|
|
||||||
def _parse_content(data: bytes):
|
from .common import permalink
|
||||||
|
|
||||||
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class config(my.config.twitter.android):
|
||||||
|
# paths[s]/glob to the exported sqlite databases
|
||||||
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
|
def inputs() -> Sequence[Path]:
|
||||||
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(unsafe_hash=True)
|
||||||
|
class Tweet:
|
||||||
|
id_str: str
|
||||||
|
created_at: datetime_aware
|
||||||
|
screen_name: str
|
||||||
|
text: str
|
||||||
|
|
||||||
|
@property
|
||||||
|
def permalink(self) -> str:
|
||||||
|
return permalink(screen_name=self.screen_name, id=self.id_str)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_content(data: bytes) -> str:
|
||||||
pos = 0
|
pos = 0
|
||||||
|
|
||||||
def skip(count: int) -> None:
|
def skip(count: int) -> None:
|
||||||
|
@ -107,29 +143,60 @@ def _parse_content(data: bytes):
|
||||||
text = text.replace(k, v)
|
text = text.replace(k, v)
|
||||||
assert 'https://t.co/' not in text # make sure we detected all links
|
assert 'https://t.co/' not in text # make sure we detected all links
|
||||||
|
|
||||||
print(text)
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _process_one(f: Path) -> Iterator[Res[Tweet]]:
|
||||||
PATH_TO_DB = '/path/to/db'
|
with sqlite_connect_immutable(f) as db:
|
||||||
|
# NOTE:
|
||||||
|
# - it also has statuses_r_ent_content which has entities' links replaced
|
||||||
|
# but they are still ellipsized (e.g. check 1692905005479580039)
|
||||||
|
# so let's just uses statuses_content
|
||||||
|
# - there is also timeline_created_at, but they look like made up timestamps
|
||||||
|
# don't think they represent bookmarking time
|
||||||
|
# - not sure what's timeline_type?
|
||||||
|
# seems like 30 means bookmarks?
|
||||||
|
# there is one tweet with timeline type 18, but it has timeline_is_preview=1
|
||||||
|
for (
|
||||||
|
tweet_id,
|
||||||
|
user_name,
|
||||||
|
user_username,
|
||||||
|
created_ms,
|
||||||
|
blob,
|
||||||
|
) in db.execute(
|
||||||
|
'''
|
||||||
|
SELECT
|
||||||
|
statuses_status_id,
|
||||||
|
users_name,
|
||||||
|
users_username,
|
||||||
|
statuses_created,
|
||||||
|
CAST(statuses_content AS BLOB)
|
||||||
|
FROM timeline_view
|
||||||
|
WHERE statuses_bookmarked = 1
|
||||||
|
ORDER BY timeline_sort_index DESC
|
||||||
|
''',
|
||||||
|
):
|
||||||
|
if blob is None: # TODO exclude in sql query?
|
||||||
|
continue
|
||||||
|
yield Tweet(
|
||||||
|
id_str=tweet_id,
|
||||||
|
# TODO double check it's utc?
|
||||||
|
created_at=datetime.fromtimestamp(created_ms / 1000, tz=timezone.utc),
|
||||||
|
screen_name=user_username,
|
||||||
|
text=_parse_content(blob),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
with sqlite_connect_immutable(PATH_TO_DB) as db:
|
def bookmarks() -> Iterator[Res[Tweet]]:
|
||||||
# TODO use statuses table instead?
|
# TODO might need to sort by timeline_sort_index again?
|
||||||
# has r_ent_content??
|
# not sure if each database contains full history of bookmarks (likely not!)
|
||||||
# TODO hmm r_ent_content contains expanded urls?
|
def it() -> Iterator[Res[Tweet]]:
|
||||||
# but they are still ellipsized? e.g. you can check 1692905005479580039
|
paths = inputs()
|
||||||
# TODO also I think content table has mappings from short urls to full, need to extract
|
total = len(paths)
|
||||||
for (tid, blob, blob2) in db.execute(
|
width = len(str(total))
|
||||||
f'SELECT statuses_status_id, CAST(statuses_content AS BLOB), CAST(statuses_r_ent_content AS BLOB) FROM timeline_view WHERE statuses_bookmarked = 1',
|
for idx, path in enumerate(paths):
|
||||||
):
|
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
|
||||||
if blob is None: # TODO exclude in sql query?
|
yield from _process_one(path)
|
||||||
continue
|
|
||||||
print("----")
|
# TODO hmm maybe unique_everseen should be a decorator?
|
||||||
try:
|
return unique_everseen(it)
|
||||||
print("PARSING", tid)
|
|
||||||
_parse_content(blob)
|
|
||||||
# _parse_content(blob2)
|
|
||||||
except UnicodeDecodeError as ue:
|
|
||||||
raise ue
|
|
||||||
# print("DECODING ERROR FOR ", tid, ue.object)
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue