switch from using dataset to raw sqlite3 module
dataset is kinda unmaintaned and currently broken due to sqlalchemy 2.0 changes resolves https://github.com/karlicoss/HPI/issues/264
This commit is contained in:
parent
9c432027b5
commit
5c82d0faa9
8 changed files with 123 additions and 103 deletions
|
@ -1,17 +1,19 @@
|
||||||
from .common import assert_subpackage; assert_subpackage(__name__)
|
from .common import assert_subpackage; assert_subpackage(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
from typing import Tuple, Any, Iterator, Callable, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
from .common import PathIsh
|
from .common import PathIsh, assert_never
|
||||||
|
from .compat import Literal
|
||||||
|
|
||||||
|
|
||||||
def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection:
|
def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection:
|
||||||
# https://www.sqlite.org/draft/uri.html#uriimmutable
|
|
||||||
return sqlite3.connect(f'file:{db}?immutable=1', uri=True)
|
return sqlite3.connect(f'file:{db}?immutable=1', uri=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,6 +32,42 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None:
|
||||||
conn.execute('DROP TABLE testtable')
|
conn.execute('DROP TABLE testtable')
|
||||||
|
|
||||||
|
|
||||||
|
SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
|
||||||
|
|
||||||
|
def dict_factory(cursor, row):
|
||||||
|
fields = [column[0] for column in cursor.description]
|
||||||
|
return {key: value for key, value in zip(fields, row)}
|
||||||
|
|
||||||
|
|
||||||
|
Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
|
||||||
|
dbp = f'file:{db}'
|
||||||
|
# https://www.sqlite.org/draft/uri.html#uriimmutable
|
||||||
|
if immutable:
|
||||||
|
dbp = f'{dbp}?immutable=1'
|
||||||
|
row_factory_: Any = None
|
||||||
|
if row_factory is not None:
|
||||||
|
if callable(row_factory):
|
||||||
|
row_factory_ = row_factory
|
||||||
|
elif row_factory == 'row':
|
||||||
|
row_factory_ = sqlite3.Row
|
||||||
|
elif row_factory == 'dict':
|
||||||
|
row_factory_ = dict_factory
|
||||||
|
else:
|
||||||
|
assert_never()
|
||||||
|
|
||||||
|
conn = sqlite3.connect(dbp, uri=True)
|
||||||
|
try:
|
||||||
|
conn.row_factory = row_factory_
|
||||||
|
with conn:
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
# Connection context manager isn't actually closing the connection, only keeps transaction
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
# TODO come up with a better name?
|
# TODO come up with a better name?
|
||||||
# NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal
|
# NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal
|
||||||
def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
|
def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
|
||||||
|
@ -52,8 +90,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
|
||||||
return dest
|
return dest
|
||||||
|
|
||||||
|
|
||||||
from typing import Tuple, Any, Iterator
|
|
||||||
|
|
||||||
# NOTE hmm, so this kinda works
|
# NOTE hmm, so this kinda works
|
||||||
# V = TypeVar('V', bound=Tuple[Any, ...])
|
# V = TypeVar('V', bound=Tuple[Any, ...])
|
||||||
# def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]:
|
# def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]:
|
||||||
|
|
|
@ -3,25 +3,27 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
REQUIRES = ['dataset']
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterator, Sequence, Optional, Dict
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import sqlite3
|
||||||
|
from typing import Iterator, Sequence, Optional, Dict, Union
|
||||||
|
|
||||||
|
from more_itertools import unique_everseen
|
||||||
|
|
||||||
|
from my.core import get_files, Paths, datetime_naive, Res, assert_never
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
from my.config import fbmessenger as user_config
|
from my.config import fbmessenger as user_config
|
||||||
|
|
||||||
|
|
||||||
from ..core import Paths
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class config(user_config.android):
|
class config(user_config.android):
|
||||||
# paths[s]/glob to the exported sqlite databases
|
# paths[s]/glob to the exported sqlite databases
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
from ..core import get_files
|
|
||||||
from pathlib import Path
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
@ -38,7 +40,6 @@ class Thread:
|
||||||
name: Optional[str]
|
name: Optional[str]
|
||||||
|
|
||||||
# todo not sure about order of fields...
|
# todo not sure about order of fields...
|
||||||
from ..core import datetime_naive
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class _BaseMessage:
|
class _BaseMessage:
|
||||||
id: str
|
id: str
|
||||||
|
@ -63,22 +64,18 @@ class Message(_BaseMessage):
|
||||||
reply_to: Optional[Message]
|
reply_to: Optional[Message]
|
||||||
|
|
||||||
|
|
||||||
import json
|
|
||||||
from typing import Union
|
|
||||||
from ..core import Res, assert_never
|
|
||||||
from ..core.dataset import connect_readonly, DatabaseT
|
|
||||||
Entity = Union[Sender, Thread, _Message]
|
Entity = Union[Sender, Thread, _Message]
|
||||||
def _entities() -> Iterator[Res[Entity]]:
|
def _entities() -> Iterator[Res[Entity]]:
|
||||||
for f in inputs():
|
for f in inputs():
|
||||||
with connect_readonly(f) as db:
|
with sqlite_connection(f, immutable=True, row_factory='row') as db:
|
||||||
yield from _process_db(db)
|
yield from _process_db(db)
|
||||||
|
|
||||||
|
|
||||||
def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
|
def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
||||||
# works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user
|
# works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user
|
||||||
threadkey2id = lambda key: key.split(':')[1]
|
threadkey2id = lambda key: key.split(':')[1]
|
||||||
|
|
||||||
for r in db['threads'].find():
|
for r in db.execute('SELECT * FROM threads'):
|
||||||
try:
|
try:
|
||||||
yield Thread(
|
yield Thread(
|
||||||
id=threadkey2id(r['thread_key']),
|
id=threadkey2id(r['thread_key']),
|
||||||
|
@ -88,7 +85,7 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
|
||||||
yield e
|
yield e
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for r in db['messages'].find(order_by='timestamp_ms'):
|
for r in db.execute('SELECT * FROM messages ORDER BY timestamp_ms'):
|
||||||
mtype: int = r['msg_type']
|
mtype: int = r['msg_type']
|
||||||
if mtype == -1:
|
if mtype == -1:
|
||||||
# likely immediately deleted or something? doesn't have any data at all
|
# likely immediately deleted or something? doesn't have any data at all
|
||||||
|
@ -133,7 +130,6 @@ def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
from more_itertools import unique_everseen
|
|
||||||
def messages() -> Iterator[Res[Message]]:
|
def messages() -> Iterator[Res[Message]]:
|
||||||
senders: Dict[str, Sender] = {}
|
senders: Dict[str, Sender] = {}
|
||||||
msgs: Dict[str, Message] = {}
|
msgs: Dict[str, Message] = {}
|
||||||
|
|
|
@ -5,13 +5,15 @@ from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Iterator, Sequence, Optional, Dict
|
from pathlib import Path
|
||||||
|
from typing import Iterator, Sequence, Optional
|
||||||
|
|
||||||
|
from my.core import get_files, Paths, Res
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
from my.config import hackernews as user_config
|
from my.config import hackernews as user_config
|
||||||
|
|
||||||
|
|
||||||
from ..core import Paths
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class config(user_config.dogsheep):
|
class config(user_config.dogsheep):
|
||||||
# paths[s]/glob to the dogsheep database
|
# paths[s]/glob to the dogsheep database
|
||||||
|
@ -20,8 +22,6 @@ class config(user_config.dogsheep):
|
||||||
|
|
||||||
# todo so much boilerplate... really need some common wildcard imports?...
|
# todo so much boilerplate... really need some common wildcard imports?...
|
||||||
# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
|
# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
|
||||||
from ..core import get_files
|
|
||||||
from pathlib import Path
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
@ -44,15 +44,15 @@ class Item:
|
||||||
@property
|
@property
|
||||||
def permalink(self) -> str:
|
def permalink(self) -> str:
|
||||||
return hackernews_link(self.id)
|
return hackernews_link(self.id)
|
||||||
|
# TODO hmm kinda annoying that permalink isn't getting serialized
|
||||||
|
# maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
|
||||||
|
# so we could just take .permalink thing
|
||||||
|
|
||||||
|
|
||||||
from ..core.error import Res
|
|
||||||
from ..core.dataset import connect_readonly
|
|
||||||
def items() -> Iterator[Res[Item]]:
|
def items() -> Iterator[Res[Item]]:
|
||||||
f = max(inputs())
|
f = max(inputs())
|
||||||
with connect_readonly(f) as db:
|
with sqlite_connection(f, immutable=True, row_factory='row') as conn:
|
||||||
items = db['items']
|
for r in conn.execute('SELECT * FROM items ORDER BY time'):
|
||||||
for r in items.all(order_by='time'):
|
|
||||||
yield Item(
|
yield Item(
|
||||||
id=r['id'],
|
id=r['id'],
|
||||||
type=r['type'],
|
type=r['type'],
|
||||||
|
|
|
@ -1,20 +1,17 @@
|
||||||
"""
|
"""
|
||||||
[[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews
|
[[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews
|
||||||
"""
|
"""
|
||||||
|
from datetime import datetime, timezone
|
||||||
REQUIRES = ['dataset']
|
from pathlib import Path
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Any, Dict, Iterator, NamedTuple, Sequence
|
from typing import Any, Dict, Iterator, NamedTuple, Sequence
|
||||||
|
|
||||||
import pytz
|
from my.core import get_files
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
from my.config import materialistic as config
|
from my.config import materialistic as config
|
||||||
# todo migrate config to my.hackernews.materialistic
|
# todo migrate config to my.hackernews.materialistic
|
||||||
|
|
||||||
|
|
||||||
from ..core import get_files
|
|
||||||
from pathlib import Path
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
@ -28,7 +25,7 @@ class Saved(NamedTuple):
|
||||||
@property
|
@property
|
||||||
def when(self) -> datetime:
|
def when(self) -> datetime:
|
||||||
ts = int(self.row['time']) / 1000
|
ts = int(self.row['time']) / 1000
|
||||||
return datetime.fromtimestamp(ts, tz=pytz.utc)
|
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def uid(self) -> str:
|
def uid(self) -> str:
|
||||||
|
@ -47,13 +44,11 @@ class Saved(NamedTuple):
|
||||||
return hackernews_link(self.uid)
|
return hackernews_link(self.uid)
|
||||||
|
|
||||||
|
|
||||||
from ..core.dataset import connect_readonly
|
|
||||||
def raw() -> Iterator[Row]:
|
def raw() -> Iterator[Row]:
|
||||||
last = max(inputs())
|
last = max(inputs())
|
||||||
with connect_readonly(last) as db:
|
with sqlite_connection(last, immutable=True, row_factory='dict') as conn:
|
||||||
saved = db['saved']
|
yield from conn.execute('SELECT * FROM saved ORDER BY time')
|
||||||
# TODO wonder if it's 'save time' or creation time?
|
# TODO wonder if it's 'save time' or creation time?
|
||||||
yield from saved.all(order_by='time')
|
|
||||||
|
|
||||||
|
|
||||||
def saves() -> Iterator[Saved]:
|
def saves() -> Iterator[Saved]:
|
||||||
|
|
14
my/taplog.py
14
my/taplog.py
|
@ -1,11 +1,11 @@
|
||||||
'''
|
'''
|
||||||
[[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data
|
[[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import NamedTuple, Dict, Optional, Iterable
|
from typing import NamedTuple, Dict, Optional, Iterable
|
||||||
|
|
||||||
from .core import get_files
|
from my.core import get_files, stat, Stats
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
from my.config import taplog as user_config
|
from my.config import taplog as user_config
|
||||||
|
|
||||||
|
@ -46,11 +46,10 @@ class Entry(NamedTuple):
|
||||||
|
|
||||||
def entries() -> Iterable[Entry]:
|
def entries() -> Iterable[Entry]:
|
||||||
last = max(get_files(user_config.export_path))
|
last = max(get_files(user_config.export_path))
|
||||||
from .core.dataset import connect_readonly
|
with sqlite_connection(last, immutable=True, row_factory='dict') as db:
|
||||||
db = connect_readonly(last)
|
# todo is it sorted by timestamp?
|
||||||
# todo is it sorted by timestamp?
|
for row in db.execute('SELECT * FROM Log'):
|
||||||
for row in db['Log'].all():
|
yield Entry(row)
|
||||||
yield Entry(row)
|
|
||||||
|
|
||||||
|
|
||||||
# I guess worth having as top level considering it would be quite common?
|
# I guess worth having as top level considering it would be quite common?
|
||||||
|
@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
from .core import stat, Stats
|
|
||||||
def stats() -> Stats:
|
def stats() -> Stats:
|
||||||
return stat(entries)
|
return stat(entries)
|
||||||
|
|
|
@ -3,19 +3,18 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
REQUIRES = ['dataset']
|
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import sqlite3
|
||||||
from typing import Sequence, Iterator, Union, Dict, List, Mapping
|
from typing import Sequence, Iterator, Union, Dict, List, Mapping
|
||||||
|
|
||||||
from more_itertools import unique_everseen
|
from more_itertools import unique_everseen
|
||||||
|
|
||||||
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
|
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
|
||||||
from my.core.dataset import connect_readonly, DatabaseT
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
|
|
||||||
from my.config import tinder as user_config
|
from my.config import tinder as user_config
|
||||||
|
@ -73,6 +72,8 @@ class Message(_BaseMessage):
|
||||||
to: Person
|
to: Person
|
||||||
|
|
||||||
|
|
||||||
|
# todo hmm I have a suspicion it might be cumulative?
|
||||||
|
# although still possible that the user might remove/install app back, so need to keep that in mind
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
@ -83,40 +84,43 @@ Entity = Union[Person, Match, Message]
|
||||||
|
|
||||||
def _entities() -> Iterator[Res[_Entity]]:
|
def _entities() -> Iterator[Res[_Entity]]:
|
||||||
for db_file in inputs():
|
for db_file in inputs():
|
||||||
with connect_readonly(db_file) as db:
|
with sqlite_connection(db_file, immutable=True, row_factory='row') as db:
|
||||||
yield from _handle_db(db)
|
yield from _handle_db(db)
|
||||||
|
|
||||||
|
|
||||||
def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]:
|
def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]:
|
||||||
# profile_user_view contains our own user id
|
# profile_user_view contains our own user id
|
||||||
for row in chain(db['profile_user_view'], db['match_person']):
|
for row in chain(
|
||||||
|
db.execute('SELECT * FROM profile_user_view'),
|
||||||
|
db.execute('SELECT * FROM match_person'),
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
yield _parse_person(row)
|
yield _parse_person(row)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# todo attach error contex?
|
# todo attach error contex?
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
for row in db['match']:
|
for row in db.execute('SELECT * FROM match'):
|
||||||
try:
|
try:
|
||||||
yield _parse_match(row)
|
yield _parse_match(row)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
for row in db['message']:
|
for row in db.execute('SELECT * FROM message'):
|
||||||
try:
|
try:
|
||||||
yield _parse_msg(row)
|
yield _parse_msg(row)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
def _parse_person(row) -> Person:
|
def _parse_person(row: sqlite3.Row) -> Person:
|
||||||
return Person(
|
return Person(
|
||||||
id=row['id'],
|
id=row['id'],
|
||||||
name=row['name'],
|
name=row['name'],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_match(row) -> _Match:
|
def _parse_match(row: sqlite3.Row) -> _Match:
|
||||||
return _Match(
|
return _Match(
|
||||||
id=row['id'],
|
id=row['id'],
|
||||||
person_id=row['person_id'],
|
person_id=row['person_id'],
|
||||||
|
@ -124,7 +128,7 @@ def _parse_match(row) -> _Match:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _parse_msg(row) -> _Message:
|
def _parse_msg(row: sqlite3.Row) -> _Message:
|
||||||
# note it also has raw_message_data -- not sure which is best to use..
|
# note it also has raw_message_data -- not sure which is best to use..
|
||||||
sent = row['sent_date']
|
sent = row['sent_date']
|
||||||
return _Message(
|
return _Message(
|
||||||
|
|
|
@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
import re
|
import re
|
||||||
from typing import Iterator, Sequence, Optional, Dict
|
import sqlite3
|
||||||
|
from typing import Iterator, Sequence, Union
|
||||||
|
|
||||||
import pytz
|
from more_itertools import unique_everseen
|
||||||
|
|
||||||
|
from my.core import Paths, Res, datetime_aware, get_files
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
|
from .common import TweetId, permalink
|
||||||
|
|
||||||
from my.config import twitter as user_config
|
from my.config import twitter as user_config
|
||||||
|
|
||||||
|
|
||||||
from ..core import Paths, Res, datetime_aware
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class config(user_config.talon):
|
class config(user_config.talon):
|
||||||
# paths[s]/glob to the exported sqlite databases
|
# paths[s]/glob to the exported sqlite databases
|
||||||
export_path: Paths
|
export_path: Paths
|
||||||
|
|
||||||
|
|
||||||
from ..core import get_files
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(config.export_path)
|
||||||
|
|
||||||
|
|
||||||
from .common import TweetId, permalink
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(unsafe_hash=True)
|
@dataclass(unsafe_hash=True)
|
||||||
class Tweet:
|
class Tweet:
|
||||||
id_str: TweetId
|
id_str: TweetId
|
||||||
|
@ -51,8 +52,6 @@ class _IsFavorire:
|
||||||
tweet: Tweet
|
tweet: Tweet
|
||||||
|
|
||||||
|
|
||||||
from typing import Union
|
|
||||||
from ..core.dataset import connect_readonly
|
|
||||||
Entity = Union[_IsTweet, _IsFavorire]
|
Entity = Union[_IsTweet, _IsFavorire]
|
||||||
def _entities() -> Iterator[Res[Entity]]:
|
def _entities() -> Iterator[Res[Entity]]:
|
||||||
for f in inputs():
|
for f in inputs():
|
||||||
|
@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]:
|
||||||
fname = f.name
|
fname = f.name
|
||||||
handler = handlers.get(fname)
|
handler = handlers.get(fname)
|
||||||
if handler is None:
|
if handler is None:
|
||||||
yield RuntimeError(f"Coulnd't find handler for {fname}")
|
yield RuntimeError(f"Could not find handler for {fname}")
|
||||||
return
|
return
|
||||||
with connect_readonly(f) as db:
|
with sqlite_connection(f, immutable=True, row_factory='row') as db:
|
||||||
yield from handler(db)
|
yield from handler(db)
|
||||||
|
|
||||||
|
|
||||||
def _process_user_tweets(db) -> Iterator[Res[Entity]]:
|
def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
||||||
# dunno why it's called 'lists'
|
# dunno why it's called 'lists'
|
||||||
for r in db['lists'].all(order_by='time'):
|
for r in db.execute('SELECT * FROM lists ORDER BY time'):
|
||||||
try:
|
try:
|
||||||
yield _IsTweet(_parse_tweet(r))
|
yield _IsTweet(_parse_tweet(r))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
|
|
||||||
def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
|
def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
||||||
for r in db['favorite_tweets'].all(order_by='time'):
|
for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
|
||||||
try:
|
try:
|
||||||
yield _IsFavorire(_parse_tweet(r))
|
yield _IsFavorire(_parse_tweet(r))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
yield e
|
yield e
|
||||||
|
|
||||||
def _parse_tweet(row) -> Tweet:
|
|
||||||
|
def _parse_tweet(row: sqlite3.Row) -> Tweet:
|
||||||
# ok so looks like it's tz aware..
|
# ok so looks like it's tz aware..
|
||||||
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
|
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
|
||||||
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
|
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
|
||||||
# and it's created here, so looks like it's properly parsed from the api
|
# and it's created here, so looks like it's properly parsed from the api
|
||||||
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
||||||
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
|
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
|
||||||
text = row['text']
|
text = row['text']
|
||||||
|
|
||||||
# try explanding URLs.. sadly there are no positions in the db
|
# try explanding URLs.. sadly there are no positions in the db
|
||||||
|
@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
from more_itertools import unique_everseen
|
|
||||||
def tweets() -> Iterator[Res[Tweet]]:
|
def tweets() -> Iterator[Res[Tweet]]:
|
||||||
for x in unique_everseen(_entities()):
|
for x in unique_everseen(_entities()):
|
||||||
if isinstance(x, Exception):
|
if isinstance(x, Exception):
|
||||||
|
@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]:
|
||||||
elif isinstance(x, _IsTweet):
|
elif isinstance(x, _IsTweet):
|
||||||
yield x.tweet
|
yield x.tweet
|
||||||
|
|
||||||
|
|
||||||
def likes() -> Iterator[Res[Tweet]]:
|
def likes() -> Iterator[Res[Tweet]]:
|
||||||
for x in unique_everseen(_entities()):
|
for x in unique_everseen(_entities()):
|
||||||
if isinstance(x, Exception):
|
if isinstance(x, Exception):
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
"""
|
"""
|
||||||
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
|
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REQUIRES = ['dataset']
|
|
||||||
|
|
||||||
from ..core.common import Paths
|
|
||||||
from ..core.error import Res
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import NamedTuple, Iterator, List
|
||||||
|
|
||||||
|
|
||||||
|
from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
|
||||||
|
from my.core.cfg import make_config
|
||||||
|
from my.core.sqlite import sqlite_connection
|
||||||
|
|
||||||
from my.config import twint as user_config
|
from my.config import twint as user_config
|
||||||
|
|
||||||
# TODO move to twitter.twint config structure
|
# TODO move to twitter.twint config structure
|
||||||
|
@ -17,16 +21,9 @@ class twint(user_config):
|
||||||
|
|
||||||
####
|
####
|
||||||
|
|
||||||
from ..core.cfg import make_config
|
|
||||||
config = make_config(twint)
|
config = make_config(twint)
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import NamedTuple, Iterator, List
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ..core.common import get_files, LazyLogger, Json, datetime_aware
|
|
||||||
|
|
||||||
log = LazyLogger(__name__)
|
log = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -110,25 +107,19 @@ WHERE {where}
|
||||||
ORDER BY T.created_at
|
ORDER BY T.created_at
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def _get_db():
|
|
||||||
from ..core.dataset import connect_readonly
|
|
||||||
db_path = get_db_path()
|
|
||||||
return connect_readonly(db_path)
|
|
||||||
|
|
||||||
|
|
||||||
def tweets() -> Iterator[Res[Tweet]]:
|
def tweets() -> Iterator[Res[Tweet]]:
|
||||||
db = _get_db()
|
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
|
||||||
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
|
res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
|
||||||
yield from map(Tweet, res)
|
yield from map(Tweet, res)
|
||||||
|
|
||||||
|
|
||||||
def likes() -> Iterator[Res[Tweet]]:
|
def likes() -> Iterator[Res[Tweet]]:
|
||||||
db = _get_db()
|
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
|
||||||
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
|
res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
|
||||||
yield from map(Tweet, res)
|
yield from map(Tweet, res)
|
||||||
|
|
||||||
|
|
||||||
from ..core import stat, Stats
|
|
||||||
def stats() -> Stats:
|
def stats() -> Stats:
|
||||||
return {
|
return {
|
||||||
**stat(tweets),
|
**stat(tweets),
|
||||||
|
|
Loading…
Add table
Reference in a new issue