Merge branch 'master' into location-fallback
This commit is contained in:
commit
f05e81cee5
36 changed files with 626 additions and 232 deletions
|
@ -13,7 +13,7 @@ import pandas as pd # type: ignore
|
|||
import orgparse
|
||||
|
||||
|
||||
from my.config import blood as config
|
||||
from my.config import blood as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
class Entry(NamedTuple):
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..core.error import Res, set_error_datetime, extract_error_datetime
|
|||
|
||||
from .. import orgmode
|
||||
|
||||
from my.config import weight as config
|
||||
from my.config import weight as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
log = LazyLogger('my.body.weight')
|
||||
|
|
|
@ -4,4 +4,4 @@ warnings.high('my.books.kobo is deprecated! Please use my.kobo instead!')
|
|||
|
||||
from ..core.util import __NOT_HPI_MODULE__
|
||||
|
||||
from ..kobo import *
|
||||
from ..kobo import * # type: ignore[no-redef]
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
from my.config import codeforces as config
|
||||
from my.config import codeforces as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import NamedTuple
|
||||
import json
|
||||
from typing import Dict, Iterator
|
||||
|
||||
|
||||
from ..core import get_files, Res, unwrap
|
||||
from ..core.compat import cached_property
|
||||
from ..core.konsume import ignore, wrap
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
from my.config import topcoder as config
|
||||
from my.config import topcoder as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
from typing import NamedTuple
|
||||
import json
|
||||
from typing import Dict, Iterator
|
||||
|
||||
|
||||
from ..core import get_files, Res, unwrap, Json
|
||||
from ..core.compat import cached_property
|
||||
from ..core.error import Res, unwrap
|
||||
|
|
90
my/config.py
90
my/config.py
|
@ -14,8 +14,14 @@ from my.core import init
|
|||
###
|
||||
|
||||
|
||||
from datetime import tzinfo
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
|
||||
from my.core import Paths, PathIsh
|
||||
|
||||
|
||||
class hypothesis:
|
||||
# expects outputs from https://github.com/karlicoss/hypexport
|
||||
# (it's just the standard Hypothes.is export format)
|
||||
|
@ -141,9 +147,14 @@ class hackernews:
|
|||
export_path: Paths
|
||||
|
||||
|
||||
class materialistic:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class fbmessenger:
|
||||
class fbmessengerexport:
|
||||
export_db: PathIsh
|
||||
facebook_id: Optional[str]
|
||||
class android:
|
||||
export_path: Paths
|
||||
|
||||
|
@ -156,8 +167,87 @@ class twitter:
|
|||
class talon:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class twint:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class browser:
|
||||
class export:
|
||||
export_path: Paths = ''
|
||||
class active_browser:
|
||||
export_path: Paths = ''
|
||||
|
||||
|
||||
class telegram:
|
||||
class telegram_backup:
|
||||
export_path: PathIsh = ''
|
||||
|
||||
|
||||
class demo:
|
||||
data_path: Paths
|
||||
username: str
|
||||
timezone: tzinfo
|
||||
|
||||
|
||||
class simple:
|
||||
count: int
|
||||
|
||||
|
||||
class vk_messages_backup:
|
||||
storage_path: Path
|
||||
|
||||
|
||||
class kobo:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class feedly:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class feedbin:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class taplog:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class lastfm:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class rescuetime:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class runnerup:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class emfit:
|
||||
export_path: Path
|
||||
timezone: tzinfo
|
||||
excluded_sids: List[str]
|
||||
|
||||
|
||||
class foursquare:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class rtm:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class imdb:
|
||||
export_path: Paths
|
||||
|
||||
|
||||
class roamresearch:
|
||||
export_path: Paths
|
||||
username: str
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -344,8 +344,8 @@ def _requires(modules: Sequence[str]) -> Sequence[str]:
|
|||
|
||||
reqs = mod.requires
|
||||
if reqs is None:
|
||||
error(f"Module {mod.name} has no REQUIRES specification")
|
||||
sys.exit(1)
|
||||
warning(f"Module {mod.name} has no REQUIRES specification")
|
||||
continue
|
||||
for r in reqs:
|
||||
if r not in res:
|
||||
res.append(r)
|
||||
|
@ -369,6 +369,10 @@ def module_install(*, user: bool, module: Sequence[str], parallel: bool=False) -
|
|||
|
||||
requirements = _requires(module)
|
||||
|
||||
if len(requirements) == 0:
|
||||
warning('requirements list is empty, no need to install anything')
|
||||
return
|
||||
|
||||
pre_cmd = [
|
||||
sys.executable, '-m', 'pip',
|
||||
'install',
|
||||
|
|
|
@ -28,7 +28,7 @@ F = TypeVar('F')
|
|||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
@contextmanager
|
||||
def override_config(config: F) -> Iterator[F]:
|
||||
def _override_config(config: F) -> Iterator[F]:
|
||||
'''
|
||||
Temporary override for config's parameters, useful for testing/fake data/etc.
|
||||
'''
|
||||
|
@ -44,12 +44,53 @@ def override_config(config: F) -> Iterator[F]:
|
|||
delattr(config, k)
|
||||
|
||||
|
||||
# helper for tests? not sure if could be useful elsewhere
|
||||
import importlib
|
||||
import sys
|
||||
from typing import Optional, Set
|
||||
ModuleRegex = str
|
||||
@contextmanager
|
||||
def tmp_config():
|
||||
import my.config as C
|
||||
with override_config(C):
|
||||
yield C # todo not sure?
|
||||
def _reload_modules(modules: ModuleRegex) -> Iterator[None]:
|
||||
def loaded_modules() -> Set[str]:
|
||||
return {name for name in sys.modules if re.fullmatch(modules, name)}
|
||||
|
||||
modules_before = loaded_modules()
|
||||
|
||||
for m in modules_before:
|
||||
importlib.reload(sys.modules[m])
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
modules_after = loaded_modules()
|
||||
for m in modules_after:
|
||||
if m in modules_before:
|
||||
# was previously loaded, so need to reload to pick up old config
|
||||
importlib.reload(sys.modules[m])
|
||||
else:
|
||||
# wasn't previously loaded, so need to unload it
|
||||
# otherwise it might fail due to missing config etc
|
||||
sys.modules.pop(m, None)
|
||||
|
||||
|
||||
from contextlib import ExitStack
|
||||
import re
|
||||
@contextmanager
|
||||
def tmp_config(*, modules: Optional[ModuleRegex]=None, config=None):
|
||||
if modules is None:
|
||||
assert config is None
|
||||
if modules is not None:
|
||||
assert config is not None
|
||||
|
||||
import my.config
|
||||
with ExitStack() as module_reload_stack, _override_config(my.config) as new_config:
|
||||
if config is not None:
|
||||
overrides = {k: v for k, v in vars(config).items() if not k.startswith('__')}
|
||||
for k, v in overrides.items():
|
||||
setattr(new_config, k, v)
|
||||
|
||||
if modules is not None:
|
||||
module_reload_stack.enter_context(_reload_modules(modules))
|
||||
yield new_config
|
||||
|
||||
|
||||
def test_tmp_config() -> None:
|
||||
|
@ -63,3 +104,8 @@ def test_tmp_config() -> None:
|
|||
# todo hmm. not sure what should do about new properties??
|
||||
assert not hasattr(c, 'extra')
|
||||
assert c.google != 'whatever'
|
||||
|
||||
|
||||
###
|
||||
# todo properly deprecate, this isn't really meant for public use
|
||||
override_config = _override_config
|
||||
|
|
|
@ -123,8 +123,8 @@ from contextlib import contextmanager as ctx
|
|||
@ctx
|
||||
def _reset_config() -> Iterator[Config]:
|
||||
# todo maybe have this decorator for the whole of my.config?
|
||||
from .cfg import override_config
|
||||
with override_config(config) as cc:
|
||||
from .cfg import _override_config
|
||||
with _override_config(config) as cc:
|
||||
cc.enabled_modules = None
|
||||
cc.disabled_modules = None
|
||||
cc.cache_dir = None
|
||||
|
|
|
@ -1,17 +1,19 @@
|
|||
from .common import assert_subpackage; assert_subpackage(__name__)
|
||||
|
||||
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import sqlite3
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Tuple, Any, Iterator, Callable, Optional, Union
|
||||
|
||||
|
||||
from .common import PathIsh
|
||||
from .common import PathIsh, assert_never
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
def sqlite_connect_immutable(db: PathIsh) -> sqlite3.Connection:
|
||||
# https://www.sqlite.org/draft/uri.html#uriimmutable
|
||||
return sqlite3.connect(f'file:{db}?immutable=1', uri=True)
|
||||
|
||||
|
||||
|
@ -30,6 +32,44 @@ def test_sqlite_connect_immutable(tmp_path: Path) -> None:
|
|||
conn.execute('DROP TABLE testtable')
|
||||
|
||||
|
||||
SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
|
||||
|
||||
def dict_factory(cursor, row):
|
||||
fields = [column[0] for column in cursor.description]
|
||||
return {key: value for key, value in zip(fields, row)}
|
||||
|
||||
|
||||
Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
|
||||
|
||||
@contextmanager
|
||||
def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
|
||||
dbp = f'file:{db}'
|
||||
# https://www.sqlite.org/draft/uri.html#uriimmutable
|
||||
if immutable:
|
||||
# assert results in nicer error than sqlite3.OperationalError
|
||||
assert Path(db).exists(), db
|
||||
dbp = f'{dbp}?immutable=1'
|
||||
row_factory_: Any = None
|
||||
if row_factory is not None:
|
||||
if callable(row_factory):
|
||||
row_factory_ = row_factory
|
||||
elif row_factory == 'row':
|
||||
row_factory_ = sqlite3.Row
|
||||
elif row_factory == 'dict':
|
||||
row_factory_ = dict_factory
|
||||
else:
|
||||
assert_never()
|
||||
|
||||
conn = sqlite3.connect(dbp, uri=True)
|
||||
try:
|
||||
conn.row_factory = row_factory_
|
||||
with conn:
|
||||
yield conn
|
||||
finally:
|
||||
# Connection context manager isn't actually closing the connection, only keeps transaction
|
||||
conn.close()
|
||||
|
||||
|
||||
# TODO come up with a better name?
|
||||
# NOTE: this is tested by tests/sqlite.py::test_sqlite_read_with_wal
|
||||
def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
|
||||
|
@ -52,8 +92,6 @@ def sqlite_copy_and_open(db: PathIsh) -> sqlite3.Connection:
|
|||
return dest
|
||||
|
||||
|
||||
from typing import Tuple, Any, Iterator
|
||||
|
||||
# NOTE hmm, so this kinda works
|
||||
# V = TypeVar('V', bound=Tuple[Any, ...])
|
||||
# def select(cols: V, rest: str, *, db: sqlite3.Connetion) -> Iterator[V]:
|
||||
|
|
|
@ -3,6 +3,11 @@
|
|||
|
||||
Consumes data exported by https://github.com/karlicoss/emfitexport
|
||||
"""
|
||||
|
||||
REQUIRES = [
|
||||
'git+https://github.com/karlicoss/emfitexport',
|
||||
]
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Iterable, Any, Optional
|
||||
|
||||
|
@ -140,16 +145,20 @@ def stats() -> Stats:
|
|||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
@contextmanager
|
||||
def fake_data(nights: int=500) -> Iterator[None]:
|
||||
from ..core.cfg import override_config
|
||||
def fake_data(nights: int=500) -> Iterator:
|
||||
from my.core.cfg import tmp_config
|
||||
from tempfile import TemporaryDirectory
|
||||
with override_config(config) as cfg, TemporaryDirectory() as td:
|
||||
with TemporaryDirectory() as td:
|
||||
tdir = Path(td)
|
||||
cfg.export_path = tdir
|
||||
|
||||
gen = dal.FakeData()
|
||||
gen.fill(tdir, count=nights)
|
||||
yield
|
||||
|
||||
class override:
|
||||
class emfit:
|
||||
export_path = tdir
|
||||
|
||||
with tmp_config(modules=__name__, config=override) as cfg:
|
||||
yield cfg
|
||||
|
||||
|
||||
# TODO remove/deprecate it? I think used by timeline
|
||||
|
|
|
@ -87,20 +87,24 @@ def stats() -> Stats:
|
|||
# TODO make sure it's possible to 'advise' functions and override stuff
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
@contextmanager
|
||||
def fake_data(count: int=100):
|
||||
from .core.cfg import override_config
|
||||
def fake_data(count: int=100) -> Iterator:
|
||||
from my.core.cfg import tmp_config
|
||||
from tempfile import TemporaryDirectory
|
||||
import json
|
||||
with override_config(endomondo) as cfg, TemporaryDirectory() as td:
|
||||
with TemporaryDirectory() as td:
|
||||
tdir = Path(td)
|
||||
cfg.export_path = tdir
|
||||
|
||||
# todo would be nice to somehow expose the generator so it's possible to hack from the outside?
|
||||
fd = dal.FakeData()
|
||||
data = fd.generate(count=count)
|
||||
|
||||
jf = tdir / 'data.json'
|
||||
jf.write_text(json.dumps(data))
|
||||
|
||||
yield
|
||||
class override:
|
||||
class endomondo:
|
||||
export_path = tdir
|
||||
|
||||
with tmp_config(modules=__name__, config=override) as cfg:
|
||||
# todo would be nice to somehow expose the generator so it's possible to hack from the outside?
|
||||
yield cfg
|
||||
|
|
|
@ -3,25 +3,37 @@ Messenger data from Android app database (in =/data/data/com.facebook.orca/datab
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
REQUIRES = ['dataset']
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Sequence, Optional, Dict
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Iterator, Sequence, Optional, Dict, Union, List
|
||||
|
||||
from more_itertools import unique_everseen
|
||||
|
||||
from my.core import get_files, Paths, datetime_naive, Res, assert_never, LazyLogger, make_config
|
||||
from my.core.error import echain
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from my.config import fbmessenger as user_config
|
||||
|
||||
|
||||
from ..core import Paths
|
||||
logger = LazyLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class config(user_config.android):
|
||||
class Config(user_config.android):
|
||||
# paths[s]/glob to the exported sqlite databases
|
||||
export_path: Paths
|
||||
|
||||
facebook_id: Optional[str] = None
|
||||
|
||||
|
||||
# hmm. this is necessary for default value (= None) to work
|
||||
# otherwise Config.facebook_id is always None..
|
||||
config = make_config(Config)
|
||||
|
||||
|
||||
from ..core import get_files
|
||||
from pathlib import Path
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
@ -35,10 +47,9 @@ class Sender:
|
|||
@dataclass(unsafe_hash=True)
|
||||
class Thread:
|
||||
id: str
|
||||
name: Optional[str]
|
||||
name: Optional[str] # isn't set for groups or one to one messages
|
||||
|
||||
# todo not sure about order of fields...
|
||||
from ..core import datetime_naive
|
||||
@dataclass
|
||||
class _BaseMessage:
|
||||
id: str
|
||||
|
@ -63,77 +74,92 @@ class Message(_BaseMessage):
|
|||
reply_to: Optional[Message]
|
||||
|
||||
|
||||
import json
|
||||
from typing import Union
|
||||
from ..core import Res, assert_never
|
||||
from ..core.dataset import connect_readonly, DatabaseT
|
||||
Entity = Union[Sender, Thread, _Message]
|
||||
def _entities() -> Iterator[Res[Entity]]:
|
||||
for f in inputs():
|
||||
with connect_readonly(f) as db:
|
||||
yield from _process_db(db)
|
||||
dbs = inputs()
|
||||
for i, f in enumerate(dbs):
|
||||
logger.debug(f'processing {f} {i}/{len(dbs)}')
|
||||
with sqlite_connection(f, immutable=True, row_factory='row') as db:
|
||||
try:
|
||||
yield from _process_db(db)
|
||||
except Exception as e:
|
||||
yield echain(RuntimeError(f'While processing {f}'), cause=e)
|
||||
|
||||
|
||||
def _process_db(db: DatabaseT) -> Iterator[Res[Entity]]:
|
||||
def _normalise_user_id(ukey: str) -> str:
|
||||
# trying to match messages.author from fbchat
|
||||
prefix = 'FACEBOOK:'
|
||||
assert ukey.startswith(prefix), ukey
|
||||
return ukey[len(prefix):]
|
||||
|
||||
|
||||
def _normalise_thread_id(key) -> str:
|
||||
# works both for GROUP:group_id and ONE_TO_ONE:other_user:your_user
|
||||
threadkey2id = lambda key: key.split(':')[1]
|
||||
return key.split(':')[1]
|
||||
|
||||
for r in db['threads'].find():
|
||||
try:
|
||||
yield Thread(
|
||||
id=threadkey2id(r['thread_key']),
|
||||
name=r['name'],
|
||||
)
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
def _process_db(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
||||
senders: Dict[str, Sender] = {}
|
||||
for r in db.execute('''SELECT * FROM thread_users'''):
|
||||
# for messaging_actor_type == 'REDUCED_MESSAGING_ACTOR', name is None
|
||||
# but they are still referenced, so need to keep
|
||||
name = r['name'] or '<NAME UNAVAILABLE>'
|
||||
user_key = r['user_key']
|
||||
s = Sender(
|
||||
id=_normalise_user_id(user_key),
|
||||
name=name,
|
||||
)
|
||||
senders[user_key] = s
|
||||
yield s
|
||||
|
||||
self_id = config.facebook_id
|
||||
thread_users: Dict[str, List[Sender]] = {}
|
||||
for r in db.execute('SELECT * from thread_participants'):
|
||||
thread_key = r['thread_key']
|
||||
user_key = r['user_key']
|
||||
if self_id is not None and user_key == f'FACEBOOK:{self_id}':
|
||||
# exclude yourself, otherwise it's just spammy to show up in all participants
|
||||
continue
|
||||
|
||||
for r in db['messages'].find(order_by='timestamp_ms'):
|
||||
mtype: int = r['msg_type']
|
||||
if mtype == -1:
|
||||
# likely immediately deleted or something? doesn't have any data at all
|
||||
ll = thread_users.get(thread_key)
|
||||
if ll is None:
|
||||
ll = []
|
||||
thread_users[thread_key] = ll
|
||||
ll.append(senders[user_key])
|
||||
|
||||
for r in db.execute('SELECT * FROM threads'):
|
||||
thread_key = r['thread_key']
|
||||
thread_type = thread_key.split(':')[0]
|
||||
if thread_type == 'MONTAGE': # no idea what this is?
|
||||
continue
|
||||
name = r['name'] # seems that it's only set for some groups
|
||||
if name is None:
|
||||
users = thread_users[thread_key]
|
||||
name = ', '.join([u.name for u in users])
|
||||
yield Thread(
|
||||
id=_normalise_thread_id(thread_key),
|
||||
name=name,
|
||||
)
|
||||
|
||||
user_id = None
|
||||
try:
|
||||
# todo could use thread_users?
|
||||
sj = json.loads(r['sender'])
|
||||
ukey: str = sj['user_key']
|
||||
prefix = 'FACEBOOK:'
|
||||
assert ukey.startswith(prefix), ukey
|
||||
user_id = ukey[len(prefix):]
|
||||
yield Sender(
|
||||
id=user_id,
|
||||
name=sj['name'],
|
||||
)
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
|
||||
thread_id = None
|
||||
try:
|
||||
thread_id = threadkey2id(r['thread_key'])
|
||||
except Exception as e:
|
||||
yield e
|
||||
continue
|
||||
|
||||
try:
|
||||
assert user_id is not None
|
||||
assert thread_id is not None
|
||||
yield _Message(
|
||||
id=r['msg_id'],
|
||||
dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000),
|
||||
# is_incoming=False, TODO??
|
||||
text=r['text'],
|
||||
thread_id=thread_id,
|
||||
sender_id=user_id,
|
||||
reply_to_id=r['message_replied_to_id']
|
||||
)
|
||||
except Exception as e:
|
||||
yield e
|
||||
for r in db.execute('''
|
||||
SELECT *, json_extract(sender, "$.user_key") AS user_key FROM messages
|
||||
WHERE msg_type NOT IN (
|
||||
-1, /* these don't have any data at all, likely immediately deleted or something? */
|
||||
2 /* these are 'left group' system messages, also a bit annoying since they might reference nonexistent users */
|
||||
)
|
||||
ORDER BY timestamp_ms /* they aren't in order in the database, so need to sort */
|
||||
'''):
|
||||
yield _Message(
|
||||
id=r['msg_id'],
|
||||
dt=datetime.fromtimestamp(r['timestamp_ms'] / 1000),
|
||||
# is_incoming=False, TODO??
|
||||
text=r['text'],
|
||||
thread_id=_normalise_thread_id(r['thread_key']),
|
||||
sender_id=_normalise_user_id(r['user_key']),
|
||||
reply_to_id=r['message_replied_to_id']
|
||||
)
|
||||
|
||||
|
||||
from more_itertools import unique_everseen
|
||||
def messages() -> Iterator[Res[Message]]:
|
||||
senders: Dict[str, Sender] = {}
|
||||
msgs: Dict[str, Message] = {}
|
||||
|
@ -150,12 +176,12 @@ def messages() -> Iterator[Res[Message]]:
|
|||
continue
|
||||
if isinstance(x, _Message):
|
||||
reply_to_id = x.reply_to_id
|
||||
# hmm, reply_to be missing due to the synthetic nature of export, so have to be defensive
|
||||
reply_to = None if reply_to_id is None else msgs.get(reply_to_id)
|
||||
# also would be interesting to merge together entities rather than resuling messages from different sources..
|
||||
# then the merging thing could be moved to common?
|
||||
try:
|
||||
sender = senders[x.sender_id]
|
||||
# hmm, reply_to be missing due to the synthetic nature of export
|
||||
# also would be interesting to merge together entities rather than resuling messages from different sources..
|
||||
# then the merging thing could be moved to common?
|
||||
reply_to = None if reply_to_id is None else msgs[reply_to_id]
|
||||
thread = threads[x.thread_id]
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
|
|
@ -7,10 +7,13 @@ REQUIRES = [
|
|||
'git+https://github.com/karlicoss/fbmessengerexport',
|
||||
]
|
||||
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
from my.core import PathIsh, Res, stat, Stats
|
||||
from my.core.warnings import high
|
||||
from my.config import fbmessenger as user_config
|
||||
|
||||
import fbmessengerexport.dal as messenger
|
||||
|
@ -22,7 +25,6 @@ _new_section = getattr(user_config, 'fbmessengerexport', None)
|
|||
_old_attr = getattr(user_config, 'export_db', None)
|
||||
|
||||
if _new_section is None and _old_attr is not None:
|
||||
from my.core.warnings import high
|
||||
high("""DEPRECATED! Please modify your fbmessenger config to look like:
|
||||
|
||||
class fbmessenger:
|
||||
|
@ -35,24 +37,26 @@ class fbmessenger:
|
|||
###
|
||||
|
||||
|
||||
from ..core import PathIsh
|
||||
@dataclass
|
||||
class config(user_config.fbmessengerexport):
|
||||
export_db: PathIsh
|
||||
|
||||
|
||||
def _dal() -> messenger.DAL:
|
||||
return messenger.DAL(config.export_db)
|
||||
@contextmanager
|
||||
def _dal() -> Iterator[messenger.DAL]:
|
||||
model = messenger.DAL(config.export_db)
|
||||
with ExitStack() as stack:
|
||||
if hasattr(model, '__dal__'): # defensive to support legacy fbmessengerexport
|
||||
stack.enter_context(model)
|
||||
yield model
|
||||
|
||||
|
||||
from ..core import Res
|
||||
def messages() -> Iterator[Res[messenger.Message]]:
|
||||
model = _dal()
|
||||
for t in model.iter_threads():
|
||||
yield from t.iter_messages()
|
||||
with _dal() as model:
|
||||
for t in model.iter_threads():
|
||||
yield from t.iter_messages()
|
||||
|
||||
|
||||
from ..core import stat, Stats
|
||||
def stats() -> Stats:
|
||||
return stat(messages)
|
||||
|
||||
|
@ -75,11 +79,9 @@ def dump_chat_history(where: PathIsh) -> None:
|
|||
p = Path(where)
|
||||
assert not p.exists() or p.is_dir()
|
||||
|
||||
model = _dal()
|
||||
|
||||
from shutil import rmtree
|
||||
from tempfile import TemporaryDirectory
|
||||
with TemporaryDirectory() as tdir:
|
||||
with TemporaryDirectory() as tdir, _dal() as model:
|
||||
td = Path(tdir)
|
||||
_dump_helper(model, td)
|
||||
|
||||
|
|
|
@ -5,13 +5,15 @@ from __future__ import annotations
|
|||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Sequence, Optional, Dict
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Sequence, Optional
|
||||
|
||||
from my.core import get_files, Paths, Res
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from my.config import hackernews as user_config
|
||||
|
||||
|
||||
from ..core import Paths
|
||||
@dataclass
|
||||
class config(user_config.dogsheep):
|
||||
# paths[s]/glob to the dogsheep database
|
||||
|
@ -20,8 +22,6 @@ class config(user_config.dogsheep):
|
|||
|
||||
# todo so much boilerplate... really need some common wildcard imports?...
|
||||
# at least for stuff which realistically is used in each module like get_files/Sequence/Paths/dataclass/Iterator/Optional
|
||||
from ..core import get_files
|
||||
from pathlib import Path
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
@ -44,15 +44,15 @@ class Item:
|
|||
@property
|
||||
def permalink(self) -> str:
|
||||
return hackernews_link(self.id)
|
||||
# TODO hmm kinda annoying that permalink isn't getting serialized
|
||||
# maybe won't be such a big problem if we used hpi query directly on objects, without jsons?
|
||||
# so we could just take .permalink thing
|
||||
|
||||
|
||||
from ..core.error import Res
|
||||
from ..core.dataset import connect_readonly
|
||||
def items() -> Iterator[Res[Item]]:
|
||||
f = max(inputs())
|
||||
with connect_readonly(f) as db:
|
||||
items = db['items']
|
||||
for r in items.all(order_by='time'):
|
||||
with sqlite_connection(f, immutable=True, row_factory='row') as conn:
|
||||
for r in conn.execute('SELECT * FROM items ORDER BY time'):
|
||||
yield Item(
|
||||
id=r['id'],
|
||||
type=r['type'],
|
||||
|
|
|
@ -1,20 +1,17 @@
|
|||
"""
|
||||
[[https://play.google.com/store/apps/details?id=io.github.hidroh.materialistic][Materialistic]] app for Hackernews
|
||||
"""
|
||||
|
||||
REQUIRES = ['dataset']
|
||||
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, NamedTuple, Sequence
|
||||
|
||||
import pytz
|
||||
from my.core import get_files
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from my.config import materialistic as config
|
||||
# todo migrate config to my.hackernews.materialistic
|
||||
|
||||
|
||||
from ..core import get_files
|
||||
from pathlib import Path
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
@ -28,7 +25,7 @@ class Saved(NamedTuple):
|
|||
@property
|
||||
def when(self) -> datetime:
|
||||
ts = int(self.row['time']) / 1000
|
||||
return datetime.fromtimestamp(ts, tz=pytz.utc)
|
||||
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
||||
|
||||
@property
|
||||
def uid(self) -> str:
|
||||
|
@ -47,13 +44,11 @@ class Saved(NamedTuple):
|
|||
return hackernews_link(self.uid)
|
||||
|
||||
|
||||
from ..core.dataset import connect_readonly
|
||||
def raw() -> Iterator[Row]:
|
||||
last = max(inputs())
|
||||
with connect_readonly(last) as db:
|
||||
saved = db['saved']
|
||||
with sqlite_connection(last, immutable=True, row_factory='dict') as conn:
|
||||
yield from conn.execute('SELECT * FROM saved ORDER BY time')
|
||||
# TODO wonder if it's 'save time' or creation time?
|
||||
yield from saved.all(order_by='time')
|
||||
|
||||
|
||||
def saves() -> Iterator[Saved]:
|
||||
|
|
|
@ -119,15 +119,17 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
|
|||
# todo use TypedDict?
|
||||
for f in inputs():
|
||||
with sqlite_connect_immutable(f) as db:
|
||||
|
||||
for (self_uid, thread_json) in select(('user_id', 'thread_info'), 'FROM threads', db=db):
|
||||
j = json.loads(thread_json)
|
||||
# todo in principle should leave the thread attached to the message?
|
||||
# since thread is a group of users?
|
||||
# inviter usually contains our own user
|
||||
for r in [j['inviter'], *j['recipients']]:
|
||||
# id disappeared and seems that pk_id is in use now (around december 2022)
|
||||
uid = r.get('id') or r.get('pk_id')
|
||||
assert uid is not None
|
||||
yield User(
|
||||
id=str(r['id']), # for some reason it's int in the db
|
||||
id=str(uid), # for some reason it's int in the db
|
||||
full_name=r['full_name'],
|
||||
username=r['username'],
|
||||
)
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..core.common import LazyLogger
|
|||
|
||||
logger = LazyLogger(__name__)
|
||||
|
||||
from my.config import jawbone as config
|
||||
from my.config import jawbone as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
BDIR = config.export_dir
|
||||
|
|
|
@ -85,7 +85,7 @@ def iter_useful(data_file: str):
|
|||
|
||||
# TODO <<< hmm. these files do contain deep and light sleep??
|
||||
# also steps stats??
|
||||
from my.config import jawbone as config
|
||||
from my.config import jawbone as config # type: ignore[attr-defined]
|
||||
|
||||
p = config.export_dir / 'old_csv'
|
||||
# TODO with_my?
|
||||
|
@ -95,7 +95,7 @@ files = [
|
|||
p / "2017.csv",
|
||||
]
|
||||
|
||||
from kython import concat, parse_date
|
||||
from kython import concat, parse_date # type: ignore
|
||||
useful = concat(*(list(iter_useful(str(f))) for f in files))
|
||||
|
||||
# for u in useful:
|
||||
|
@ -108,7 +108,7 @@ dates = [parse_date(u.date, yearfirst=True, dayfirst=False) for u in useful]
|
|||
# TODO filter outliers?
|
||||
|
||||
# TODO don't need this anymore? it's gonna be in dashboards package
|
||||
from kython.plotting import plot_timestamped
|
||||
from kython.plotting import plot_timestamped # type: ignore
|
||||
for attr, lims, mavg, fig in [ # type: ignore
|
||||
('light', (0, 400), 5, None),
|
||||
('deep', (0, 600), 5, None),
|
||||
|
|
|
@ -19,7 +19,7 @@ from ..core.common import LazyLogger, mcachew, fastermime
|
|||
from ..core.error import Res, sort_res_by
|
||||
from ..core.cachew import cache_dir
|
||||
|
||||
from my.config import photos as config
|
||||
from my.config import photos as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
logger = LazyLogger(__name__)
|
||||
|
|
|
@ -58,22 +58,27 @@ def stats() -> Stats:
|
|||
|
||||
# basically, hack config and populate it with fake data? fake data generated by DAL, but the rest is handled by this?
|
||||
|
||||
from typing import Iterator
|
||||
from contextlib import contextmanager
|
||||
from typing import Iterator
|
||||
# todo take seed, or what?
|
||||
@contextmanager
|
||||
def fake_data(rows: int=1000) -> Iterator[None]:
|
||||
def fake_data(rows: int=1000) -> Iterator:
|
||||
# todo also disable cachew automatically for such things?
|
||||
from .core.cachew import disabled_cachew
|
||||
from .core.cfg import override_config
|
||||
from my.core.cfg import tmp_config
|
||||
from my.core.cachew import disabled_cachew
|
||||
from tempfile import TemporaryDirectory
|
||||
with disabled_cachew(), override_config(config) as cfg, TemporaryDirectory() as td:
|
||||
import json
|
||||
with disabled_cachew(), TemporaryDirectory() as td:
|
||||
tdir = Path(td)
|
||||
cfg.export_path = tdir
|
||||
f = tdir / 'rescuetime.json'
|
||||
import json
|
||||
f.write_text(json.dumps(dal.fake_data_generator(rows=rows)))
|
||||
yield
|
||||
|
||||
class override:
|
||||
class rescuetime:
|
||||
export_path = tdir
|
||||
|
||||
with tmp_config(modules=__name__, config=override) as cfg:
|
||||
yield cfg
|
||||
# TODO ok, now it's something that actually could run on CI!
|
||||
# todo would be kinda nice if doctor could run against the fake data, to have a basic health check of the module?
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ from typing import Iterable
|
|||
from .core import Res, get_files
|
||||
from .core.common import isoparse, Json
|
||||
|
||||
import tcxparser
|
||||
import tcxparser # type: ignore[import]
|
||||
|
||||
from my.config import runnerup as config
|
||||
|
||||
|
|
21
my/simple.py
Normal file
21
my/simple.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
'''
|
||||
Just a demo module for testing and documentation purposes
|
||||
'''
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterator
|
||||
|
||||
from my.core import make_config
|
||||
|
||||
from my.config import simple as user_config
|
||||
|
||||
|
||||
@dataclass
|
||||
class simple(user_config):
|
||||
count: int
|
||||
|
||||
|
||||
config = make_config(simple)
|
||||
|
||||
|
||||
def items() -> Iterator[int]:
|
||||
yield from range(config.count)
|
14
my/taplog.py
14
my/taplog.py
|
@ -1,11 +1,11 @@
|
|||
'''
|
||||
[[https://play.google.com/store/apps/details?id=com.waterbear.taglog][Taplog]] app data
|
||||
'''
|
||||
|
||||
from datetime import datetime
|
||||
from typing import NamedTuple, Dict, Optional, Iterable
|
||||
|
||||
from .core import get_files
|
||||
from my.core import get_files, stat, Stats
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from my.config import taplog as user_config
|
||||
|
||||
|
@ -46,11 +46,10 @@ class Entry(NamedTuple):
|
|||
|
||||
def entries() -> Iterable[Entry]:
|
||||
last = max(get_files(user_config.export_path))
|
||||
from .core.dataset import connect_readonly
|
||||
db = connect_readonly(last)
|
||||
# todo is it sorted by timestamp?
|
||||
for row in db['Log'].all():
|
||||
yield Entry(row)
|
||||
with sqlite_connection(last, immutable=True, row_factory='dict') as db:
|
||||
# todo is it sorted by timestamp?
|
||||
for row in db.execute('SELECT * FROM Log'):
|
||||
yield Entry(row)
|
||||
|
||||
|
||||
# I guess worth having as top level considering it would be quite common?
|
||||
|
@ -60,6 +59,5 @@ def by_button(button: str) -> Iterable[Entry]:
|
|||
yield e
|
||||
|
||||
|
||||
from .core import stat, Stats
|
||||
def stats() -> Stats:
|
||||
return stat(entries)
|
||||
|
|
103
my/telegram/telegram_backup.py
Normal file
103
my/telegram/telegram_backup.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
"""
|
||||
Telegram data via [fabianonline/telegram_backup](https://github.com/fabianonline/telegram_backup) tool
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
import sqlite3
|
||||
from typing import Dict, Iterator, Optional
|
||||
|
||||
from my.core import datetime_aware, PathIsh
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from my.config import telegram as user_config
|
||||
|
||||
|
||||
@dataclass
|
||||
class config(user_config.telegram_backup):
|
||||
# path to the export database.sqlite
|
||||
export_path: PathIsh
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chat:
|
||||
id: str
|
||||
name: Optional[str]
|
||||
# not all users have short handle + groups don't have them either?
|
||||
# TODO hmm some groups have it -- it's just the tool doesn't dump them??
|
||||
handle: Optional[str]
|
||||
# not sure if need type?
|
||||
|
||||
|
||||
@dataclass
|
||||
class User:
|
||||
id: str
|
||||
name: Optional[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Message:
|
||||
# NOTE: message id is NOT unique globally -- only with respect to chat!
|
||||
id: int
|
||||
time: datetime_aware
|
||||
chat: Chat
|
||||
sender: User
|
||||
text: str
|
||||
|
||||
@property
|
||||
def permalink(self) -> str:
|
||||
handle = self.chat.handle
|
||||
if handle is None:
|
||||
clink = str(self.chat.id)
|
||||
else:
|
||||
# FIXME add c/
|
||||
clink = f'{handle}'
|
||||
|
||||
# NOTE: don't think deep links to messages work for private conversations sadly https://core.telegram.org/api/links#message-links
|
||||
# NOTE: doesn't look like this works with private groups at all, doesn't even jump into it
|
||||
return f'https://t.me/{clink}/{self.id}'
|
||||
|
||||
|
||||
|
||||
Chats = Dict[str, Chat]
|
||||
def _message_from_row(r: sqlite3.Row, *, chats: Chats) -> Message:
|
||||
ts = r['time']
|
||||
time = datetime.fromtimestamp(ts, tz=timezone.utc)
|
||||
chat = chats[r['source_id']]
|
||||
sender = chats[r['sender_id']]
|
||||
return Message(
|
||||
id=r['message_id'],
|
||||
time=time,
|
||||
chat=chat,
|
||||
sender=User(id=sender.id, name=sender.name),
|
||||
text=r['text'],
|
||||
)
|
||||
|
||||
|
||||
def messages() -> Iterator[Message]:
|
||||
with sqlite_connection(config.export_path, immutable=True, row_factory='row') as db:
|
||||
|
||||
chats: Chats = {}
|
||||
for r in db.execute('SELECT * FROM chats'):
|
||||
chat = Chat(id=r['id'], name=r['name'], handle=None)
|
||||
assert chat.id not in chats
|
||||
chats[chat.id] = chat
|
||||
|
||||
for r in db.execute('SELECT * FROM users'):
|
||||
first = r["first_name"]
|
||||
last = r["last_name"]
|
||||
name: Optional[str]
|
||||
if first is not None and last is not None:
|
||||
name = f'{first} {last}'
|
||||
else:
|
||||
name = first or last
|
||||
|
||||
chat = Chat(id=r['id'], name=name, handle=r['username'])
|
||||
assert chat.id not in chats
|
||||
chats[chat.id] = chat
|
||||
|
||||
# TODO order by? not sure
|
||||
for r in db.execute('SELECT * FROM messages WHERE message_type NOT IN ("service_message", "empty_message")'):
|
||||
# seems like the only remaining have message_type = 'message'
|
||||
yield _message_from_row(r, chats=chats)
|
||||
|
|
@ -3,19 +3,21 @@ Tinder data from Android app database (in =/data/data/com.tinder/databases/tinde
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
REQUIRES = ['dataset']
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
from typing import Sequence, Iterator, Union, Dict, List, Mapping
|
||||
|
||||
from more_itertools import unique_everseen
|
||||
|
||||
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware
|
||||
from my.core.dataset import connect_readonly, DatabaseT
|
||||
from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, LazyLogger
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
|
||||
logger = LazyLogger(__name__)
|
||||
|
||||
|
||||
from my.config import tinder as user_config
|
||||
|
@ -39,7 +41,7 @@ class _BaseMatch:
|
|||
id: str
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(unsafe_hash=True)
|
||||
class _Match(_BaseMatch):
|
||||
person_id: str
|
||||
|
||||
|
@ -59,7 +61,7 @@ class _BaseMessage:
|
|||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(unsafe_hash=True)
|
||||
class _Message(_BaseMessage):
|
||||
match_id: str
|
||||
from_id: str
|
||||
|
@ -73,6 +75,8 @@ class Message(_BaseMessage):
|
|||
to: Person
|
||||
|
||||
|
||||
# todo hmm I have a suspicion it might be cumulative?
|
||||
# although still possible that the user might remove/install app back, so need to keep that in mind
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
@ -82,41 +86,46 @@ Entity = Union[Person, Match, Message]
|
|||
|
||||
|
||||
def _entities() -> Iterator[Res[_Entity]]:
|
||||
for db_file in inputs():
|
||||
with connect_readonly(db_file) as db:
|
||||
dbs = inputs()
|
||||
for i, db_file in enumerate(dbs):
|
||||
logger.debug(f'processing {db_file} {i}/{len(dbs)}')
|
||||
with sqlite_connection(db_file, immutable=True, row_factory='row') as db:
|
||||
yield from _handle_db(db)
|
||||
|
||||
|
||||
def _handle_db(db: DatabaseT) -> Iterator[Res[_Entity]]:
|
||||
def _handle_db(db: sqlite3.Connection) -> Iterator[Res[_Entity]]:
|
||||
# profile_user_view contains our own user id
|
||||
for row in chain(db['profile_user_view'], db['match_person']):
|
||||
for row in chain(
|
||||
db.execute('SELECT * FROM profile_user_view'),
|
||||
db.execute('SELECT * FROM match_person'),
|
||||
):
|
||||
try:
|
||||
yield _parse_person(row)
|
||||
except Exception as e:
|
||||
# todo attach error contex?
|
||||
yield e
|
||||
|
||||
for row in db['match']:
|
||||
for row in db.execute('SELECT * FROM match'):
|
||||
try:
|
||||
yield _parse_match(row)
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
for row in db['message']:
|
||||
for row in db.execute('SELECT * FROM message'):
|
||||
try:
|
||||
yield _parse_msg(row)
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
|
||||
def _parse_person(row) -> Person:
|
||||
def _parse_person(row: sqlite3.Row) -> Person:
|
||||
return Person(
|
||||
id=row['id'],
|
||||
name=row['name'],
|
||||
)
|
||||
|
||||
|
||||
def _parse_match(row) -> _Match:
|
||||
def _parse_match(row: sqlite3.Row) -> _Match:
|
||||
return _Match(
|
||||
id=row['id'],
|
||||
person_id=row['person_id'],
|
||||
|
@ -124,7 +133,7 @@ def _parse_match(row) -> _Match:
|
|||
)
|
||||
|
||||
|
||||
def _parse_msg(row) -> _Message:
|
||||
def _parse_msg(row: sqlite3.Row) -> _Message:
|
||||
# note it also has raw_message_data -- not sure which is best to use..
|
||||
sent = row['sent_date']
|
||||
return _Message(
|
||||
|
|
|
@ -12,7 +12,7 @@ except ImportError as ie:
|
|||
# must be caused by something else
|
||||
raise ie
|
||||
try:
|
||||
from my.config import twitter as user_config # type: ignore[misc]
|
||||
from my.config import twitter as user_config # type: ignore[misc,assignment]
|
||||
except ImportError:
|
||||
raise ie # raise the original exception.. must be something else
|
||||
else:
|
||||
|
|
|
@ -4,31 +4,32 @@ Twitter data from Talon app database (in =/data/data/com.klinker.android.twitter
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
import re
|
||||
from typing import Iterator, Sequence, Optional, Dict
|
||||
import sqlite3
|
||||
from typing import Iterator, Sequence, Union
|
||||
|
||||
import pytz
|
||||
from more_itertools import unique_everseen
|
||||
|
||||
from my.core import Paths, Res, datetime_aware, get_files
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from .common import TweetId, permalink
|
||||
|
||||
from my.config import twitter as user_config
|
||||
|
||||
|
||||
from ..core import Paths, Res, datetime_aware
|
||||
@dataclass
|
||||
class config(user_config.talon):
|
||||
# paths[s]/glob to the exported sqlite databases
|
||||
export_path: Paths
|
||||
|
||||
|
||||
from ..core import get_files
|
||||
from pathlib import Path
|
||||
def inputs() -> Sequence[Path]:
|
||||
return get_files(config.export_path)
|
||||
|
||||
|
||||
from .common import TweetId, permalink
|
||||
|
||||
|
||||
@dataclass(unsafe_hash=True)
|
||||
class Tweet:
|
||||
id_str: TweetId
|
||||
|
@ -51,8 +52,6 @@ class _IsFavorire:
|
|||
tweet: Tweet
|
||||
|
||||
|
||||
from typing import Union
|
||||
from ..core.dataset import connect_readonly
|
||||
Entity = Union[_IsTweet, _IsFavorire]
|
||||
def _entities() -> Iterator[Res[Entity]]:
|
||||
for f in inputs():
|
||||
|
@ -67,35 +66,36 @@ def _process_one(f: Path) -> Iterator[Res[Entity]]:
|
|||
fname = f.name
|
||||
handler = handlers.get(fname)
|
||||
if handler is None:
|
||||
yield RuntimeError(f"Coulnd't find handler for {fname}")
|
||||
yield RuntimeError(f"Could not find handler for {fname}")
|
||||
return
|
||||
with connect_readonly(f) as db:
|
||||
with sqlite_connection(f, immutable=True, row_factory='row') as db:
|
||||
yield from handler(db)
|
||||
|
||||
|
||||
def _process_user_tweets(db) -> Iterator[Res[Entity]]:
|
||||
def _process_user_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
||||
# dunno why it's called 'lists'
|
||||
for r in db['lists'].all(order_by='time'):
|
||||
for r in db.execute('SELECT * FROM lists ORDER BY time'):
|
||||
try:
|
||||
yield _IsTweet(_parse_tweet(r))
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
|
||||
def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
|
||||
for r in db['favorite_tweets'].all(order_by='time'):
|
||||
def _process_favorite_tweets(db: sqlite3.Connection) -> Iterator[Res[Entity]]:
|
||||
for r in db.execute('SELECT * FROM favorite_tweets ORDER BY time'):
|
||||
try:
|
||||
yield _IsFavorire(_parse_tweet(r))
|
||||
except Exception as e:
|
||||
yield e
|
||||
|
||||
def _parse_tweet(row) -> Tweet:
|
||||
|
||||
def _parse_tweet(row: sqlite3.Row) -> Tweet:
|
||||
# ok so looks like it's tz aware..
|
||||
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
|
||||
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
|
||||
# and it's created here, so looks like it's properly parsed from the api
|
||||
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
||||
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
|
||||
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=timezone.utc)
|
||||
text = row['text']
|
||||
|
||||
# try explanding URLs.. sadly there are no positions in the db
|
||||
|
@ -132,7 +132,6 @@ def _parse_tweet(row) -> Tweet:
|
|||
)
|
||||
|
||||
|
||||
from more_itertools import unique_everseen
|
||||
def tweets() -> Iterator[Res[Tweet]]:
|
||||
for x in unique_everseen(_entities()):
|
||||
if isinstance(x, Exception):
|
||||
|
@ -140,6 +139,7 @@ def tweets() -> Iterator[Res[Tweet]]:
|
|||
elif isinstance(x, _IsTweet):
|
||||
yield x.tweet
|
||||
|
||||
|
||||
def likes() -> Iterator[Res[Tweet]]:
|
||||
for x in unique_everseen(_entities()):
|
||||
if isinstance(x, Exception):
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
"""
|
||||
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
|
||||
"""
|
||||
|
||||
REQUIRES = ['dataset']
|
||||
|
||||
from ..core.common import Paths
|
||||
from ..core.error import Res
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple, Iterator, List
|
||||
|
||||
|
||||
from my.core import Paths, Res, get_files, LazyLogger, Json, datetime_aware, stat, Stats
|
||||
from my.core.cfg import make_config
|
||||
from my.core.sqlite import sqlite_connection
|
||||
|
||||
from my.config import twint as user_config
|
||||
|
||||
# TODO move to twitter.twint config structure
|
||||
|
@ -17,16 +21,9 @@ class twint(user_config):
|
|||
|
||||
####
|
||||
|
||||
from ..core.cfg import make_config
|
||||
config = make_config(twint)
|
||||
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import NamedTuple, Iterator, List
|
||||
from pathlib import Path
|
||||
|
||||
from ..core.common import get_files, LazyLogger, Json, datetime_aware
|
||||
|
||||
log = LazyLogger(__name__)
|
||||
|
||||
|
||||
|
@ -110,25 +107,19 @@ WHERE {where}
|
|||
ORDER BY T.created_at
|
||||
'''
|
||||
|
||||
def _get_db():
|
||||
from ..core.dataset import connect_readonly
|
||||
db_path = get_db_path()
|
||||
return connect_readonly(db_path)
|
||||
|
||||
|
||||
def tweets() -> Iterator[Res[Tweet]]:
|
||||
db = _get_db()
|
||||
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
|
||||
yield from map(Tweet, res)
|
||||
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
|
||||
res = db.execute(_QUERY.format(where='F.tweet_id IS NULL'))
|
||||
yield from map(Tweet, res)
|
||||
|
||||
|
||||
def likes() -> Iterator[Res[Tweet]]:
|
||||
db = _get_db()
|
||||
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
|
||||
yield from map(Tweet, res)
|
||||
with sqlite_connection(get_db_path(), immutable=True, row_factory='row') as db:
|
||||
res = db.execute(_QUERY.format(where='F.tweet_id IS NOT NULL'))
|
||||
yield from map(Tweet, res)
|
||||
|
||||
|
||||
from ..core import stat, Stats
|
||||
def stats() -> Stats:
|
||||
return {
|
||||
**stat(tweets),
|
||||
|
|
|
@ -3,7 +3,8 @@ from datetime import datetime
|
|||
import json
|
||||
from typing import NamedTuple, Iterable, Sequence, Optional
|
||||
|
||||
from my.config import vk as config
|
||||
|
||||
from my.config import vk as config # type: ignore[attr-defined]
|
||||
|
||||
|
||||
class Favorite(NamedTuple):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue