diff --git a/my/fbmessenger/common.py b/my/fbmessenger/common.py index 0f3ec1b..5f8bd85 100644 --- a/my/fbmessenger/common.py +++ b/my/fbmessenger/common.py @@ -1,3 +1,5 @@ +from my.core import __NOT_HPI_MODULE__ + from datetime import datetime from typing import Iterator, Optional, TYPE_CHECKING @@ -35,7 +37,9 @@ class Message(Protocol): from itertools import chain from more_itertools import unique_everseen -from my.core import Res +from my.core import warn_if_empty, Res + +@warn_if_empty def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]: # todo might be nice to dump some stats for debugging, e.g. how many were overlapping? def key(r: Res[Message]): diff --git a/my/twitter/all.py b/my/twitter/all.py index 0899454..efdc991 100644 --- a/my/twitter/all.py +++ b/my/twitter/all.py @@ -1,22 +1,51 @@ """ Unified Twitter data (merged from the archive and periodic updates) """ +from typing import Iterator +from ..core import Res +from ..core.source import import_source +from .common import merge_tweets, Tweet + # NOTE: you can comment out the sources you don't need -from . import twint, archive - -from .common import merge_tweets +src_twint = import_source(module_name=f'my.twitter.twint') +src_archive = import_source(module_name=f'my.twitter.archive') -def tweets(): +@src_twint +def _tweets_twint() -> Iterator[Res[Tweet]]: + from . import twint as src + return src.tweets() + +@src_archive +def _tweets_archive() -> Iterator[Res[Tweet]]: + from . import archive as src + return src.tweets() + + +@src_twint +def _likes_twint() -> Iterator[Res[Tweet]]: + from . import twint as src + return src.likes() + +@src_archive +def _likes_archive() -> Iterator[Res[Tweet]]: + from . import archive as src + return src.likes() + + +def tweets() -> Iterator[Res[Tweet]]: yield from merge_tweets( - twint .tweets(), - archive.tweets(), + _tweets_twint(), + _tweets_archive(), ) -def likes(): +def likes() -> Iterator[Res[Tweet]]: yield from merge_tweets( - twint .likes(), - archive.likes(), + _likes_twint(), + _likes_archive(), ) + + +# TODO maybe to avoid all the boilerplate above could use some sort of module Protocol? diff --git a/my/twitter/archive.py b/my/twitter/archive.py index 0bb3151..f2434d5 100644 --- a/my/twitter/archive.py +++ b/my/twitter/archive.py @@ -18,7 +18,7 @@ except ImportError as e: from dataclasses import dataclass -from ..core import Paths +from ..core import Paths, Res, datetime_aware @dataclass class twitter_archive(user_config): @@ -32,7 +32,7 @@ config = make_config(twitter_archive) from datetime import datetime -from typing import List, Optional, Iterable, NamedTuple, Sequence +from typing import List, Optional, NamedTuple, Sequence, Iterator from pathlib import Path import json @@ -61,7 +61,7 @@ class Tweet(NamedTuple): return self.raw['id_str'] @property - def created_at(self) -> datetime: + def created_at(self) -> datetime_aware: dts = self.raw['created_at'] return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') @@ -159,12 +159,12 @@ class ZipExport: [acc] = self.raw('account') return acc['username'] - def tweets(self) -> Iterable[Tweet]: + def tweets(self) -> Iterator[Tweet]: for r in self.raw('tweet'): yield Tweet(r, screen_name=self.screen_name()) - def likes(self) -> Iterable[Like]: + def likes(self) -> Iterator[Like]: # TODO ugh. would be nice to unify Tweet/Like interface # however, akeout only got tweetId, full text and url for r in self.raw('like'): @@ -172,18 +172,18 @@ class ZipExport: # todo not sure about list and sorting? although can't hurt considering json is not iterative? -def tweets() -> Iterable[Tweet]: +def tweets() -> Iterator[Res[Tweet]]: for inp in inputs(): yield from sorted(ZipExport(inp).tweets(), key=lambda t: t.dt) -def likes() -> Iterable[Like]: +def likes() -> Iterator[Res[Like]]: for inp in inputs(): yield from ZipExport(inp).likes() -def stats(): - from ..core import stat +from ..core import stat, Stats +def stats() -> Stats: return { **stat(tweets), **stat(likes), diff --git a/my/twitter/common.py b/my/twitter/common.py index 4feb544..5fd7daa 100644 --- a/my/twitter/common.py +++ b/my/twitter/common.py @@ -1,12 +1,21 @@ +from my.core import __NOT_HPI_MODULE__ + from itertools import chain +from typing import Iterator, Any from more_itertools import unique_everseen -from ..core import warn_if_empty, __NOT_HPI_MODULE__ +# TODO add proper Protocol for Tweet +Tweet = Any + + +from my.core import warn_if_empty, Res @warn_if_empty -def merge_tweets(*sources): - yield from unique_everseen( - chain(*sources), - key=lambda t: t.id_str, - ) +def merge_tweets(*sources: Iterator[Res[Tweet]]) -> Iterator[Res[Tweet]]: + def key(r: Res[Tweet]): + if isinstance(r, Exception): + return str(r) + else: + return r.id_str + yield from unique_everseen(chain(*sources), key=key) diff --git a/my/twitter/talon.py b/my/twitter/talon.py index a369f69..4b42b1f 100644 --- a/my/twitter/talon.py +++ b/my/twitter/talon.py @@ -7,11 +7,12 @@ from dataclasses import dataclass from datetime import datetime from typing import Iterator, Sequence, Optional, Dict +import pytz from my.config import twitter as user_config -from ..core import Paths +from ..core import Paths, Res, datetime_aware @dataclass class config(user_config.talon): # paths[s]/glob to the exported sqlite databases @@ -28,8 +29,7 @@ def inputs() -> Sequence[Path]: @dataclass(unsafe_hash=True) class Tweet: id_str: str - # TODO figure out if utc - created_at: datetime + created_at: datetime_aware screen_name: str text: str urls: Sequence[str] @@ -45,7 +45,6 @@ class _IsFavorire: from typing import Union -from ..core.error import Res from ..core.dataset import connect_readonly Entity = Union[_IsTweet, _IsFavorire] def _entities() -> Iterator[Res[Entity]]: @@ -86,9 +85,17 @@ def _process_favorite_tweets(db) -> Iterator[Res[Entity]]: def _parse_tweet(row) -> Tweet: # TODO row['retweeter] if not empty, would be user's name and means retweet? # screen name would be the actual tweet's author + + # ok so looks like it's tz aware.. + # https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95 + # uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime() + # and it's created here, so looks like it's properly parsed from the api + # https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79 + created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc) + return Tweet( id_str=str(row['tweet_id']), - created_at=datetime.fromtimestamp(row['time'] / 1000), + created_at=created_at, screen_name=row['screen_name'], text=row['text'], # todo hmm text sometimes is trimmed with ellipsis? at least urls diff --git a/my/twitter/twint.py b/my/twitter/twint.py index c8d426e..ee84ea1 100644 --- a/my/twitter/twint.py +++ b/my/twitter/twint.py @@ -5,6 +5,7 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin REQUIRES = ['dataset'] from ..core.common import Paths +from ..core.error import Res from dataclasses import dataclass from my.config import twint as user_config @@ -21,10 +22,10 @@ config = make_config(twint) from datetime import datetime -from typing import NamedTuple, Iterable, List +from typing import NamedTuple, Iterator, List from pathlib import Path -from ..core.common import get_files, LazyLogger, Json +from ..core.common import get_files, LazyLogger, Json, datetime_aware from ..core.time import abbr_to_timezone log = LazyLogger(__name__) @@ -42,7 +43,7 @@ class Tweet(NamedTuple): return self.row['id_str'] @property - def created_at(self) -> datetime: + def created_at(self) -> datetime_aware: seconds = self.row['created_at'] / 1000 tz_abbr = self.row['timezone'] tz = abbr_to_timezone(tz_abbr) @@ -97,20 +98,20 @@ def _get_db(): return connect_readonly(db_path) -def tweets() -> Iterable[Tweet]: +def tweets() -> Iterator[Res[Tweet]]: db = _get_db() res = db.query(_QUERY.format(where='F.tweet_id IS NULL')) yield from map(Tweet, res) -def likes() -> Iterable[Tweet]: +def likes() -> Iterator[Res[Tweet]]: db = _get_db() res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL')) yield from map(Tweet, res) -def stats(): - from ..core import stat +from ..core import stat, Stats +def stats() -> Stats: return { **stat(tweets), **stat(likes),