twitter: use import_source and proper merging for tweets from different sources
+ use proper datetime_aware for created_at
This commit is contained in:
parent
afdf9d4334
commit
b9852f45cf
6 changed files with 87 additions and 37 deletions
|
@ -1,3 +1,5 @@
|
|||
from my.core import __NOT_HPI_MODULE__
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Optional, TYPE_CHECKING
|
||||
|
||||
|
@ -35,7 +37,9 @@ class Message(Protocol):
|
|||
|
||||
from itertools import chain
|
||||
from more_itertools import unique_everseen
|
||||
from my.core import Res
|
||||
from my.core import warn_if_empty, Res
|
||||
|
||||
@warn_if_empty
|
||||
def _merge_messages(*sources: Iterator[Res[Message]]) -> Iterator[Res[Message]]:
|
||||
# todo might be nice to dump some stats for debugging, e.g. how many were overlapping?
|
||||
def key(r: Res[Message]):
|
||||
|
|
|
@ -1,22 +1,51 @@
|
|||
"""
|
||||
Unified Twitter data (merged from the archive and periodic updates)
|
||||
"""
|
||||
from typing import Iterator
|
||||
from ..core import Res
|
||||
from ..core.source import import_source
|
||||
from .common import merge_tweets, Tweet
|
||||
|
||||
|
||||
# NOTE: you can comment out the sources you don't need
|
||||
from . import twint, archive
|
||||
|
||||
from .common import merge_tweets
|
||||
src_twint = import_source(module_name=f'my.twitter.twint')
|
||||
src_archive = import_source(module_name=f'my.twitter.archive')
|
||||
|
||||
|
||||
def tweets():
|
||||
@src_twint
|
||||
def _tweets_twint() -> Iterator[Res[Tweet]]:
|
||||
from . import twint as src
|
||||
return src.tweets()
|
||||
|
||||
@src_archive
|
||||
def _tweets_archive() -> Iterator[Res[Tweet]]:
|
||||
from . import archive as src
|
||||
return src.tweets()
|
||||
|
||||
|
||||
@src_twint
|
||||
def _likes_twint() -> Iterator[Res[Tweet]]:
|
||||
from . import twint as src
|
||||
return src.likes()
|
||||
|
||||
@src_archive
|
||||
def _likes_archive() -> Iterator[Res[Tweet]]:
|
||||
from . import archive as src
|
||||
return src.likes()
|
||||
|
||||
|
||||
def tweets() -> Iterator[Res[Tweet]]:
|
||||
yield from merge_tweets(
|
||||
twint .tweets(),
|
||||
archive.tweets(),
|
||||
_tweets_twint(),
|
||||
_tweets_archive(),
|
||||
)
|
||||
|
||||
|
||||
def likes():
|
||||
def likes() -> Iterator[Res[Tweet]]:
|
||||
yield from merge_tweets(
|
||||
twint .likes(),
|
||||
archive.likes(),
|
||||
_likes_twint(),
|
||||
_likes_archive(),
|
||||
)
|
||||
|
||||
|
||||
# TODO maybe to avoid all the boilerplate above could use some sort of module Protocol?
|
||||
|
|
|
@ -18,7 +18,7 @@ except ImportError as e:
|
|||
|
||||
|
||||
from dataclasses import dataclass
|
||||
from ..core import Paths
|
||||
from ..core import Paths, Res, datetime_aware
|
||||
|
||||
@dataclass
|
||||
class twitter_archive(user_config):
|
||||
|
@ -32,7 +32,7 @@ config = make_config(twitter_archive)
|
|||
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterable, NamedTuple, Sequence
|
||||
from typing import List, Optional, NamedTuple, Sequence, Iterator
|
||||
from pathlib import Path
|
||||
import json
|
||||
|
||||
|
@ -61,7 +61,7 @@ class Tweet(NamedTuple):
|
|||
return self.raw['id_str']
|
||||
|
||||
@property
|
||||
def created_at(self) -> datetime:
|
||||
def created_at(self) -> datetime_aware:
|
||||
dts = self.raw['created_at']
|
||||
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
|
||||
|
||||
|
@ -159,12 +159,12 @@ class ZipExport:
|
|||
[acc] = self.raw('account')
|
||||
return acc['username']
|
||||
|
||||
def tweets(self) -> Iterable[Tweet]:
|
||||
def tweets(self) -> Iterator[Tweet]:
|
||||
for r in self.raw('tweet'):
|
||||
yield Tweet(r, screen_name=self.screen_name())
|
||||
|
||||
|
||||
def likes(self) -> Iterable[Like]:
|
||||
def likes(self) -> Iterator[Like]:
|
||||
# TODO ugh. would be nice to unify Tweet/Like interface
|
||||
# however, akeout only got tweetId, full text and url
|
||||
for r in self.raw('like'):
|
||||
|
@ -172,18 +172,18 @@ class ZipExport:
|
|||
|
||||
|
||||
# todo not sure about list and sorting? although can't hurt considering json is not iterative?
|
||||
def tweets() -> Iterable[Tweet]:
|
||||
def tweets() -> Iterator[Res[Tweet]]:
|
||||
for inp in inputs():
|
||||
yield from sorted(ZipExport(inp).tweets(), key=lambda t: t.dt)
|
||||
|
||||
|
||||
def likes() -> Iterable[Like]:
|
||||
def likes() -> Iterator[Res[Like]]:
|
||||
for inp in inputs():
|
||||
yield from ZipExport(inp).likes()
|
||||
|
||||
|
||||
def stats():
|
||||
from ..core import stat
|
||||
from ..core import stat, Stats
|
||||
def stats() -> Stats:
|
||||
return {
|
||||
**stat(tweets),
|
||||
**stat(likes),
|
||||
|
|
|
@ -1,12 +1,21 @@
|
|||
from my.core import __NOT_HPI_MODULE__
|
||||
|
||||
from itertools import chain
|
||||
from typing import Iterator, Any
|
||||
|
||||
from more_itertools import unique_everseen
|
||||
|
||||
from ..core import warn_if_empty, __NOT_HPI_MODULE__
|
||||
|
||||
# TODO add proper Protocol for Tweet
|
||||
Tweet = Any
|
||||
|
||||
|
||||
from my.core import warn_if_empty, Res
|
||||
@warn_if_empty
|
||||
def merge_tweets(*sources):
|
||||
yield from unique_everseen(
|
||||
chain(*sources),
|
||||
key=lambda t: t.id_str,
|
||||
)
|
||||
def merge_tweets(*sources: Iterator[Res[Tweet]]) -> Iterator[Res[Tweet]]:
|
||||
def key(r: Res[Tweet]):
|
||||
if isinstance(r, Exception):
|
||||
return str(r)
|
||||
else:
|
||||
return r.id_str
|
||||
yield from unique_everseen(chain(*sources), key=key)
|
||||
|
|
|
@ -7,11 +7,12 @@ from dataclasses import dataclass
|
|||
from datetime import datetime
|
||||
from typing import Iterator, Sequence, Optional, Dict
|
||||
|
||||
import pytz
|
||||
|
||||
from my.config import twitter as user_config
|
||||
|
||||
|
||||
from ..core import Paths
|
||||
from ..core import Paths, Res, datetime_aware
|
||||
@dataclass
|
||||
class config(user_config.talon):
|
||||
# paths[s]/glob to the exported sqlite databases
|
||||
|
@ -28,8 +29,7 @@ def inputs() -> Sequence[Path]:
|
|||
@dataclass(unsafe_hash=True)
|
||||
class Tweet:
|
||||
id_str: str
|
||||
# TODO figure out if utc
|
||||
created_at: datetime
|
||||
created_at: datetime_aware
|
||||
screen_name: str
|
||||
text: str
|
||||
urls: Sequence[str]
|
||||
|
@ -45,7 +45,6 @@ class _IsFavorire:
|
|||
|
||||
|
||||
from typing import Union
|
||||
from ..core.error import Res
|
||||
from ..core.dataset import connect_readonly
|
||||
Entity = Union[_IsTweet, _IsFavorire]
|
||||
def _entities() -> Iterator[Res[Entity]]:
|
||||
|
@ -86,9 +85,17 @@ def _process_favorite_tweets(db) -> Iterator[Res[Entity]]:
|
|||
def _parse_tweet(row) -> Tweet:
|
||||
# TODO row['retweeter] if not empty, would be user's name and means retweet?
|
||||
# screen name would be the actual tweet's author
|
||||
|
||||
# ok so looks like it's tz aware..
|
||||
# https://github.com/klinker24/talon-for-twitter-android/blob/c3b0612717ba3ea93c0cae6d907d7d86d640069e/app/src/main/java/com/klinker/android/twitter_l/data/sq_lite/FavoriteTweetsDataSource.java#L95
|
||||
# uses https://docs.oracle.com/javase/7/docs/api/java/util/Date.html#getTime()
|
||||
# and it's created here, so looks like it's properly parsed from the api
|
||||
# https://github.com/Twitter4J/Twitter4J/blob/8376fade8d557896bb9319fb46e39a55b134b166/twitter4j-core/src/internal-json/java/twitter4j/ParseUtil.java#L69-L79
|
||||
created_at = datetime.fromtimestamp(row['time'] / 1000, tz=pytz.utc)
|
||||
|
||||
return Tweet(
|
||||
id_str=str(row['tweet_id']),
|
||||
created_at=datetime.fromtimestamp(row['time'] / 1000),
|
||||
created_at=created_at,
|
||||
screen_name=row['screen_name'],
|
||||
text=row['text'],
|
||||
# todo hmm text sometimes is trimmed with ellipsis? at least urls
|
||||
|
|
|
@ -5,6 +5,7 @@ Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twin
|
|||
REQUIRES = ['dataset']
|
||||
|
||||
from ..core.common import Paths
|
||||
from ..core.error import Res
|
||||
from dataclasses import dataclass
|
||||
from my.config import twint as user_config
|
||||
|
||||
|
@ -21,10 +22,10 @@ config = make_config(twint)
|
|||
|
||||
|
||||
from datetime import datetime
|
||||
from typing import NamedTuple, Iterable, List
|
||||
from typing import NamedTuple, Iterator, List
|
||||
from pathlib import Path
|
||||
|
||||
from ..core.common import get_files, LazyLogger, Json
|
||||
from ..core.common import get_files, LazyLogger, Json, datetime_aware
|
||||
from ..core.time import abbr_to_timezone
|
||||
|
||||
log = LazyLogger(__name__)
|
||||
|
@ -42,7 +43,7 @@ class Tweet(NamedTuple):
|
|||
return self.row['id_str']
|
||||
|
||||
@property
|
||||
def created_at(self) -> datetime:
|
||||
def created_at(self) -> datetime_aware:
|
||||
seconds = self.row['created_at'] / 1000
|
||||
tz_abbr = self.row['timezone']
|
||||
tz = abbr_to_timezone(tz_abbr)
|
||||
|
@ -97,20 +98,20 @@ def _get_db():
|
|||
return connect_readonly(db_path)
|
||||
|
||||
|
||||
def tweets() -> Iterable[Tweet]:
|
||||
def tweets() -> Iterator[Res[Tweet]]:
|
||||
db = _get_db()
|
||||
res = db.query(_QUERY.format(where='F.tweet_id IS NULL'))
|
||||
yield from map(Tweet, res)
|
||||
|
||||
|
||||
def likes() -> Iterable[Tweet]:
|
||||
def likes() -> Iterator[Res[Tweet]]:
|
||||
db = _get_db()
|
||||
res = db.query(_QUERY.format(where='F.tweet_id IS NOT NULL'))
|
||||
yield from map(Tweet, res)
|
||||
|
||||
|
||||
def stats():
|
||||
from ..core import stat
|
||||
from ..core import stat, Stats
|
||||
def stats() -> Stats:
|
||||
return {
|
||||
**stat(tweets),
|
||||
**stat(likes),
|
||||
|
|
Loading…
Add table
Reference in a new issue