move twitter stuff to twitter subdir

This commit is contained in:
Dima Gerasimov 2020-04-14 21:31:40 +01:00
parent 6f8c2e2f24
commit 56b6ab9aaf
2 changed files with 0 additions and 0 deletions

193
my/twitter/archive.py Executable file
View file

@ -0,0 +1,193 @@
"""
Twitter data (uses official twitter archive export)
See https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive
"""
from . import init
from datetime import date, datetime
from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
from pathlib import Path
import json
import zipfile
import pytz
from .common import PathIsh, get_files, LazyLogger, Json
from .kython import kompress
logger = LazyLogger(__name__)
def _get_export() -> Path:
from my.config import twitter as config
return max(get_files(config.export_path, '*.zip'))
Tid = str
# TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple):
raw: Json
# TODO deprecate tid?
@property
def tid(self) -> Tid:
return self.raw['id_str']
@property
def permalink(self) -> str:
return f'https://twitter.com/i/web/status/{self.tid}'
# TODO deprecate dt?
@property
def dt(self) -> datetime:
dts = self.raw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property
def text(self) -> str:
return self.raw['full_text']
# TODO not sure if I need them...
@property
def entities(self):
return self.raw['entities']
def __str__(self) -> str:
return str(self.raw)
def __repr__(self) -> str:
return repr(self.raw)
class Like(NamedTuple):
raw: Json
# TODO need to make permalink/link/url consistent across my stuff..
@property
def permalink(self) -> str:
# doesn'tseem like link it export is more specific...
return f'https://twitter.com/i/web/status/{self.tid}'
@property
def tid(self) -> Tid:
return self.raw['tweetId']
@property
def text(self) -> Optional[str]:
# ugh. I think none means that tweet was deleted?
return self.raw.get('fullText')
class ZipExport:
def __init__(self) -> None:
self.epath = _get_export()
self.old_format = False # changed somewhere around 2020.03
if not kompress.kexists(self.epath, 'Your archive.html'):
self.old_format = True
def raw(self, what: str): # TODO Json in common?
logger.info('processing: %s %s', self.epath, what)
path = what
if not self.old_format:
path = 'data/' + path
path += '.js'
with kompress.kopen(self.epath, path) as fo:
ddd = fo.read().decode('utf8')
start = ddd.index('[')
ddd = ddd[start:]
for j in json.loads(ddd):
if set(j.keys()) == {what}:
# newer format
yield j[what]
else:
# older format
yield j
def tweets(self) -> Iterator[Tweet]:
for r in self.raw('tweet'):
yield Tweet(r)
def likes(self) -> Iterator[Like]:
# TODO ugh. would be nice to unify Tweet/Like interface
# however, akeout only got tweetId, full text and url
for r in self.raw('like'):
yield Like(r)
def tweets() -> List[Tweet]:
return list(sorted(ZipExport().tweets(), key=lambda t: t.dt))
def likes() -> List[Like]:
return list(ZipExport().likes())
def predicate(p) -> List[Tweet]:
return [t for t in tweets() if p(t)]
def predicate_date(p) -> List[Tweet]: # TODO rename to by_date?
return predicate(lambda t: p(t.dt.date()))
# TODO move these to private tests?
Datish = Union[date, str]
def tweets_on(*dts: Datish) -> List[Tweet]:
from kython import parse_date_new
# TODO how to make sure we don't miss on 29 feb?
dates = {parse_date_new(d) for d in dts}
return predicate_date(lambda d: d in dates)
on = tweets_on
def test_tweet():
raw = """
{
"retweeted" : false,
"entities" : {
"hashtags" : [ ],
"symbols" : [ ],
"user_mentions" : [ ],
"urls" : [ {
"url" : "https://t.co/vUg4W6nxwU",
"expanded_url" : "https://intelligence.org/2013/12/13/aaronson/",
"display_url" : "intelligence.org/2013/12/13/aar…",
"indices" : [ "120", "143" ]
}
]
},
"display_text_range" : [ "0", "90" ],
"favorite_count" : "0",
"in_reply_to_status_id_str" : "24123424",
"id_str" : "2328934829084",
"in_reply_to_user_id" : "23423424",
"truncated" : false,
"retweet_count" : "0",
"id" : "23492349032940",
"in_reply_to_status_id" : "23482984932084",
"created_at" : "Thu Aug 30 07:12:48 +0000 2012",
"favorited" : false,
"full_text" : "this is a test tweet",
"lang" : "ru",
"in_reply_to_screen_name" : "whatever",
"in_reply_to_user_id_str" : "3748274"
}
"""
t = Tweet(json.loads(raw))
assert t.permalink is not None
assert t.dt == datetime(year=2012, month=8, day=30, hour=7, minute=12, second=48, tzinfo=pytz.utc)
assert t.text == 'this is a test tweet'
assert t.tid == '2328934829084'
assert t.entities is not None

65
my/twitter/twint.py Normal file
View file

@ -0,0 +1,65 @@
"""
Twitter data (tweets and favorites). Uses [[https://github.com/twintproject/twint][Twint]] data export.
"""
from datetime import datetime
from typing import NamedTuple, Iterable
from pathlib import Path
from .common import PathIsh, get_files, LazyLogger, Json
from .core.time import abbr_to_timezone
from my.config import twint as config
log = LazyLogger(__name__)
def get_db_path() -> Path:
# TODO don't like the hardcoded extension. maybe, config should decide?
# or, glob only applies to directories?
return max(get_files(config.export_path, glob='*.db'))
class Tweet(NamedTuple):
row: Json
@property
def id_str(self) -> str:
return self.row['id_str']
@property
def created_at(self) -> datetime:
seconds = self.row['created_at'] / 1000
tz_abbr = self.row['timezone']
tz = abbr_to_timezone(tz_abbr)
dt = datetime.fromtimestamp(seconds, tz=tz)
return dt
# TODO permalink -- take user into account?
@property
def screen_name(self) -> str:
return self.row['screen_name']
@property
def text(self) -> str:
return self.row['tweet']
@property
def permalink(self) -> str:
return f'https://twitter.com/{self.screen_name}/status/{self.id_str}'
# TODO urls
def __repr__(self):
return f'Tweet(id_str={self.id_str}, created_at={self.created_at}, text={self.text})'
def tweets() -> Iterable[Tweet]:
import dataset # type: ignore
db_path = get_db_path()
# TODO check that exists?
db = dataset.connect(f'sqlite:///{db_path}')
tdb = db.load_table('tweets')
yield from map(Tweet, tdb.all(order_by='created_at'))