From 14a5a916855b1efae66b25d5441c6c956369fb6d Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sat, 11 Jan 2020 17:01:50 +0000 Subject: [PATCH] add like processing --- my/twitter.py | 56 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/my/twitter.py b/my/twitter.py index 55230f9..c85ecd1 100755 --- a/my/twitter.py +++ b/my/twitter.py @@ -10,7 +10,7 @@ Expects path to be set from datetime import date, datetime -from typing import Union, List, Dict, Set, Optional, Iterator, Any +from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple from pathlib import Path import json import zipfile @@ -41,14 +41,18 @@ def _get_export() -> Path: Tid = str + +# TODO a bit messy... perhaps we do need DAL for twitter exports +Json = Dict[str, Any] + + # TODO make sure it's not used anywhere else and simplify interface -class Tweet: - def __init__(self, tw: Dict[str, Any]) -> None: - self.tw = tw +class Tweet(NamedTuple): + raw: Json @property def tid(self) -> Tid: - return self.tw['id_str'] + return self.raw['id_str'] @property def permalink(self) -> str: @@ -56,33 +60,44 @@ class Tweet: @property def dt(self) -> datetime: - dts = self.tw['created_at'] + dts = self.raw['created_at'] return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') @property def text(self) -> str: - return self.tw['full_text'] + return self.raw['full_text'] @property def entities(self): - return self.tw['entities'] + return self.raw['entities'] def __str__(self) -> str: - return str(self.tw) + return str(self.raw) def __repr__(self) -> str: - return repr(self.tw) + return repr(self.raw) + + +class Like(NamedTuple): + raw: Json + + @property + def tid(self) -> Tid: + return self.raw['tweetId'] + + @property + def text(self) -> str: + return self.raw['fullText'] -# TODO a bit messy... perhaps we do need DAL for twitter exports class ZipExport: def __init__(self) -> None: pass - def raw(self): # TODO Json in common? + def raw(self, what: str): # TODO Json in common? epath = _get_export() - logger.info('processing: %s', epath) - ddd = zipfile.ZipFile(epath).read('tweet.js').decode('utf8') + logger.info('processing: %s %s', epath, what) + ddd = zipfile.ZipFile(epath).read(what).decode('utf8') start = ddd.index('[') ddd = ddd[start:] for j in json.loads(ddd): @@ -90,14 +105,25 @@ class ZipExport: def tweets(self) -> Iterator[Tweet]: - for r in self.raw(): + for r in self.raw('tweet.js'): yield Tweet(r) + def likes(self) -> Iterator[Like]: + # TODO ugh. would be nice to unify Tweet/Like interface + # however, akeout only got tweetId, full text and url + for r in self.raw('like.js'): + yield Like(r) + + def tweets_all() -> List[Tweet]: return list(sorted(ZipExport().tweets(), key=lambda t: t.dt)) +def likes_all() -> List[Like]: + return list(ZipExport().likes()) + + def predicate(p) -> List[Tweet]: return [t for t in tweets_all() if p(t)]