add like processing

This commit is contained in:
Dima Gerasimov 2020-01-11 17:01:50 +00:00
parent a1f65754f9
commit 14a5a91685

View file

@ -10,7 +10,7 @@ Expects path to be set
from datetime import date, datetime from datetime import date, datetime
from typing import Union, List, Dict, Set, Optional, Iterator, Any from typing import Union, List, Dict, Set, Optional, Iterator, Any, NamedTuple
from pathlib import Path from pathlib import Path
import json import json
import zipfile import zipfile
@ -41,14 +41,18 @@ def _get_export() -> Path:
Tid = str Tid = str
# TODO a bit messy... perhaps we do need DAL for twitter exports
Json = Dict[str, Any]
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet: class Tweet(NamedTuple):
def __init__(self, tw: Dict[str, Any]) -> None: raw: Json
self.tw = tw
@property @property
def tid(self) -> Tid: def tid(self) -> Tid:
return self.tw['id_str'] return self.raw['id_str']
@property @property
def permalink(self) -> str: def permalink(self) -> str:
@ -56,33 +60,44 @@ class Tweet:
@property @property
def dt(self) -> datetime: def dt(self) -> datetime:
dts = self.tw['created_at'] dts = self.raw['created_at']
return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y') return datetime.strptime(dts, '%a %b %d %H:%M:%S %z %Y')
@property @property
def text(self) -> str: def text(self) -> str:
return self.tw['full_text'] return self.raw['full_text']
@property @property
def entities(self): def entities(self):
return self.tw['entities'] return self.raw['entities']
def __str__(self) -> str: def __str__(self) -> str:
return str(self.tw) return str(self.raw)
def __repr__(self) -> str: def __repr__(self) -> str:
return repr(self.tw) return repr(self.raw)
class Like(NamedTuple):
raw: Json
@property
def tid(self) -> Tid:
return self.raw['tweetId']
@property
def text(self) -> str:
return self.raw['fullText']
# TODO a bit messy... perhaps we do need DAL for twitter exports
class ZipExport: class ZipExport:
def __init__(self) -> None: def __init__(self) -> None:
pass pass
def raw(self): # TODO Json in common? def raw(self, what: str): # TODO Json in common?
epath = _get_export() epath = _get_export()
logger.info('processing: %s', epath) logger.info('processing: %s %s', epath, what)
ddd = zipfile.ZipFile(epath).read('tweet.js').decode('utf8') ddd = zipfile.ZipFile(epath).read(what).decode('utf8')
start = ddd.index('[') start = ddd.index('[')
ddd = ddd[start:] ddd = ddd[start:]
for j in json.loads(ddd): for j in json.loads(ddd):
@ -90,14 +105,25 @@ class ZipExport:
def tweets(self) -> Iterator[Tweet]: def tweets(self) -> Iterator[Tweet]:
for r in self.raw(): for r in self.raw('tweet.js'):
yield Tweet(r) yield Tweet(r)
def likes(self) -> Iterator[Like]:
# TODO ugh. would be nice to unify Tweet/Like interface
# however, akeout only got tweetId, full text and url
for r in self.raw('like.js'):
yield Like(r)
def tweets_all() -> List[Tweet]: def tweets_all() -> List[Tweet]:
return list(sorted(ZipExport().tweets(), key=lambda t: t.dt)) return list(sorted(ZipExport().tweets(), key=lambda t: t.dt))
def likes_all() -> List[Like]:
return list(ZipExport().likes())
def predicate(p) -> List[Tweet]: def predicate(p) -> List[Tweet]:
return [t for t in tweets_all() if p(t)] return [t for t in tweets_all() if p(t)]