handle updated twitter archive

This commit is contained in:
Dima Gerasimov 2020-03-16 23:36:18 +00:00
parent 66790cb9f4
commit 6c5d713a17
2 changed files with 34 additions and 9 deletions

View file

@ -45,3 +45,12 @@ class CPath(PosixPath):
open = kopen # TODO FIXME remove? open = kopen # TODO FIXME remove?
# meh
def kexists(path: PathIsh, subpath: str) -> bool:
try:
kopen(path, subpath)
return True
except Exception:
return False

View file

@ -21,6 +21,7 @@ import zipfile
import pytz import pytz
from .common import PathIsh, get_files, LazyLogger from .common import PathIsh, get_files, LazyLogger
from .kython import kompress
logger = LazyLogger('my.twitter') logger = LazyLogger('my.twitter')
@ -102,29 +103,44 @@ class Like(NamedTuple):
class ZipExport: class ZipExport:
def __init__(self) -> None: def __init__(self) -> None:
pass self.epath = _get_export()
self.old_format = False # changed somewhere around 2020.03
if not kompress.kexists(self.epath, 'Your archive.html'):
self.old_format = True
def raw(self, what: str): # TODO Json in common? def raw(self, what: str): # TODO Json in common?
epath = _get_export() logger.info('processing: %s %s', self.epath, what)
logger.info('processing: %s %s', epath, what)
ddd = zipfile.ZipFile(epath).read(what).decode('utf8') path = what
if not self.old_format:
path = 'data/' + path
path += '.js'
with kompress.kopen(self.epath, path) as fo:
ddd = fo.read().decode('utf8')
start = ddd.index('[') start = ddd.index('[')
ddd = ddd[start:] ddd = ddd[start:]
for j in json.loads(ddd): for j in json.loads(ddd):
yield j if set(j.keys()) == {what}:
# newer format
yield j[what]
else:
# older format
yield j
def tweets(self) -> Iterator[Tweet]: def tweets(self) -> Iterator[Tweet]:
for r in self.raw('tweet.js'): for r in self.raw('tweet'):
yield Tweet(r) yield Tweet(r)
def likes(self) -> Iterator[Like]: def likes(self) -> Iterator[Like]:
# TODO ugh. would be nice to unify Tweet/Like interface # TODO ugh. would be nice to unify Tweet/Like interface
# however, akeout only got tweetId, full text and url # however, akeout only got tweetId, full text and url
for r in self.raw('like.js'): for r in self.raw('like'):
assert set(r.keys()) == {'like'} yield Like(r)
yield Like(r['like'])
def tweets() -> List[Tweet]: def tweets() -> List[Tweet]: