From 711157e0f5c063d52a75c8a566206365bee632bc Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Tue, 31 May 2022 12:46:21 +0100
Subject: [PATCH] my.twitter.archive: switch to zippath, add config section,
 better mypy coverage

---
 misc/check-twitter.sh |  8 ++++++++
 my/config.py          |  4 ++++
 my/twitter/archive.py | 40 +++++++++++++++++++++++-----------------
 3 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/misc/check-twitter.sh b/misc/check-twitter.sh
index 1aec2fb..318ff71 100755
--- a/misc/check-twitter.sh
+++ b/misc/check-twitter.sh
@@ -93,5 +93,13 @@ check '2022-02-02 Wed 18:28.*You are in luck!.*https://deepmind.com/blog/article
 # check link which is only in twidump
 check '2013-06-24 Mon 14:13.*RT @gorod095: Нашел недавно в букинист'
 
+# some older statuses, useful to test that all input data is properly detected
+check '2010-04-01 Thu 11:34'
+check '2010-06-28 Mon 23:42'
+
+# https://twitter.com/karlicoss/status/22916704915
+# this one is weird, just disappeared for no reason between 2021-12-22 and 2022-03-15
+# and the account isn't suspended etc. maybe it was temporary private or something?
+check '2010-09-03 Fri 20:11.*Джобс'
 
 # TODO check likes as well
diff --git a/my/config.py b/my/config.py
index b1c17d2..1a8e49a 100644
--- a/my/config.py
+++ b/my/config.py
@@ -139,6 +139,10 @@ class fbmessenger:
         export_path: Paths
 
 
+class twitter_archive:
+    export_path: Paths
+
+
 class twitter:
     class talon:
         export_path: Paths
diff --git a/my/twitter/archive.py b/my/twitter/archive.py
index 70f55db..0583214 100644
--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@@ -4,23 +4,28 @@ Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-do
 
 
 # before this config was named 'twitter', doesn't make too much sense for archive
-# try to import it defensively..
+# todo unify with other code like this, e.g. time.tz.via_location
 try:
     from my.config import twitter_archive as user_config
-except ImportError as e:
+except ImportError as ie:
+    if ie.name != 'twitter_archive':
+        raise ie
     try:
-        from my.config import twitter as user_config
+        from my.config import twitter as user_config # type: ignore[misc]
     except ImportError:
-        raise e # raise the original exception.. must be something else
+        raise ie # raise the original exception.. must be something else
     else:
         from ..core import warnings
         warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
+##
 
 
 from dataclasses import dataclass
+from functools import lru_cache
 import html
 from ..core.common import Paths, datetime_aware
 from ..core.error import Res
+from ..core.kompress import ZipPath
 
 @dataclass
 class twitter_archive(user_config):
@@ -39,7 +44,6 @@ from pathlib import Path
 import json
 
 from ..core.common import get_files, LazyLogger, Json
-from ..core import kompress
 
 
 
@@ -47,7 +51,7 @@ logger = LazyLogger(__name__, level="warning")
 
 
 def inputs() -> Sequence[Path]:
-    return get_files(config.export_path)[-1:]
+    return get_files(config.export_path)
 
 
 from .common import TweetId, permalink
@@ -73,7 +77,7 @@ class Tweet(NamedTuple):
 
     @property
     def text(self) -> str:
-        res = self.raw['full_text']
+        res: str = self.raw['full_text']
 
         ## replace shortened URLS
         repls = [] # from, to, what
@@ -145,7 +149,7 @@ class Like(NamedTuple):
     def text(self) -> Optional[str]:
         # NOTE: likes basically don't have anything except text and url
         # ugh. I think none means that tweet was deleted?
-        res = self.raw.get('fullText')
+        res: Optional[str] = self.raw.get('fullText')
         if res is None:
             return None
         res = html.unescape(res)
@@ -157,27 +161,27 @@ class Like(NamedTuple):
         return self.id_str
 
 
-from functools import lru_cache
 class ZipExport:
     def __init__(self, archive_path: Path) -> None:
-        # TODO use ZipPath
-        self.epath = archive_path
+        # todo maybe this should be insude get_files instead, perhps covered with a flag?
+        self.zpath = ZipPath(archive_path)
 
+        if (self.zpath / 'tweets.csv').exists():
+            from ..core.warnings import high
+            high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
         self.old_format = False # changed somewhere around 2020.03
-        if not kompress.kexists(self.epath, 'Your archive.html'):
+        if not (self.zpath / 'Your archive.html').exists():
             self.old_format = True
 
-
-    def raw(self, what: str): # TODO Json in common?
-        logger.info('processing: %s %s', self.epath, what)
+    def raw(self, what: str) -> Iterator[Json]:
+        logger.info('processing: %s %s', self.zpath, what)
 
         path = what
         if not self.old_format:
             path = 'data/' + path
         path += '.js'
 
-        with kompress.kopen(self.epath, path) as fo:
-            ddd = fo.read()
+        ddd = (self.zpath / path).read_text()
         start = ddd.index('[')
         ddd = ddd[start:]
         for j in json.loads(ddd):
@@ -194,6 +198,8 @@ class ZipExport:
         return acc['username']
 
     def tweets(self) -> Iterator[Tweet]:
+        # NOTE: for some reason, created_at doesn't seem to be in order
+        # it mostly is, but there are a bunch of one-off random tweets where the time decreases (typically at the very end)
         for r in self.raw('tweet'):
             yield Tweet(r, screen_name=self.screen_name())