twitter.archive: deduplicate results via json.dumps

this speeds up processing quite a bit, from 40s to 20s for me, plus removes tons of identical outputs interesting enough, using raw object without json.dumps as key brings unique_everseen to crawl...
2023-10-24 01:15:27 +01:00 · 2023-10-24 01:15:27 +01:00 · a5c04e789a
commit a5c04e789a
parent 0e94e0a9ea
1 changed files with 46 additions and 29 deletions
--- a/my/twitter/archive.py
+++ b/my/twitter/archive.py
@ -22,31 +22,48 @@ except ImportError as ie:


 from dataclasses import dataclass
+from datetime import datetime
+from itertools import chain
+import json  # hmm interesting enough, orjson didn't give much speedup here?
+from pathlib import Path
 from functools import cached_property
 import html
-from ..core.common import Paths, datetime_aware
-from ..core.error import Res
+from typing import (
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+)
+
+from more_itertools import unique_everseen
+
+from my.core import (
+    datetime_aware,
+    get_files,
+    make_logger,
+    stat,
+    Json,
+    Paths,
+    Res,
+    Stats,
+)
+from my.core import warnings
+from my.core.cfg import make_config
+from my.core.serialize import dumps as json_dumps
+
+from .common import TweetId, permalink
+

@dataclass
 class twitter_archive(user_config):
-    export_path: Paths # path[s]/glob to the twitter archive takeout
+    export_path: Paths  # path[s]/glob to the twitter archive takeout


 ###

-from ..core.cfg import make_config
 config = make_config(twitter_archive)


-from datetime import datetime
-from typing import List, Optional, NamedTuple, Sequence, Iterator
-from pathlib import Path
-import json
-
-from my.core import get_files, make_logger, Json
-
-
-
 logger = make_logger(__name__)


@ -54,11 +71,9 @@ def inputs() -> Sequence[Path]:
    return get_files(config.export_path)


-from .common import TweetId, permalink
-
-
 # TODO make sure it's not used anywhere else and simplify interface
-class Tweet(NamedTuple):
+@dataclass
+class Tweet:
    raw: Json
    screen_name: str

@ -80,7 +95,7 @@ class Tweet(NamedTuple):
        res: str = self.raw['full_text']

        ## replace shortened URLS
-        repls = [] # from, to, what
+        repls = []  # from, to, what
        for ue in self.entities['urls']:
            [fr, to] = map(int, ue['indices'])
            repls.append((fr, to, ue['expanded_url']))
@ -94,7 +109,7 @@ class Tweet(NamedTuple):
        parts = []
        idx = 0
        for fr, to, what in repls:
-            parts.append(res[idx: fr])
+            parts.append(res[idx:fr])
            parts.append(what)
            idx = to
        parts.append(res[idx:])
@ -132,7 +147,8 @@ class Tweet(NamedTuple):
        return self.created_at


-class Like(NamedTuple):
+@dataclass
+class Like:
    raw: Json
    screen_name: str

@ -165,13 +181,12 @@ class ZipExport:
    def __init__(self, archive_path: Path) -> None:
        self.zpath = archive_path
        if (self.zpath / 'tweets.csv').exists():
-            from ..core.warnings import high
-            high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
-        self.old_format = False # changed somewhere around 2020.03
+            warnings.high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
+        self.old_format = False  # changed somewhere around 2020.03
        if not (self.zpath / 'Your archive.html').exists():
            self.old_format = True

-    def raw(self, what: str, *, fname: Optional[str]=None) -> Iterator[Json]:
+    def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]:
        logger.info(f'{self.zpath} : processing {what}')

        path = fname or what
@ -213,16 +228,18 @@ class ZipExport:

 # todo not sure about list and sorting? although can't hurt considering json is not iterative?
 def tweets() -> Iterator[Res[Tweet]]:
-    for inp in inputs():
-        yield from sorted(ZipExport(inp).tweets(), key=lambda t: t.dt)
+    _all = chain.from_iterable(ZipExport(i).tweets() for i in inputs())
+    res = unique_everseen(_all, key=json_dumps)
+    yield from sorted(res, key=lambda t: t.dt)


 def likes() -> Iterator[Res[Like]]:
-    for inp in inputs():
-        yield from ZipExport(inp).likes()
+    _all = chain.from_iterable(ZipExport(i).likes() for i in inputs())
+    res = unique_everseen(_all, key=json_dumps)
+    # ugh. likes don't have datetimes..
+    yield from res


-from ..core import stat, Stats
 def stats() -> Stats:
    return {
        **stat(tweets),