twitter.archive: deduplicate results via json.dumps

this speeds up processing quite a bit, from 40s to 20s for me, plus removes tons of identical outputs

interesting enough, using raw object without json.dumps as key brings unique_everseen to crawl...
This commit is contained in:
Dima Gerasimov 2023-10-24 01:15:27 +01:00 committed by karlicoss
parent 0e94e0a9ea
commit a5c04e789a

View file

@ -22,31 +22,48 @@ except ImportError as ie:
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime
from itertools import chain
import json # hmm interesting enough, orjson didn't give much speedup here?
from pathlib import Path
from functools import cached_property from functools import cached_property
import html import html
from ..core.common import Paths, datetime_aware from typing import (
from ..core.error import Res Iterator,
List,
Optional,
Sequence,
)
from more_itertools import unique_everseen
from my.core import (
datetime_aware,
get_files,
make_logger,
stat,
Json,
Paths,
Res,
Stats,
)
from my.core import warnings
from my.core.cfg import make_config
from my.core.serialize import dumps as json_dumps
from .common import TweetId, permalink
@dataclass @dataclass
class twitter_archive(user_config): class twitter_archive(user_config):
export_path: Paths # path[s]/glob to the twitter archive takeout export_path: Paths # path[s]/glob to the twitter archive takeout
### ###
from ..core.cfg import make_config
config = make_config(twitter_archive) config = make_config(twitter_archive)
from datetime import datetime
from typing import List, Optional, NamedTuple, Sequence, Iterator
from pathlib import Path
import json
from my.core import get_files, make_logger, Json
logger = make_logger(__name__) logger = make_logger(__name__)
@ -54,11 +71,9 @@ def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(config.export_path)
from .common import TweetId, permalink
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
class Tweet(NamedTuple): @dataclass
class Tweet:
raw: Json raw: Json
screen_name: str screen_name: str
@ -80,7 +95,7 @@ class Tweet(NamedTuple):
res: str = self.raw['full_text'] res: str = self.raw['full_text']
## replace shortened URLS ## replace shortened URLS
repls = [] # from, to, what repls = [] # from, to, what
for ue in self.entities['urls']: for ue in self.entities['urls']:
[fr, to] = map(int, ue['indices']) [fr, to] = map(int, ue['indices'])
repls.append((fr, to, ue['expanded_url'])) repls.append((fr, to, ue['expanded_url']))
@ -94,7 +109,7 @@ class Tweet(NamedTuple):
parts = [] parts = []
idx = 0 idx = 0
for fr, to, what in repls: for fr, to, what in repls:
parts.append(res[idx: fr]) parts.append(res[idx:fr])
parts.append(what) parts.append(what)
idx = to idx = to
parts.append(res[idx:]) parts.append(res[idx:])
@ -132,7 +147,8 @@ class Tweet(NamedTuple):
return self.created_at return self.created_at
class Like(NamedTuple): @dataclass
class Like:
raw: Json raw: Json
screen_name: str screen_name: str
@ -165,13 +181,12 @@ class ZipExport:
def __init__(self, archive_path: Path) -> None: def __init__(self, archive_path: Path) -> None:
self.zpath = archive_path self.zpath = archive_path
if (self.zpath / 'tweets.csv').exists(): if (self.zpath / 'tweets.csv').exists():
from ..core.warnings import high warnings.high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.")
high("NOTE: CSV format (pre ~Aug 2018) isn't supported yet, this is likely not going to work.") self.old_format = False # changed somewhere around 2020.03
self.old_format = False # changed somewhere around 2020.03
if not (self.zpath / 'Your archive.html').exists(): if not (self.zpath / 'Your archive.html').exists():
self.old_format = True self.old_format = True
def raw(self, what: str, *, fname: Optional[str]=None) -> Iterator[Json]: def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]:
logger.info(f'{self.zpath} : processing {what}') logger.info(f'{self.zpath} : processing {what}')
path = fname or what path = fname or what
@ -213,16 +228,18 @@ class ZipExport:
# todo not sure about list and sorting? although can't hurt considering json is not iterative? # todo not sure about list and sorting? although can't hurt considering json is not iterative?
def tweets() -> Iterator[Res[Tweet]]: def tweets() -> Iterator[Res[Tweet]]:
for inp in inputs(): _all = chain.from_iterable(ZipExport(i).tweets() for i in inputs())
yield from sorted(ZipExport(inp).tweets(), key=lambda t: t.dt) res = unique_everseen(_all, key=json_dumps)
yield from sorted(res, key=lambda t: t.dt)
def likes() -> Iterator[Res[Like]]: def likes() -> Iterator[Res[Like]]:
for inp in inputs(): _all = chain.from_iterable(ZipExport(i).likes() for i in inputs())
yield from ZipExport(inp).likes() res = unique_everseen(_all, key=json_dumps)
# ugh. likes don't have datetimes..
yield from res
from ..core import stat, Stats
def stats() -> Stats: def stats() -> Stats:
return { return {
**stat(tweets), **stat(tweets),