my.twitter.archive: cleanup linting and use proper configuration via abstract class

This commit is contained in:
Dima Gerasimov 2024-09-22 02:03:28 +01:00 committed by karlicoss
parent 239e6617fe
commit 02dabe9f2b

View file

@ -2,73 +2,75 @@
Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]]) Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])
""" """
from __future__ import annotations
# before this config was named 'twitter', doesn't make too much sense for archive import html
# todo unify with other code like this, e.g. time.tz.via_location import json # hmm interesting enough, orjson didn't give much speedup here?
try: from abc import abstractmethod
from my.config import twitter_archive as user_config
except ImportError as ie:
if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)):
# must be caused by something else
raise ie
try:
from my.config import twitter as user_config # type: ignore[assignment]
except ImportError:
raise ie # raise the original exception.. must be something else # noqa: B904
else:
from my.core import warnings
warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
##
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from itertools import chain
import json # hmm interesting enough, orjson didn't give much speedup here?
from pathlib import Path
from functools import cached_property from functools import cached_property
import html from itertools import chain
from pathlib import Path
from typing import ( from typing import (
TYPE_CHECKING,
Iterator, Iterator,
List,
Optional,
Sequence, Sequence,
) )
from more_itertools import unique_everseen from more_itertools import unique_everseen
from my.core import ( from my.core import (
datetime_aware,
get_files,
make_logger,
stat,
Json, Json,
Paths, Paths,
Res, Res,
Stats, Stats,
datetime_aware,
get_files,
make_logger,
stat,
warnings,
) )
from my.core import warnings
from my.core.cfg import make_config
from my.core.serialize import dumps as json_dumps from my.core.serialize import dumps as json_dumps
from .common import TweetId, permalink from .common import TweetId, permalink
@dataclass
class twitter_archive(user_config):
export_path: Paths # path[s]/glob to the twitter archive takeout
###
config = make_config(twitter_archive)
logger = make_logger(__name__) logger = make_logger(__name__)
class config:
@property
@abstractmethod
def export_path(self) -> Paths:
"""path[s]/glob to the twitter archive takeout"""
raise NotImplementedError
def make_config() -> config:
# before this config was named 'twitter', doesn't make too much sense for archive
# todo unify with other code like this, e.g. time.tz.via_location
try:
from my.config import twitter_archive as user_config
except ImportError as ie:
if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)):
# must be caused by something else
raise ie
try:
from my.config import twitter as user_config # type: ignore[assignment]
except ImportError:
raise ie # raise the original exception.. must be something else # noqa: B904
else:
warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
##
class combined_config(user_config, config):
pass
return combined_config()
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(make_config().export_path)
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
@ -121,7 +123,7 @@ class Tweet:
return res return res
@property @property
def urls(self) -> List[str]: def urls(self) -> list[str]:
ents = self.entities ents = self.entities
us = ents['urls'] us = ents['urls']
return [u['expanded_url'] for u in us] return [u['expanded_url'] for u in us]
@ -162,10 +164,10 @@ class Like:
return self.raw['tweetId'] return self.raw['tweetId']
@property @property
def text(self) -> Optional[str]: def text(self) -> str | None:
# NOTE: likes basically don't have anything except text and url # NOTE: likes basically don't have anything except text and url
# ugh. I think none means that tweet was deleted? # ugh. I think none means that tweet was deleted?
res: Optional[str] = self.raw.get('fullText') res: str | None = self.raw.get('fullText')
if res is None: if res is None:
return None return None
res = html.unescape(res) res = html.unescape(res)
@ -186,7 +188,7 @@ class ZipExport:
if not (self.zpath / 'Your archive.html').exists(): if not (self.zpath / 'Your archive.html').exists():
self.old_format = True self.old_format = True
def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]: def raw(self, what: str, *, fname: str | None = None) -> Iterator[Json]:
logger.info(f'{self.zpath} : processing {what}') logger.info(f'{self.zpath} : processing {what}')
path = fname or what path = fname or what
@ -317,4 +319,5 @@ def stats() -> Stats:
## Deprecated stuff ## Deprecated stuff
Tid = TweetId if not TYPE_CHECKING:
Tid = TweetId