my.twitter.archive: cleanup linting and use proper configuration via abstract class

This commit is contained in:
Dima Gerasimov 2024-09-22 02:03:28 +01:00 committed by karlicoss
parent 239e6617fe
commit 02dabe9f2b

View file

@ -2,12 +2,56 @@
Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]]) Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])
""" """
from __future__ import annotations
# before this config was named 'twitter', doesn't make too much sense for archive import html
# todo unify with other code like this, e.g. time.tz.via_location import json # hmm interesting enough, orjson didn't give much speedup here?
try: from abc import abstractmethod
from dataclasses import dataclass
from datetime import datetime
from functools import cached_property
from itertools import chain
from pathlib import Path
from typing import (
TYPE_CHECKING,
Iterator,
Sequence,
)
from more_itertools import unique_everseen
from my.core import (
Json,
Paths,
Res,
Stats,
datetime_aware,
get_files,
make_logger,
stat,
warnings,
)
from my.core.serialize import dumps as json_dumps
from .common import TweetId, permalink
logger = make_logger(__name__)
class config:
@property
@abstractmethod
def export_path(self) -> Paths:
"""path[s]/glob to the twitter archive takeout"""
raise NotImplementedError
def make_config() -> config:
# before this config was named 'twitter', doesn't make too much sense for archive
# todo unify with other code like this, e.g. time.tz.via_location
try:
from my.config import twitter_archive as user_config from my.config import twitter_archive as user_config
except ImportError as ie: except ImportError as ie:
if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)): if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)):
# must be caused by something else # must be caused by something else
raise ie raise ie
@ -16,59 +60,17 @@ except ImportError as ie:
except ImportError: except ImportError:
raise ie # raise the original exception.. must be something else # noqa: B904 raise ie # raise the original exception.. must be something else # noqa: B904
else: else:
from my.core import warnings
warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config') warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
## ##
class combined_config(user_config, config):
pass
from dataclasses import dataclass return combined_config()
from datetime import datetime
from itertools import chain
import json # hmm interesting enough, orjson didn't give much speedup here?
from pathlib import Path
from functools import cached_property
import html
from typing import (
Iterator,
List,
Optional,
Sequence,
)
from more_itertools import unique_everseen
from my.core import (
datetime_aware,
get_files,
make_logger,
stat,
Json,
Paths,
Res,
Stats,
)
from my.core import warnings
from my.core.cfg import make_config
from my.core.serialize import dumps as json_dumps
from .common import TweetId, permalink
@dataclass
class twitter_archive(user_config):
export_path: Paths # path[s]/glob to the twitter archive takeout
###
config = make_config(twitter_archive)
logger = make_logger(__name__)
def inputs() -> Sequence[Path]: def inputs() -> Sequence[Path]:
return get_files(config.export_path) return get_files(make_config().export_path)
# TODO make sure it's not used anywhere else and simplify interface # TODO make sure it's not used anywhere else and simplify interface
@ -121,7 +123,7 @@ class Tweet:
return res return res
@property @property
def urls(self) -> List[str]: def urls(self) -> list[str]:
ents = self.entities ents = self.entities
us = ents['urls'] us = ents['urls']
return [u['expanded_url'] for u in us] return [u['expanded_url'] for u in us]
@ -162,10 +164,10 @@ class Like:
return self.raw['tweetId'] return self.raw['tweetId']
@property @property
def text(self) -> Optional[str]: def text(self) -> str | None:
# NOTE: likes basically don't have anything except text and url # NOTE: likes basically don't have anything except text and url
# ugh. I think none means that tweet was deleted? # ugh. I think none means that tweet was deleted?
res: Optional[str] = self.raw.get('fullText') res: str | None = self.raw.get('fullText')
if res is None: if res is None:
return None return None
res = html.unescape(res) res = html.unescape(res)
@ -186,7 +188,7 @@ class ZipExport:
if not (self.zpath / 'Your archive.html').exists(): if not (self.zpath / 'Your archive.html').exists():
self.old_format = True self.old_format = True
def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]: def raw(self, what: str, *, fname: str | None = None) -> Iterator[Json]:
logger.info(f'{self.zpath} : processing {what}') logger.info(f'{self.zpath} : processing {what}')
path = fname or what path = fname or what
@ -317,4 +319,5 @@ def stats() -> Stats:
## Deprecated stuff ## Deprecated stuff
Tid = TweetId if not TYPE_CHECKING:
Tid = TweetId