my.twitter.archive: cleanup linting and use proper configuration via abstract class
This commit is contained in:
parent
239e6617fe
commit
02dabe9f2b
1 changed files with 50 additions and 47 deletions
|
@ -2,12 +2,56 @@
|
||||||
Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])
|
Twitter data (uses [[https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive][official twitter archive export]])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
# before this config was named 'twitter', doesn't make too much sense for archive
|
import html
|
||||||
# todo unify with other code like this, e.g. time.tz.via_location
|
import json # hmm interesting enough, orjson didn't give much speedup here?
|
||||||
try:
|
from abc import abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from functools import cached_property
|
||||||
|
from itertools import chain
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Iterator,
|
||||||
|
Sequence,
|
||||||
|
)
|
||||||
|
|
||||||
|
from more_itertools import unique_everseen
|
||||||
|
|
||||||
|
from my.core import (
|
||||||
|
Json,
|
||||||
|
Paths,
|
||||||
|
Res,
|
||||||
|
Stats,
|
||||||
|
datetime_aware,
|
||||||
|
get_files,
|
||||||
|
make_logger,
|
||||||
|
stat,
|
||||||
|
warnings,
|
||||||
|
)
|
||||||
|
from my.core.serialize import dumps as json_dumps
|
||||||
|
|
||||||
|
from .common import TweetId, permalink
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class config:
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def export_path(self) -> Paths:
|
||||||
|
"""path[s]/glob to the twitter archive takeout"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def make_config() -> config:
|
||||||
|
# before this config was named 'twitter', doesn't make too much sense for archive
|
||||||
|
# todo unify with other code like this, e.g. time.tz.via_location
|
||||||
|
try:
|
||||||
from my.config import twitter_archive as user_config
|
from my.config import twitter_archive as user_config
|
||||||
except ImportError as ie:
|
except ImportError as ie:
|
||||||
if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)):
|
if not (ie.name == 'my.config' and 'twitter_archive' in str(ie)):
|
||||||
# must be caused by something else
|
# must be caused by something else
|
||||||
raise ie
|
raise ie
|
||||||
|
@ -16,59 +60,17 @@ except ImportError as ie:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ie # raise the original exception.. must be something else # noqa: B904
|
raise ie # raise the original exception.. must be something else # noqa: B904
|
||||||
else:
|
else:
|
||||||
from my.core import warnings
|
|
||||||
warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
|
warnings.high('my.config.twitter is deprecated! Please rename it to my.config.twitter_archive in your config')
|
||||||
##
|
##
|
||||||
|
|
||||||
|
class combined_config(user_config, config):
|
||||||
|
pass
|
||||||
|
|
||||||
from dataclasses import dataclass
|
return combined_config()
|
||||||
from datetime import datetime
|
|
||||||
from itertools import chain
|
|
||||||
import json # hmm interesting enough, orjson didn't give much speedup here?
|
|
||||||
from pathlib import Path
|
|
||||||
from functools import cached_property
|
|
||||||
import html
|
|
||||||
from typing import (
|
|
||||||
Iterator,
|
|
||||||
List,
|
|
||||||
Optional,
|
|
||||||
Sequence,
|
|
||||||
)
|
|
||||||
|
|
||||||
from more_itertools import unique_everseen
|
|
||||||
|
|
||||||
from my.core import (
|
|
||||||
datetime_aware,
|
|
||||||
get_files,
|
|
||||||
make_logger,
|
|
||||||
stat,
|
|
||||||
Json,
|
|
||||||
Paths,
|
|
||||||
Res,
|
|
||||||
Stats,
|
|
||||||
)
|
|
||||||
from my.core import warnings
|
|
||||||
from my.core.cfg import make_config
|
|
||||||
from my.core.serialize import dumps as json_dumps
|
|
||||||
|
|
||||||
from .common import TweetId, permalink
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class twitter_archive(user_config):
|
|
||||||
export_path: Paths # path[s]/glob to the twitter archive takeout
|
|
||||||
|
|
||||||
|
|
||||||
###
|
|
||||||
|
|
||||||
config = make_config(twitter_archive)
|
|
||||||
|
|
||||||
|
|
||||||
logger = make_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def inputs() -> Sequence[Path]:
|
def inputs() -> Sequence[Path]:
|
||||||
return get_files(config.export_path)
|
return get_files(make_config().export_path)
|
||||||
|
|
||||||
|
|
||||||
# TODO make sure it's not used anywhere else and simplify interface
|
# TODO make sure it's not used anywhere else and simplify interface
|
||||||
|
@ -121,7 +123,7 @@ class Tweet:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def urls(self) -> List[str]:
|
def urls(self) -> list[str]:
|
||||||
ents = self.entities
|
ents = self.entities
|
||||||
us = ents['urls']
|
us = ents['urls']
|
||||||
return [u['expanded_url'] for u in us]
|
return [u['expanded_url'] for u in us]
|
||||||
|
@ -162,10 +164,10 @@ class Like:
|
||||||
return self.raw['tweetId']
|
return self.raw['tweetId']
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self) -> Optional[str]:
|
def text(self) -> str | None:
|
||||||
# NOTE: likes basically don't have anything except text and url
|
# NOTE: likes basically don't have anything except text and url
|
||||||
# ugh. I think none means that tweet was deleted?
|
# ugh. I think none means that tweet was deleted?
|
||||||
res: Optional[str] = self.raw.get('fullText')
|
res: str | None = self.raw.get('fullText')
|
||||||
if res is None:
|
if res is None:
|
||||||
return None
|
return None
|
||||||
res = html.unescape(res)
|
res = html.unescape(res)
|
||||||
|
@ -186,7 +188,7 @@ class ZipExport:
|
||||||
if not (self.zpath / 'Your archive.html').exists():
|
if not (self.zpath / 'Your archive.html').exists():
|
||||||
self.old_format = True
|
self.old_format = True
|
||||||
|
|
||||||
def raw(self, what: str, *, fname: Optional[str] = None) -> Iterator[Json]:
|
def raw(self, what: str, *, fname: str | None = None) -> Iterator[Json]:
|
||||||
logger.info(f'{self.zpath} : processing {what}')
|
logger.info(f'{self.zpath} : processing {what}')
|
||||||
|
|
||||||
path = fname or what
|
path = fname or what
|
||||||
|
@ -317,4 +319,5 @@ def stats() -> Stats:
|
||||||
|
|
||||||
|
|
||||||
## Deprecated stuff
|
## Deprecated stuff
|
||||||
Tid = TweetId
|
if not TYPE_CHECKING:
|
||||||
|
Tid = TweetId
|
||||||
|
|
Loading…
Add table
Reference in a new issue