my.google.takeout.parser: speedup event merging on newer google_takeout_parser versions

This commit is contained in:
Dima Gerasimov 2024-09-13 01:18:40 +01:00 committed by karlicoss
parent 71fdeca5e1
commit 27178c0939
2 changed files with 21 additions and 10 deletions

View file

@ -31,6 +31,7 @@ ABBR_TIMEZONES.extend(user_forced())
import google_takeout_parser
from google_takeout_parser.path_dispatch import TakeoutParser
from google_takeout_parser.merge import GoogleEventSet, CacheResults
from google_takeout_parser.models import BaseEvent
# see https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
from my.config import google as user_config
@ -95,6 +96,17 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
error_policy = config.error_policy
count = 0
emitted = GoogleEventSet()
try:
emitted_add = emitted.add_if_not_present
except AttributeError:
# compat for older versions of google_takeout_parser which didn't have this method
def emitted_add(other: BaseEvent) -> bool:
if other in emitted:
return False
emitted.add(other)
return True
# reversed shouldn't really matter? but logic is to use newer
# takeouts if they're named according to date, since JSON Activity
# is nicer than HTML Activity
@ -123,10 +135,9 @@ def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
elif error_policy == 'drop':
pass
continue
if event in emitted:
continue
emitted.add(event)
yield event # type: ignore[misc]
if emitted_add(event):
yield event # type: ignore[misc]
logger.debug(
f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates"
)

View file

@ -1,10 +1,10 @@
from typing import NamedTuple, List, Iterable, TYPE_CHECKING
from ..core import datetime_aware, Res, LazyLogger
from ..core.compat import removeprefix
from my.core import datetime_aware, make_logger, stat, Res, Stats
from my.core.compat import deprecated, removeprefix
logger = LazyLogger(__name__)
logger = make_logger(__name__)
class Watched(NamedTuple):
@ -93,7 +93,6 @@ def watched() -> Iterable[Res[Watched]]:
)
from ..core import stat, Stats
def stats() -> Stats:
return stat(watched)
@ -101,8 +100,9 @@ def stats() -> Stats:
### deprecated stuff (keep in my.media.youtube)
if not TYPE_CHECKING:
# "deprecate" by hiding from mypy
get_watched = watched
@deprecated("use 'watched' instead")
def get_watched(*args, **kwargs):
return watched(*args, **kwargs)
def _watched_legacy() -> Iterable[Watched]: