HPI/my/google/takeout/parser.py
2024-10-19 23:41:22 +01:00

153 lines
5.3 KiB
Python

"""
Parses Google Takeout using [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]]
See [[https://github.com/seanbreckenridge/google_takeout_parser][google_takeout_parser]] for more information
about how to export and organize your takeouts
If the DISABLE_TAKEOUT_CACHE environment variable is set, this won't cache individual
exports in ~/.cache/google_takeout_parser
The directory set as takeout_path can be unpacked directories, or
zip files of the exports, which are temporarily unpacked while creating
the cachew cache
"""
REQUIRES = ["git+https://github.com/seanbreckenridge/google_takeout_parser"]
import os
from collections.abc import Sequence
from contextlib import ExitStack
from dataclasses import dataclass
from pathlib import Path
from typing import cast
from google_takeout_parser.parse_html.html_time_utils import ABBR_TIMEZONES
from my.core import Paths, Stats, get_files, make_config, make_logger, stat
from my.core.cachew import mcachew
from my.core.error import ErrorPolicy
from my.core.structure import match_structure
from my.core.time import user_forced
ABBR_TIMEZONES.extend(user_forced())
import google_takeout_parser
from google_takeout_parser.merge import CacheResults, GoogleEventSet
from google_takeout_parser.models import BaseEvent
from google_takeout_parser.path_dispatch import TakeoutParser
# see https://github.com/seanbreckenridge/dotfiles/blob/master/.config/my/my/config/__init__.py for an example
from my.config import google as user_config
@dataclass
class google(user_config):
# directory which includes unpacked/zipped takeouts
takeout_path: Paths
error_policy: ErrorPolicy = 'yield'
# experimental flag to use core.kompress.ZipPath
# instead of unpacking to a tmp dir via match_structure
_use_zippath: bool = False
config = make_config(google)
logger = make_logger(__name__, level="warning")
# patch the takeout parser logger to match the computed loglevel
from google_takeout_parser.log import setup as setup_takeout_logger
setup_takeout_logger(logger.level)
DISABLE_TAKEOUT_CACHE = "DISABLE_TAKEOUT_CACHE" in os.environ
def inputs() -> Sequence[Path]:
return get_files(config.takeout_path)
try:
from google_takeout_parser.locales.main import get_paths_for_functions
EXPECTED = tuple(get_paths_for_functions())
except ImportError:
EXPECTED = (
"My Activity",
"Chrome",
"Location History",
"Youtube",
"YouTube and YouTube Music",
)
google_takeout_version = str(getattr(google_takeout_parser, '__version__', 'unknown'))
def _cachew_depends_on() -> list[str]:
exports = sorted([str(p) for p in inputs()])
# add google takeout parser pip version to hash, so this re-creates on breaking changes
exports.insert(0, f"google_takeout_version: {google_takeout_version}")
return exports
# ResultsType is a Union of all of the models in google_takeout_parser
@mcachew(depends_on=_cachew_depends_on, logger=logger, force_file=True)
def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults: # noqa: FBT001
error_policy = config.error_policy
count = 0
emitted = GoogleEventSet()
try:
emitted_add = emitted.add_if_not_present
except AttributeError:
# compat for older versions of google_takeout_parser which didn't have this method
def emitted_add(other: BaseEvent) -> bool:
if other in emitted:
return False
emitted.add(other)
return True
# reversed shouldn't really matter? but logic is to use newer
# takeouts if they're named according to date, since JSON Activity
# is nicer than HTML Activity
for path in reversed(inputs()):
with ExitStack() as exit_stack:
if config._use_zippath:
# for later takeouts it's just 'Takeout' dir,
# but for older (pre 2015) it contains email/date in the subdir name
results = tuple(cast(Sequence[Path], path.iterdir()))
else:
results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
for m in results:
# e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457'
# means that zipped takeouts have nice filenames from cachew
cw_id, _, _ = path.name.rpartition(".")
# each takeout result is cached as well, in individual databases per-type
tk = TakeoutParser(m, cachew_identifier=cw_id, error_policy=error_policy)
# TODO might be nice to pass hpi cache dir?
for event in tk.parse(cache=not disable_takeout_cache):
count += 1
if isinstance(event, Exception):
if error_policy == 'yield':
yield event
elif error_policy == 'raise':
raise event
elif error_policy == 'drop':
pass
continue
if emitted_add(event):
yield event # type: ignore[misc]
logger.debug(
f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates"
)
def stats() -> Stats:
return {
**stat(events),
}