HPI/my/youtube/takeout.py
2024-11-26 21:51:40 +00:00

183 lines
6.1 KiB
Python

from __future__ import annotations
from collections.abc import Iterable, Iterator
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
from my.core.compat import deprecated
logger = make_logger(__name__)
@dataclass
class Watched:
url: str
title: str
when: datetime_aware
@property
def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}'
def is_deleted(self) -> bool:
return self.title == self.url
# todo define error policy?
# although it has one from google takeout module.. so not sure
def watched() -> Iterator[Res[Watched]]:
emitted: dict[Any, Watched] = {}
for w in _watched():
if isinstance(w, Exception):
yield w # TODO also make unique?
continue
# older exports (e.g. html) didn't have microseconds
# whereas newer json ones do have them
# seconds resolution is enough to distinguish watched videos
# also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
without_microsecond = w.when.replace(microsecond=0)
key = w.url, without_microsecond
prev = emitted.get(key, None)
if prev is not None:
# NOTE: some video titles start with 'Liked ' for liked videos activity
# but they'd have different timestamp, so fine not to handle them as a special case here
if w.title in prev.title:
# often more stuff added to the title, like 'Official Video'
# in this case not worth emitting the change
# also handles the case when titles match
continue
# otherwise if title changed completely, just emit the change... not sure what else we could do?
# could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..
# TODO would also be nice to handle is_deleted here somehow...
# but for that would need to process data in direct order vs reversed..
# not sure, maybe this could use a special mode or something?
emitted[key] = w
yield w
def _watched() -> Iterator[Res[Watched]]:
try:
from google_takeout_parser.models import Activity
from ..google.takeout.parser import events
except ModuleNotFoundError as ex:
logger.exception(ex)
warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
yield from _watched_legacy() # type: ignore[name-defined]
return
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
# TODO would be nice to filter, e.g. it's kinda pointless to process Location events
for e in events():
if isinstance(e, Exception):
yield e
continue
if not isinstance(e, Activity):
continue
url = e.titleUrl
if url is None:
continue
header = e.header
if header in {'Image Search', 'Search', 'Chrome'}:
# sometimes results in youtube links.. but definitely not watch history
continue
if header not in {'YouTube', 'youtube.com'}:
# TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check
# perhaps this would be easier once we have universal ids
if YOUTUBE_VIDEO_LINK in url:
# TODO maybe log in this case or something?
pass
continue
title = e.title
if header == 'youtube.com' and title.startswith('Visited '):
continue
if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'):
# search activity, don't need it here
continue
if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'):
# todo might be interesting to process somewhere?
continue
# all titles contain it, so pointless to include 'Watched '
# also compatible with legacy titles
title = title.removeprefix('Watched ')
# watches originating from some activity end with this, remove it for consistency
title = title.removesuffix(' - YouTube')
if YOUTUBE_VIDEO_LINK not in url:
if 'youtube.com/post/' in url:
# some sort of channel updates?
continue
if 'youtube.com/playlist' in url:
# 'saved playlist' actions
continue
if 'music.youtube.com' in url:
# todo maybe allow it?
continue
if any('From Google Ads' in d for d in e.details):
# weird, sometimes results in odd urls
continue
if title == 'Used YouTube':
continue
yield RuntimeError(f'Unexpected url: {e}')
continue
# TODO contribute to takeout parser? seems that these still might happen in json data
title = title.replace("\xa0", " ")
yield Watched(
url=url,
title=title,
when=e.time,
)
def stats() -> Stats:
return stat(watched)
### deprecated stuff (keep in my.media.youtube)
if not TYPE_CHECKING:
@deprecated("use 'watched' instead")
def get_watched(*args, **kwargs):
return watched(*args, **kwargs)
def _watched_legacy() -> Iterable[Watched]:
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout
# todo looks like this one doesn't have retention? so enough to use the last
path = 'Takeout/My Activity/YouTube/MyActivity.html'
last = get_last_takeout(path=path)
if last is None:
return []
watches: list[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))
# todo hmm they already come sorted.. wonder if should just rely on it..
return sorted(watches, key=lambda e: e.when)