diff --git a/my/core/compat.py b/my/core/compat.py index eccaf07..29d4f04 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -22,12 +22,17 @@ if not TYPE_CHECKING: source.backup(dest, **kwargs) -# can remove after python3.9 (although need to keep the method itself for bwd compat) +## can remove after python3.9 (although need to keep the method itself for bwd compat) def removeprefix(text: str, prefix: str) -> str: if text.startswith(prefix): return text[len(prefix) :] return text +def removesuffix(text: str, suffix: str) -> str: + if text.endswith(suffix): + return text[:-len(suffix)] + return text +## ## used to have compat function before 3.8 for these, keeping for runtime back compatibility if not TYPE_CHECKING: diff --git a/my/media/youtube.py b/my/media/youtube.py index efaa74b..3ddbc14 100644 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -1,5 +1,10 @@ -from ..core.warnings import high -high("DEPRECATED! Please use my.youtube.takeout instead.") -from ..core.util import __NOT_HPI_MODULE__ +from my.core import __NOT_HPI_MODULE__ -from ..youtube.takeout import * +from typing import TYPE_CHECKING + +from my.core.warnings import high + +high("DEPRECATED! Please use my.youtube.takeout instead.") + +if not TYPE_CHECKING: + from my.youtube.takeout import * diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py index 284c082..bbce46a 100644 --- a/my/youtube/takeout.py +++ b/my/youtube/takeout.py @@ -1,13 +1,16 @@ -from typing import NamedTuple, List, Iterable, TYPE_CHECKING +from __future__ import annotations -from my.core import datetime_aware, make_logger, stat, Res, Stats -from my.core.compat import deprecated, removeprefix +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Iterable, Iterator +from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings +from my.core.compat import deprecated, removeprefix, removesuffix logger = make_logger(__name__) -class Watched(NamedTuple): +@dataclass +class Watched: url: str title: str when: datetime_aware @@ -16,19 +19,57 @@ class Watched(NamedTuple): def eid(self) -> str: return f'{self.url}-{self.when.isoformat()}' + def is_deleted(self) -> bool: + return self.title == self.url + # todo define error policy? # although it has one from google takeout module.. so not sure -def watched() -> Iterable[Res[Watched]]: + +def watched() -> Iterator[Res[Watched]]: + emitted: dict[Any, Watched] = {} + for w in _watched(): + if isinstance(w, Exception): + yield w # TODO also make unique? + continue + + # older exports (e.g. html) didn't have microseconds + # wheras newer json ones do have them + # seconds resolution is enough to distinguish watched videos + # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data + without_microsecond = w.when.replace(microsecond=0) + + key = w.url, without_microsecond + prev = emitted.get(key, None) + if prev is not None: + # NOTE: some video titles start with 'Liked ' for liked videos activity + # but they'd have different timestamp, so fine not to handle them as a special case here + if w.title in prev.title: + # often more stuff added to the title, like 'Official Video' + # in this case not worth emitting the change + # also handles the case when titles match + continue + # otherwise if title changed completely, just emit the change... not sure what else we could do? + # could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later.. + + # TODO would also be nice to handle is_deleted here somehow... + # but for that would need to process data in direct order vs reversed.. + # not sure, maybe this could use a special mode or something? + + emitted[key] = w + yield w + + +def _watched() -> Iterator[Res[Watched]]: try: - from ..google.takeout.parser import events from google_takeout_parser.models import Activity + + from ..google.takeout.parser import events except ModuleNotFoundError as ex: logger.exception(ex) - from ..core.warnings import high - high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") - yield from _watched_legacy() + warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") + yield from _watched_legacy() # type: ignore[name-defined] return YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v=' @@ -43,12 +84,12 @@ def watched() -> Iterable[Res[Watched]]: continue url = e.titleUrl - header = e.header - title = e.title if url is None: continue + header = e.header + if header in {'Image Search', 'Search', 'Chrome'}: # sometimes results in youtube links.. but definitely not watch history continue @@ -61,6 +102,8 @@ def watched() -> Iterable[Res[Watched]]: pass continue + title = e.title + if header == 'youtube.com' and title.startswith('Visited '): continue @@ -76,16 +119,32 @@ def watched() -> Iterable[Res[Watched]]: # also compatible with legacy titles title = removeprefix(title, 'Watched ') + # watches originating from some activity end with this, remove it for consistency + title = removesuffix(title, ' - YouTube') + if YOUTUBE_VIDEO_LINK not in url: - if e.details == ['From Google Ads']: - # weird, sometimes results in odd + if 'youtube.com/post/' in url: + # some sort of channel updates? continue - if title == 'Used YouTube' and e.products == ['Android']: + if 'youtube.com/playlist' in url: + # 'saved playlist' actions + continue + if 'music.youtube.com' in url: + # todo maybe allow it? + continue + if any('From Google Ads' in d for d in e.details): + # weird, sometimes results in odd urls + continue + + if title == 'Used YouTube': continue yield RuntimeError(f'Unexpected url: {e}') continue + # TODO contribute to takeout parser? seems that these still might happen in json data + title = title.replace("\xa0", " ") + yield Watched( url=url, title=title, @@ -100,24 +159,24 @@ def stats() -> Stats: ### deprecated stuff (keep in my.media.youtube) if not TYPE_CHECKING: + @deprecated("use 'watched' instead") def get_watched(*args, **kwargs): return watched(*args, **kwargs) + def _watched_legacy() -> Iterable[Watched]: + from ..google.takeout.html import read_html + from ..google.takeout.paths import get_last_takeout -def _watched_legacy() -> Iterable[Watched]: - from ..google.takeout.html import read_html - from ..google.takeout.paths import get_last_takeout + # todo looks like this one doesn't have retention? so enough to use the last + path = 'Takeout/My Activity/YouTube/MyActivity.html' + last = get_last_takeout(path=path) + if last is None: + return [] - # todo looks like this one doesn't have retention? so enough to use the last - path = 'Takeout/My Activity/YouTube/MyActivity.html' - last = get_last_takeout(path=path) - if last is None: - return [] + watches: list[Watched] = [] + for dt, url, title in read_html(last, path): + watches.append(Watched(url=url, title=title, when=dt)) - watches: List[Watched] = [] - for dt, url, title in read_html(last, path): - watches.append(Watched(url=url, title=title, when=dt)) - - # todo hmm they already come sorted.. wonder if should just rely on it.. - return sorted(watches, key=lambda e: e.when) + # todo hmm they already come sorted.. wonder if should just rely on it.. + return sorted(watches, key=lambda e: e.when)