my.youtube.takeout: deduplicate watched videos and sort out a few minor errors

2024-09-22 17:47:05 +01:00 · 2024-09-22 17:47:05 +01:00 · 8ed9e1947e
commit 8ed9e1947e
parent 75639a3d5e
3 changed files with 102 additions and 33 deletions
--- a/my/core/compat.py
+++ b/my/core/compat.py
@ -22,12 +22,17 @@ if not TYPE_CHECKING:
        source.backup(dest, **kwargs)
-# can remove after python3.9 (although need to keep the method itself for bwd compat)
+## can remove after python3.9 (although need to keep the method itself for bwd compat)
 def removeprefix(text: str, prefix: str) -> str:
    if text.startswith(prefix):
        return text[len(prefix) :]
    return text
 def removesuffix(text: str, suffix: str) -> str:
    if text.endswith(suffix):
        return text[:-len(suffix)]
    return text
 ##
 ## used to have compat function before 3.8 for these, keeping for runtime back compatibility
 if not TYPE_CHECKING:
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@ -1,5 +1,10 @@
-from ..core.warnings import high
+from my.core import __NOT_HPI_MODULE__
 high("DEPRECATED! Please use my.youtube.takeout instead.")
 from ..core.util import __NOT_HPI_MODULE__
-from ..youtube.takeout import *
+from typing import TYPE_CHECKING
 from my.core.warnings import high
 high("DEPRECATED! Please use my.youtube.takeout instead.")
 if not TYPE_CHECKING:
    from my.youtube.takeout import *
--- a/my/youtube/takeout.py
+++ b/my/youtube/takeout.py
@ -1,13 +1,16 @@
-from typing import NamedTuple, List, Iterable, TYPE_CHECKING
+from __future__ import annotations
-from my.core import datetime_aware, make_logger, stat, Res, Stats
+from dataclasses import dataclass
-from my.core.compat import deprecated, removeprefix
+from typing import TYPE_CHECKING, Any, Iterable, Iterator
 from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
 from my.core.compat import deprecated, removeprefix, removesuffix
 logger = make_logger(__name__)
-class Watched(NamedTuple):
+@dataclass
 class Watched:
    url: str
    title: str
    when: datetime_aware
@ -16,19 +19,57 @@ class Watched(NamedTuple):
    def eid(self) -> str:
        return f'{self.url}-{self.when.isoformat()}'
    def is_deleted(self) -> bool:
        return self.title == self.url
 # todo define error policy?
 # although it has one from google takeout module.. so not sure
-def watched() -> Iterable[Res[Watched]]:
+
 def watched() -> Iterator[Res[Watched]]:
    emitted: dict[Any, Watched] = {}
    for w in _watched():
        if isinstance(w, Exception):
            yield w  # TODO also make unique?
            continue
        # older exports (e.g. html) didn't have microseconds
        # wheras newer json ones do have them
        # seconds resolution is enough to distinguish watched videos
        # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
        without_microsecond = w.when.replace(microsecond=0)
        key = w.url, without_microsecond
        prev = emitted.get(key, None)
        if prev is not None:
            # NOTE: some video titles start with 'Liked ' for liked videos activity
            # but they'd have different timestamp, so fine not to handle them as a special case here
            if w.title in prev.title:
                # often more stuff added to the title, like 'Official Video'
                # in this case not worth emitting the change
                # also handles the case when titles match
                continue
            # otherwise if title changed completely, just emit the change... not sure what else we could do?
            # could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..
            # TODO would also be nice to handle is_deleted here somehow...
            # but for that would need to process data in direct order vs reversed..
            # not sure, maybe this could use a special mode or something?
        emitted[key] = w
        yield w
 def _watched() -> Iterator[Res[Watched]]:
    try:
        from ..google.takeout.parser import events
        from google_takeout_parser.models import Activity
        from ..google.takeout.parser import events
    except ModuleNotFoundError as ex:
        logger.exception(ex)
-        from ..core.warnings import high
+        warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
-        high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
+        yield from _watched_legacy()  # type: ignore[name-defined]
        yield from _watched_legacy()
        return
    YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
@ -43,12 +84,12 @@ def watched() -> Iterable[Res[Watched]]:
            continue
        url = e.titleUrl
        header = e.header
        title = e.title
        if url is None:
            continue
        header = e.header
        if header in {'Image Search', 'Search', 'Chrome'}:
            # sometimes results in youtube links.. but definitely not watch history
            continue
@ -61,6 +102,8 @@ def watched() -> Iterable[Res[Watched]]:
                pass
            continue
        title = e.title
        if header == 'youtube.com' and title.startswith('Visited '):
            continue
@ -76,16 +119,32 @@ def watched() -> Iterable[Res[Watched]]:
        # also compatible with legacy titles
        title = removeprefix(title, 'Watched ')
        # watches originating from some activity end with this, remove it for consistency
        title = removesuffix(title, ' - YouTube')
        if YOUTUBE_VIDEO_LINK not in url:
-            if e.details == ['From Google Ads']:
+            if 'youtube.com/post/' in url:
-                # weird, sometimes results in odd
+                # some sort of channel updates?
                continue
-            if title == 'Used YouTube' and e.products == ['Android']:
+            if 'youtube.com/playlist' in url:
                # 'saved playlist' actions
                continue
            if 'music.youtube.com' in url:
                # todo maybe allow it?
                continue
            if any('From Google Ads' in d for d in e.details):
                # weird, sometimes results in odd urls
                continue
            if title == 'Used YouTube':
                continue
            yield RuntimeError(f'Unexpected url: {e}')
            continue
        # TODO contribute to takeout parser? seems that these still might happen in json data
        title = title.replace("\xa0", " ")
        yield Watched(
            url=url,
            title=title,
@ -100,24 +159,24 @@ def stats() -> Stats:
 ### deprecated stuff (keep in my.media.youtube)
 if not TYPE_CHECKING:
    @deprecated("use 'watched' instead")
    def get_watched(*args, **kwargs):
        return watched(*args, **kwargs)
    def _watched_legacy() -> Iterable[Watched]:
        from ..google.takeout.html import read_html
        from ..google.takeout.paths import get_last_takeout
-def _watched_legacy() -> Iterable[Watched]:
+        # todo looks like this one doesn't have retention? so enough to use the last
-    from ..google.takeout.html import read_html
+        path = 'Takeout/My Activity/YouTube/MyActivity.html'
-    from ..google.takeout.paths import get_last_takeout
+        last = get_last_takeout(path=path)
        if last is None:
            return []
-    # todo looks like this one doesn't have retention? so enough to use the last
+        watches: list[Watched] = []
-    path = 'Takeout/My Activity/YouTube/MyActivity.html'
+        for dt, url, title in read_html(last, path):
-    last = get_last_takeout(path=path)
+            watches.append(Watched(url=url, title=title, when=dt))
    if last is None:
        return []
-    watches: List[Watched] = []
+        # todo hmm they already come sorted.. wonder if should just rely on it..
-    for dt, url, title in read_html(last, path):
+        return sorted(watches, key=lambda e: e.when)
        watches.append(Watched(url=url, title=title, when=dt))
    # todo hmm they already come sorted.. wonder if should just rely on it..
    return sorted(watches, key=lambda e: e.when)