my.youtube.takeout: deduplicate watched videos and sort out a few minor errors

2024-09-22 17:47:05 +01:00 · 2024-09-22 17:47:05 +01:00 · 8ed9e1947e
commit 8ed9e1947e
parent 75639a3d5e
3 changed files with 102 additions and 33 deletions
--- a/my/core/compat.py
+++ b/my/core/compat.py
@ -22,12 +22,17 @@ if not TYPE_CHECKING:
        source.backup(dest, **kwargs)


-# can remove after python3.9 (although need to keep the method itself for bwd compat)
+## can remove after python3.9 (although need to keep the method itself for bwd compat)
 def removeprefix(text: str, prefix: str) -> str:
    if text.startswith(prefix):
        return text[len(prefix) :]
    return text

+def removesuffix(text: str, suffix: str) -> str:
+    if text.endswith(suffix):
+        return text[:-len(suffix)]
+    return text
+##

 ## used to have compat function before 3.8 for these, keeping for runtime back compatibility
 if not TYPE_CHECKING:
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@ -1,5 +1,10 @@
-from ..core.warnings import high
-high("DEPRECATED! Please use my.youtube.takeout instead.")
-from ..core.util import __NOT_HPI_MODULE__
+from my.core import __NOT_HPI_MODULE__

-from ..youtube.takeout import *
+from typing import TYPE_CHECKING
+
+from my.core.warnings import high
+
+high("DEPRECATED! Please use my.youtube.takeout instead.")
+
+if not TYPE_CHECKING:
+    from my.youtube.takeout import *
--- a/my/youtube/takeout.py
+++ b/my/youtube/takeout.py
@ -1,13 +1,16 @@
-from typing import NamedTuple, List, Iterable, TYPE_CHECKING
+from __future__ import annotations

-from my.core import datetime_aware, make_logger, stat, Res, Stats
-from my.core.compat import deprecated, removeprefix
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Iterable, Iterator

+from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
+from my.core.compat import deprecated, removeprefix, removesuffix

 logger = make_logger(__name__)


-class Watched(NamedTuple):
+@dataclass
+class Watched:
    url: str
    title: str
    when: datetime_aware
@ -16,19 +19,57 @@ class Watched(NamedTuple):
    def eid(self) -> str:
        return f'{self.url}-{self.when.isoformat()}'

+    def is_deleted(self) -> bool:
+        return self.title == self.url
+

 # todo define error policy?
 # although it has one from google takeout module.. so not sure

-def watched() -> Iterable[Res[Watched]]:
+
+def watched() -> Iterator[Res[Watched]]:
+    emitted: dict[Any, Watched] = {}
+    for w in _watched():
+        if isinstance(w, Exception):
+            yield w  # TODO also make unique?
+            continue
+
+        # older exports (e.g. html) didn't have microseconds
+        # wheras newer json ones do have them
+        # seconds resolution is enough to distinguish watched videos
+        # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
+        without_microsecond = w.when.replace(microsecond=0)
+
+        key = w.url, without_microsecond
+        prev = emitted.get(key, None)
+        if prev is not None:
+            # NOTE: some video titles start with 'Liked ' for liked videos activity
+            # but they'd have different timestamp, so fine not to handle them as a special case here
+            if w.title in prev.title:
+                # often more stuff added to the title, like 'Official Video'
+                # in this case not worth emitting the change
+                # also handles the case when titles match
+                continue
+            # otherwise if title changed completely, just emit the change... not sure what else we could do?
+            # could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..
+
+            # TODO would also be nice to handle is_deleted here somehow...
+            # but for that would need to process data in direct order vs reversed..
+            # not sure, maybe this could use a special mode or something?
+
+        emitted[key] = w
+        yield w
+
+
+def _watched() -> Iterator[Res[Watched]]:
    try:
-        from ..google.takeout.parser import events
        from google_takeout_parser.models import Activity
+
+        from ..google.takeout.parser import events
    except ModuleNotFoundError as ex:
        logger.exception(ex)
-        from ..core.warnings import high
-        high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
-        yield from _watched_legacy()
+        warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
+        yield from _watched_legacy()  # type: ignore[name-defined]
        return

    YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
@ -43,12 +84,12 @@ def watched() -> Iterable[Res[Watched]]:
            continue

        url = e.titleUrl
-        header = e.header
-        title = e.title

        if url is None:
            continue

+        header = e.header
+
        if header in {'Image Search', 'Search', 'Chrome'}:
            # sometimes results in youtube links.. but definitely not watch history
            continue
@ -61,6 +102,8 @@ def watched() -> Iterable[Res[Watched]]:
                pass
            continue

+        title = e.title
+
        if header == 'youtube.com' and title.startswith('Visited '):
            continue

@ -76,16 +119,32 @@ def watched() -> Iterable[Res[Watched]]:
        # also compatible with legacy titles
        title = removeprefix(title, 'Watched ')

+        # watches originating from some activity end with this, remove it for consistency
+        title = removesuffix(title, ' - YouTube')
+
        if YOUTUBE_VIDEO_LINK not in url:
-            if e.details == ['From Google Ads']:
-                # weird, sometimes results in odd
+            if 'youtube.com/post/' in url:
+                # some sort of channel updates?
                continue
-            if title == 'Used YouTube' and e.products == ['Android']:
+            if 'youtube.com/playlist' in url:
+                # 'saved playlist' actions
+                continue
+            if 'music.youtube.com' in url:
+                # todo maybe allow it?
+                continue
+            if any('From Google Ads' in d for d in e.details):
+                # weird, sometimes results in odd urls
+                continue
+
+            if title == 'Used YouTube':
                continue

            yield RuntimeError(f'Unexpected url: {e}')
            continue

+        # TODO contribute to takeout parser? seems that these still might happen in json data
+        title = title.replace("\xa0", " ")
+
        yield Watched(
            url=url,
            title=title,
@ -100,11 +159,11 @@ def stats() -> Stats:
 ### deprecated stuff (keep in my.media.youtube)

 if not TYPE_CHECKING:
+
    @deprecated("use 'watched' instead")
    def get_watched(*args, **kwargs):
        return watched(*args, **kwargs)

-
    def _watched_legacy() -> Iterable[Watched]:
        from ..google.takeout.html import read_html
        from ..google.takeout.paths import get_last_takeout
@ -115,7 +174,7 @@ def _watched_legacy() -> Iterable[Watched]:
        if last is None:
            return []

-    watches: List[Watched] = []
+        watches: list[Watched] = []
        for dt, url, title in read_html(last, path):
            watches.append(Watched(url=url, title=title, when=dt))