my.youtube.takeout: deduplicate watched videos and sort out a few minor errors

This commit is contained in:
Dima Gerasimov 2024-09-22 17:47:05 +01:00 committed by karlicoss
parent 75639a3d5e
commit 8ed9e1947e
3 changed files with 102 additions and 33 deletions

View file

@ -22,12 +22,17 @@ if not TYPE_CHECKING:
source.backup(dest, **kwargs) source.backup(dest, **kwargs)
# can remove after python3.9 (although need to keep the method itself for bwd compat) ## can remove after python3.9 (although need to keep the method itself for bwd compat)
def removeprefix(text: str, prefix: str) -> str: def removeprefix(text: str, prefix: str) -> str:
if text.startswith(prefix): if text.startswith(prefix):
return text[len(prefix) :] return text[len(prefix) :]
return text return text
def removesuffix(text: str, suffix: str) -> str:
if text.endswith(suffix):
return text[:-len(suffix)]
return text
##
## used to have compat function before 3.8 for these, keeping for runtime back compatibility ## used to have compat function before 3.8 for these, keeping for runtime back compatibility
if not TYPE_CHECKING: if not TYPE_CHECKING:

View file

@ -1,5 +1,10 @@
from ..core.warnings import high from my.core import __NOT_HPI_MODULE__
high("DEPRECATED! Please use my.youtube.takeout instead.")
from ..core.util import __NOT_HPI_MODULE__
from ..youtube.takeout import * from typing import TYPE_CHECKING
from my.core.warnings import high
high("DEPRECATED! Please use my.youtube.takeout instead.")
if not TYPE_CHECKING:
from my.youtube.takeout import *

View file

@ -1,13 +1,16 @@
from typing import NamedTuple, List, Iterable, TYPE_CHECKING from __future__ import annotations
from my.core import datetime_aware, make_logger, stat, Res, Stats from dataclasses import dataclass
from my.core.compat import deprecated, removeprefix from typing import TYPE_CHECKING, Any, Iterable, Iterator
from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
from my.core.compat import deprecated, removeprefix, removesuffix
logger = make_logger(__name__) logger = make_logger(__name__)
class Watched(NamedTuple): @dataclass
class Watched:
url: str url: str
title: str title: str
when: datetime_aware when: datetime_aware
@ -16,19 +19,57 @@ class Watched(NamedTuple):
def eid(self) -> str: def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}' return f'{self.url}-{self.when.isoformat()}'
def is_deleted(self) -> bool:
return self.title == self.url
# todo define error policy? # todo define error policy?
# although it has one from google takeout module.. so not sure # although it has one from google takeout module.. so not sure
def watched() -> Iterable[Res[Watched]]:
def watched() -> Iterator[Res[Watched]]:
emitted: dict[Any, Watched] = {}
for w in _watched():
if isinstance(w, Exception):
yield w # TODO also make unique?
continue
# older exports (e.g. html) didn't have microseconds
# wheras newer json ones do have them
# seconds resolution is enough to distinguish watched videos
# also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
without_microsecond = w.when.replace(microsecond=0)
key = w.url, without_microsecond
prev = emitted.get(key, None)
if prev is not None:
# NOTE: some video titles start with 'Liked ' for liked videos activity
# but they'd have different timestamp, so fine not to handle them as a special case here
if w.title in prev.title:
# often more stuff added to the title, like 'Official Video'
# in this case not worth emitting the change
# also handles the case when titles match
continue
# otherwise if title changed completely, just emit the change... not sure what else we could do?
# could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..
# TODO would also be nice to handle is_deleted here somehow...
# but for that would need to process data in direct order vs reversed..
# not sure, maybe this could use a special mode or something?
emitted[key] = w
yield w
def _watched() -> Iterator[Res[Watched]]:
try: try:
from ..google.takeout.parser import events
from google_takeout_parser.models import Activity from google_takeout_parser.models import Activity
from ..google.takeout.parser import events
except ModuleNotFoundError as ex: except ModuleNotFoundError as ex:
logger.exception(ex) logger.exception(ex)
from ..core.warnings import high warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") yield from _watched_legacy() # type: ignore[name-defined]
yield from _watched_legacy()
return return
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v=' YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
@ -43,12 +84,12 @@ def watched() -> Iterable[Res[Watched]]:
continue continue
url = e.titleUrl url = e.titleUrl
header = e.header
title = e.title
if url is None: if url is None:
continue continue
header = e.header
if header in {'Image Search', 'Search', 'Chrome'}: if header in {'Image Search', 'Search', 'Chrome'}:
# sometimes results in youtube links.. but definitely not watch history # sometimes results in youtube links.. but definitely not watch history
continue continue
@ -61,6 +102,8 @@ def watched() -> Iterable[Res[Watched]]:
pass pass
continue continue
title = e.title
if header == 'youtube.com' and title.startswith('Visited '): if header == 'youtube.com' and title.startswith('Visited '):
continue continue
@ -76,16 +119,32 @@ def watched() -> Iterable[Res[Watched]]:
# also compatible with legacy titles # also compatible with legacy titles
title = removeprefix(title, 'Watched ') title = removeprefix(title, 'Watched ')
# watches originating from some activity end with this, remove it for consistency
title = removesuffix(title, ' - YouTube')
if YOUTUBE_VIDEO_LINK not in url: if YOUTUBE_VIDEO_LINK not in url:
if e.details == ['From Google Ads']: if 'youtube.com/post/' in url:
# weird, sometimes results in odd # some sort of channel updates?
continue continue
if title == 'Used YouTube' and e.products == ['Android']: if 'youtube.com/playlist' in url:
# 'saved playlist' actions
continue
if 'music.youtube.com' in url:
# todo maybe allow it?
continue
if any('From Google Ads' in d for d in e.details):
# weird, sometimes results in odd urls
continue
if title == 'Used YouTube':
continue continue
yield RuntimeError(f'Unexpected url: {e}') yield RuntimeError(f'Unexpected url: {e}')
continue continue
# TODO contribute to takeout parser? seems that these still might happen in json data
title = title.replace("\xa0", " ")
yield Watched( yield Watched(
url=url, url=url,
title=title, title=title,
@ -100,24 +159,24 @@ def stats() -> Stats:
### deprecated stuff (keep in my.media.youtube) ### deprecated stuff (keep in my.media.youtube)
if not TYPE_CHECKING: if not TYPE_CHECKING:
@deprecated("use 'watched' instead") @deprecated("use 'watched' instead")
def get_watched(*args, **kwargs): def get_watched(*args, **kwargs):
return watched(*args, **kwargs) return watched(*args, **kwargs)
def _watched_legacy() -> Iterable[Watched]:
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout
def _watched_legacy() -> Iterable[Watched]: # todo looks like this one doesn't have retention? so enough to use the last
from ..google.takeout.html import read_html path = 'Takeout/My Activity/YouTube/MyActivity.html'
from ..google.takeout.paths import get_last_takeout last = get_last_takeout(path=path)
if last is None:
return []
# todo looks like this one doesn't have retention? so enough to use the last watches: list[Watched] = []
path = 'Takeout/My Activity/YouTube/MyActivity.html' for dt, url, title in read_html(last, path):
last = get_last_takeout(path=path) watches.append(Watched(url=url, title=title, when=dt))
if last is None:
return []
watches: List[Watched] = [] # todo hmm they already come sorted.. wonder if should just rely on it..
for dt, url, title in read_html(last, path): return sorted(watches, key=lambda e: e.when)
watches.append(Watched(url=url, title=title, when=dt))
# todo hmm they already come sorted.. wonder if should just rely on it..
return sorted(watches, key=lambda e: e.when)