my.youtube.takeout: deduplicate watched videos and sort out a few minor errors
This commit is contained in:
parent
75639a3d5e
commit
8ed9e1947e
3 changed files with 102 additions and 33 deletions
|
@ -22,12 +22,17 @@ if not TYPE_CHECKING:
|
||||||
source.backup(dest, **kwargs)
|
source.backup(dest, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
# can remove after python3.9 (although need to keep the method itself for bwd compat)
|
## can remove after python3.9 (although need to keep the method itself for bwd compat)
|
||||||
def removeprefix(text: str, prefix: str) -> str:
|
def removeprefix(text: str, prefix: str) -> str:
|
||||||
if text.startswith(prefix):
|
if text.startswith(prefix):
|
||||||
return text[len(prefix) :]
|
return text[len(prefix) :]
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def removesuffix(text: str, suffix: str) -> str:
|
||||||
|
if text.endswith(suffix):
|
||||||
|
return text[:-len(suffix)]
|
||||||
|
return text
|
||||||
|
##
|
||||||
|
|
||||||
## used to have compat function before 3.8 for these, keeping for runtime back compatibility
|
## used to have compat function before 3.8 for these, keeping for runtime back compatibility
|
||||||
if not TYPE_CHECKING:
|
if not TYPE_CHECKING:
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
from ..core.warnings import high
|
from my.core import __NOT_HPI_MODULE__
|
||||||
high("DEPRECATED! Please use my.youtube.takeout instead.")
|
|
||||||
from ..core.util import __NOT_HPI_MODULE__
|
|
||||||
|
|
||||||
from ..youtube.takeout import *
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from my.core.warnings import high
|
||||||
|
|
||||||
|
high("DEPRECATED! Please use my.youtube.takeout instead.")
|
||||||
|
|
||||||
|
if not TYPE_CHECKING:
|
||||||
|
from my.youtube.takeout import *
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
from typing import NamedTuple, List, Iterable, TYPE_CHECKING
|
from __future__ import annotations
|
||||||
|
|
||||||
from my.core import datetime_aware, make_logger, stat, Res, Stats
|
from dataclasses import dataclass
|
||||||
from my.core.compat import deprecated, removeprefix
|
from typing import TYPE_CHECKING, Any, Iterable, Iterator
|
||||||
|
|
||||||
|
from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
|
||||||
|
from my.core.compat import deprecated, removeprefix, removesuffix
|
||||||
|
|
||||||
logger = make_logger(__name__)
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Watched(NamedTuple):
|
@dataclass
|
||||||
|
class Watched:
|
||||||
url: str
|
url: str
|
||||||
title: str
|
title: str
|
||||||
when: datetime_aware
|
when: datetime_aware
|
||||||
|
@ -16,19 +19,57 @@ class Watched(NamedTuple):
|
||||||
def eid(self) -> str:
|
def eid(self) -> str:
|
||||||
return f'{self.url}-{self.when.isoformat()}'
|
return f'{self.url}-{self.when.isoformat()}'
|
||||||
|
|
||||||
|
def is_deleted(self) -> bool:
|
||||||
|
return self.title == self.url
|
||||||
|
|
||||||
|
|
||||||
# todo define error policy?
|
# todo define error policy?
|
||||||
# although it has one from google takeout module.. so not sure
|
# although it has one from google takeout module.. so not sure
|
||||||
|
|
||||||
def watched() -> Iterable[Res[Watched]]:
|
|
||||||
|
def watched() -> Iterator[Res[Watched]]:
|
||||||
|
emitted: dict[Any, Watched] = {}
|
||||||
|
for w in _watched():
|
||||||
|
if isinstance(w, Exception):
|
||||||
|
yield w # TODO also make unique?
|
||||||
|
continue
|
||||||
|
|
||||||
|
# older exports (e.g. html) didn't have microseconds
|
||||||
|
# wheras newer json ones do have them
|
||||||
|
# seconds resolution is enough to distinguish watched videos
|
||||||
|
# also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
|
||||||
|
without_microsecond = w.when.replace(microsecond=0)
|
||||||
|
|
||||||
|
key = w.url, without_microsecond
|
||||||
|
prev = emitted.get(key, None)
|
||||||
|
if prev is not None:
|
||||||
|
# NOTE: some video titles start with 'Liked ' for liked videos activity
|
||||||
|
# but they'd have different timestamp, so fine not to handle them as a special case here
|
||||||
|
if w.title in prev.title:
|
||||||
|
# often more stuff added to the title, like 'Official Video'
|
||||||
|
# in this case not worth emitting the change
|
||||||
|
# also handles the case when titles match
|
||||||
|
continue
|
||||||
|
# otherwise if title changed completely, just emit the change... not sure what else we could do?
|
||||||
|
# could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..
|
||||||
|
|
||||||
|
# TODO would also be nice to handle is_deleted here somehow...
|
||||||
|
# but for that would need to process data in direct order vs reversed..
|
||||||
|
# not sure, maybe this could use a special mode or something?
|
||||||
|
|
||||||
|
emitted[key] = w
|
||||||
|
yield w
|
||||||
|
|
||||||
|
|
||||||
|
def _watched() -> Iterator[Res[Watched]]:
|
||||||
try:
|
try:
|
||||||
from ..google.takeout.parser import events
|
|
||||||
from google_takeout_parser.models import Activity
|
from google_takeout_parser.models import Activity
|
||||||
|
|
||||||
|
from ..google.takeout.parser import events
|
||||||
except ModuleNotFoundError as ex:
|
except ModuleNotFoundError as ex:
|
||||||
logger.exception(ex)
|
logger.exception(ex)
|
||||||
from ..core.warnings import high
|
warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
|
||||||
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
|
yield from _watched_legacy() # type: ignore[name-defined]
|
||||||
yield from _watched_legacy()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
|
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
|
||||||
|
@ -43,12 +84,12 @@ def watched() -> Iterable[Res[Watched]]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url = e.titleUrl
|
url = e.titleUrl
|
||||||
header = e.header
|
|
||||||
title = e.title
|
|
||||||
|
|
||||||
if url is None:
|
if url is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
header = e.header
|
||||||
|
|
||||||
if header in {'Image Search', 'Search', 'Chrome'}:
|
if header in {'Image Search', 'Search', 'Chrome'}:
|
||||||
# sometimes results in youtube links.. but definitely not watch history
|
# sometimes results in youtube links.. but definitely not watch history
|
||||||
continue
|
continue
|
||||||
|
@ -61,6 +102,8 @@ def watched() -> Iterable[Res[Watched]]:
|
||||||
pass
|
pass
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
title = e.title
|
||||||
|
|
||||||
if header == 'youtube.com' and title.startswith('Visited '):
|
if header == 'youtube.com' and title.startswith('Visited '):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -76,16 +119,32 @@ def watched() -> Iterable[Res[Watched]]:
|
||||||
# also compatible with legacy titles
|
# also compatible with legacy titles
|
||||||
title = removeprefix(title, 'Watched ')
|
title = removeprefix(title, 'Watched ')
|
||||||
|
|
||||||
|
# watches originating from some activity end with this, remove it for consistency
|
||||||
|
title = removesuffix(title, ' - YouTube')
|
||||||
|
|
||||||
if YOUTUBE_VIDEO_LINK not in url:
|
if YOUTUBE_VIDEO_LINK not in url:
|
||||||
if e.details == ['From Google Ads']:
|
if 'youtube.com/post/' in url:
|
||||||
# weird, sometimes results in odd
|
# some sort of channel updates?
|
||||||
continue
|
continue
|
||||||
if title == 'Used YouTube' and e.products == ['Android']:
|
if 'youtube.com/playlist' in url:
|
||||||
|
# 'saved playlist' actions
|
||||||
|
continue
|
||||||
|
if 'music.youtube.com' in url:
|
||||||
|
# todo maybe allow it?
|
||||||
|
continue
|
||||||
|
if any('From Google Ads' in d for d in e.details):
|
||||||
|
# weird, sometimes results in odd urls
|
||||||
|
continue
|
||||||
|
|
||||||
|
if title == 'Used YouTube':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield RuntimeError(f'Unexpected url: {e}')
|
yield RuntimeError(f'Unexpected url: {e}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# TODO contribute to takeout parser? seems that these still might happen in json data
|
||||||
|
title = title.replace("\xa0", " ")
|
||||||
|
|
||||||
yield Watched(
|
yield Watched(
|
||||||
url=url,
|
url=url,
|
||||||
title=title,
|
title=title,
|
||||||
|
@ -100,12 +159,12 @@ def stats() -> Stats:
|
||||||
### deprecated stuff (keep in my.media.youtube)
|
### deprecated stuff (keep in my.media.youtube)
|
||||||
|
|
||||||
if not TYPE_CHECKING:
|
if not TYPE_CHECKING:
|
||||||
|
|
||||||
@deprecated("use 'watched' instead")
|
@deprecated("use 'watched' instead")
|
||||||
def get_watched(*args, **kwargs):
|
def get_watched(*args, **kwargs):
|
||||||
return watched(*args, **kwargs)
|
return watched(*args, **kwargs)
|
||||||
|
|
||||||
|
def _watched_legacy() -> Iterable[Watched]:
|
||||||
def _watched_legacy() -> Iterable[Watched]:
|
|
||||||
from ..google.takeout.html import read_html
|
from ..google.takeout.html import read_html
|
||||||
from ..google.takeout.paths import get_last_takeout
|
from ..google.takeout.paths import get_last_takeout
|
||||||
|
|
||||||
|
@ -115,7 +174,7 @@ def _watched_legacy() -> Iterable[Watched]:
|
||||||
if last is None:
|
if last is None:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
watches: List[Watched] = []
|
watches: list[Watched] = []
|
||||||
for dt, url, title in read_html(last, path):
|
for dt, url, title in read_html(last, path):
|
||||||
watches.append(Watched(url=url, title=title, when=dt))
|
watches.append(Watched(url=url, title=title, when=dt))
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue