my.youtube: use new my.google.takeout.parser module for its data

- fallback on the old logic if google_takeout_parser isn't available
- move to my.youtube.takeout (possibly mixing in other sources later)
- keep my.media.youtube, but issue deprecation warning
  currently used in orger etc, so doesn't hurt to keep
- also fixes https://github.com/karlicoss/HPI/issues/113
This commit is contained in:
Dima Gerasimov 2022-04-20 21:58:10 +01:00 committed by karlicoss
parent 915cfe69b3
commit 78f6ae96d1
5 changed files with 154 additions and 51 deletions

View file

@ -83,3 +83,10 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa
dest.cursor().executescript(tempfile.read())
dest.commit()
# can remove after python3.9
def removeprefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix):]
return text

View file

46
my/media/youtube.py Executable file → Normal file
View file

@ -1,43 +1,5 @@
#!/usr/bin/env python3
from datetime import datetime
from typing import NamedTuple, List, Iterable
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout
class Watched(NamedTuple):
url: str
title: str
when: datetime
@property
def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}'
def watched() -> Iterable[Watched]:
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
last = get_last_takeout(path=path)
if last is None:
return []
watches: List[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))
# TODO hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when))
from ..core import stat, Stats
def stats() -> Stats:
return stat(watched)
# todo deprecate
get_watched = watched
from ..core.warnings import high
high("DEPRECATED! Please use my.youtube.takeout instead.")
from ..core.util import __NOT_HPI_MODULE__
from ..youtube.takeout import *

120
my/youtube/takeout.py Executable file
View file

@ -0,0 +1,120 @@
from typing import NamedTuple, List, Iterable
from ..core import datetime_aware, Res, LazyLogger
from ..core.compat import removeprefix
logger = LazyLogger(__name__)
class Watched(NamedTuple):
url: str
title: str
when: datetime_aware
@property
def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}'
# todo define error policy?
# although it has one from google takeout module.. so not sure
def watched() -> Iterable[Res[Watched]]:
try:
from ..google.takeout.parser import events
from google_takeout_parser.models import Activity
except ModuleNotFoundError as ex:
logger.exception(ex)
from ..core.warnings import high
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
yield from _watched_legacy()
return
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
# TODO would be nice to filter, e.g. it's kinda pointless to process Location events
for e in events():
if isinstance(e, Exception):
yield e
if not isinstance(e, Activity):
continue
url = e.titleUrl
header = e.header
title = e.title
if url is None:
continue
if header in {'Image Search', 'Search', 'Chrome'}:
# sometimes results in youtube links.. but definitely not watch history
continue
if header not in {'YouTube', 'youtube.com'}:
# TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check
# perhaps this would be easier once we have universal ids
if YOUTUBE_VIDEO_LINK in url:
# TODO maybe log in this case or something?
pass
continue
if header == 'youtube.com' and title.startswith('Visited '):
continue
if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'):
# search activity, don't need it here
continue
if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'):
# todo might be interesting to process somwhere?
continue
# all titles contain it, so pointless to include 'Watched '
# also compatible with legacy titles
title = removeprefix(title, 'Watched ')
if YOUTUBE_VIDEO_LINK not in url:
if e.details == ['From Google Ads']:
# weird, sometimes results in odd
continue
if title == 'Used YouTube' and e.products == ['Android']:
continue
yield RuntimeError(f'Unexpected url: {e}')
continue
yield Watched(
url=url,
title=title,
when=e.time,
)
from ..core import stat, Stats
def stats() -> Stats:
return stat(watched)
### deprecated stuff (keep in my.media.youtube)
get_watched = watched
def _watched_legacy() -> Iterable[Watched]:
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout
# todo looks like this one doesn't have retention? so enough to use the last
path = 'Takeout/My Activity/YouTube/MyActivity.html'
last = get_last_takeout(path=path)
if last is None:
return []
watches: List[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))
# todo hmm they already come sorted.. wonder if should just rely on it..
return list(sorted(watches, key=lambda e: e.when))

View file

@ -1,22 +1,36 @@
# TODO move elsewhere?
# these tests would only make sense with some existing data? although some of them would work for everyone..
# not sure what's a good way of handling this..
from datetime import datetime
import pytz
from more_itertools import bucket
from .common import skip_if_not_karlicoss as pytestmark
# TODO ugh. if i uncomment this here (on top level), then this test vvv fails
# from my.media.youtube import get_watched, Watched
# HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py
def test() -> None:
from my.media.youtube import get_watched, Watched
watched = list(get_watched())
assert len(watched) > 1000
from datetime import datetime
import pytz
w = Watched(
def test() -> None:
from my.youtube.takeout import watched, Watched
videos = [w for w in watched() if not isinstance(w, Exception)]
assert len(videos) > 1000
# results in nicer errors, otherwise annoying to check against thousands of videos
grouped = bucket(videos, key=lambda w: (w.url, w.title))
w1 = Watched(
url='https://www.youtube.com/watch?v=hTGJfRPLe08',
title='Jamie xx - Gosh',
when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc),
when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)),
)
assert w in watched
assert w1 in list(grouped[(w1.url, w1.title)])
w2 = Watched(
url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0',
title='Why LESS Sensitive Tests Might Be Better',
when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)),
)
assert w2 in list(grouped[(w2.url, w2.title)])