my.youtube: use new my.google.takeout.parser module for its data
- fallback on the old logic if google_takeout_parser isn't available - move to my.youtube.takeout (possibly mixing in other sources later) - keep my.media.youtube, but issue deprecation warning currently used in orger etc, so doesn't hurt to keep - also fixes https://github.com/karlicoss/HPI/issues/113
This commit is contained in:
parent
915cfe69b3
commit
78f6ae96d1
5 changed files with 154 additions and 51 deletions
|
@ -83,3 +83,10 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa
|
||||||
|
|
||||||
dest.cursor().executescript(tempfile.read())
|
dest.cursor().executescript(tempfile.read())
|
||||||
dest.commit()
|
dest.commit()
|
||||||
|
|
||||||
|
|
||||||
|
# can remove after python3.9
|
||||||
|
def removeprefix(text: str, prefix: str) -> str:
|
||||||
|
if text.startswith(prefix):
|
||||||
|
return text[len(prefix):]
|
||||||
|
return text
|
||||||
|
|
46
my/media/youtube.py
Executable file → Normal file
46
my/media/youtube.py
Executable file → Normal file
|
@ -1,43 +1,5 @@
|
||||||
#!/usr/bin/env python3
|
from ..core.warnings import high
|
||||||
from datetime import datetime
|
high("DEPRECATED! Please use my.youtube.takeout instead.")
|
||||||
from typing import NamedTuple, List, Iterable
|
from ..core.util import __NOT_HPI_MODULE__
|
||||||
|
|
||||||
from ..google.takeout.html import read_html
|
|
||||||
from ..google.takeout.paths import get_last_takeout
|
|
||||||
|
|
||||||
|
|
||||||
class Watched(NamedTuple):
|
|
||||||
url: str
|
|
||||||
title: str
|
|
||||||
when: datetime
|
|
||||||
|
|
||||||
@property
|
|
||||||
def eid(self) -> str:
|
|
||||||
return f'{self.url}-{self.when.isoformat()}'
|
|
||||||
|
|
||||||
|
|
||||||
def watched() -> Iterable[Watched]:
|
|
||||||
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
|
|
||||||
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
|
|
||||||
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
|
|
||||||
last = get_last_takeout(path=path)
|
|
||||||
if last is None:
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
watches: List[Watched] = []
|
|
||||||
for dt, url, title in read_html(last, path):
|
|
||||||
watches.append(Watched(url=url, title=title, when=dt))
|
|
||||||
|
|
||||||
# TODO hmm they already come sorted.. wonder if should just rely on it..
|
|
||||||
return list(sorted(watches, key=lambda e: e.when))
|
|
||||||
|
|
||||||
|
|
||||||
from ..core import stat, Stats
|
|
||||||
def stats() -> Stats:
|
|
||||||
return stat(watched)
|
|
||||||
|
|
||||||
|
|
||||||
# todo deprecate
|
|
||||||
get_watched = watched
|
|
||||||
|
|
||||||
|
from ..youtube.takeout import *
|
||||||
|
|
120
my/youtube/takeout.py
Executable file
120
my/youtube/takeout.py
Executable file
|
@ -0,0 +1,120 @@
|
||||||
|
from typing import NamedTuple, List, Iterable
|
||||||
|
|
||||||
|
from ..core import datetime_aware, Res, LazyLogger
|
||||||
|
from ..core.compat import removeprefix
|
||||||
|
|
||||||
|
|
||||||
|
logger = LazyLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Watched(NamedTuple):
|
||||||
|
url: str
|
||||||
|
title: str
|
||||||
|
when: datetime_aware
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eid(self) -> str:
|
||||||
|
return f'{self.url}-{self.when.isoformat()}'
|
||||||
|
|
||||||
|
|
||||||
|
# todo define error policy?
|
||||||
|
# although it has one from google takeout module.. so not sure
|
||||||
|
|
||||||
|
def watched() -> Iterable[Res[Watched]]:
|
||||||
|
try:
|
||||||
|
from ..google.takeout.parser import events
|
||||||
|
from google_takeout_parser.models import Activity
|
||||||
|
except ModuleNotFoundError as ex:
|
||||||
|
logger.exception(ex)
|
||||||
|
from ..core.warnings import high
|
||||||
|
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
|
||||||
|
yield from _watched_legacy()
|
||||||
|
return
|
||||||
|
|
||||||
|
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
|
||||||
|
|
||||||
|
# TODO would be nice to filter, e.g. it's kinda pointless to process Location events
|
||||||
|
for e in events():
|
||||||
|
if isinstance(e, Exception):
|
||||||
|
yield e
|
||||||
|
|
||||||
|
if not isinstance(e, Activity):
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = e.titleUrl
|
||||||
|
header = e.header
|
||||||
|
title = e.title
|
||||||
|
|
||||||
|
if url is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if header in {'Image Search', 'Search', 'Chrome'}:
|
||||||
|
# sometimes results in youtube links.. but definitely not watch history
|
||||||
|
continue
|
||||||
|
|
||||||
|
if header not in {'YouTube', 'youtube.com'}:
|
||||||
|
# TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check
|
||||||
|
# perhaps this would be easier once we have universal ids
|
||||||
|
if YOUTUBE_VIDEO_LINK in url:
|
||||||
|
# TODO maybe log in this case or something?
|
||||||
|
pass
|
||||||
|
continue
|
||||||
|
|
||||||
|
if header == 'youtube.com' and title.startswith('Visited '):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'):
|
||||||
|
# search activity, don't need it here
|
||||||
|
continue
|
||||||
|
|
||||||
|
if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'):
|
||||||
|
# todo might be interesting to process somwhere?
|
||||||
|
continue
|
||||||
|
|
||||||
|
# all titles contain it, so pointless to include 'Watched '
|
||||||
|
# also compatible with legacy titles
|
||||||
|
title = removeprefix(title, 'Watched ')
|
||||||
|
|
||||||
|
if YOUTUBE_VIDEO_LINK not in url:
|
||||||
|
if e.details == ['From Google Ads']:
|
||||||
|
# weird, sometimes results in odd
|
||||||
|
continue
|
||||||
|
if title == 'Used YouTube' and e.products == ['Android']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield RuntimeError(f'Unexpected url: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield Watched(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
when=e.time,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
from ..core import stat, Stats
|
||||||
|
def stats() -> Stats:
|
||||||
|
return stat(watched)
|
||||||
|
|
||||||
|
|
||||||
|
### deprecated stuff (keep in my.media.youtube)
|
||||||
|
|
||||||
|
get_watched = watched
|
||||||
|
|
||||||
|
|
||||||
|
def _watched_legacy() -> Iterable[Watched]:
|
||||||
|
from ..google.takeout.html import read_html
|
||||||
|
from ..google.takeout.paths import get_last_takeout
|
||||||
|
|
||||||
|
# todo looks like this one doesn't have retention? so enough to use the last
|
||||||
|
path = 'Takeout/My Activity/YouTube/MyActivity.html'
|
||||||
|
last = get_last_takeout(path=path)
|
||||||
|
if last is None:
|
||||||
|
return []
|
||||||
|
|
||||||
|
watches: List[Watched] = []
|
||||||
|
for dt, url, title in read_html(last, path):
|
||||||
|
watches.append(Watched(url=url, title=title, when=dt))
|
||||||
|
|
||||||
|
# todo hmm they already come sorted.. wonder if should just rely on it..
|
||||||
|
return list(sorted(watches, key=lambda e: e.when))
|
|
@ -1,22 +1,36 @@
|
||||||
# TODO move elsewhere?
|
# TODO move elsewhere?
|
||||||
# these tests would only make sense with some existing data? although some of them would work for everyone..
|
# these tests would only make sense with some existing data? although some of them would work for everyone..
|
||||||
# not sure what's a good way of handling this..
|
# not sure what's a good way of handling this..
|
||||||
|
from datetime import datetime
|
||||||
|
import pytz
|
||||||
|
from more_itertools import bucket
|
||||||
|
|
||||||
|
|
||||||
from .common import skip_if_not_karlicoss as pytestmark
|
from .common import skip_if_not_karlicoss as pytestmark
|
||||||
|
|
||||||
# TODO ugh. if i uncomment this here (on top level), then this test vvv fails
|
# TODO ugh. if i uncomment this here (on top level), then this test vvv fails
|
||||||
# from my.media.youtube import get_watched, Watched
|
# from my.media.youtube import get_watched, Watched
|
||||||
# HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py
|
# HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py
|
||||||
|
|
||||||
def test() -> None:
|
|
||||||
from my.media.youtube import get_watched, Watched
|
|
||||||
watched = list(get_watched())
|
|
||||||
assert len(watched) > 1000
|
|
||||||
|
|
||||||
from datetime import datetime
|
def test() -> None:
|
||||||
import pytz
|
from my.youtube.takeout import watched, Watched
|
||||||
w = Watched(
|
videos = [w for w in watched() if not isinstance(w, Exception)]
|
||||||
|
assert len(videos) > 1000
|
||||||
|
|
||||||
|
# results in nicer errors, otherwise annoying to check against thousands of videos
|
||||||
|
grouped = bucket(videos, key=lambda w: (w.url, w.title))
|
||||||
|
|
||||||
|
w1 = Watched(
|
||||||
url='https://www.youtube.com/watch?v=hTGJfRPLe08',
|
url='https://www.youtube.com/watch?v=hTGJfRPLe08',
|
||||||
title='Jamie xx - Gosh',
|
title='Jamie xx - Gosh',
|
||||||
when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc),
|
when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)),
|
||||||
)
|
)
|
||||||
assert w in watched
|
assert w1 in list(grouped[(w1.url, w1.title)])
|
||||||
|
|
||||||
|
w2 = Watched(
|
||||||
|
url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0',
|
||||||
|
title='Why LESS Sensitive Tests Might Be Better',
|
||||||
|
when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)),
|
||||||
|
)
|
||||||
|
assert w2 in list(grouped[(w2.url, w2.title)])
|
||||||
|
|
Loading…
Add table
Reference in a new issue