From 9e5cd60ff248b2b925b10577c43e218c630144f3 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Sun, 13 Feb 2022 15:56:05 -0800 Subject: [PATCH] browser: parse browser history using browserexport (#216) * browser: parse browser history using browserexport from seanbreckenridge/HPI module: https://github.com/seanbreckenridge/HPI/blob/1fba8ccf2fe93d67198aaa77355cac5f5910e7ac/my/browser/export.py --- doc/MODULES.org | 68 +++++++++++++++++++++++------------- my/browser/active_browser.py | 50 ++++++++++++++++++++++++++ my/browser/all.py | 35 +++++++++++++++++++ my/browser/common.py | 11 ++++++ my/browser/export.py | 50 ++++++++++++++++++++++++++ my/config.py | 6 ++++ tox.ini | 2 ++ 7 files changed, 198 insertions(+), 24 deletions(-) create mode 100644 my/browser/active_browser.py create mode 100644 my/browser/all.py create mode 100644 my/browser/common.py create mode 100644 my/browser/export.py diff --git a/doc/MODULES.org b/doc/MODULES.org index 1f69559..1f31931 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works. +# Nested Configurations before the doc generation using the block below +** [[file:../my/reddit][my.reddit]] + + Reddit data: saved items/comments/upvotes/etc. + + # Note: can't be generated as easily since this is a nested configuration object + #+begin_src python + class reddit: + class rexport: + ''' + Uses [[https://github.com/karlicoss/rexport][rexport]] output. + ''' + + # path[s]/glob to the exported JSON data + export_path: Paths + + class pushshift: + ''' + Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments + ''' + + # path[s]/glob to the exported JSON data + export_path: Paths + + #+end_src +** [[file:../my/browser/][my.browser]] + + Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] + + #+begin_src python + @dataclass + class browser: + class export: + # path[s]/glob to your backed up browser history sqlite files + export_path: Paths + + class active_browser: + # paths to sqlite database files which you use actively + # to read from. For example: + # from browserexport.browsers.all import Firefox + # active_databases = Firefox.locate_database() + export_path: Paths + #+end_src + # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh. #+begin_src python :dir .. :results output drawer raw :exports result @@ -139,30 +183,6 @@ for cls, p in modules: # paths[s]/glob to the exported JSON data export_path: Paths #+end_src -** [[file:../my/reddit][my.reddit]] - - Reddit data: saved items/comments/upvotes/etc. - - # Note: can't be generated as easily since this is a nested configuration object - #+begin_src python - class reddit: - class rexport: - ''' - Uses [[https://github.com/karlicoss/rexport][rexport]] output. - ''' - - # path[s]/glob to the exported JSON data - export_path: Paths - - class pushshift: - ''' - Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments - ''' - - # path[s]/glob to the exported JSON data - export_path: Paths - - #+end_src ** [[file:../my/pocket.py][my.pocket]] [[https://getpocket.com][Pocket]] bookmarks and highlights diff --git a/my/browser/active_browser.py b/my/browser/active_browser.py new file mode 100644 index 0000000..7005573 --- /dev/null +++ b/my/browser/active_browser.py @@ -0,0 +1,50 @@ +""" +Parses active browser history by backing it up with [[http://github.com/seanbreckenridge/sqlite_backup][sqlite_backup]] +""" + +REQUIRES = ["browserexport", "sqlite_backup"] + + +from my.config import browser as user_config +from my.core import Paths, dataclass + + +@dataclass +class config(user_config.active_browser): + # paths to sqlite database files which you use actively + # to read from. For example: + # from browserexport.browsers.all import Firefox + # active_databases = Firefox.locate_database() + export_path: Paths + + +from pathlib import Path +from typing import Sequence, Iterator + +from my.core import get_files, Stats +from browserexport.merge import read_visits, Visit +from sqlite_backup import sqlite_backup + +from .common import _patch_browserexport_logs + +_patch_browserexport_logs() + + +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +def history() -> Iterator[Visit]: + for ad in inputs(): + conn = sqlite_backup(ad) + assert conn is not None + try: + yield from read_visits(conn) + finally: + conn.close() + + +def stats() -> Stats: + from my.core import stat + + return {**stat(history)} diff --git a/my/browser/all.py b/my/browser/all.py new file mode 100644 index 0000000..a7d12b4 --- /dev/null +++ b/my/browser/all.py @@ -0,0 +1,35 @@ +from typing import Iterator + +from my.core import Stats +from my.core.source import import_source +from browserexport.merge import merge_visits, Visit + + +src_export = import_source(module_name="my.browser.export") +src_active = import_source(module_name="my.browser.active_browser") + + +@src_export +def _visits_export() -> Iterator[Visit]: + from . import export + return export.history() + + +@src_active +def _visits_active() -> Iterator[Visit]: + from . import active_browser + return active_browser.history() + + +# NOTE: you can comment out the sources you don't need +def history() -> Iterator[Visit]: + yield from merge_visits([ + _visits_active(), + _visits_export(), + ]) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(history)} diff --git a/my/browser/common.py b/my/browser/common.py new file mode 100644 index 0000000..9427f61 --- /dev/null +++ b/my/browser/common.py @@ -0,0 +1,11 @@ +import os +from my.core.util import __NOT_HPI_MODULE__ + + +def _patch_browserexport_logs(): + # patch browserexport logs if HPI_LOGS is present + if "HPI_LOGS" in os.environ: + from browserexport.log import setup as setup_browserexport_logger + from my.core.logging import mklevel + + setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"])) diff --git a/my/browser/export.py b/my/browser/export.py new file mode 100644 index 0000000..3185d53 --- /dev/null +++ b/my/browser/export.py @@ -0,0 +1,50 @@ +""" +Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] +""" + +REQUIRES = ["browserexport"] + +from my.config import browser as user_config +from my.core import Paths, dataclass + + +@dataclass +class config(user_config.export): + # path[s]/glob to your backed up browser history sqlite files + export_path: Paths + + +from pathlib import Path +from typing import Iterator, Sequence, List + +from my.core import Stats, get_files, LazyLogger +from my.core.common import mcachew + +from browserexport.merge import read_and_merge, Visit + +from .common import _patch_browserexport_logs + + +logger = LazyLogger(__name__, level="warning") + +_patch_browserexport_logs() + + +# all of my backed up databases +def inputs() -> Sequence[Path]: + return get_files(config.export_path) + + +def _cachew_depends_on() -> List[str]: + return [str(f) for f in inputs()] + + +@mcachew(depends_on=_cachew_depends_on, logger=logger) +def history() -> Iterator[Visit]: + yield from read_and_merge(inputs()) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(history)} diff --git a/my/config.py b/my/config.py index 7201a84..5bb316f 100644 --- a/my/config.py +++ b/my/config.py @@ -129,3 +129,9 @@ class fbmessenger: class twitter: class talon: export_path: Paths + +class browser: + class export: + export_path: Paths = '' + class active_browser: + export_path: Paths = '' diff --git a/tox.ini b/tox.ini index 64e1ab3..b8c89db 100644 --- a/tox.ini +++ b/tox.ini @@ -84,6 +84,7 @@ commands = commands = pip install -e .[testing,optional] + hpi module install my.browser.export hpi module install my.orgmode hpi module install my.endomondo hpi module install my.github.ghexport @@ -103,6 +104,7 @@ commands = # todo fuck. -p my.github isn't checking the subpackages?? wtf... # guess it wants .pyi file?? python3 -m mypy --install-types --non-interactive \ + -p my.browser \ -p my.endomondo \ -p my.github.ghexport \ -p my.hypothesis \