From b1a71b4d6f413d6e114573300e4136ce0a32724d Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 13 Feb 2022 04:51:15 -0800 Subject: [PATCH] browser: parse browser history using browserexport from my modules: https://github.com/seanbreckenridge/HPI/blob/1fba8ccf2fe93d67198aaa77355cac5f5910e7ac/my/browser/export.py --- doc/MODULES.org | 69 +++++++++++++++++++++------------ my/browser/export.py | 90 ++++++++++++++++++++++++++++++++++++++++++++ my/config.py | 5 +++ 3 files changed, 140 insertions(+), 24 deletions(-) create mode 100644 my/browser/export.py diff --git a/doc/MODULES.org b/doc/MODULES.org index 1f69559..fce8f51 100644 --- a/doc/MODULES.org +++ b/doc/MODULES.org @@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works. +# Nested Configurations before the doc generation using the block below +** [[file:../my/reddit][my.reddit]] + + Reddit data: saved items/comments/upvotes/etc. + + # Note: can't be generated as easily since this is a nested configuration object + #+begin_src python + class reddit: + class rexport: + ''' + Uses [[https://github.com/karlicoss/rexport][rexport]] output. + ''' + + # path[s]/glob to the exported JSON data + export_path: Paths + + class pushshift: + ''' + Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments + ''' + + # path[s]/glob to the exported JSON data + export_path: Paths + + #+end_src +** [[file:../my/browser/export.py][my.browser.export]] + + Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] + + #+begin_src python + @dataclass + class browser: + class export: + # path[s]/glob to your backed up browser history sqlite files + export_path: Paths + + # paths to sqlite database files which you + # use actively, which should be combined into your history + # For example: + # from browserexport.browsers.all import Firefox + # active_databases = Firefox.locate_database() + active_databases: Paths + #+end_src + # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh. #+begin_src python :dir .. :results output drawer raw :exports result @@ -139,30 +183,6 @@ for cls, p in modules: # paths[s]/glob to the exported JSON data export_path: Paths #+end_src -** [[file:../my/reddit][my.reddit]] - - Reddit data: saved items/comments/upvotes/etc. - - # Note: can't be generated as easily since this is a nested configuration object - #+begin_src python - class reddit: - class rexport: - ''' - Uses [[https://github.com/karlicoss/rexport][rexport]] output. - ''' - - # path[s]/glob to the exported JSON data - export_path: Paths - - class pushshift: - ''' - Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments - ''' - - # path[s]/glob to the exported JSON data - export_path: Paths - - #+end_src ** [[file:../my/pocket.py][my.pocket]] [[https://getpocket.com][Pocket]] bookmarks and highlights @@ -267,3 +287,4 @@ for cls, p in modules: # path[s]/glob to the exported databases export_path: Paths #+end_src + diff --git a/my/browser/export.py b/my/browser/export.py new file mode 100644 index 0000000..864dc89 --- /dev/null +++ b/my/browser/export.py @@ -0,0 +1,90 @@ +""" +Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]] +""" + +REQUIRES = ["browserexport"] + +from my.config import browser as user_config +from my.core import Paths, dataclass + + +@dataclass +class config(user_config.export): + # path[s]/glob to your backed up browser history sqlite files + export_path: Paths + + # paths to sqlite database files which you + # use actively, which should be combined into your history + # For example: + # from browserexport.browsers.all import Firefox + # active_databases = Firefox.locate_database() + active_databases: Paths + + +import os +from pathlib import Path +from typing import Iterator, List + +from sqlite_backup import sqlite_backup + +from my.core import Stats, get_files, LazyLogger +from my.core.common import mcachew + + +# patch browserexport logs if HPI_LOGS is present +if "HPI_LOGS" in os.environ: + from browserexport.log import setup as setup_browserexport_logger + from my.core.logging import mklevel + + setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"])) + + +logger = LazyLogger(__name__, level="warning") + + +from browserexport.merge import read_and_merge, merge_visits, Visit +from browserexport.parse import read_visits + + +# all of my backed up databases +def inputs() -> List[Path]: + return list(get_files(config.export_path)) + + +# return the visits from the active sqlite database, +# copying the active database into memory using +# https://github.com/seanbreckenridge/sqlite_backup +def _active_visits() -> List[Visit]: + visits: List[Visit] = [] + active_dbs = get_files(config.active_databases or "") + logger.debug(f"Reading from active databases: {active_dbs}") + for ad in active_dbs: + conn = sqlite_backup(ad) + assert conn is not None + try: + # read visits, so can close the in-memory connection + visits.extend(list(read_visits(conn))) + finally: + conn.close() + logger.debug(f"Read {len(visits)} visits from active databases") + return visits + + +Results = Iterator[Visit] + + +# don't put this behind cachew, since the active history database(s) +# are merged when this is called, whose contents may constantly change +def history() -> Results: + yield from merge_visits([_history_from_backups(), _active_visits()]) + + +@mcachew(depends_on=lambda: sorted(map(str, inputs())), logger=logger) +def _history_from_backups() -> Results: + yield from read_and_merge(inputs()) + + +def stats() -> Stats: + from my.core import stat + + return {**stat(history)} diff --git a/my/config.py b/my/config.py index 7201a84..cac9bc6 100644 --- a/my/config.py +++ b/my/config.py @@ -129,3 +129,8 @@ class fbmessenger: class twitter: class talon: export_path: Paths + +class browser: + class export: + export_path: Paths = '' + active_databases: Paths = ''