From b1a71b4d6f413d6e114573300e4136ce0a32724d Mon Sep 17 00:00:00 2001
From: Sean Breckenridge <seanbrecke@gmail.com>
Date: Sun, 13 Feb 2022 04:51:15 -0800
Subject: [PATCH] browser: parse browser history using browserexport

from my modules:
https://github.com/seanbreckenridge/HPI/blob/1fba8ccf2fe93d67198aaa77355cac5f5910e7ac/my/browser/export.py
---
 doc/MODULES.org      | 69 +++++++++++++++++++++------------
 my/browser/export.py | 90 ++++++++++++++++++++++++++++++++++++++++++++
 my/config.py         |  5 +++
 3 files changed, 140 insertions(+), 24 deletions(-)
 create mode 100644 my/browser/export.py

diff --git a/doc/MODULES.org b/doc/MODULES.org
index 1f69559..fce8f51 100644
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into
 
 You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
 
+# Nested Configurations before the doc generation using the block below
+** [[file:../my/reddit][my.reddit]]
+
+    Reddit data: saved items/comments/upvotes/etc.
+
+    # Note: can't be generated as easily since this is a nested configuration object
+    #+begin_src python
+    class reddit:
+        class rexport:
+            '''
+            Uses [[https://github.com/karlicoss/rexport][rexport]] output.
+            '''
+
+            # path[s]/glob to the exported JSON data
+            export_path: Paths
+
+        class pushshift:
+            '''
+            Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
+            '''
+
+            # path[s]/glob to the exported JSON data
+            export_path: Paths
+
+    #+end_src
+** [[file:../my/browser/export.py][my.browser.export]]
+
+    Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
+
+    #+begin_src python
+    @dataclass
+    class browser:
+        class export:
+            # path[s]/glob to your backed up browser history sqlite files
+            export_path: Paths
+
+            # paths to sqlite database files which you
+            # use actively, which should be combined into your history
+            # For example:
+            # from browserexport.browsers.all import Firefox
+            # active_databases = Firefox.locate_database()
+            active_databases: Paths
+    #+end_src
+
 # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
 
 #+begin_src python :dir .. :results output drawer raw :exports result
@@ -139,30 +183,6 @@ for cls, p in modules:
         # paths[s]/glob to the exported JSON data
         export_path: Paths
     #+end_src
-** [[file:../my/reddit][my.reddit]]
-
-    Reddit data: saved items/comments/upvotes/etc.
-
-    # Note: can't be generated as easily since this is a nested configuration object
-    #+begin_src python
-    class reddit:
-        class rexport:
-            '''
-            Uses [[https://github.com/karlicoss/rexport][rexport]] output.
-            '''
-
-            # path[s]/glob to the exported JSON data
-            export_path: Paths
-
-        class pushshift:
-            '''
-            Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
-            '''
-
-            # path[s]/glob to the exported JSON data
-            export_path: Paths
-
-    #+end_src
 ** [[file:../my/pocket.py][my.pocket]]
 
     [[https://getpocket.com][Pocket]] bookmarks and highlights
@@ -267,3 +287,4 @@ for cls, p in modules:
         # path[s]/glob to the exported databases
         export_path: Paths
     #+end_src
+
diff --git a/my/browser/export.py b/my/browser/export.py
new file mode 100644
index 0000000..864dc89
--- /dev/null
+++ b/my/browser/export.py
@@ -0,0 +1,90 @@
+"""
+Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
+"""
+
+REQUIRES = ["browserexport"]
+
+from my.config import browser as user_config
+from my.core import Paths, dataclass
+
+
+@dataclass
+class config(user_config.export):
+    # path[s]/glob to your backed up browser history sqlite files
+    export_path: Paths
+
+    # paths to sqlite database files which you
+    # use actively, which should be combined into your history
+    # For example:
+    # from browserexport.browsers.all import Firefox
+    # active_databases = Firefox.locate_database()
+    active_databases: Paths
+
+
+import os
+from pathlib import Path
+from typing import Iterator, List
+
+from sqlite_backup import sqlite_backup
+
+from my.core import Stats, get_files, LazyLogger
+from my.core.common import mcachew
+
+
+# patch browserexport logs if HPI_LOGS is present
+if "HPI_LOGS" in os.environ:
+    from browserexport.log import setup as setup_browserexport_logger
+    from my.core.logging import mklevel
+
+    setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))
+
+
+logger = LazyLogger(__name__, level="warning")
+
+
+from browserexport.merge import read_and_merge, merge_visits, Visit
+from browserexport.parse import read_visits
+
+
+# all of my backed up databases
+def inputs() -> List[Path]:
+    return list(get_files(config.export_path))
+
+
+# return the visits from the active sqlite database,
+# copying the active database into memory using
+# https://github.com/seanbreckenridge/sqlite_backup
+def _active_visits() -> List[Visit]:
+    visits: List[Visit] = []
+    active_dbs = get_files(config.active_databases or "")
+    logger.debug(f"Reading from active databases: {active_dbs}")
+    for ad in active_dbs:
+        conn = sqlite_backup(ad)
+        assert conn is not None
+        try:
+            # read visits, so can close the in-memory connection
+            visits.extend(list(read_visits(conn)))
+        finally:
+            conn.close()
+    logger.debug(f"Read {len(visits)} visits from active databases")
+    return visits
+
+
+Results = Iterator[Visit]
+
+
+# don't put this behind cachew, since the active history database(s)
+# are merged when this is called, whose contents may constantly change
+def history() -> Results:
+    yield from merge_visits([_history_from_backups(), _active_visits()])
+
+
+@mcachew(depends_on=lambda: sorted(map(str, inputs())), logger=logger)
+def _history_from_backups() -> Results:
+    yield from read_and_merge(inputs())
+
+
+def stats() -> Stats:
+    from my.core import stat
+
+    return {**stat(history)}
diff --git a/my/config.py b/my/config.py
index 7201a84..cac9bc6 100644
--- a/my/config.py
+++ b/my/config.py
@@ -129,3 +129,8 @@ class fbmessenger:
 class twitter:
     class talon:
         export_path: Paths
+
+class browser:
+    class export:
+        export_path: Paths = ''
+        active_databases: Paths = ''