browser: parse browser history using browserexport (#216)

* browser: parse browser history using browserexport from seanbreckenridge/HPI module: 1fba8ccf2f/my/browser/export.py
2022-02-13 15:56:05 -08:00 · 2022-02-13 15:56:05 -08:00 · 9e5cd60ff2
commit 9e5cd60ff2
parent 059c4ae791
7 changed files with 198 additions and 24 deletions
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into

 You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.

+# Nested Configurations before the doc generation using the block below
+** [[file:../my/reddit][my.reddit]]
+
+    Reddit data: saved items/comments/upvotes/etc.
+
+    # Note: can't be generated as easily since this is a nested configuration object
+    #+begin_src python
+    class reddit:
+        class rexport:
+            '''
+            Uses [[https://github.com/karlicoss/rexport][rexport]] output.
+            '''
+
+            # path[s]/glob to the exported JSON data
+            export_path: Paths
+
+        class pushshift:
+            '''
+            Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
+            '''
+
+            # path[s]/glob to the exported JSON data
+            export_path: Paths
+
+    #+end_src
+** [[file:../my/browser/][my.browser]]
+
+    Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
+
+    #+begin_src python
+    @dataclass
+    class browser:
+        class export:
+            # path[s]/glob to your backed up browser history sqlite files
+            export_path: Paths
+
+        class active_browser:
+            # paths to sqlite database files which you use actively
+            # to read from. For example:
+            # from browserexport.browsers.all import Firefox
+            # active_databases = Firefox.locate_database()
+            export_path: Paths
+    #+end_src
+
 # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.

 #+begin_src python :dir .. :results output drawer raw :exports result
@ -139,30 +183,6 @@ for cls, p in modules:
        # paths[s]/glob to the exported JSON data
        export_path: Paths
    #+end_src
-** [[file:../my/reddit][my.reddit]]
-
-    Reddit data: saved items/comments/upvotes/etc.
-
-    # Note: can't be generated as easily since this is a nested configuration object
-    #+begin_src python
-    class reddit:
-        class rexport:
-            '''
-            Uses [[https://github.com/karlicoss/rexport][rexport]] output.
-            '''
-
-            # path[s]/glob to the exported JSON data
-            export_path: Paths
-
-        class pushshift:
-            '''
-            Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
-            '''
-
-            # path[s]/glob to the exported JSON data
-            export_path: Paths
-
-    #+end_src
 ** [[file:../my/pocket.py][my.pocket]]

    [[https://getpocket.com][Pocket]] bookmarks and highlights
--- a/my/browser/active_browser.py
+++ b/my/browser/active_browser.py
@ -0,0 +1,50 @@
+"""
+Parses active browser history by backing it up with [[http://github.com/seanbreckenridge/sqlite_backup][sqlite_backup]]
+"""
+
+REQUIRES = ["browserexport", "sqlite_backup"]
+
+
+from my.config import browser as user_config
+from my.core import Paths, dataclass
+
+
+@dataclass
+class config(user_config.active_browser):
+    # paths to sqlite database files which you use actively
+    # to read from. For example:
+    # from browserexport.browsers.all import Firefox
+    # active_databases = Firefox.locate_database()
+    export_path: Paths
+
+
+from pathlib import Path
+from typing import Sequence, Iterator
+
+from my.core import get_files, Stats
+from browserexport.merge import read_visits, Visit
+from sqlite_backup import sqlite_backup
+
+from .common import _patch_browserexport_logs
+
+_patch_browserexport_logs()
+
+
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+def history() -> Iterator[Visit]:
+    for ad in inputs():
+        conn = sqlite_backup(ad)
+        assert conn is not None
+        try:
+            yield from read_visits(conn)
+        finally:
+            conn.close()
+
+
+def stats() -> Stats:
+    from my.core import stat
+
+    return {**stat(history)}
--- a/my/browser/all.py
+++ b/my/browser/all.py
@ -0,0 +1,35 @@
+from typing import Iterator
+
+from my.core import Stats
+from my.core.source import import_source
+from browserexport.merge import merge_visits, Visit
+
+
+src_export = import_source(module_name="my.browser.export")
+src_active = import_source(module_name="my.browser.active_browser")
+
+
+@src_export
+def _visits_export() -> Iterator[Visit]:
+    from . import export
+    return export.history()
+
+
+@src_active
+def _visits_active() -> Iterator[Visit]:
+    from . import active_browser
+    return active_browser.history()
+
+
+# NOTE: you can comment out the sources you don't need
+def history() -> Iterator[Visit]:
+    yield from merge_visits([
+        _visits_active(),
+        _visits_export(),
+    ])
+
+
+def stats() -> Stats:
+    from my.core import stat
+
+    return {**stat(history)}
--- a/my/browser/common.py
+++ b/my/browser/common.py
@ -0,0 +1,11 @@
+import os
+from my.core.util import __NOT_HPI_MODULE__
+
+
+def _patch_browserexport_logs():
+    # patch browserexport logs if HPI_LOGS is present
+    if "HPI_LOGS" in os.environ:
+        from browserexport.log import setup as setup_browserexport_logger
+        from my.core.logging import mklevel
+
+        setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))
--- a/my/browser/export.py
+++ b/my/browser/export.py
@ -0,0 +1,50 @@
+"""
+Parses browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
+"""
+
+REQUIRES = ["browserexport"]
+
+from my.config import browser as user_config
+from my.core import Paths, dataclass
+
+
+@dataclass
+class config(user_config.export):
+    # path[s]/glob to your backed up browser history sqlite files
+    export_path: Paths
+
+
+from pathlib import Path
+from typing import Iterator, Sequence, List
+
+from my.core import Stats, get_files, LazyLogger
+from my.core.common import mcachew
+
+from browserexport.merge import read_and_merge, Visit
+
+from .common import _patch_browserexport_logs
+
+
+logger = LazyLogger(__name__, level="warning")
+
+_patch_browserexport_logs()
+
+
+# all of my backed up databases
+def inputs() -> Sequence[Path]:
+    return get_files(config.export_path)
+
+
+def _cachew_depends_on() -> List[str]:
+    return [str(f) for f in inputs()]
+
+
+@mcachew(depends_on=_cachew_depends_on, logger=logger)
+def history() -> Iterator[Visit]:
+    yield from read_and_merge(inputs())
+
+
+def stats() -> Stats:
+    from my.core import stat
+
+    return {**stat(history)}
--- a/my/config.py
+++ b/my/config.py
@ -129,3 +129,9 @@ class fbmessenger:
 class twitter:
    class talon:
        export_path: Paths
+
+class browser:
+    class export:
+        export_path: Paths = ''
+    class active_browser:
+        export_path: Paths = ''
--- a/tox.ini
+++ b/tox.ini
@ -84,6 +84,7 @@ commands =
 commands =
    pip install -e .[testing,optional]

+    hpi module install my.browser.export
    hpi module install my.orgmode
    hpi module install my.endomondo
    hpi module install my.github.ghexport
@ -103,6 +104,7 @@ commands =
    # todo fuck. -p my.github isn't checking the subpackages?? wtf...
    # guess it wants .pyi file??
    python3 -m mypy --install-types --non-interactive \
+                    -p my.browser                     \
                    -p my.endomondo                   \
                    -p my.github.ghexport             \
                    -p my.hypothesis                  \