browser: parse browser history using browserexport

from my modules: 1fba8ccf2f/my/browser/export.py
2022-02-13 04:51:15 -08:00 · 2022-02-13 04:51:15 -08:00 · b1a71b4d6f
commit b1a71b4d6f
parent 059c4ae791
3 changed files with 140 additions and 24 deletions
--- a/doc/MODULES.org
+++ b/doc/MODULES.org
@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into
 You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
 # Nested Configurations before the doc generation using the block below
 ** [[file:../my/reddit][my.reddit]]
    Reddit data: saved items/comments/upvotes/etc.
    # Note: can't be generated as easily since this is a nested configuration object
    #+begin_src python
    class reddit:
        class rexport:
            '''
            Uses [[https://github.com/karlicoss/rexport][rexport]] output.
            '''
            # path[s]/glob to the exported JSON data
            export_path: Paths
        class pushshift:
            '''
            Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
            '''
            # path[s]/glob to the exported JSON data
            export_path: Paths
    #+end_src
 ** [[file:../my/browser/export.py][my.browser.export]]
    Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
    #+begin_src python
    @dataclass
    class browser:
        class export:
            # path[s]/glob to your backed up browser history sqlite files
            export_path: Paths
            # paths to sqlite database files which you
            # use actively, which should be combined into your history
            # For example:
            # from browserexport.browsers.all import Firefox
            # active_databases = Firefox.locate_database()
            active_databases: Paths
    #+end_src
 # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
 #+begin_src python :dir .. :results output drawer raw :exports result
@ -139,30 +183,6 @@ for cls, p in modules:
        # paths[s]/glob to the exported JSON data
        export_path: Paths
    #+end_src
 ** [[file:../my/reddit][my.reddit]]
    Reddit data: saved items/comments/upvotes/etc.
    # Note: can't be generated as easily since this is a nested configuration object
    #+begin_src python
    class reddit:
        class rexport:
            '''
            Uses [[https://github.com/karlicoss/rexport][rexport]] output.
            '''
            # path[s]/glob to the exported JSON data
            export_path: Paths
        class pushshift:
            '''
            Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
            '''
            # path[s]/glob to the exported JSON data
            export_path: Paths
    #+end_src
 ** [[file:../my/pocket.py][my.pocket]]
    [[https://getpocket.com][Pocket]] bookmarks and highlights
@ -267,3 +287,4 @@ for cls, p in modules:
        # path[s]/glob to the exported databases
        export_path: Paths
    #+end_src
--- a/my/browser/export.py
+++ b/my/browser/export.py
@ -0,0 +1,90 @@
 """
 Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
 """
 REQUIRES = ["browserexport"]
 from my.config import browser as user_config
 from my.core import Paths, dataclass
@dataclass
 class config(user_config.export):
    # path[s]/glob to your backed up browser history sqlite files
    export_path: Paths
    # paths to sqlite database files which you
    # use actively, which should be combined into your history
    # For example:
    # from browserexport.browsers.all import Firefox
    # active_databases = Firefox.locate_database()
    active_databases: Paths
 import os
 from pathlib import Path
 from typing import Iterator, List
 from sqlite_backup import sqlite_backup
 from my.core import Stats, get_files, LazyLogger
 from my.core.common import mcachew
 # patch browserexport logs if HPI_LOGS is present
 if "HPI_LOGS" in os.environ:
    from browserexport.log import setup as setup_browserexport_logger
    from my.core.logging import mklevel
    setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))
 logger = LazyLogger(__name__, level="warning")
 from browserexport.merge import read_and_merge, merge_visits, Visit
 from browserexport.parse import read_visits
 # all of my backed up databases
 def inputs() -> List[Path]:
    return list(get_files(config.export_path))
 # return the visits from the active sqlite database,
 # copying the active database into memory using
 # https://github.com/seanbreckenridge/sqlite_backup
 def _active_visits() -> List[Visit]:
    visits: List[Visit] = []
    active_dbs = get_files(config.active_databases or "")
    logger.debug(f"Reading from active databases: {active_dbs}")
    for ad in active_dbs:
        conn = sqlite_backup(ad)
        assert conn is not None
        try:
            # read visits, so can close the in-memory connection
            visits.extend(list(read_visits(conn)))
        finally:
            conn.close()
    logger.debug(f"Read {len(visits)} visits from active databases")
    return visits
 Results = Iterator[Visit]
 # don't put this behind cachew, since the active history database(s)
 # are merged when this is called, whose contents may constantly change
 def history() -> Results:
    yield from merge_visits([_history_from_backups(), _active_visits()])
@mcachew(depends_on=lambda: sorted(map(str, inputs())), logger=logger)
 def _history_from_backups() -> Results:
    yield from read_and_merge(inputs())
 def stats() -> Stats:
    from my.core import stat
    return {**stat(history)}
--- a/my/config.py
+++ b/my/config.py
@ -129,3 +129,8 @@ class fbmessenger:
 class twitter:
    class talon:
        export_path: Paths
 class browser:
    class export:
        export_path: Paths = ''
        active_databases: Paths = ''