browser: parse browser history using browserexport

from my modules:
1fba8ccf2f/my/browser/export.py
This commit is contained in:
Sean Breckenridge 2022-02-13 04:51:15 -08:00
parent 059c4ae791
commit b1a71b4d6f
3 changed files with 140 additions and 24 deletions

View file

@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into
You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works. You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
# Nested Configurations before the doc generation using the block below
** [[file:../my/reddit][my.reddit]]
Reddit data: saved items/comments/upvotes/etc.
# Note: can't be generated as easily since this is a nested configuration object
#+begin_src python
class reddit:
class rexport:
'''
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
class pushshift:
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/browser/export.py][my.browser.export]]
Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
#+begin_src python
@dataclass
class browser:
class export:
# path[s]/glob to your backed up browser history sqlite files
export_path: Paths
# paths to sqlite database files which you
# use actively, which should be combined into your history
# For example:
# from browserexport.browsers.all import Firefox
# active_databases = Firefox.locate_database()
active_databases: Paths
#+end_src
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh. # TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
#+begin_src python :dir .. :results output drawer raw :exports result #+begin_src python :dir .. :results output drawer raw :exports result
@ -139,30 +183,6 @@ for cls, p in modules:
# paths[s]/glob to the exported JSON data # paths[s]/glob to the exported JSON data
export_path: Paths export_path: Paths
#+end_src #+end_src
** [[file:../my/reddit][my.reddit]]
Reddit data: saved items/comments/upvotes/etc.
# Note: can't be generated as easily since this is a nested configuration object
#+begin_src python
class reddit:
class rexport:
'''
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
'''
# path[s]/glob to the exported JSON data
export_path: Paths
class pushshift:
'''
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
'''
# path[s]/glob to the exported JSON data
export_path: Paths
#+end_src
** [[file:../my/pocket.py][my.pocket]] ** [[file:../my/pocket.py][my.pocket]]
[[https://getpocket.com][Pocket]] bookmarks and highlights [[https://getpocket.com][Pocket]] bookmarks and highlights
@ -267,3 +287,4 @@ for cls, p in modules:
# path[s]/glob to the exported databases # path[s]/glob to the exported databases
export_path: Paths export_path: Paths
#+end_src #+end_src

90
my/browser/export.py Normal file
View file

@ -0,0 +1,90 @@
"""
Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
"""
REQUIRES = ["browserexport"]
from my.config import browser as user_config
from my.core import Paths, dataclass
@dataclass
class config(user_config.export):
# path[s]/glob to your backed up browser history sqlite files
export_path: Paths
# paths to sqlite database files which you
# use actively, which should be combined into your history
# For example:
# from browserexport.browsers.all import Firefox
# active_databases = Firefox.locate_database()
active_databases: Paths
import os
from pathlib import Path
from typing import Iterator, List
from sqlite_backup import sqlite_backup
from my.core import Stats, get_files, LazyLogger
from my.core.common import mcachew
# patch browserexport logs if HPI_LOGS is present
if "HPI_LOGS" in os.environ:
from browserexport.log import setup as setup_browserexport_logger
from my.core.logging import mklevel
setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))
logger = LazyLogger(__name__, level="warning")
from browserexport.merge import read_and_merge, merge_visits, Visit
from browserexport.parse import read_visits
# all of my backed up databases
def inputs() -> List[Path]:
return list(get_files(config.export_path))
# return the visits from the active sqlite database,
# copying the active database into memory using
# https://github.com/seanbreckenridge/sqlite_backup
def _active_visits() -> List[Visit]:
visits: List[Visit] = []
active_dbs = get_files(config.active_databases or "")
logger.debug(f"Reading from active databases: {active_dbs}")
for ad in active_dbs:
conn = sqlite_backup(ad)
assert conn is not None
try:
# read visits, so can close the in-memory connection
visits.extend(list(read_visits(conn)))
finally:
conn.close()
logger.debug(f"Read {len(visits)} visits from active databases")
return visits
Results = Iterator[Visit]
# don't put this behind cachew, since the active history database(s)
# are merged when this is called, whose contents may constantly change
def history() -> Results:
yield from merge_visits([_history_from_backups(), _active_visits()])
@mcachew(depends_on=lambda: sorted(map(str, inputs())), logger=logger)
def _history_from_backups() -> Results:
yield from read_and_merge(inputs())
def stats() -> Stats:
from my.core import stat
return {**stat(history)}

View file

@ -129,3 +129,8 @@ class fbmessenger:
class twitter: class twitter:
class talon: class talon:
export_path: Paths export_path: Paths
class browser:
class export:
export_path: Paths = ''
active_databases: Paths = ''