browser: parse browser history using browserexport
from my modules:
1fba8ccf2f/my/browser/export.py
This commit is contained in:
parent
059c4ae791
commit
b1a71b4d6f
3 changed files with 140 additions and 24 deletions
|
@ -63,6 +63,50 @@ The config snippets below are meant to be modified accordingly and *pasted into
|
|||
|
||||
You don't have to set up all modules at once, it's recommended to do it gradually, to get the feel of how HPI works.
|
||||
|
||||
# Nested Configurations before the doc generation using the block below
|
||||
** [[file:../my/reddit][my.reddit]]
|
||||
|
||||
Reddit data: saved items/comments/upvotes/etc.
|
||||
|
||||
# Note: can't be generated as easily since this is a nested configuration object
|
||||
#+begin_src python
|
||||
class reddit:
|
||||
class rexport:
|
||||
'''
|
||||
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
|
||||
'''
|
||||
|
||||
# path[s]/glob to the exported JSON data
|
||||
export_path: Paths
|
||||
|
||||
class pushshift:
|
||||
'''
|
||||
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
|
||||
'''
|
||||
|
||||
# path[s]/glob to the exported JSON data
|
||||
export_path: Paths
|
||||
|
||||
#+end_src
|
||||
** [[file:../my/browser/export.py][my.browser.export]]
|
||||
|
||||
Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
|
||||
|
||||
#+begin_src python
|
||||
@dataclass
|
||||
class browser:
|
||||
class export:
|
||||
# path[s]/glob to your backed up browser history sqlite files
|
||||
export_path: Paths
|
||||
|
||||
# paths to sqlite database files which you
|
||||
# use actively, which should be combined into your history
|
||||
# For example:
|
||||
# from browserexport.browsers.all import Firefox
|
||||
# active_databases = Firefox.locate_database()
|
||||
active_databases: Paths
|
||||
#+end_src
|
||||
|
||||
# TODO hmm. drawer raw means it can output outlines, but then have to manually erase the generated results. ugh.
|
||||
|
||||
#+begin_src python :dir .. :results output drawer raw :exports result
|
||||
|
@ -139,30 +183,6 @@ for cls, p in modules:
|
|||
# paths[s]/glob to the exported JSON data
|
||||
export_path: Paths
|
||||
#+end_src
|
||||
** [[file:../my/reddit][my.reddit]]
|
||||
|
||||
Reddit data: saved items/comments/upvotes/etc.
|
||||
|
||||
# Note: can't be generated as easily since this is a nested configuration object
|
||||
#+begin_src python
|
||||
class reddit:
|
||||
class rexport:
|
||||
'''
|
||||
Uses [[https://github.com/karlicoss/rexport][rexport]] output.
|
||||
'''
|
||||
|
||||
# path[s]/glob to the exported JSON data
|
||||
export_path: Paths
|
||||
|
||||
class pushshift:
|
||||
'''
|
||||
Uses [[https://github.com/seanbreckenridge/pushshift_comment_export][pushshift]] to get access to old comments
|
||||
'''
|
||||
|
||||
# path[s]/glob to the exported JSON data
|
||||
export_path: Paths
|
||||
|
||||
#+end_src
|
||||
** [[file:../my/pocket.py][my.pocket]]
|
||||
|
||||
[[https://getpocket.com][Pocket]] bookmarks and highlights
|
||||
|
@ -267,3 +287,4 @@ for cls, p in modules:
|
|||
# path[s]/glob to the exported databases
|
||||
export_path: Paths
|
||||
#+end_src
|
||||
|
||||
|
|
90
my/browser/export.py
Normal file
90
my/browser/export.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
Parses Browser history using [[http://github.com/seanbreckenridge/browserexport][browserexport]]
|
||||
"""
|
||||
|
||||
REQUIRES = ["browserexport"]
|
||||
|
||||
from my.config import browser as user_config
|
||||
from my.core import Paths, dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class config(user_config.export):
|
||||
# path[s]/glob to your backed up browser history sqlite files
|
||||
export_path: Paths
|
||||
|
||||
# paths to sqlite database files which you
|
||||
# use actively, which should be combined into your history
|
||||
# For example:
|
||||
# from browserexport.browsers.all import Firefox
|
||||
# active_databases = Firefox.locate_database()
|
||||
active_databases: Paths
|
||||
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List
|
||||
|
||||
from sqlite_backup import sqlite_backup
|
||||
|
||||
from my.core import Stats, get_files, LazyLogger
|
||||
from my.core.common import mcachew
|
||||
|
||||
|
||||
# patch browserexport logs if HPI_LOGS is present
|
||||
if "HPI_LOGS" in os.environ:
|
||||
from browserexport.log import setup as setup_browserexport_logger
|
||||
from my.core.logging import mklevel
|
||||
|
||||
setup_browserexport_logger(mklevel(os.environ["HPI_LOGS"]))
|
||||
|
||||
|
||||
logger = LazyLogger(__name__, level="warning")
|
||||
|
||||
|
||||
from browserexport.merge import read_and_merge, merge_visits, Visit
|
||||
from browserexport.parse import read_visits
|
||||
|
||||
|
||||
# all of my backed up databases
|
||||
def inputs() -> List[Path]:
|
||||
return list(get_files(config.export_path))
|
||||
|
||||
|
||||
# return the visits from the active sqlite database,
|
||||
# copying the active database into memory using
|
||||
# https://github.com/seanbreckenridge/sqlite_backup
|
||||
def _active_visits() -> List[Visit]:
|
||||
visits: List[Visit] = []
|
||||
active_dbs = get_files(config.active_databases or "")
|
||||
logger.debug(f"Reading from active databases: {active_dbs}")
|
||||
for ad in active_dbs:
|
||||
conn = sqlite_backup(ad)
|
||||
assert conn is not None
|
||||
try:
|
||||
# read visits, so can close the in-memory connection
|
||||
visits.extend(list(read_visits(conn)))
|
||||
finally:
|
||||
conn.close()
|
||||
logger.debug(f"Read {len(visits)} visits from active databases")
|
||||
return visits
|
||||
|
||||
|
||||
Results = Iterator[Visit]
|
||||
|
||||
|
||||
# don't put this behind cachew, since the active history database(s)
|
||||
# are merged when this is called, whose contents may constantly change
|
||||
def history() -> Results:
|
||||
yield from merge_visits([_history_from_backups(), _active_visits()])
|
||||
|
||||
|
||||
@mcachew(depends_on=lambda: sorted(map(str, inputs())), logger=logger)
|
||||
def _history_from_backups() -> Results:
|
||||
yield from read_and_merge(inputs())
|
||||
|
||||
|
||||
def stats() -> Stats:
|
||||
from my.core import stat
|
||||
|
||||
return {**stat(history)}
|
|
@ -129,3 +129,8 @@ class fbmessenger:
|
|||
class twitter:
|
||||
class talon:
|
||||
export_path: Paths
|
||||
|
||||
class browser:
|
||||
class export:
|
||||
export_path: Paths = ''
|
||||
active_databases: Paths = ''
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue